pax_global_header00006660000000000000000000000064144420153500014510gustar00rootroot0000000000000052 comment=9814c75eea18fc7374cde884592233b6b7dc055b nccl-2.18.3-1/000077500000000000000000000000001444201535000126605ustar00rootroot00000000000000nccl-2.18.3-1/.gitignore000066400000000000000000000001351444201535000146470ustar00rootroot00000000000000# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. /build *.gcov /coverage/ nccl-2.18.3-1/LICENSE.txt000066400000000000000000000035471444201535000145140ustar00rootroot00000000000000 Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National Laboratory, the U.S. Department of Energy, nor the names of their contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. The U.S. Department of Energy funded the development of this software under subcontract 7078610 with Lawrence Berkeley National Laboratory. This code also includes files from the NVIDIA Tools Extension SDK project. See: https://github.com/NVIDIA/NVTX for more information and license details. nccl-2.18.3-1/Makefile000066400000000000000000000012261444201535000143210ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # .PHONY : all clean default : src.build install : src.install BUILDDIR ?= $(abspath ./build) ABSBUILDDIR := $(abspath $(BUILDDIR)) TARGETS := src pkg clean: ${TARGETS:%=%.clean} test.build: src.build LICENSE_FILES := LICENSE.txt LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%) lic: $(LICENSE_TARGETS) ${BUILDDIR}/%.txt: %.txt @printf "Copying %-35s > %s\n" $< $@ mkdir -p ${BUILDDIR} cp $< $@ src.%: ${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR} pkg.%: ${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR} pkg.debian.prep: lic pkg.txz.prep: lic nccl-2.18.3-1/README.md000066400000000000000000000047561444201535000141530ustar00rootroot00000000000000# NCCL Optimized primitives for inter-GPU communication. ## Introduction NCCL (pronounced "Nickel") is a stand-alone library of standard communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, as well as any send/receive based communication pattern. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications. For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html). ## Build Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds. To build the library : ```shell $ cd nccl $ make -j src.build ``` If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with : ```shell $ make src.build CUDA_HOME= ``` NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set. By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform : ```shell $ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70" ``` ## Install To install NCCL on the system, create a package then install it as root. Debian/Ubuntu : ```shell $ # Install tools to create debian packages $ sudo apt install build-essential devscripts debhelper fakeroot $ # Build NCCL deb package $ make pkg.debian.build $ ls build/pkg/deb/ ``` RedHat/CentOS : ```shell $ # Install tools to create rpm packages $ sudo yum install rpm-build rpmdevtools $ # Build NCCL rpm package $ make pkg.redhat.build $ ls build/pkg/rpm/ ``` OS-agnostic tarball : ```shell $ make pkg.txz.build $ ls build/pkg/txz/ ``` ## Tests Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests. ```shell $ git clone https://github.com/NVIDIA/nccl-tests.git $ cd nccl-tests $ make $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g ``` ## Copyright All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. nccl-2.18.3-1/ext-net/000077500000000000000000000000001444201535000142445ustar00rootroot00000000000000nccl-2.18.3-1/ext-net/README.md000066400000000000000000000426261444201535000155350ustar00rootroot00000000000000# NCCL Net Plugin Documentation This page describes the NCCL Net plugin API and how to implement a network plugin for NCCL. # Overview To allow NCCL to work on any network type, NCCL provides a way to use external plugins. Plugins implement the NCCL network API, and decouple NCCL binary builds which are built against a particular version of the GPU stack (i.e. CUDA) from the network code which is built against a particular version of the networking stack. That way, we can easily integrate any CUDA version with any network stack version. NCCL network plugins come as a shared library called `libnccl-net.so`. That shared library contains one or more implementations of the NCCL NET API, in the form of versioned structs, filled with pointers to all required functions. # Plugin architecture ## Plugin name and supporting multiple network plugins When NCCL is initialized, it will look for a `libnccl-net.so` library and dynamically load it, then look for symbols inside the library. The `NCCL_NET_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL will look for a library with a name of `libnccl-net-${NCCL_NET_PLUGIN}.so`. It is therefore advised to name the library following that pattern, with a symlink pointing `libnccl-net.so` to `libnccl-net-${NCCL_NET_PLUGIN}.so`. That way, if there are multiple plugins in the path, setting `NCCL_NET_PLUGIN` will allow users to select the right plugin. ## Struct versioning Once a library is found, NCCL will look for a symbol named `ncclNet_vX`, with `X` increasing over time. The versioning ensures that the plugin and the NCCL core are compatible. Plugins are encouraged to provide multiple of those symbols, implementing multiple versions of the NCCL NET API, so that the same plugin can be compiled and support a wide range of NCCL versions. Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking for the latest ncclNet struct version, but also looking for older ones so that older plugins would still work. ## In-network collective operations, a.k.a. collNet Additionally to the ncclNet structure, network plugins can provide a collNet structure which implements in-network collective operations, if supported. That can be used by the NCCL collNet algorithm to accelerate inter-node reductions in allReduce. The collNet struct is a different, optional struct provided by the network plugin, but its versioning is tied to the ncclNet struct and many functions are common between the two to ease the implementation. ## Headers management To help users build plugins effortlessly, plugins should copy the `ncclNet_vX` definitions they support to their internal includes. An example is shown in `ext-net/example/` where we keep all headers in the `nccl/` directory and provide thin layers to implement old versions on top of newer ones. The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions from old API versions. It also provides error codes in `err.h`. # API (v6) Below is the main `ncclNet_v6` struct. Each function is explained in later sections. ``` typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v6_t; ``` ## Error codes All plugins functions use NCCL error codes as return value. `ncclSuccess` should be returned upon success. Otherwise, plugins can return one of the following: - `ncclSystemError` is the most common error for network plugins, when a call to the linux kernel or a system library fails. This typically includes all network/hardware errors. - `ncclInternalError` is returned when the NCCL core code is using the network plugin in an incorrect way, for example allocating more requests than it should, or passing an invalid argument to calls. - `ncclInvalidUsage` should be returned when the error is most likely a user error. This can include misconfiguration, but also sizes mismatch. - `ncclInvalidArgument` should usually not be used by plugins since arguments should be checked by the NCCL core layer. - `ncclUnhandledCudaError` is returned when an error comes from CUDA. Since network plugins should not need to rely on CUDA, this should not be common. ## Operation overview NCCL will call the `init` function first, then query the number of network devices with the `devices` function, getting each network device properties with `getProperties`. To establish a connection between two network devices, NCCL will first call `listen` on the receiving side, pass the returned handle to the sender side of the connection, and call `connect` with that handle. Finally, `accept` will be called on the receiving side to finalize the connection establishment. Once the connection is established, communication will be done using the functions `isend`, `irecv` and `test`. Prior to calling `isend` or `irecv`, NCCL will call the `regMr` function on all buffers to allow RDMA NICs to prepare buffers. `deregMr` will be used to unregister buffers. In certain conditions, `iflush` will be called after a receive calls completes to allow the network plugin to flush data and ensure the GPU will observe the newly written data. To close the connections NCCL will call `closeListen` to close the object returned by `listen`, `closeSend` to close the object returned by `connect` and `closeRecv` to close the object returned by `accept`. ## API Functions ### Initialization `name` The `name` field should point to a character string with the name of the network plugin. This will be used for all logging, especially when `NCCL_DEBUG=INFO` is set. Note: setting `NCCL_NET=` will ensure a specific network implementation is used, with a matching `name`. This is not to be confused with `NCCL_NET_PLUGIN` which defines a suffix to the `libnccl-net.so`library name to load. `init` As soon as NCCL finds the plugin and the correct ncclNet symbol, it will call the `init` function. This will allow the plugin to discover network devices and make sure they are usable. If the `init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on internal ones. To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within the plugin code adding the following definitions: ``` #define WARN(...) logFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) ``` `devices` Once the plugin is initialized, NCCL will query the number of devices available. It should not be zero, otherwise NCCL initialization will fail. If no device is present or usable, the `init` function should not return `ncclSuccess`. `getProperties` Right after getting the number of devices, NCCL will query properties for each available network device. These properties are critical when multiple adapters are present to ensure NCCL uses each adapter in the most optimized way. The `name` is only used for logging. The `pciPath` is the base for all topology detection and should point to the PCI device directory in /sys. This is typically the directory pointed by `/sys/class/net/eth0/device` or `/sys/class/infiniband/mlx5_0/device`. If the network interface is virtual, then `pciPath` should be `NULL`. The `guid` field is used to determine when network adapters are connected to multiple PCI endpoints. For normal cases, it can be set to the device number. If multiple network devices have the same guid, then NCCL will consider the are sharing the same network port to the fabric, hence it will not use the port multiple times. The `ptrSupport` field indicates whether or not CUDA pointers are supported. If so, it should be set to `NCCL_PTR_HOST|NCCL_PTR_CUDA`, otherwise it should be set to `NCCL_PTR_HOST`. If the plugin supports `dmabuf`, it should set `ptrSupport` to `NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF` and provide a `regMrDmaBuf` function. The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is important to ensure proper optimization of flows within the node. The `port` field indicates the port number. This is important again for topology detection and flow optimization within the node when a NIC with a single PCI connection is connected to the fabric with multiple ports. The `latency` field indicates the network latency in microseconds. This can be useful to improve the NCCL tuning and make sure NCCL switches from tree to ring at the right size. The `maxComms` field indicates the maximum number of connections we can create. The `maxRecvs` field indicates the maximum number for grouped receive operations (see grouped receive). ### Connection establishment Connections are used in an unidirectional manner. There is therefore a sender side and a receiver side. `listen` To create a connection, NCCL will start by calling `listen` on the receiver side. This function takes a device number as input argument, and should return a local `listenComm` object, and a `handle` to pass to the other side, so that the sender side can connect to the receiver. The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL. This call should never block, but contrary to `connect` and `accept`, `listenComm` should never be `NULL` if the call succeeds. `connect` NCCL will use its bootstrap infrastructure to provide the `handle` to the sender side, then call `connect` on the sender side on a given device index `dev`, providing the `handle`. `connect` should not block either, and instead set `sendComm` to `NULL` and return `ncclSuccess`. In that case, NCCL will call `accept` again until it succeeds. `accept` To finalize the connection, the receiver side will call `accept` on the `listenComm` returned by the `listen` call previously. If the sender did not connect yet, `accept` should not block. It should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it succeeds. `closeListen`/`closeSend`/`closeRecv` Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call `closeListen`/`closeSend`/`closeRecv` to free the associated resources. ### Communication Communication is done using asynchronous send and receive operations: `isend`, `irecv` and `test`. To support RDMA capabilities, buffer registration and flush functions are provided. To keep track of asynchronous send, receive and flush operations, requests are returned to NCCL, then queried with `test`. Each `sendComm` or `recvComm` must be able to handle `NCCL_NET_MAX_REQUESTS` requests in parallel. Note: That value should be multiplied by the multi-receive capability of the plugin for the sender side, so that we can effectively have `NCCL_NET_MAX_REQUESTS` multi-receive operations happening in parallel. So, if we have a `maxRecvs`value of 8 and `NCCL_NET_MAX_REQUESTS` is 8, then each `sendComm` must be able to handle up to 8x8=64 concurrent `isend` operations. `regMr` Prior to sending or receiving data, NCCL will call `regMr` with any buffers later used for communication. It will provide a `sendComm` or `recvComm` as `comm` argument, then the buffer pointer `data`, `size`, and `type` being either `NCCL_PTR_HOST`, or `NCCL_PTR_CUDA` if the network supports CUDA pointers. The network plugin can use the output argument `mhandle` to keep any reference to that memory registration, as this `mhandle` will be passed back for all `isend`, `irecv`, `iflush` and `deregMr` calls. `regMrDmaBuf` If the plugin has set the `NCCL_PTR_DMABUF` property in `ptrSupport`, NCCL will use `regMrDmaBuf` instead of `regMr`. If the property was not set, `regMrDmaBuf` can be set to `NULL`. `deregMr` When buffers will no longer be used for communication, NCCL will call `deregMr` to let the plugin free resources. This function is used to deregister handles returned by both `regMr` and `regMrDmaBuf`. `isend` Data will be sent through the connection using `isend`, passing the `sendComm` previously created by `connect`, and the buffer described by `data`, `size`, and `mhandle`. A `tag` must be used if the network supports multi-receive operations (see `irecv`) to distinguish between different sends matching the same multi-receive. Otherwise it can be set to 0. The `isend` operation returns a handle in the `request` argument for further calls to `test`. If the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call `isend` again later. `irecv` To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument `n` will allow NCCL to perform a multi-receive, to allow grouping of multiple sends through a single network connection. Each buffer will be described by the `data`, `sizes`, and `mhandles` arrays. `tags` will specify a tag for each receive so that each of the `n` independent `isend` operations is received into the right buffer. If all receive operations can be initiated, `irecv` will return a handle in the `request` pointer, otherwise it will set it to `NULL`. In the case of multi-receive, all `n` receive operations are handled by a single request handle. The sizes provided to `irecv` can (and will) be larger than the size of the `isend` operation. The contrary (receive size being lower than the send size) is an error, however. Note: for a given connection, send/receive operations should always match in the order they were posted. Tags provided for receive operations are only used to assign a given send operation to one of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag matching on any receive operation posted. `test` After an `isend` or `irecv` operation is initiated, NCCL will call `test` on the request handles until they complete. When that happens, `done` will be set to 1 and `sizes` will be set to the real size sent or received, the latter being potentially lower than the size passed to `irecv`. In the case of a multi-receive, all receives will be considered as done as a single operation (the goal being to allow aggregation), hence they share a single request and a single `done` status. However, they can have different sizes, so when `done` is non-zero, the `sizes` array should contain the `n` sizes corresponding to the buffers passed to `irecv`. Once `test` returns 1 in `done`, the request handle can be freed, meaning that NCCL will never call `test` again on that request (until it is reallocated by another call to `isend` or `irecv`). `iflush` After a receive operation completes, if the operation was targeting GPU memory and received a non-zero number of bytes, NCCL will call `iflush` to let the network flush any buffer and ensure the GPU can read it right after without seeing stale data. This flush operation is decoupled from the `test` code to improve latency of `LL*` protocols, as those are capable of determining when data is valid or not. `iflush` returns a request which needs to be queried with `test` until it completes. nccl-2.18.3-1/ext-net/example/000077500000000000000000000000001444201535000156775ustar00rootroot00000000000000nccl-2.18.3-1/ext-net/example/Makefile000066400000000000000000000005761444201535000173470ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # NCCL_HOME:=../../build/ CUDA_HOME:=/usr/local/cuda INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include PLUGIN_SO:=libnccl-net.so default: $(PLUGIN_SO) $(PLUGIN_SO): plugin.c $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ clean: rm -f $(PLUGIN_SO) nccl-2.18.3-1/ext-net/example/nccl/000077500000000000000000000000001444201535000166165ustar00rootroot00000000000000nccl-2.18.3-1/ext-net/example/nccl/err.h000066400000000000000000000007201444201535000175560ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_ERR_H_ #define NCCL_ERR_H_ /* Error type for plugins */ typedef enum { ncclSuccess = 0, ncclUnhandledCudaError = 1, ncclSystemError = 2, ncclInternalError = 3, ncclInvalidArgument = 4, ncclRemoteError = 6 } ncclResult_t; #endif nccl-2.18.3-1/ext-net/example/nccl/net.h000066400000000000000000000017021444201535000175550ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_H_ #define NCCL_NET_H_ #include #include #include "err.h" #define NCCL_NET_HANDLE_MAXSIZE 128 #define NCCL_PTR_HOST 0x1 #define NCCL_PTR_CUDA 0x2 #define NCCL_PTR_DMABUF 0x4 // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 8 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); #include "net_v6.h" #include "net_v5.h" #include "net_v4.h" #include "net_v3.h" #include "net_v2.h" #endif // end include guard nccl-2.18.3-1/ext-net/example/nccl/net_v2.h000066400000000000000000000050361444201535000201700ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_V2_H_ #define NCCL_NET_V2_H_ typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Return the device path in /sys. NCCL will call free on this path. ncclResult_t (*pciPath)(int dev, char** path); // Return whether this device supports host pointers and/or CUDA pointers // as data from the current GPU. Supported types should be composed with // NCCL_PTR_HOST and NCCL_PTR_CUDA. ncclResult_t (*ptrSupport)(int dev, int* supportedTypes); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connectHandle ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v2_t; #endif // end include guard nccl-2.18.3-1/ext-net/example/nccl/net_v3.h000066400000000000000000000045141444201535000201710ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_V3_H_ #define NCCL_NET_V3_H_ #define NCCL_NET_MAX_REQUESTS_V3 16 typedef ncclNetProperties_v4_t ncclNetProperties_v3_t; typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connectHandle ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v3_t; #endif // end include guard nccl-2.18.3-1/ext-net/example/nccl/net_v4.h000066400000000000000000000054751444201535000202010ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_V4_H_ #define NCCL_NET_V4_H_ #define NCCL_NET_HANDLE_MAXSIZE_V4 64 typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA int speed; // Port speed in Mbps. int port; // Port number. int maxComms; // Maximum number of comms we can create } ncclNetProperties_v4_t; // v4 struct for backwards compatibility typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connectHandle ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v4_t; #endif // end include guard nccl-2.18.3-1/ext-net/example/nccl/net_v5.h000066400000000000000000000054121444201535000201710ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_V5_H_ #define NCCL_NET_V5_H_ typedef ncclNetProperties_v6_t ncclNetProperties_v5_t; typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v5_t; #endif // end include guard nccl-2.18.3-1/ext-net/example/nccl/net_v6.h000066400000000000000000000067411444201535000202000ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_NET_V6_H_ #define NCCL_NET_V6_H_ typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] int speed; // Port speed in Mbps. int port; // Port number. float latency; // Network latency int maxComms; // Maximum number of comms we can create int maxRecvs; // Maximum number of grouped receives. }ncclNetProperties_v6_t; typedef ncclNetProperties_v6_t ncclNetProperties_t; typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v6_t; #endif // end include guard nccl-2.18.3-1/ext-net/example/nccl/types.h000066400000000000000000000011431444201535000201320ustar00rootroot00000000000000/* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ #ifndef NCCL_ERR_H_ #define NCCL_ERR_H_ /* Data types */ typedef enum { ncclInt8 = 0, ncclChar = 0, ncclUint8 = 1, ncclInt32 = 2, ncclInt = 2, ncclUint32 = 3, ncclInt64 = 4, ncclUint64 = 5, ncclFloat16 = 6, ncclHalf = 6, ncclFloat32 = 7, ncclFloat = 7, ncclFloat64 = 8, ncclDouble = 8, ncclBfloat16 = 9, } ncclDataType_t; #endif nccl-2.18.3-1/ext-net/example/plugin.c000066400000000000000000000165401444201535000173470ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include #define __hidden __attribute__ ((visibility("hidden"))) int max_requests = NCCL_NET_MAX_REQUESTS; __hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; } __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; } __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; } __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v6_t* props) { //pluginPciPath(dev, &props.pciPath); //pluginPtrSupport(dev, &props.ptrSupport); return ncclInternalError; } __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } __hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; } __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; } __hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;} __hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; } __hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; } __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; } __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; } __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; } __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; } __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; } #define PLUGIN_NAME "Plugin" const ncclNet_v6_t ncclNetPlugin_v6 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, .getProperties = pluginGetProperties, .listen = pluginListen, .connect = pluginConnect, .accept = pluginAccept, .regMr = pluginRegMr, .regMrDmaBuf = pluginRegMrDmaBuf, .deregMr = pluginDeregMr, .isend = pluginIsend, .irecv = pluginIrecv, .iflush = pluginIflush, .test = pluginTest, .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen, }; /* v5 Compat */ const ncclNet_v5_t ncclNetPlugin_v5 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, .getProperties = pluginGetProperties, .listen = pluginListen, .connect = pluginConnect, .accept = pluginAccept, .regMr = pluginRegMr, .deregMr = pluginDeregMr, .isend = pluginIsend, .irecv = pluginIrecv, .iflush = pluginIflush, .test = pluginTest, .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen, }; /* v4 Compat */ static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) { ncclNetProperties_v6_t props_v6; ncclResult_t ret = pluginGetProperties(dev, &props_v6); if (ret != ncclSuccess) return ret; props->name = props_v6.name; props->pciPath = props_v6.pciPath; props->guid = props_v6.guid; props->ptrSupport = props_v6.ptrSupport; props->speed = props_v6.speed; props->port = props_v6.port; props->maxComms = props_v6.maxComms; return ncclSuccess; } static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) { return pluginIsend(sendComm, data, size, 0, mhandle, request); } static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) { int tag = 0; return pluginIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request); } static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) { return pluginIflush(recvComm, 1, &data, &size, &mhandle, request); } static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) { ncclResult_t ret; do { ret = pluginConnect(dev, handle, sendComm); } while (ret == ncclSuccess && *sendComm == NULL); return ret; } static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) { ncclResult_t ret; do { ret = pluginAccept(listenComm, recvComm); } while (ret == ncclSuccess && *recvComm == NULL); return ret; } const ncclNet_v4_t ncclNetPlugin_v4 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, .getProperties = pluginGetProperties_v4, .listen = pluginListen, .connect = pluginConnect_v4, .accept = pluginAccept_v4, .regMr = pluginRegMr, .deregMr = pluginDeregMr, .isend = pluginIsend_v4, .irecv = pluginIrecv_v4, .iflush = pluginIflush_v4, .test = pluginTest, .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen, }; /* v3 Compat */ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) { void* req; ncclResult_t ret = pluginIflush_v4(recvComm, data, size, mhandle, &req); int done = 0; while (ret == ncclSuccess && done == 0) { ret = pluginTest(req, &done, NULL); } return ret; } static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) { max_requests = NCCL_NET_MAX_REQUESTS_V3; return pluginInit(logFunction); } #include static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) { char pluginHandle[NCCL_NET_HANDLE_MAXSIZE]; ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm); memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V3); return ret; } static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendComm) { char pluginHandle[NCCL_NET_HANDLE_MAXSIZE]; memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V3); return pluginConnect_v4(dev, &pluginHandle, sendComm); } const ncclNet_v3_t ncclNetPlugin_v3 = { .name = PLUGIN_NAME, .init = pluginInit_v3, .devices = pluginDevices, .getProperties = pluginGetProperties_v4, .listen = pluginListen_v3, .connect = pluginConnect_v3, .accept = pluginAccept_v4, .regMr = pluginRegMr, .deregMr = pluginDeregMr, .isend = pluginIsend_v4, .irecv = pluginIrecv_v4, .flush = pluginFlush, .test = pluginTest, .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen, }; /* v2 Compat */ const ncclNet_v2_t ncclNetPlugin_v2 = { .name = PLUGIN_NAME, .init = pluginInit_v3, .devices = pluginDevices, .pciPath = pluginPciPath, .ptrSupport = pluginPtrSupport, .listen = pluginListen, .connect = pluginConnect_v4, .accept = pluginAccept_v4, .regMr = pluginRegMr, .deregMr = pluginDeregMr, .isend = pluginIsend_v4, .irecv = pluginIrecv_v4, .flush = pluginFlush, .test = pluginTest, .closeSend = pluginCloseSend, .closeRecv = pluginCloseRecv, .closeListen = pluginCloseListen, }; nccl-2.18.3-1/ext-net/google-fastsocket/000077500000000000000000000000001444201535000176645ustar00rootroot00000000000000nccl-2.18.3-1/ext-net/google-fastsocket/Makefile000066400000000000000000000007701444201535000213300ustar00rootroot00000000000000CUDA_HOME?=/usr/local/cuda INC:=-I$(CUDA_HOME)/include PLUGIN_SO:=libnccl-net.so default: $(PLUGIN_SO) $(PLUGIN_SO): nccl-fastsocket/*.cc $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ nccl-fastsocket/*.cc: git clone https://github.com/google/nccl-fastsocket.git install: $(BUILDDIR)/lib/$(PLUGIN_SO) $(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO) @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(BUILDDIR)/lib install -m 644 $< $@ clean: rm -f $(PLUGIN_SO) rm -Rf nccl-fastsocket nccl-2.18.3-1/makefiles/000077500000000000000000000000001444201535000146205ustar00rootroot00000000000000nccl-2.18.3-1/makefiles/common.mk000066400000000000000000000074331444201535000164500ustar00rootroot00000000000000# # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # CUDA_HOME ?= /usr/local/cuda PREFIX ?= /usr/local VERBOSE ?= 0 KEEP ?= 0 DEBUG ?= 0 TRACE ?= 0 PROFAPI ?= 1 NVTX ?= 1 RDMA_CORE ?= 0 NVCC = $(CUDA_HOME)/bin/nvcc CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev) CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) #$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR}) # You should define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \ -gencode=arch=compute_60,code=sm_60 \ -gencode=arch=compute_61,code=sm_61 ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0) # SM35 is deprecated from CUDA12.0 onwards CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35 endif CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80 CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90 CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 CUDA11_PTX = -gencode=arch=compute_80,code=compute_80 CUDA12_PTX = -gencode=arch=compute_90,code=compute_90 ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0) # Include Hopper support if we're using CUDA11.8 or above NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX) else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX) # Include Volta support if we're using CUDA9 or above else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0) NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) else NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX) endif $(info NVCC_GENCODE is ${NVCC_GENCODE}) CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \ -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \ -I $(CUDA_INC) \ $(CXXFLAGS) # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors) # 512 : 120, 640 : 96, 768 : 80, 1024 : 60 # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions. NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all # Use addprefix so that we can specify more than one path NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt ########## GCOV ########## GCOV ?= 0 # disable by default. GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1 CXXFLAGS += ${GCOV_FLAGS} NVCUFLAGS += ${GCOV_FLAGS:%=-Xcompiler %} LDFLAGS += ${GCOV_FLAGS} NVLDFLAGS += ${GCOV_FLAGS:%=-Xcompiler %} # $(warning GCOV_FLAGS=${GCOV_FLAGS}) ########## GCOV ########## ifeq ($(DEBUG), 0) NVCUFLAGS += -O3 CXXFLAGS += -O3 -g else NVCUFLAGS += -O0 -G -g CXXFLAGS += -O0 -g -ggdb3 endif ifneq ($(VERBOSE), 0) NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter CXXFLAGS += -Wall -Wextra else .SILENT: endif ifneq ($(TRACE), 0) CXXFLAGS += -DENABLE_TRACE endif ifeq ($(NVTX), 0) CXXFLAGS += -DNVTX_DISABLE endif ifneq ($(KEEP), 0) NVCUFLAGS += -keep endif ifneq ($(PROFAPI), 0) CXXFLAGS += -DPROFAPI endif ifneq ($(RDMA_CORE), 0) CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1 endif nccl-2.18.3-1/makefiles/formatting.mk000066400000000000000000000022361444201535000173260ustar00rootroot00000000000000# # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # # Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting # As this file defines a new target (format), it should be included at least after the definition of the # default target. ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none ASTYLEDIR := $(BUILDDIR)/contrib ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/ ASTYLEVER := 3.1 ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz" $(ASTYLEDIR) : @mkdir -p $(ASTYLEDIR) $(ASTYLETAR) : $(ASTYLEDIR) @wget -q -O $(ASTYLETAR) $(ASTYLEURL) $(ASTYLEBLD) : $(ASTYLETAR) @cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR) $(ASTYLEBIN) : $(ASTYLEBLD) ${MAKE} -C $(ASTYLEBLD) .PHONY : format format : $(ASTYLEBIN) @$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT) nccl-2.18.3-1/makefiles/version.mk000066400000000000000000000001471444201535000166400ustar00rootroot00000000000000##### version NCCL_MAJOR := 2 NCCL_MINOR := 18 NCCL_PATCH := 3 NCCL_SUFFIX := PKG_REVISION := 1 nccl-2.18.3-1/pkg/000077500000000000000000000000001444201535000134415ustar00rootroot00000000000000nccl-2.18.3-1/pkg/Makefile000066400000000000000000000010161444201535000150770ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # .PHONY : all clean default : build build : debian.build txz.build BUILDDIR ?= $(abspath ../build) ABSBUILDDIR := $(abspath $(BUILDDIR)) TARGETS := debian txz all: ${TARGETS:%=%.build} prep: ${TARGETS:%=%.prep} build: ${TARGETS:%=%.build} clean: ${TARGETS:%=%.clean} %.prep: ${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR} %.build: ${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR} %.clean: ${MAKE} -C $* clean nccl-2.18.3-1/pkg/debian/000077500000000000000000000000001444201535000146635ustar00rootroot00000000000000nccl-2.18.3-1/pkg/debian/.gitignore000066400000000000000000000001211444201535000166450ustar00rootroot00000000000000/*.debhelper.log /*.debhelper /*.substvars /tmp/ /files /libnccl1/ /libnccl-dev/ nccl-2.18.3-1/pkg/debian/Makefile000066400000000000000000000031501444201535000163220ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # include ../../makefiles/common.mk include ../../makefiles/version.mk BUILDDIR ?= $(abspath ../../build) DEBPREPDIR := $(BUILDDIR)/debian PKGDIR := $(BUILDDIR)/pkg/deb/ DEBGEN_IN := $(wildcard *.in) DEBGEN := $(DEBGEN_IN:.in=) DEBFILES := compat copyright libnccl-dev.install rules $(DEBGEN) DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES)) PKG_TIMESTAMP := $(shell date -R) PKG_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH) PKG_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) prep : $(DEBTARGETS) $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) build : prep $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) @printf "Building Debian package\n" (cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b) mkdir -p $(PKGDIR) mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/ clean: rm -Rf $(DEBPREPDIR) $(PKGDIR) $(DEBPREPDIR)/% : %.in @printf "Generating %-35s > %s\n" $< $@ mkdir -p $(DEBPREPDIR) sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ $< > $@ $(DEBPREPDIR)/% : % @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(DEBPREPDIR) cp -f $< $@ nccl-2.18.3-1/pkg/debian/changelog.in000066400000000000000000000003471444201535000171460ustar00rootroot00000000000000nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium * Automatic Debian package from build -- cudatools ${pkg:Timestamp} nccl-2.18.3-1/pkg/debian/compat000066400000000000000000000000021444201535000160610ustar00rootroot000000000000009 nccl-2.18.3-1/pkg/debian/control.in000066400000000000000000000023511444201535000166740ustar00rootroot00000000000000Source: nccl Section: libs Maintainer: cudatools Priority: optional Build-depends: debhelper(>=9) Standards-Version: 3.9.5 Package: libnccl${nccl:Major} Section: libs Architecture: ${pkg:Arch} Depends: ${misc:Depends}, ${shlibs:Depends} Description: NVIDIA Collective Communication Library (NCCL) Runtime NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on any platform using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. Package: libnccl-dev Section: libdevel Architecture: ${pkg:Arch} Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version}) Description: NVIDIA Collective Communication Library (NCCL) Development Files NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on any platform using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. nccl-2.18.3-1/pkg/debian/copyright000077700000000000000000000000001444201535000210622../../LICENSE.txtustar00rootroot00000000000000nccl-2.18.3-1/pkg/debian/gbp.conf000066400000000000000000000001641444201535000163030ustar00rootroot00000000000000[DEFAULT] debian-branch = master upstream-branch = master ignore-new = True [git-buildpackage] no-purge = True nccl-2.18.3-1/pkg/debian/libnccl-dev.install.in000066400000000000000000000002241444201535000210400ustar00rootroot00000000000000include/nccl.h /usr/include include/nccl_net.h /usr/include lib/libnccl.so /usr/lib/${pkg:MultiArch} lib/libnccl_static.a /usr/lib/${pkg:MultiArch} nccl-2.18.3-1/pkg/debian/libnccl2.install.in000066400000000000000000000002121444201535000203430ustar00rootroot00000000000000lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch} lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch} nccl-2.18.3-1/pkg/debian/rules000077500000000000000000000003011444201535000157350ustar00rootroot00000000000000#!/usr/bin/make -f %: dh $@ --parallel override_dh_auto_install: PREFIX=debian/tmp dh_auto_install override_dh_auto_test: # Do not make test override_dh_auto_clean: # Do not make clean nccl-2.18.3-1/pkg/debian/source/000077500000000000000000000000001444201535000161635ustar00rootroot00000000000000nccl-2.18.3-1/pkg/debian/source/format000066400000000000000000000000151444201535000173720ustar00rootroot000000000000003.0 (native) nccl-2.18.3-1/pkg/redhat/000077500000000000000000000000001444201535000147105ustar00rootroot00000000000000nccl-2.18.3-1/pkg/redhat/Makefile000066400000000000000000000037041444201535000163540ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # include ../../makefiles/common.mk include ../../makefiles/version.mk BUILDDIR ?= $(abspath ../../build) RPMPREPDIR := $(BUILDDIR)/redhat PKGDIR := $(BUILDDIR)/pkg/rpm/ RPMGEN_IN := $(wildcard *.in) RPMGEN := $(RPMGEN_IN:.in=) RPMFILES := $(RPMGEN) RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES)) PKG_TIMESTAMP := $(shell date -R) ARCH := $(shell uname -m) PKG_ARCH ?= $(shell uname -m) PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch) ifeq ($(PKG_MULTIARCH),) # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it PKG_MULTIARCH := $(ARCH)-linux-gnu endif prep : $(RPMTARGETS) $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) build : prep $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) $(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR) @printf "Building Redhat package\n" mkdir -p $(PKGDIR) rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \ --define "_rpmdir $(PKGDIR)" \ --define "_builddir $(PKGDIR)/build/" \ --define "_buildrootdir $(PKGDIR)/buildroot/" \ -bb $(BUILDDIR)/redhat/nccl.spec clean: rm -Rf $(RPMPREPDIR) $(PKGDIR) $(RPMPREPDIR)/% : %.in @printf "Generating %-35s > %s\n" $< $@ mkdir -p $(RPMPREPDIR) sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \ -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \ $< > $@ $(RPMPREPDIR)/% : % @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(RPMPREPDIR) cp -f $< $@ nccl-2.18.3-1/pkg/redhat/nccl.spec.in000066400000000000000000000047631444201535000171220ustar00rootroot00000000000000Name: libnccl Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix} Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor} Summary: NVIDIA Collective Communication Library (NCCL) Runtime Group: Development/Libraries License: BSD URL: http://developer.nvidia.com/nccl Source0: nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz Requires(pre,preun): /sbin/ldconfig %description NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on any platform using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. %package devel Summary: NVIDIA Collective Communication Library (NCCL) Runtime Group: Development/Libraries %description devel NCCL development files %package static Summary: NVIDIA Collective Communication Library (NCCL) Runtime Group: Development/Libraries %description static NCCL static library %define debug_package %{nil} %prep %setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q %build %install rm -rf $RPM_BUILD_ROOT install -m 755 -d $RPM_BUILD_ROOT install -m 755 -d $RPM_BUILD_ROOT/%{_libdir} install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir} ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major} # devel install -m 755 -d $RPM_BUILD_ROOT/%{_includedir} install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir} install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir} ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so # static install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir} %post -p /sbin/ldconfig %postun -p /sbin/ldconfig %post devel -p /sbin/ldconfig %postun devel -p /sbin/ldconfig %clean rm -rf $RPM_BUILD_ROOT %files devel %doc LICENSE.txt %defattr(-,root,root,-) %{_includedir}/nccl.h %{_includedir}/nccl_net.h %{_libdir}/libnccl.so %files static %doc LICENSE.txt %defattr(-,root,root,-) %{_libdir}/libnccl_static.a %files %doc LICENSE.txt %defattr(-,root,root,-) %{_libdir}/libnccl.so.${nccl:Major} %{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} %changelog nccl-2.18.3-1/pkg/srctxz/000077500000000000000000000000001444201535000147765ustar00rootroot00000000000000nccl-2.18.3-1/pkg/srctxz/Makefile000066400000000000000000000020511444201535000164340ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # include ../../makefiles/common.mk include ../../makefiles/version.mk BUILDDIR ?= $(abspath ../../build) TXZPREPDIR := $(BUILDDIR)/srctxz PKGDIR := $(BUILDDIR)/pkg/srctxz/ TXZGEN_IN := $(wildcard *.in) TXZGEN := $(TXZGEN_IN:.in=) TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) PKG_REVISION ?= 3 PKG_ARCH := $(shell uname -m) prep: $(TXZTARGETS) build: prep $(MAKE) -C ../../src clean @printf "Building source tar.xz package\n" (cd $(BUILDDIR); bash srctxz/create_srctxz.sh) mkdir -p $(PKGDIR) mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR) clean: rm -Rf $(TXZPREPDIR) $(PKGDIR) $(TXZPREPDIR)/% : %.in @printf "Generating %-35s > %s\n" $< $@ mkdir -p $(TXZPREPDIR) sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ $< > $@ nccl-2.18.3-1/pkg/srctxz/create_srctxz.sh.in000066400000000000000000000015101444201535000206140ustar00rootroot00000000000000#!/bin/bash # # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # # To run from $BUILDDIR/ cd .. NCCLDIR=`basename $PWD` echo "Checking for unclean directory ..." git clean -x -i echo "Clean done." echo "Checking for uncommited files ..." if [ "`git status -s | wc -l`" != "0" ]; then git status -s echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)" read fi cd .. NCCL_MAJOR=${nccl:Major} NCCL_MINOR=${nccl:Minor} NCCL_PATCH=${nccl:Patch} NCCL_SUFFIX=${nccl:Suffix} NCCL_BUILD=${pkg:Revision} NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}" tar --exclude build \ --exclude ".git*" \ --exclude pkg/srctxz \ --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR nccl-2.18.3-1/pkg/txz/000077500000000000000000000000001444201535000142665ustar00rootroot00000000000000nccl-2.18.3-1/pkg/txz/Makefile000066400000000000000000000022751444201535000157340ustar00rootroot00000000000000# # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # include ../../makefiles/common.mk include ../../makefiles/version.mk BUILDDIR ?= $(abspath ../../build) TXZPREPDIR := $(BUILDDIR)/txz PKGDIR := $(BUILDDIR)/pkg/txz/ TXZGEN_IN := $(wildcard *.in) TXZGEN := $(TXZGEN_IN:.in=) TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN)) PKG_ARCH := $(shell uname -m) prep: $(TXZTARGETS) $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) build: prep $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) @printf "Building tar.xz package\n" (cd $(BUILDDIR); bash txz/create_txz.sh) mkdir -p $(PKGDIR) mv $(BUILDDIR)/../nccl*.txz $(PKGDIR) clean: rm -Rf $(TXZPREPDIR) $(PKGDIR) $(TXZPREPDIR)/% : %.in @printf "Generating %-35s > %s\n" $< $@ mkdir -p $(TXZPREPDIR) sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \ -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \ -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \ $< > $@ nccl-2.18.3-1/pkg/txz/create_txz.sh.in000066400000000000000000000012141444201535000173750ustar00rootroot00000000000000#!/bin/bash # # Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # # To run from $BUILDDIR/ BUILDDIR=`basename $PWD` cd .. NCCL_MAJOR=${nccl:Major} NCCL_MINOR=${nccl:Minor} NCCL_PATCH=${nccl:Patch} NCCL_SUFFIX=${nccl:Suffix} CUDA_MAJOR=${cuda:Major} CUDA_MINOR=${cuda:Minor} PKG_REVISION=${pkg:Revision} PKG_ARCH=${pkg:Arch} NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}" tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt nccl-2.18.3-1/src/000077500000000000000000000000001444201535000134475ustar00rootroot00000000000000nccl-2.18.3-1/src/Makefile000066400000000000000000000122151444201535000151100ustar00rootroot00000000000000# # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # include ../makefiles/common.mk include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \ misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc \ misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \ misc/ipcsocket.cc \ transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \ collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc ##### lib files LIBNAME := libnccl.so STATICLIBNAME := libnccl_static.a ##### pkgconfig files PKGCONFIGFILE := nccl.pc ##### dirs BUILDDIR ?= $(abspath ../build) INCDIR := $(BUILDDIR)/include LIBDIR := $(BUILDDIR)/lib OBJDIR := $(BUILDDIR)/obj PKGDIR := $(BUILDDIR)/lib/pkgconfig ##### target files CUDARTLIB ?= cudart_static ifeq ($(CUDARTLIB), cudart_static) # Use compatibility shim only with static cudart; see https://github.com/NVIDIA/nccl/issues/658 LIBSRCFILES += enhcompat.cc endif INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%) LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR)) LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) STATICLIBTARGET := $(STATICLIBNAME) PKGTARGET := $(PKGCONFIGFILE) LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a ##### rules build : lib staticlib lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET) staticlib : $(LIBDIR)/$(STATICLIBTARGET) $(DEVICELIB): ALWAYS_REBUILD $(INCTARGETS) $(MAKE) -C collectives/device # Empty target to force rebuild ALWAYS_REBUILD: -include $(DEPFILES) $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ) $(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk # NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z)) @$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH))) mkdir -p $(INCDIR) @printf "Generating %-35s > %s\n" $< $@ sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \ $< > $@ $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB) @printf "Linking %-35s > %s\n" $(LIBTARGET) $@ mkdir -p $(LIBDIR) $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS) ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) null := space := $(null) # comma := , $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB) @printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@ mkdir -p $(LIBDIR) printf "create $@\naddlib $(DEVICELIB)\naddmod $(subst $(space),$(comma),$(strip $(LIBOBJ)))\nsave\nend" | ar -M $(PKGDIR)/nccl.pc : nccl.pc.in mkdir -p $(PKGDIR) @printf "Generating %-35s > %s\n" $< $@ sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \ -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ $< > $@ $(INCDIR)/%.h : %.h @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(INCDIR) install -m 644 $< $@ $(INCDIR)/nccl_%.h : include/nccl_%.h @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(INCDIR) install -m 644 $< $@ $(PKGDIR)/%.pc : %.pc @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(PKGDIR) install -m 644 $< $@ $(OBJDIR)/%.o : %.cc $(INCTARGETS) @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@ @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) @rm -f $(@:%.o=%.d.tmp) clean : $(MAKE) -C collectives/device clean rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR} install : build mkdir -p $(PREFIX)/lib mkdir -p $(PREFIX)/lib/pkgconfig mkdir -p $(PREFIX)/include cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/ cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/ cp -v $(BUILDDIR)/include/* $(PREFIX)/include/ FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h') # Note that formatting.mk defines a new target so in order to not overwrite the default target, # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well # as the BUILDDIR variable. include ../makefiles/formatting.mk nccl-2.18.3-1/src/bootstrap.cc000066400000000000000000000531311444201535000157760ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #include "core.h" #include "utils.h" #include "bootstrap.h" #include "net.h" #include #include #include "proxy.h" struct bootstrapRootArgs { struct ncclSocket* listenSock; uint64_t magic; }; /* Init functions */ static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1]; static union ncclSocketAddress bootstrapNetIfAddr; static int bootstrapNetInitDone = 0; pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER; ncclResult_t bootstrapNetInit() { if (bootstrapNetInitDone == 0) { pthread_mutex_lock(&bootstrapNetLock); if (bootstrapNetInitDone == 0) { char* env = getenv("NCCL_COMM_ID"); if (env) { union ncclSocketAddress remoteAddr; if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) { WARN("Invalid NCCL_COMM_ID, please use format: : or []: or :"); return ncclInvalidArgument; } if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { WARN("NET/Socket : No usable listening interface found"); return ncclSystemError; } } else { int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); if (nIfs <= 0) { WARN("Bootstrap : no socket interface found"); return ncclInternalError; } } char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2]; sprintf(line, " %s:", bootstrapNetIfName); ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line)); INFO(NCCL_INIT, "Bootstrap : Using%s", line); bootstrapNetInitDone = 1; } pthread_mutex_unlock(&bootstrapNetLock); } return ncclSuccess; } /* Socket Interface Selection type */ enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 }; // Additional sync functions static ncclResult_t bootstrapNetSend(struct ncclSocket* sock, void* data, int size) { NCCLCHECK(ncclSocketSend(sock, &size, sizeof(int))); NCCLCHECK(ncclSocketSend(sock, data, size)); return ncclSuccess; } static ncclResult_t bootstrapNetRecv(struct ncclSocket* sock, void* data, int size) { int recvSize; NCCLCHECK(ncclSocketRecv(sock, &recvSize, sizeof(int))); if (recvSize > size) { WARN("Message truncated : received %d bytes instead of %d", recvSize, size); return ncclInternalError; } NCCLCHECK(ncclSocketRecv(sock, data, std::min(recvSize, size))); return ncclSuccess; } struct extInfo { int rank; int nranks; union ncclSocketAddress extAddressListenRoot; union ncclSocketAddress extAddressListen; }; #include static ncclResult_t setFilesLimit() { struct rlimit filesLimit; SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit"); filesLimit.rlim_cur = filesLimit.rlim_max; SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit"); return ncclSuccess; } static void *bootstrapRoot(void* rargs) { struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs; struct ncclSocket* listenSock = args->listenSock; uint64_t magic = args->magic; ncclResult_t res = ncclSuccess; int nranks = 0, c = 0; struct extInfo info; union ncclSocketAddress *rankAddresses = NULL; union ncclSocketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange union ncclSocketAddress *zero = NULL; NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out); setFilesLimit(); TRACE(NCCL_INIT, "BEGIN"); /* Receive addresses from all ranks */ do { struct ncclSocket sock; NCCLCHECKGOTO(ncclSocketInit(&sock), res, out); NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out); NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out); NCCLCHECKGOTO(ncclSocketClose(&sock), res, out); if (c == 0) { nranks = info.nranks; NCCLCHECKGOTO(ncclCalloc(&rankAddresses, nranks), res, out); NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nranks), res, out); } if (nranks != info.nranks) { WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks); goto out; } if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union ncclSocketAddress)) != 0) { WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks); goto out; } // Save the connection handle for that rank memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union ncclSocketAddress)); memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union ncclSocketAddress)); ++c; TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks); } while (c < nranks); TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks); // Send the connect handle for the next rank in the AllGather ring for (int r=0; raddr, handle->magic, ncclSocketTypeBootstrap, NULL, 0)); NCCLCHECK(ncclSocketListen(listenSock)); NCCLCHECK(ncclSocketGetAddr(listenSock, &handle->addr)); NCCLCHECK(ncclCalloc(&args, 1)); args->listenSock = listenSock; args->magic = handle->magic; NEQCHECK(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), 0); ncclSetThreadName(thread, "NCCL BootstrapR"); NEQCHECK(pthread_detach(thread), 0); // will not be pthread_join()'d return ncclSuccess; } ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) { memset(handle, 0, sizeof(ncclBootstrapHandle)); NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic))); char* env = getenv("NCCL_COMM_ID"); if (env) { INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env); if (ncclSocketGetAddrFromString(&handle->addr, env) != ncclSuccess) { WARN("Invalid NCCL_COMM_ID, please use format: : or []: or :"); return ncclInvalidArgument; } } else { memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); NCCLCHECK(bootstrapCreateRoot(handle, false)); } return ncclSuccess; } struct unexConn { int peer; int tag; struct ncclSocket sock; struct unexConn* next; }; struct bootstrapState { struct ncclSocket listenSock; struct ncclSocket ringRecvSocket; struct ncclSocket ringSendSocket; union ncclSocketAddress* peerCommAddresses; union ncclSocketAddress* peerProxyAddresses; struct unexConn* unexpectedConnections; int cudaDev; int rank; int nranks; uint64_t magic; volatile uint32_t *abortFlag; }; ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm) { int rank = comm->rank; int nranks = comm->nRanks; struct bootstrapState* state; struct ncclSocket* proxySocket; ncclSocketAddress nextAddr; struct ncclSocket sock, listenSockRoot; struct extInfo info = { 0 }; NCCLCHECK(ncclCalloc(&state, 1)); state->rank = rank; state->nranks = nranks; state->abortFlag = comm->abortFlag; comm->bootstrap = state; comm->magic = state->magic = handle->magic; TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); info.rank = rank; info.nranks = nranks; // Create socket for other ranks to contact me NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag)); NCCLCHECK(ncclSocketListen(&state->listenSock)); NCCLCHECK(ncclSocketGetAddr(&state->listenSock, &info.extAddressListen)); // Create socket for root to contact me NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag)); NCCLCHECK(ncclSocketListen(&listenSockRoot)); NCCLCHECK(ncclSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); // stagger connection times to avoid an overload of the root if (nranks > 128) { long msec = rank; struct timespec tv; tv.tv_sec = msec / 1000; tv.tv_nsec = 1000000 * (msec % 1000); TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec); (void) nanosleep(&tv, NULL); } // send info on my listening socket to root NCCLCHECK(ncclSocketInit(&sock, &handle->addr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag)); NCCLCHECK(ncclSocketConnect(&sock)); NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info))); NCCLCHECK(ncclSocketClose(&sock)); // get info on my "next" rank in the bootstrap ring from root NCCLCHECK(ncclSocketInit(&sock)); NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot)); NCCLCHECK(bootstrapNetRecv(&sock, &nextAddr, sizeof(union ncclSocketAddress))); NCCLCHECK(ncclSocketClose(&sock)); NCCLCHECK(ncclSocketClose(&listenSockRoot)); NCCLCHECK(ncclSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag)); NCCLCHECK(ncclSocketConnect(&state->ringSendSocket)); // Accept the connect request from the previous rank in the AllGather ring NCCLCHECK(ncclSocketInit(&state->ringRecvSocket)); NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock)); // AllGather all listen handlers NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks)); NCCLCHECK(ncclSocketGetAddr(&state->listenSock, state->peerCommAddresses+rank)); NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress))); // Create the service proxy NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks)); // proxy is aborted through a message; don't set abortFlag NCCLCHECK(ncclCalloc(&proxySocket, 1)); NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag)); NCCLCHECK(ncclSocketListen(proxySocket)); NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank)); NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress))); NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses)); TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); return ncclSuccess; } ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) { ncclResult_t ret = ncclSuccess; int rank = comm->rank; int nranks = comm->nRanks; int prev, next; ncclSocketAddress listenAddr, tmpAddr; struct ncclSocket* proxySocket; struct bootstrapState* state; NCCLCHECKGOTO(ncclCalloc(&state, 1), ret, fail); state->rank = rank; state->nranks = nranks; state->abortFlag = comm->abortFlag; comm->bootstrap = state; comm->magic = state->magic = handle->magic; prev = parentRanks[(rank-1+nranks)%nranks]; next = parentRanks[(rank+1)%nranks]; // Setup my sockets for the allgather ring and other p2p connections NCCLCHECKGOTO(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail); NCCLCHECKGOTO(ncclSocketInit(&state->ringRecvSocket, NULL, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail); // Create socket for other ranks to contact me NCCLCHECKGOTO(ncclSocketListen(&state->listenSock), ret, fail); // Get addr from next rank NCCLCHECKGOTO(ncclSocketGetAddr(&state->listenSock, &listenAddr), ret, fail); NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, -2, &listenAddr, sizeof(union ncclSocketAddress)), ret, fail); NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, -2, &tmpAddr, sizeof(union ncclSocketAddress)), ret, fail); NCCLCHECKGOTO(ncclSocketInit(&state->ringSendSocket, &tmpAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail); NCCLCHECKGOTO(ncclSocketConnect(&state->ringSendSocket), ret, fail); // Accept the connect request from the previous rank in the AllGather ring NCCLCHECKGOTO(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock), ret, fail); // AllGather all listen handlers NCCLCHECKGOTO(ncclCalloc(&state->peerCommAddresses, nranks), ret, fail); memcpy(state->peerCommAddresses+rank, &listenAddr, sizeof(union ncclSocketAddress)); NCCLCHECKGOTO(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)), ret, fail); if (parent->config.splitShare) { /* map local rank to top parent local rank. */ for (int i = 0; i < nranks; ++i) { comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]]; } comm->proxyState = parent->sharedRes->proxyState; ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount); } else { // Create the service proxy NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail); NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail); NCCLCHECKGOTO(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag, 0), ret, fail); NCCLCHECKGOTO(ncclSocketListen(proxySocket), ret, fail); NCCLCHECKGOTO(ncclSocketGetAddr(proxySocket, &tmpAddr), ret, fail); memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(union ncclSocketAddress)); NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail); NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses), ret, fail); } INFO(NCCL_INIT, "bootstrapSplit: rank %d nranks %d color %d key %d prev %d next %d - DONE", rank, nranks, color, key, prev, next); exit: return ret; fail: goto exit; } ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { struct bootstrapState* state = (struct bootstrapState*)commState; char* data = (char*)allData; int rank = state->rank; int nranks = state->nranks; TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size); /* Simple ring based AllGather * At each step i receive data from (rank-i-1) from left * and send previous step's data from (rank-i) to right */ for (int i=0; iringSendSocket, data+sslice*size, size)); // Recv slice from the left NCCLCHECK(bootstrapNetRecv(&state->ringRecvSocket, data+rslice*size, size)); } TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); return ncclSuccess; } ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) { ncclResult_t ret = ncclSuccess; struct bootstrapState* state = (struct bootstrapState*)commState; struct ncclSocket sock; NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap), ret, fail); NCCLCHECKGOTO(ncclSocketConnect(&sock), ret, fail); NCCLCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail); NCCLCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail); NCCLCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, fail); exit: NCCLCHECK(ncclSocketClose(&sock)); return ret; fail: goto exit; } ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag) { if (nranks == 1) return ncclSuccess; TRACE(NCCL_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag); /* Simple intra process barrier * * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet, * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988" */ int data[1]; for (int mask=1; maskpeer = peer; unex->tag = tag; memcpy(&unex->sock, sock, sizeof(struct ncclSocket)); // Enqueue struct unexConn* list = state->unexpectedConnections; if (list == NULL) { state->unexpectedConnections = unex; return ncclSuccess; } while (list->next) list = list->next; list->next = unex; return ncclSuccess; } ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock, int* found) { struct unexConn* elem = state->unexpectedConnections; struct unexConn* prev = NULL; *found = 0; while (elem) { if (elem->peer == peer && elem->tag == tag) { if (prev == NULL) { state->unexpectedConnections = elem->next; } else { prev->next = elem->next; } memcpy(sock, &elem->sock, sizeof(struct ncclSocket)); free(elem); *found = 1; return ncclSuccess; } prev = elem; elem = elem->next; } return ncclSuccess; } static void unexpectedFree(struct bootstrapState* state) { struct unexConn* elem = state->unexpectedConnections; struct unexConn* prev = NULL; while (elem) { prev = elem; elem = elem->next; free(prev); } return; } // We can't know who we'll receive from, so we need to receive everything at once ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) { ncclResult_t ret = ncclSuccess; struct bootstrapState* state = (struct bootstrapState*)commState; struct ncclSocket sock; int newPeer, newTag; // Search unexpected connections first int found; NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock, &found)); if (found) { NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail); goto exit; } // Then look for new connections while (1) { NCCLCHECKGOTO(ncclSocketInit(&sock), ret, fail); NCCLCHECKGOTO(ncclSocketAccept(&sock, &state->listenSock), ret, fail); NCCLCHECKGOTO(bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail); NCCLCHECKGOTO(bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail); if (newPeer == peer && newTag == tag) { NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail); goto exit; } // Unexpected connection. Save for later. NCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail); } exit: NCCLCHECK(ncclSocketClose(&sock)); return ret; fail: goto exit; } ncclResult_t bootstrapClose(void* commState) { struct bootstrapState* state = (struct bootstrapState*)commState; if (state->unexpectedConnections != NULL) { unexpectedFree(state); if (*state->abortFlag == 0) { WARN("Unexpected connections are not empty"); return ncclInternalError; } } NCCLCHECK(ncclSocketClose(&state->listenSock)); NCCLCHECK(ncclSocketClose(&state->ringSendSocket)); NCCLCHECK(ncclSocketClose(&state->ringRecvSocket)); free(state->peerCommAddresses); free(state); return ncclSuccess; } ncclResult_t bootstrapAbort(void* commState) { struct bootstrapState* state = (struct bootstrapState*)commState; if (commState == NULL) return ncclSuccess; NCCLCHECK(ncclSocketClose(&state->listenSock)); NCCLCHECK(ncclSocketClose(&state->ringSendSocket)); NCCLCHECK(ncclSocketClose(&state->ringRecvSocket)); free(state->peerCommAddresses); free(state->peerProxyAddresses); free(state); return ncclSuccess; } nccl-2.18.3-1/src/channel.cc000066400000000000000000000160501444201535000153700ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "channel.h" #include "param.h" #include "gdrwrap.h" ncclResult_t initChannel(struct ncclComm* comm, int channelId) { struct ncclChannel* channel = &comm->channels[channelId]; if (channel->id != -1) return ncclSuccess; int nRanks = comm->nRanks; int nPeers = nRanks + 1 /* Collnet */ + comm->localRanks /* NVLS */; channel->id = channelId; channel->workFifoSent = 0; struct ncclSharedResources* sharedRes = comm->sharedRes; NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); if (channel->peers == NULL) { // The extra on nRanks+1 is for collnet root (i.e. network) // Allocate everything related to sharedRes with ncclCalloc as this can be // shared between communicators hence should not be tied to comm. if (sharedRes->peers[channelId] == NULL) { NCCLCHECK(ncclCalloc(sharedRes->peers + channelId, sharedRes->tpNRanks)); } channel->peers = ncclMemoryStackAlloc(&comm->memPermanent, nPeers); for (int r = 0; r < nRanks; r++) { channel->peers[r] = comm->sharedRes->peers[channelId] + comm->topParentRanks[r]; ncclAtomicRefCountIncrement(&channel->peers[r]->refCount); } } if (channel->devPeers == NULL) { if (sharedRes->devPeers[channelId] == NULL) { NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream)); } /* channel->devPeers is not shared, so just free it when calling commFree() */ NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream)); ncclCommPushCudaFree(comm, channel->devPeers); for (int r = 0; r < nRanks; r++) { uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]); NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); } } channel->ring.userRanks = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream)); ncclCommPushCudaFree(comm, channel->devRingUserRanks); NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); return ncclSuccess; } ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) { struct ncclChannel* channel = &comm->channels[channelId]; struct ncclSharedResources* sharedRes = comm->sharedRes; if (channel->nvlsPeers != NULL) return ncclSuccess; if (channel->id == -1) NCCLCHECK(initChannel(comm, channelId)); NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); if (share) { channel->nvlsPeers = parent->channels[channelId].nvlsPeers; channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers; for (int r = 0; r < comm->localRanks; ++r) { int tr = comm->topParentLocalRanks[r]; uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr); channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr; NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount); } } else { NCCLCHECK(ncclCalloc(&channel->nvlsPeers, comm->localRanks)); NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, comm->localRanks, sharedRes->deviceStream.cudaStream)); for (int r = 0; r < comm->localRanks; ++r) { uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r); channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r; NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount); } } NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); return ncclSuccess; } ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) { struct ncclChannel* channel = &comm->channels[channelId]; struct ncclSharedResources* sharedRes = comm->sharedRes; uintptr_t addr; if (channel->collnetPeers != NULL) return ncclSuccess; if (channel->id == -1) NCCLCHECK(initChannel(comm, channelId)); NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); if (share) { channel->collnetPeers = parent->channels[channelId].collnetPeers; channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers; addr = (uintptr_t)parent->channels[channelId].collnetDevPeers; channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers; NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount); } else { NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1)); NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream)); addr = (uintptr_t)channel->collnetDevPeers; channel->peers[comm->nRanks] = channel->collnetPeers; NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount); } NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); return ncclSuccess; } ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks) { int nPeers = nRanks + collnetNRanks + nvlsNRanks; /* channel peers are only valid when async init thread completes commAlloc() and * the channel is intialized with initChannel(); if either is not done, this channel * should never be free. */ if (channel->id == -1 || channel->peers == NULL) return ncclSuccess; // Free transport proxy resources // Note: free all send resources first due to CollNet arrangement for (int r = 0; r < nPeers; r++) { struct ncclChannelPeer* peer = channel->peers[r]; if (peer) { if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) { for (int b=0; bsend[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b)); if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b)); } if (r == nRanks) { free(channel->collnetPeers); ncclCudaFree(channel->collnetDevPeers); } else if (r == nPeers - 1) { free(channel->nvlsPeers); ncclCudaFree(channel->nvlsDevPeers); } } } } return ncclSuccess; } nccl-2.18.3-1/src/collectives/000077500000000000000000000000001444201535000157635ustar00rootroot00000000000000nccl-2.18.3-1/src/collectives/all_gather.cc000066400000000000000000000022301444201535000203710ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "enqueue.h" #include "collectives.h" NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { // Just pass the size of one message and not the total bytes sent/received. constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"} }; size_t msgsize = sendcount * ncclTypeSize(datatype); NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize) struct ncclInfo info = { ncclFuncAllGather, "AllGather", sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */ ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; return ncclEnqueueCheck(&info); } nccl-2.18.3-1/src/collectives/all_reduce.cc000066400000000000000000000025571444201535000204020ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "enqueue.h" #include "nccl.h" NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { struct NvtxParamsAllReduce { size_t bytes; ncclRedOp_t op; }; // Just pass the size of one message and not the total bytes sent/received. static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, offsetof(NvtxParamsAllReduce, op)} }; NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op}; NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload) struct ncclInfo info = { ncclFuncAllReduce, "AllReduce", sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; return ncclEnqueueCheck(&info); } nccl-2.18.3-1/src/collectives/broadcast.cc000066400000000000000000000032131444201535000202330ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "enqueue.h" #include "collectives.h" NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream) { struct NvtxParamsBroadcast { size_t bytes; int root; }; constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)} }; NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root}; NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload) struct ncclInfo info = { ncclFuncBroadcast, "Broadcast", sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; return ncclEnqueueCheck(&info); } /* Deprecated original "in place" function, similar to MPI */ NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream) { return ncclBroadcast(buff, buff, count, datatype, root, comm, stream); } nccl-2.18.3-1/src/collectives/device/000077500000000000000000000000001444201535000172225ustar00rootroot00000000000000nccl-2.18.3-1/src/collectives/device/Makefile000066400000000000000000000045751444201535000206750ustar00rootroot00000000000000# # Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # include ../../../makefiles/common.mk include ../../../makefiles/version.mk BUILDDIR ?= $(abspath ../../../build) OBJDIR := $(BUILDDIR)/obj/collectives/device LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu LIBSRCFILES += functions.cu DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES)) DEPENDFILES:= $(DEPFILES:%.d=%.dep) STATICLIB := $(OBJDIR)/colldevice.a DEVOBJ := $(OBJDIR)/devlink.o RULESFILE := $(OBJDIR)/Makefile.rules NVCUFLAGS += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" all: $(STATICLIB) # Dummy rule so that the extra dependency (%.dep) files are preserved by make all_deps: $(DEPENDFILES) # Auto-generating the rules per op/reduction/datatype/algorithm $(RULESFILE) : gen_rules.sh @printf "Generating %-35s > %s\n" rules $@ @mkdir -p $(OBJDIR) @CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@ -include $(RULESFILE) LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o $(OBJDIR)/onerank_reduce.o -include $(DEPFILES) $(STATICLIB): $(LIBOBJ) $(DEVOBJ) @printf "Archiving %-35s > %s\n" objects $@ ar cr $@ $^ # We do not want make to build *.d when running make clean. # So we only provide targets for .dep which will produce .dep and .d, # with only .d being included, and .dep keeping track of what needs to # be regenerated. $(OBJDIR)/%.dep : %.cu @mkdir -p $(OBJDIR) @$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp @sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@ @sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $@ @rm -f $@.tmp @cp $@ $(@:.dep=.d) # Compiled kernels and collectives with relocatable device code ... $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` $(NVCC) $(NVCUFLAGS) -dc $< -o $@ $(OBJDIR)/onerank_reduce.o : onerank_reduce.cu $(OBJDIR)/onerank_reduce.dep @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` $(NVCC) $(NVCUFLAGS) -dc $< -o $@ # ... and create the device-side linked object with all those. $(DEVOBJ) : $(LIBOBJ) $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ clean: rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB) nccl-2.18.3-1/src/collectives/device/all_gather.cu000066400000000000000000000005501444201535000216550ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "all_gather.h" #include "common.h" #include "collectives.h" IMPL_COLL_C(AllGather); nccl-2.18.3-1/src/collectives/device/all_gather.h000066400000000000000000000127321444201535000215020ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "devcomm.h" #include "collectives.h" #include "primitives.h" namespace { template __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; const int *ringRanks = ring->userRanks; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1)); // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*int(chunkSize); const ssize_t size = args->count; T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; Primitives, 1, Proto, 0> prims (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; if (Proto::Id == NCCL_PROTO_SIMPLE) { realChunkSize = min(chunkSize, divUp(size-gridOffset,nChannels)); realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128); realChunkSize = int(realChunkSize); ssize_t chunkOffset = gridOffset + int(bid*realChunkSize); /////////////// begin AllGather steps /////////////// ssize_t offset; int nelem = min(realChunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ringRanks[0]; offset = chunkOffset + rankDest * size; if (inputBuf + chunkOffset == outputBuf + offset) { // In place prims.directSend(chunkOffset, offset, nelem); } else { prims.directCopySend(chunkOffset, offset, nelem); } // k-2 steps: copy to next GPU for (int j=1; j struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { using Proto = ProtoSimple; runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { const int tid = threadIdx.x; const int bid = args->bid; const int nChannels = args->nChannels; struct ncclNvls* nvls = &ncclShmem.channel.nvls; const ssize_t chunkSize = int(args->lastChunkSize); const ssize_t size = args->count; const ssize_t loopSize = nChannels*chunkSize; const int nThreadsGather = 128; const int nThreadsBcast = 384 + WARP_SIZE; const int tidEndGather = nThreadsGather; const int tidEndBcast = tidEndGather + nThreadsBcast; using Proto = ProtoSimple<1, 1>; if (tid < tidEndGather) { // Gather Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*chunkSize; int nelem = min(chunkSize, size-offset); prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0); } } else if (tid < tidEndBcast) { // Bcast through NVLS Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL, args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*chunkSize; int nelem = min(chunkSize, size-offset); prims.send(offset, nelem); } } } }; nccl-2.18.3-1/src/collectives/device/all_reduce.cu000066400000000000000000000005501444201535000216520ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "all_reduce.h" #include "common.h" #include "collectives.h" IMPL_COLL_R(AllReduce); nccl-2.18.3-1/src/collectives/device/all_reduce.h000066400000000000000000000740331444201535000215010ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "devcomm.h" #include "collectives.h" #include "primitives.h" namespace { template __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; int ringIx = ring->index; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1)); const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*nranks*chunkSize; const ssize_t size = args->count; int minChunkSize; if (Proto::Id == NCCL_PROTO_LL) minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T)); if (Proto::Id == NCCL_PROTO_LL128) { // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2; } Primitives, 1, Proto, 0> prims (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; if (Proto::Id == NCCL_PROTO_SIMPLE) { realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*nranks)); realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); } else realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize); realChunkSize = int(realChunkSize); auto calcOffset = [&]__device__(int chunk)->ssize_t { if (Proto::Id == NCCL_PROTO_SIMPLE) return gridOffset + bid*nranks*realChunkSize + chunk*realChunkSize; else return gridOffset + (chunk*nChannels + bid)*realChunkSize; }; auto modRanks = [&]__device__(int r)->int { return r - (r >= nranks ? nranks : 0); }; ssize_t offset; int nelem; int chunk; // step 0: push data to next GPU chunk = modRanks(ringIx + nranks-1); offset = calcOffset(chunk); nelem = min(realChunkSize, size-offset); prims.send(offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; j __device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) { const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclTree *tree = &ncclShmem.channel.tree; ssize_t chunkSize = int( Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize /* LL & LL128 */ : Proto::calcBytePerStep()/sizeof(T)); const ssize_t minChunkSize = int( Proto::Id == NCCL_PROTO_SIMPLE ? (nthreads-2*WARP_SIZE)*8*(sizeof(uint64_t)/sizeof(T)) /* LL & LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T))); const ssize_t loopSize = int(nChannels*chunkSize); const ssize_t size = args->count; if (loopSize > size) chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize); { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) Primitives, /*Direct=*/0, Proto, 0> prims (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg); if (tree->up == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true); } } else if (tree->down[0] == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.send(offset, nelem); } } else { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.recvReduceSend(offset, nelem); } } } { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) Primitives, /*Direct=*/1, Proto, 0> prims (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg); if (tree->up == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.directSendFromOutput(offset, nelem); } } else if (tree->down[0] == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.directRecv(offset, nelem); } } else { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.directRecvCopySend(offset, nelem); } } } } template __device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) { const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclTree *tree = &ncclShmem.channel.tree; ssize_t chunkSize = int( Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize : Proto::calcBytePerStep()/sizeof(T)); const ssize_t minChunkSize = int( Proto::Id == NCCL_PROTO_SIMPLE ? (nthreads - 2*WARP_SIZE)*8*(sizeof(uint64_t)/sizeof(T)) : Proto::Id == NCCL_PROTO_LL ? nthreads*(Proto::calcBytePerGrain()/sizeof(T)) /* LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T))/8); const ssize_t loopSize = int(nChannels*chunkSize); const ssize_t size = args->count; int nthreadsSplit; if (Proto::Id == NCCL_PROTO_SIMPLE) { nthreadsSplit = nthreads/2; if (nthreadsSplit >= 256) nthreadsSplit += 64; } else { // LL & LL128 // Receiving from up to 3 sources is more compute intensive than sending // to 3 dests. Use 70% for reduce and 30% for bcast. nthreadsSplit = (nthreads*7/(10*WARP_SIZE))*WARP_SIZE; } if (loopSize > size) chunkSize = divUp((int)size, nChannels*int(minChunkSize))*int(minChunkSize); if (tree->up == -1) { // Reduce and broadcast. Max number of recv is 2, max number of send is 2 Primitives, /*Direct=*/1, Proto, 0> prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true); } } else if (tid < nthreadsSplit) { /* Reduce up. Max number of recv is 3, max number of send is 1 (binary tree + local). * Why Direct=1???? * Answer: Because despite not performing any direct operations, the ctor * must assume Direct so that it can exchange direct pointers with remote ctors * that are Direct, otherwise it hangs. A cleaner solution would be to seperate * into DirectRecv and DirectSend capabilities, this ctor would have both=0, * but the ctor above for tree roots would be DirectRecv=0 DirectSend=1. */ Primitives, /*Direct=*/1, Proto, 0> prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth); if (tree->down[0] == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.send(offset, nelem); } } else { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.recvReduceSend(offset, nelem); } } } else { // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local) Primitives, /*Direct=*/1, Proto, 0> prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth); if (tree->down[0] == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.directRecv(offset, nelem); } } else { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.directRecvCopySend(offset, nelem); } } } } } template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { using Proto = ProtoSimple; runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { #if CUDART_VERSION >= 11020 && CUDART_VERSION < 11040 && __CUDA_ARCH__ >= 800 runTreeUpDown>(args); #else runTreeSplit>(args); #endif } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { static constexpr int COLLNET_COPY_THREADS = 96; const int tid = threadIdx.x; const int bid = args->bid; const int nChannels = args->nChannels; struct ncclDirect* direct = &ncclShmem.channel.collnetDirect; const ssize_t chunkSize = int(args->lastChunkSize); const ssize_t size = args->count; const ssize_t loopSize = nChannels*direct->nHeads*chunkSize; const int hasUp = (direct->up[0] >= 0) ? 1 : 0; const int hasDn = (direct->down[0] >= 0) ? 1 : 0; const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0); const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0); const int nThreadsBcast = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS); const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast; const int tidStartBcast = nThreadsGather; const int tidStartScatter = tidStartBcast + nThreadsBcast; const int tidStartReduce = tidStartScatter + nThreadsScatter; using Proto = ProtoSimple<1, 1>; if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) { // Scatter Primitives, /*Direct=*/1, Proto, 0> prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff, args->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; int nelem = min(direct->nHeads*chunkSize, size-offset); if (args->regUsed) { prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift); } else { prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift); } } } else if (tid >= tidStartReduce && direct->out != -1) { if (hasDn) { // Reduce, send to network Primitives, /*Direct=*/1, Proto, 0> prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); if (args->regUsed) { prims.directRecvReduceSend(offset, nelem); } else { prims.recvReduceSend(offset, nelem); } } } else { // Directly send to network Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff, args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.send(offset, nelem); } } } else if (tid < tidStartBcast && hasUp) { // Gather Primitives, /*Direct=*/1, Proto, 0> prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; int nelem = min(direct->nHeads*chunkSize, size-offset); prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift); } } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) { if (hasDn) { // Recv from network, broadcast Primitives, /*Direct=*/1, Proto, 0> prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true); } } else { // Recv from network (no post thread needed) Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.recv(offset, nelem, /*postOp=*/true); } } } } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { #if NCCL_NVLS_ENABLED const int tid = threadIdx.x; const int bid = args->bid; const int nChannels = args->nChannels; struct ncclNvls* nvls = &ncclShmem.channel.nvls; const ssize_t chunkSize = int(args->lastChunkSize); const ssize_t size = args->count; const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize; const int nranks = ncclShmem.comm.nRanks; const bool hasOut = nvls->out != -1; const int reduceWarps = hasOut ? 3 : nranks <= 6 ? 7 : 5; const int bcastWarps = hasOut ? 2 : 0; const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2; const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2; const int nThreadsScatter = scatterWarps*WARP_SIZE; const int nThreadsGather = gatherWarps*WARP_SIZE; const int nThreadsReduce = reduceWarps*WARP_SIZE; const int nThreadsBcast = (bcastWarps)*WARP_SIZE; const int tidEndScatter = nThreadsScatter; const int tidEndGather = tidEndScatter + nThreadsGather; const int tidEndReduce = tidEndGather + nThreadsReduce; const int tidEndBcast = tidEndReduce + nThreadsBcast; if (tid < tidEndScatter) { // Scatter using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize; int nelem = min(nvls->nHeads*chunkSize, size-offset); prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize; int nelem = min(nvls->nHeads*chunkSize, size-offset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndReduce && nvls->headRank != -1) { if (!hasOut) { // Reduce, broadcast through NVLS using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>; Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL, args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.recvSend(nelem); } } else { // Reduce, send to network using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL, args->redOpArg, 2*Proto::MaxGroupWidth, 0, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.recvSend(nelem); } } } else if (tid < tidEndBcast && nvls->headRank != -1) { // Recv from network, broadcast using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL, args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.recvSend(nelem); } } #endif // NCCL_NVLS_ENABLED } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { #if NCCL_NVLS_ENABLED const int tid = threadIdx.x; const int bid = args->bid; const int nChannels = args->nChannels; struct ncclNvls* nvls = &ncclShmem.channel.nvls; const int treeUp = nvls->treeUp; const int* treeDown = nvls->treeDown; const ssize_t chunkSize = int(args->lastChunkSize); const ssize_t size = args->count; const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize; const int nranks = ncclShmem.comm.nRanks; const bool hasUp = treeUp != -1; const int reduceWarps = hasUp ? 5 : nranks <= 6 ? 7 : 5; const int bcastWarps = hasUp ? 4 : 0; const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2; const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2; const int nThreadsScatter = scatterWarps*WARP_SIZE; const int nThreadsGather = gatherWarps*WARP_SIZE; const int nThreadsReduce = reduceWarps*WARP_SIZE; const int nThreadsBcast = (bcastWarps)*WARP_SIZE; const int tidEndScatter = nThreadsScatter; const int tidEndGather = tidEndScatter + nThreadsGather; const int tidEndReduce = tidEndGather + nThreadsReduce; const int tidEndBcast = tidEndReduce + nThreadsBcast; if (tid < tidEndScatter) { // Scatter using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize; int nelem = min(nvls->nHeads*chunkSize, size-offset); prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize; int nelem = min(nvls->nHeads*chunkSize, size-offset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndReduce && nvls->headRank != -1) { if (!hasUp) { // Reduce and Broadcast using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>; Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL, args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.recvSend(nelem); } } else { // Reduce, send to network using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL, args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.recvSend(nelem); } } } else if (tid < tidEndBcast && nvls->headRank != -1) { // Recv from network, broadcast using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL, args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); prims.recvSend(nelem); } } #endif // NCCL_NVLS_ENABLED } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclTree *tree = &ncclShmem.channel.collnetChain; ssize_t chunkSize = int(args->lastChunkSize); const ssize_t loopSize = int(nChannels*chunkSize); const int nranks = ncclShmem.comm.nRanks; const ssize_t size = args->count; int nthreadsSplit = nthreads/2; if (nthreadsSplit >= 256) nthreadsSplit += 64; int group, connIndex, send, recv, groupTid, groupNthreads; using Proto = ProtoSimple<1, 1>; if (tid < nthreadsSplit) { // Reduce up the chain group = 0; connIndex = 1; recv = tree->down[0]; send = tree->up; groupTid = tid; groupNthreads = nthreadsSplit; } else { // Broadcast down the chain group = 1; connIndex = 0; recv = tree->up; send = tree->down[0]; groupTid = tid - nthreadsSplit; groupNthreads = nthreads-nthreadsSplit; } Primitives, /*Direct=*/1, Proto, 0> prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff, args->redOpArg, group*Proto::MaxGroupWidth, connIndex, connIndex); if (tid < nthreadsSplit) { if (recv == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.send(offset, nelem); } } else { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.recvReduceSend(offset, nelem); } } } else { if (recv == nranks) { // I'm the first in the broadcast chain, I need to perform the division (postOp) if (send == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.recv(offset, nelem, /*postOp*/true); } } else { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.recvCopyDirectSend(offset, nelem, /*postOp*/true); } } } else { if (send == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.directRecv(offset, nelem); } } else { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); prims.directRecvCopySend(offset, nelem); } } } } } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runTreeSplit(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runTreeSplit(args); } }; nccl-2.18.3-1/src/collectives/device/broadcast.cu000066400000000000000000000005471444201535000215230ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "broadcast.h" #include "common.h" #include "collectives.h" IMPL_COLL_C(Broadcast); nccl-2.18.3-1/src/collectives/device/broadcast.h000066400000000000000000000062111444201535000213350ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "devcomm.h" #include "collectives.h" #include "primitives.h" namespace { template __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1)); const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); const ssize_t loopSize = nChannels*chunkSize; const ssize_t size = args->count; const int rank = ring->userRanks[0]; const int nextRank = ring->userRanks[1]; const int root = args->root; T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; Primitives, 0, Proto, 0> prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; if (Proto::Id == NCCL_PROTO_SIMPLE) { realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels)); realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128); realChunkSize = int(realChunkSize); ssize_t offset = gridOffset + int(bid*realChunkSize); int nelem = min(realChunkSize, size-offset); if (rank == root) { if (inputBuf == outputBuf) { prims.send(offset, nelem); } else { prims.copySend(offset, offset, nelem); } } else if (nextRank == root) { prims.recv(offset, nelem); } else { prims.recvCopySend(offset, nelem); } } } } template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { using Proto = ProtoSimple; runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; nccl-2.18.3-1/src/collectives/device/common.h000066400000000000000000000256451444201535000206770ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_DEVICE_COMMON_H_ #define NCCL_DEVICE_COMMON_H_ #include "collectives.h" #include "devcomm.h" #include "op128.h" #define COLL_UNROLL (ncclCollUnroll()) typedef void(*ncclKern_t)(); extern __device__ ncclKern_t ncclFuncs[]; struct ncclShmemGroup { ncclConnInfo *recvConns[NCCL_MAX_NVLS_ARITY]; ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY]; void* srcs[NCCL_MAX_NVLS_ARITY+1]; void* dsts[NCCL_MAX_NVLS_ARITY+1]; }; struct ncclShmemData { struct ncclShmemGroup groups[NCCL_MAX_GROUPS]; uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1]; int channelId; int aborted; alignas(16) struct ncclDevComm comm; alignas(16) struct ncclDevChannel channel; alignas(16) struct ncclWork work; }; static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned"); extern __shared__ ncclShmemData ncclShmem; #if __CUDA_ARCH__ >= 700 extern __shared__ ulong2 ncclShmemPerWarp[/*ncclShmemDynamicSize()/sizeof(ulong2)*/]; #else extern __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)]; #endif __device__ inline void* ncclScratchForWarp(int warp) { return (char*)ncclShmemPerWarp + warp*ncclShmemScratchWarpSize(); } __device__ inline bool barrierReduceAny(int bit) { uint32_t popc; asm ("{" ".reg .pred barr_pred;" "setp.eq.u32 barr_pred, %1, 1;" "bar.red.popc.u32 %0, 2, barr_pred;" "}" : "=r"(popc) : "r"(bit)); return popc != 0; } // Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads. inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int bytes) { int offset = 16*tid; if (offset < bytes) { uint64_t a=0, b=0; asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset)); asm volatile("st.v2.u64 [%0],{%1,%2};" :: "l"((char*)dst + offset), "l"(a), "l"(b)); } } template struct RunWorkElement { __device__ void run(ncclWorkElem*) { // Put NOT IMPLEMENTED behavior here. } }; template struct RunWork { // This __forceinline__ is necessary. The compiler was inserting a function call // here from the LL ncclKernel. __device__ __forceinline__ void run(ncclWork *w) { int wid = threadIdx.x / WARP_SIZE; ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0]; int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem); #pragma unroll 1 while ((char*)we + stride <= (char*)(w+1) && we->isUsed) { if (wid < we->nWarps) { RunWorkElement().run(we); } we = (ncclWorkElem*)((char*)we + stride); } } }; static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) { if (we->isUsed && we->redOpArgIsPtr) { /* redOpArg is a pointer to the scalar value, so we'll dereference it * here so that redOpArg holds the bits of the scalar going forward. * The tricky thing is we don't know its type T since that's encoded in * the funcIndex. Because it would be difficult to get sizeof(T) from * funcIndex, we'll cheat and just dereference the largest possible size * given the alignment of the pointer. We might be reading in more bytes * than we need but that's harmless. */ if (we->redOpArg%2 != 0) we->redOpArg = *reinterpret_cast(we->redOpArg); else if (we->redOpArg%4 != 0) we->redOpArg = *reinterpret_cast(we->redOpArg); else if (we->redOpArg%8 != 0) we->redOpArg = *reinterpret_cast(we->redOpArg); else we->redOpArg = *reinterpret_cast(we->redOpArg); } } template __device__ void ncclKernel( struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead ) { int tid = threadIdx.x; // To map blockId to channelId, we need the n'th set bit of channelMask which // is the inverse of counting the number of set bits among the the first n. if (tid < WARP_SIZE) { int x = tid; if (channelMask & (1ull<channels[channelId]; bytes = sizeof(ncclDevChannel); static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn."); break; case 2: dst = &ncclShmem.work; src = workHead + blockIdx.x; bytes = sizeof(ncclWork); static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn."); break; default: bytes = 0; break; } copyToShmem16(tid%WARP_SIZE, dst, src, bytes); } __syncthreads(); // publish ncclShmem while (true) { // Notify host that all fifo reads are complete. if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) { *ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks; } __syncwarp(); if (ncclShmem.work.header.type == ncclWorkTypeColl) { if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]); } else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) { if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem); } __syncthreads(); if (ncclShmem.work.header.funcIndex == FnIndex) { RunWork().run(&ncclShmem.work); } else { ncclFuncs[ncclShmem.work.header.funcIndex](); } int workIxNext = ncclShmem.work.header.workNext; __syncthreads(); if (ncclShmem.work.header.isLast) break; copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork)); { // Check whether the last operation was aborted and make sure all threads exit int aborted = tid == 0 ? *comm->abortFlag : 0; if (barrierReduceAny(aborted)) // publish ncclShmem.work break; } } } // Only generate kernels for SUM #if NCCL_OP == 0 #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \ __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)( \ struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead \ ) { \ ncclKernel, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex> \ (comm, channelMask, workHead); \ } #else #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fInded) #endif // Examples : AllReduce, RING, LL, Sum, uint8 #define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \ RunWork, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \ } // Only generate inline kernels for LL #define IMPL_COLL4(func, algo, devredop, type, ncclType) \ IMPL_COLL_FUNC(func, algo, LL, devredop, type) \ IMPL_COLL_FUNC(func, algo, LL128, devredop, type) \ IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type) \ IMPL_COLL_KERN(func, algo, LL, devredop, type, FUNC_INDEX(ncclFunc##func, ncclDev##devredop, ncclType, NCCL_ALGO_##algo, NCCL_PROTO_LL)) \ #define IMPL_COLL3(func, devredop, type, ncclType) \ IMPL_COLL4(func, TREE, devredop, type, ncclType) \ IMPL_COLL4(func, RING, devredop, type, ncclType) \ IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \ IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType) \ IMPL_COLL4(func, NVLS, devredop, type, ncclType) \ IMPL_COLL4(func, NVLS_TREE, devredop, type, ncclType) #if NCCL_TYPE == 0 #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t, ncclInt8) #elif NCCL_TYPE == 1 #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint8_t, ncclUint8) #elif NCCL_TYPE == 2 #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int32_t, ncclInt32) #elif NCCL_TYPE == 3 #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint32_t, ncclUint32) #elif NCCL_TYPE == 4 #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int64_t, ncclInt64) #elif NCCL_TYPE == 5 #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint64_t, ncclUint64) #elif NCCL_TYPE == 6 #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, half, ncclFloat16) #elif NCCL_TYPE == 7 #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, float, ncclFloat32) #elif NCCL_TYPE == 8 #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, double, ncclFloat64) #elif NCCL_TYPE == 9 && defined(__CUDA_BF16_TYPES_EXIST__) #define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, __nv_bfloat16, ncclBfloat16) #endif // Reduction define all functions #if NCCL_OP == 0 #define IMPL_COLL_R(func) IMPL_COLL2(func, Sum); #elif NCCL_OP == 1 #define IMPL_COLL_R(func) IMPL_COLL2(func, Prod); #elif NCCL_OP == 2 #define IMPL_COLL_R(func) IMPL_COLL2(func, Min); #elif NCCL_OP == 3 #define IMPL_COLL_R(func) IMPL_COLL2(func, Max); #elif NCCL_OP == 4 #define IMPL_COLL_R(func) IMPL_COLL2(func, PreMulSum); #elif NCCL_OP == 5 #if NCCL_TYPE < 6 #define IMPL_COLL_R(func) IMPL_COLL2(func, SumPostDiv); #else #define IMPL_COLL_R(func) // skip SumPostDiv for floating point #endif #endif #if NCCL_OP == 0 && NCCL_TYPE == 0 // Copy primitives only define one function for copy #define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8); // Point-to-point primitives only have one function/kernel. #define IMPL_COLL_P(func) \ IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \ IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P); #else #define IMPL_COLL_C(func) #define IMPL_COLL_P(func) #endif #define NCCL_NVLS_ENABLED (__CUDA_ARCH__ >= 900 && NCCL_NVLS_SUPPORTS(NCCL_TYPE, NCCL_OP)) #endif nccl-2.18.3-1/src/collectives/device/common_kernel.h000066400000000000000000000224021444201535000222230ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_COMMON_KERNEL_H_ #define NCCL_COMMON_KERNEL_H_ #include "devcomm.h" #include "op128.h" #include "reduce_kernel.h" #include #include #include // Define min for ssize_t inline __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; } inline __device__ int loadInt(int* ptr) { int v; asm volatile("ld.volatile.global.u32 %0, [%1];" : "=r"(v) : "l"(ptr)); return v; } template __device__ __forceinline__ void reduceCopyPacks( int nThreads, int &thread, uint64_t redArg, uint64_t *preOpArgs, bool postOp, int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs, IntBytes &nBytesBehind, IntBytes &nBytesAhead ) { static_assert(std::is_signed::value, "IntBytes must be a signed integral type."); if (BytePerPack == 0) __trap(); // A hunk is the amount of contiguous data a warp consumes per loop iteration // assuming all threads partake. constexpr int BytePerHunk = Unroll*WARP_SIZE*BytePerPack; int nWarps = nThreads/WARP_SIZE; int warp = thread/WARP_SIZE; int lane = thread%WARP_SIZE; // This thread's initial position. IntBytes threadBytesBehind = nBytesBehind + (warp*BytePerHunk + lane*BytePerPack); IntBytes threadBytesAhead = nBytesAhead - (warp*BytePerHunk + lane*BytePerPack); // Number of hunks to be consumed over all warps. IntBytes nHunksAhead = nBytesAhead/(BytePerHunk + !BytePerHunk); // Advance collective position. nBytesBehind += nHunksAhead*BytePerHunk; nBytesAhead -= nHunksAhead*BytePerHunk; if (Unroll==1 && BytePerPack <= nBytesAhead) { // Only Unroll=1 can do partial hunks (where not all threads partake). nHunksAhead += 1; nBytesBehind += nBytesAhead - (nBytesAhead%(BytePerPack + !BytePerPack)); nBytesAhead = nBytesAhead%(BytePerPack + !BytePerPack); } nHunksAhead -= warp; RedFn redFn(redArg); uintptr_t minSrcs[MinSrcs + !MinSrcs]; uintptr_t minDsts[MinDsts + !MinDsts]; #pragma unroll for (int s=0; s < MinSrcs; s++) minSrcs[s] = cvta_to_global(srcPtrs[s]) + threadBytesBehind; #pragma unroll for (int d=0; d < MinDsts; d++) minDsts[d] = cvta_to_global(dstPtrs[d]) + threadBytesBehind; // We dictate loop termination condition according to whether partial hunks // can be handled or not. while (Unroll==1 ? (BytePerPack <= threadBytesAhead) : (0 < nHunksAhead)) { BytePack acc[Unroll]; { RedFn preFn(0 < PreOpSrcs ? preOpArgs[0] : 0); #pragma unroll Unroll for (int u=0; u < Unroll; u++) { if (0 < MultimemSrcs) { // applyLoadMultimem uses relaxed semantics for same reason we use volatile below. acc[u] = applyLoadMultimem(preFn, minSrcs[0]); } else { // Use volatile loads in case credits are polled for with volatile (instead of acquire). acc[u] = ld_volatile_global(minSrcs[0]); } minSrcs[0] += WARP_SIZE*BytePerPack; if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]); } } #pragma unroll (MinSrcs-1 + !(MinSrcs-1)) for (int s=1; s < MinSrcs; s++) { BytePack tmp[Unroll]; RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0); #pragma unroll Unroll for (int u=0; u < Unroll; u++) { if (s < MultimemSrcs) { // applyLoadMultimem uses relaxed semantics for same reason we use volatile below. acc[u] = applyLoadMultimem(preFn, minSrcs[s]); } else { // Use volatile loads in case credits are polled for with volatile (instead of acquire). tmp[u] = ld_volatile_global(minSrcs[s]); } minSrcs[s] += WARP_SIZE*BytePerPack; } #pragma unroll Unroll for (int u=0; u < Unroll; u++) { if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]); acc[u] = applyReduce(redFn, acc[u], tmp[u]); } } for (int s=MinSrcs; (MinSrcs < MaxSrcs) && (s < MaxSrcs) && (s < nSrcs); s++) { uintptr_t src = cvta_to_global(srcPtrs[s]) + threadBytesBehind; BytePack tmp[Unroll]; RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0); #pragma unroll Unroll for (int u=0; u < Unroll; u++) { // Use volatile loads in case credits are polled for with volatile (instead of acquire). tmp[u] = ld_volatile_global(src); src += WARP_SIZE*BytePerPack; } #pragma unroll Unroll for (int u=0; u < Unroll; u++) { if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]); acc[u] = applyReduce(redFn, acc[u], tmp[u]); } } if (postOp) { #pragma unroll Unroll for (int u=0; u < Unroll; u++) acc[u] = applyPostOp(redFn, acc[u]); } #pragma unroll (MinDsts + !MinDsts) for (int d=0; d < MinDsts; d++) { #pragma unroll Unroll for (int u=0; u < Unroll; u++) { if (d < MultimemDsts) { multimem_st_global(minDsts[d], acc[u]); } else { st_global(minDsts[d], acc[u]); } minDsts[d] += WARP_SIZE*BytePerPack; } } for (int d=MinDsts; (MinDsts < MaxDsts) && (d < MaxDsts) && (d < nDsts); d++) { uintptr_t dst = cvta_to_global(dstPtrs[d]) + threadBytesBehind; #pragma unroll Unroll for (int u=0; u < Unroll; u++) { st_global(dst, acc[u]); dst += WARP_SIZE*BytePerPack; } } nWarps = nThreads/WARP_SIZE; #pragma unroll for (int s=0; s < MinSrcs; s++) minSrcs[s] += (nWarps-1)*BytePerHunk; #pragma unroll for (int d=0; d < MinDsts; d++) minDsts[d] += (nWarps-1)*BytePerHunk; threadBytesBehind += nWarps*BytePerHunk; threadBytesAhead -= nWarps*BytePerHunk; nHunksAhead -= nWarps; } nWarps = nThreads/WARP_SIZE; warp = thread/WARP_SIZE; lane = thread%WARP_SIZE; // The last loop iteration could have been partial, i.e. not taken by all // threads. The threads that weren't included need an extra subtraction to // make the value warp uniform. if (Unroll==1 && nHunksAhead > 0) nHunksAhead -= nWarps; // Rotate warps so the warp which got the least work here will be warp 0. // This effectively assigns: warp = (warp-nHunks+nWarps)%nWarps; warp = -nHunksAhead; thread = warp*WARP_SIZE + lane; } template __device__ __forceinline__ void reduceCopy( int thread, int nThreads, uint64_t redArg, uint64_t *preOpArgs, bool postOp, int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs, IntBytes nElts ) { static_assert(MultimemSrcs <= MinSrcs && MultimemDsts <= MinDsts, "Multimem pointers cannot exceed respective Min values."); //int nWarps = nThreads/WARP_SIZE; //int warp = thread/WARP_SIZE; int lane = thread%WARP_SIZE; // If a multimem src is present then our biggest pack size is limited to what // is supported for this redfn/type. constexpr int BigPackSize = (MultimemSrcs == 0) ? 16 : LoadMultimem_BigPackSize::BigPackSize; IntBytes nBytesBehind = 0; IntBytes nBytesAhead = nElts*sizeof(T); #if __cpp_if_constexpr if constexpr (BigPackSize > sizeof(T)) { #else if (BigPackSize > sizeof(T)) { #endif // Check that all pointers are BigPackSize aligned. bool aligned = true; if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane]) % (BigPackSize + !BigPackSize); if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane]) % (BigPackSize + !BigPackSize); aligned = __all_sync(~0u, aligned); if (aligned) { reduceCopyPacks (nThreads, /*&*/thread, redArg, preOpArgs, postOp, nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead); if (nBytesAhead == 0) return; reduceCopyPacks (nThreads, /*&*/thread, redArg, preOpArgs, postOp, nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead); if (nBytesAhead == 0) return; } } reduceCopyPacks (nThreads, /*&*/thread, redArg, preOpArgs, postOp, nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead); if (nBytesAhead == 0) return; reduceCopyPacks (nThreads, /*&*/thread, redArg, preOpArgs, postOp, nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead); } #endif // COMMON_KERNEL_H_ nccl-2.18.3-1/src/collectives/device/functions.cu000066400000000000000000000120001444201535000215540ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "devcomm.h" #include "collectives.h" #include "common.h" __shared__ ncclShmemData ncclShmem; #if __CUDA_ARCH__ < 700 __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)]; #endif #define NCCL_FUNC5(func, algo, devredop, type, nullify) \ MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \ MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128, devredop, type)), \ MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type)) #define NCCL_FUNC4(func, devredop, type, nullify) \ NCCL_FUNC5(func, TREE, devredop, type, nullify), \ NCCL_FUNC5(func, RING, devredop, type, nullify), \ NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \ NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \ NCCL_FUNC5(func, NVLS, devredop, type, nullify), \ NCCL_FUNC5(func, NVLS_TREE, devredop, type, nullify) #if defined(__CUDA_BF16_TYPES_EXIST__) // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(func, devredop, nullForFloat) \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, uint8_t, 0), \ NCCL_FUNC4(func, devredop, int32_t, 0), \ NCCL_FUNC4(func, devredop, uint32_t, 0), \ NCCL_FUNC4(func, devredop, int64_t, 0), \ NCCL_FUNC4(func, devredop, uint64_t, 0), \ NCCL_FUNC4(func, devredop, half, nullForFloat), \ NCCL_FUNC4(func, devredop, float, nullForFloat), \ NCCL_FUNC4(func, devredop, double, nullForFloat), \ NCCL_FUNC4(func, devredop, __nv_bfloat16, nullForFloat) #define NCCL_FUNCS3B(func, devredop) \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0) #else // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(func, devredop, nullForFloat) \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, uint8_t, 0), \ NCCL_FUNC4(func, devredop, int32_t, 0), \ NCCL_FUNC4(func, devredop, uint32_t, 0), \ NCCL_FUNC4(func, devredop, int64_t, 0), \ NCCL_FUNC4(func, devredop, uint64_t, 0), \ NCCL_FUNC4(func, devredop, half, nullForFloat), \ NCCL_FUNC4(func, devredop, float, nullForFloat), \ NCCL_FUNC4(func, devredop, double, nullForFloat) #define NCCL_FUNCS3B(func, devredop) \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0), \ NCCL_FUNC4(func, devredop, int8_t, 0) #endif // Must be consistent with ncclRedOp_t #define NCCL_FUNCS2A(func) \ NCCL_FUNCS3A(func, Sum, /*nullForFloat=*/0), \ NCCL_FUNCS3A(func, Prod, /*nullForFloat=*/0), \ NCCL_FUNCS3A(func, Max, /*nullForFloat=*/0), \ NCCL_FUNCS3A(func, Min, /*nullForFloat=*/0), \ NCCL_FUNCS3A(func, PreMulSum, /*nullForFloat=*/0), \ NCCL_FUNCS3A(func, SumPostDiv, /*nullForFloat=*/1) #define NCCL_FUNCS2B(func) \ NCCL_FUNCS3B(func, Sum), \ NCCL_FUNCS3B(func, Sum), \ NCCL_FUNCS3B(func, Sum), \ NCCL_FUNCS3B(func, Sum), \ NCCL_FUNCS3B(func, Sum), \ NCCL_FUNCS3B(func, Sum) // Must be consistent with the ncclFuncSet enum __device__ ncclKern_t ncclFuncs[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { // Don't try to initialize the host shadow copy of this device-side global // variable. There is no host pointer to a device-side function, which // confuses clang. This will be fixed in the next clang release. #if __CUDA_ARCH__ NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t), NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t), NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t), NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t), NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t), NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t), NCCL_ONERANK_REDUCE_NAME(PreMulSum, half), NCCL_ONERANK_REDUCE_NAME(PreMulSum, float), NCCL_ONERANK_REDUCE_NAME(PreMulSum, double), #if defined(__CUDA_BF16_TYPES_EXIST__) NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16), #endif NCCL_FUNCS2B(Broadcast), NCCL_FUNCS2A(Reduce), NCCL_FUNCS2B(AllGather), NCCL_FUNCS2A(ReduceScatter), NCCL_FUNCS2A(AllReduce) #endif }; // Workaround for https://reviews.llvm.org/D55580 __device__ void ncclWorkaroundClangD55580() {} nccl-2.18.3-1/src/collectives/device/gen_rules.sh000077500000000000000000000024151444201535000215460ustar00rootroot00000000000000#!/bin/bash # # Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # dir=$1 datatypes="i8 u8 i32 u32 i64 u64 f16 f32 f64" if [ "$CUDA_MAJOR" -ge 11 ] then datatypes+=" bf16" fi targets="GENOBJS := \\\\\n" for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do opn=0 for op in sum prod min max premulsum sumpostdiv; do dtn=0 # Order must match that of the ncclDataType_t enum for dt in ${datatypes}; do # Generate a unique filename for each compilation unit, # otherwise the __nv_module_id may conflict at link time echo "${dir}/${base}_${op}_${dt}.cu : ${base}.cu" echo " @printf \"Copying %-35s > %s\\\\n\" \$< \$@" echo " cp \$< \$@" echo "" # Compile the file echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${op}_${dt}.cu ${base}.cu ${dir}/${base}.dep" echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o" echo " mkdir -p ${dir}" echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@" echo "" targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n" dtn=$(($dtn + 1)) done opn=$(($opn + 1)) done done echo -e "$targets" nccl-2.18.3-1/src/collectives/device/onerank_reduce.cu000066400000000000000000000041011444201535000225330ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "devcomm.h" #include "collectives.h" #include "common_kernel.h" #include "common.h" namespace { template __device__ __forceinline__ void oneRankReduce() { ncclWork *w = &ncclShmem.work; int tid = threadIdx.x; int tn = blockDim.x; #pragma unroll 1 for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) { ncclWorkElem *we = &w->elems[e]; intptr_t eltN = we->count; int bid = we->bid; int bn = we->nChannels; T const *src = (T const*)we->sendbuff; T *dst = (T*)we->recvbuff; // each block/channel gets a roughly equal segment of 16 byte packs constexpr int EltPerPack = 16/sizeof(T); intptr_t packN = (eltN + EltPerPack-1) - (eltN + EltPerPack-1)%EltPerPack; intptr_t i0 = (bid+0)*(packN/bn) + (bid+0 < packN%bn ? bid+0 : packN%bn); intptr_t i1 = (bid+1)*(packN/bn) + (bid+1 < packN%bn ? bid+1 : packN%bn); i0 *= EltPerPack; i0 = i0 < eltN ? i0 : eltN; i1 *= EltPerPack; i1 = i1 < eltN ? i1 : eltN; src += i0; dst += i0; void *vsrc = (void*)src; void *vdst = (void*)dst; reduceCopy (tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0); } } } #define INSTANTIATE(devredop, type) \ __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \ oneRankReduce>(); \ } INSTANTIATE(PreMulSum, int8_t) INSTANTIATE(PreMulSum, uint8_t) INSTANTIATE(PreMulSum, int32_t) INSTANTIATE(PreMulSum, uint32_t) INSTANTIATE(PreMulSum, int64_t) INSTANTIATE(PreMulSum, uint64_t) INSTANTIATE(PreMulSum, half) #if defined(__CUDA_BF16_TYPES_EXIST__) INSTANTIATE(PreMulSum, __nv_bfloat16) #endif INSTANTIATE(PreMulSum, float) INSTANTIATE(PreMulSum, double) nccl-2.18.3-1/src/collectives/device/op128.h000066400000000000000000000334731444201535000202560ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef OP128_H_ #define OP128_H_ #include inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) { asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v0), "=l"(v1) : "l"(ptr)); } inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) { asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};" :: "l"(v0), "l"(v1), "l"(ptr)); } inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) { uint64_t* shmemAsmPtr; asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr)); return shmemAsmPtr; } inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) { asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];" : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr)); } inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) { asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};" :: "l"(v0), "l"(v1), "l"(shmemAsmPtr)); } template inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1) { union { uint32_t tmp4[4]; uint64_t tmp8[2]; }; if(sizeof(T) < 4) { uint32_t *ptr4 = reinterpret_cast(reinterpret_cast(ptr) & -uintptr_t(4)); #pragma unroll for(int e=0; e < 4; e++) { // Produce 4 bytes of sub-register type by reading 2 4-byte // aligned values and shifting. uint32_t lo, hi; asm("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0)); asm("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1)); tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast(ptr))%4)); } } else if(sizeof(T) == 4) { #pragma unroll for(int e=0; e < 4; e++) asm("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e)); } else /*sizeof(T)==8*/ { #pragma unroll for(int e=0; e < 2; e++) asm("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e)); } v0 = tmp8[0]; v1 = tmp8[1]; } template __device__ __forceinline__ uint32_t cvta_to_shared(T* ptr) { return (uint32_t)__cvta_generic_to_shared(ptr); } template __device__ __forceinline__ uintptr_t cvta_to_global(T* ptr) { return (uintptr_t)__cvta_generic_to_global(ptr); } template __device__ __forceinline__ T* cvta_from_shared(uint32_t shptr) { T* ans; asm("cvta.shared.u64 %0, %1;" : "=l"(ans) : "l"(uint64_t(shptr))); return ans; } template __device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) { T* ans; asm("cvta.global.u64 %0, %1;" : "=l"(ans) : "l"(gptr)); return ans; } //////////////////////////////////////////////////////////////////////////////// // BytePack: struct of bytes. template union BytePack; template<> union BytePack<0> {}; template<> union BytePack<1> { uint8_t u8, native; }; template<> union BytePack<2> { BytePack<1> half[2]; uint8_t u8[2]; uint16_t u16, native; }; template<> union BytePack<4> { BytePack<2> half[2]; uint8_t u8[4]; uint16_t u16[2]; uint32_t u32, native; }; template<> union BytePack<8> { BytePack<4> half[2]; uint8_t u8[8]; uint16_t u16[4]; uint32_t u32[2]; uint64_t u64, native; }; template<> union alignas(16) BytePack<16> { BytePack<8> half[2]; uint8_t u8[16]; uint16_t u16[8]; uint32_t u32[4]; uint64_t u64[2]; ulong2 ul2, native; }; template struct BytePackOf { static constexpr int Size = sizeof(T); using Pack = BytePack; }; template<> struct BytePackOf> { static constexpr int Size = 0; using Pack = BytePack<0>; }; template __device__ __forceinline__ typename BytePackOf::Pack toPack(T value) { union { typename BytePackOf::Pack p; T v; }; v = value; return p; } template __device__ __forceinline__ T fromPack(typename BytePackOf::Pack pack) { union { typename BytePackOf::Pack p; T v; }; p = pack; return v; } //////////////////////////////////////////////////////////////////////////////// // Load/store of BytePack using integral addresses. template __device__ BytePack ld_global(uintptr_t addr); template __device__ BytePack ld_volatile_global(uintptr_t addr); template __device__ BytePack ld_shared(uint32_t addr); template __device__ BytePack ld_volatile_shared(uint32_t addr); template __device__ void st_global(uintptr_t addr, BytePack value); template __device__ void st_shared(uint32_t addr, BytePack value); template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; } template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; } template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {} template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {} // Used to define implementations for above prototypes. #define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \ template<> \ __device__ __forceinline__ BytePack ld_##space(addr_cxx_ty addr) { \ data_cxx_ty tmp; \ asm("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \ BytePack ans; \ ans.native = tmp; \ return ans; \ } \ template<> \ __device__ __forceinline__ BytePack ld_volatile_##space(addr_cxx_ty addr) { \ data_cxx_ty tmp; \ asm("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \ BytePack ans; \ ans.native = tmp; \ return ans; \ } \ template<> \ __device__ __forceinline__ void st_##space(addr_cxx_ty addr, BytePack value) { \ data_cxx_ty tmp = value.native; \ asm volatile("st." #space "." #data_ptx_ty " [%0], %1;" :: #addr_reg_ty(addr), #data_reg_ty(tmp) : "memory"); \ } // Single-byte types use 4-byte registers since there is no 1-byte register // character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints DEFINE_ld_st(1, uint32_t, b8, r, global, uintptr_t, l) DEFINE_ld_st(1, uint32_t, b8, r, shared, uint32_t, r) DEFINE_ld_st(2, uint16_t, b16, h, global, uintptr_t, l) DEFINE_ld_st(2, uint16_t, b16, h, shared, uint32_t, r) DEFINE_ld_st(4, uint32_t, b32, r, global, uintptr_t, l) DEFINE_ld_st(4, uint32_t, b32, r, shared, uint32_t, r) DEFINE_ld_st(8, uint64_t, b64, l, global, uintptr_t, l) DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r) #undef DEFINE_ld_st #define DEFINE_ld_st_16(space, addr_cxx_ty, addr_reg_ty) \ template<> \ __device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \ BytePack<16> ans; \ asm("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \ return ans; \ } \ template<> \ __device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \ BytePack<16> ans; \ asm("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \ return ans; \ } \ template<> \ __device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \ asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \ } DEFINE_ld_st_16(global, uintptr_t, l) DEFINE_ld_st_16(shared, uint32_t, r) #undef DEFINE_ld_st_16 //////////////////////////////////////////////////////////////////////////////// // Atomic load/store using c++ pointers. __device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) { uint64_t ans; asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); return ans; } __device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) { uint64_t ans; #if __CUDA_ARCH__ >= 700 asm("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); #else asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); #endif return ans; } __device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) { uint64_t ans; #if __CUDA_ARCH__ >= 700 asm("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); #else asm("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr))); #endif return ans; } __device__ __forceinline__ void st_volatile_global(uint64_t *ptr, uint64_t val) { asm volatile("st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); } __device__ __forceinline__ void st_relaxed_sys_global(uint64_t *ptr, uint64_t val) { #if __CUDA_ARCH__ >= 700 asm volatile("st.relaxed.sys.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); #else asm volatile("st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); #endif } __device__ __forceinline__ void st_release_sys_global(uint64_t *ptr, uint64_t val) { #if __CUDA_ARCH__ >= 700 asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); #else asm volatile("membar.sys; st.volatile.global.u64 [%0], %1;" :: "l"(cvta_to_global(ptr)), "l"(val) : "memory"); #endif } __device__ __forceinline__ void fence_acq_rel_sys() { #if __CUDA_ARCH__ >= 700 asm volatile("fence.acq_rel.sys;" ::: "memory"); #else asm volatile("membar.sys;" ::: "memory"); #endif } __device__ __forceinline__ void fence_acq_rel_gpu() { #if __CUDA_ARCH__ >= 700 asm volatile("fence.acq_rel.gpu;" ::: "memory"); #else asm volatile("membar.gl;" ::: "memory"); #endif } //////////////////////////////////////////////////////////////////////////////// // Multimem stores of BytePack. template __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack val); #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 template<> __device__ __forceinline__ void multimem_st_global<0>(uintptr_t addr, BytePack<0> val) { // nop } template<> __device__ __forceinline__ void multimem_st_global<1>(uintptr_t addr, BytePack<1> val) { asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.u8) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<2>(uintptr_t addr, BytePack<2> val) { asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.u16) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) { asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<8>(uintptr_t addr, BytePack<8> val) { asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.u64) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<16>(uintptr_t addr, BytePack<16> val) { asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" :: "l"(addr), "r"(val.u32[0]), "r"(val.u32[1]), "r"(val.u32[2]), "r"(val.u32[3]) : "memory"); } #else template __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack val) { // nop } #endif // Warp-uniform memory copy from shared address (not generic) to global memory. // The number of bytes copied is `min(MaxBytes, nBytesAhead)`, a negative value // is interpeted as zero. EltSize is the guaranteed alignment of the addresses and sizes. template __device__ __forceinline__ void copyGlobalShared_WarpUnrolled( int lane, uintptr_t dstAddr, uint32_t srcAddr, IntBytes nBytesAhead ) { static_assert(std::is_signed::value, "`IntBytes` must be a signed integral type."); int nBytes = min(nBytesAhead, (IntBytes)MaxBytes); int nFrontBytes = min(nBytes, (16 - int(dstAddr%16))%16); int nMiddleBytes = (nBytes-nFrontBytes) & -16; int nBackBytes = (nBytes-nFrontBytes) % 16; { int backLane = WARP_SIZE-1 - lane; bool hasFront = lane*EltSize < nFrontBytes; bool hasBack = backLane*EltSize < nBackBytes; int offset = hasFront ? lane*EltSize : (nBytes - (backLane+1)*EltSize); if (hasFront | hasBack) { BytePack tmp = ld_shared(srcAddr+offset); // Can't use multimem_st since it doesn't support EltSize==2 st_global(dstAddr+offset, tmp); } } srcAddr += nFrontBytes; int srcMisalign = EltSize < 4 ? (srcAddr%4) : 0; srcAddr += -srcMisalign + lane*16; dstAddr += nFrontBytes + lane*16; nMiddleBytes -= lane*16; #pragma unroll for (int u=0; u < divUp(MaxBytes, WARP_SIZE*16); u++) { if (nMiddleBytes <= 0) break; union { BytePack<4> b4[4]; BytePack<16> b16; }; b4[0] = ld_shared<4>(srcAddr + 0*4); b4[1] = ld_shared<4>(srcAddr + 1*4); b4[2] = ld_shared<4>(srcAddr + 2*4); b4[3] = ld_shared<4>(srcAddr + 3*4); if (srcMisalign != 0) { BytePack<4> b4_4 = ld_shared<4>(srcAddr + 4*4); b4[0].u32 = __funnelshift_r(b4[0].u32, b4[1].u32, srcMisalign*8); b4[1].u32 = __funnelshift_r(b4[1].u32, b4[2].u32, srcMisalign*8); b4[2].u32 = __funnelshift_r(b4[2].u32, b4[3].u32, srcMisalign*8); b4[3].u32 = __funnelshift_r(b4[3].u32, b4_4.u32, srcMisalign*8); } if (Multimem) multimem_st_global<16>(dstAddr, b16); else st_global<16>(dstAddr, b16); srcAddr += WARP_SIZE*16; dstAddr += WARP_SIZE*16; nMiddleBytes -= WARP_SIZE*16; } } #endif nccl-2.18.3-1/src/collectives/device/primitives.h000066400000000000000000000126751444201535000216010ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_PRIMITIVES_H_ #define NCCL_PRIMITIVES_H_ #include #include "reduce_kernel.h" // for reduction funcs #include "common_kernel.h" #include "common.h" #define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000 /* Protocol classes: ProtoSimple, ProtoLL, ProtoLL128 * We use these as template args to the Primtiives class instead of integral * enums (e.g. NCCL_PROTO_LL) because for SIMPLE we need to carry a few extra * numbers. Also these types hold methods which let us compute numbers important * to how that protocol operates with a consistent interface so that our * algorithm code can operate protocol parametrically. */ template struct ProtoSimple { static constexpr int Id = NCCL_PROTO_SIMPLE; static constexpr int SlicePerChunk = SlicePerChunk_1; static constexpr int StepPerSlice = StepPerSlice_1; static constexpr int Unroll = Unroll_1; static constexpr int MultimemSrcs = MultimemSrcs_1; static constexpr int MultimemDsts = MultimemDsts_1; // Data bytes (no flags etc) in one step of the fifo queue. __device__ static int calcBytePerStep() { return ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; } // Granularity of data bytes transferred per thread. __device__ static int calcBytePerGrain() { return sizeof(uint64_t); // Bogus value? Nobody queries this metric for simple. } // Group width is how many consecutive group values a subchannel occupies. static constexpr int MaxGroupWidth = 2; }; struct ProtoLL { static constexpr int Id = NCCL_PROTO_LL; // Data bytes (no flags etc) in one step of the fifo queue. __device__ static int calcBytePerStep() { return ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/2; // Half is data } // Granularity of data bytes transferred per thread. __device__ static int calcBytePerGrain() { return sizeof(uint64_t); // One 16-byte line has 8-bytes of data } // Group width is how many consecutive group values a subchannel occupies. static constexpr int MaxGroupWidth = 1; }; struct ProtoLL128 { static constexpr int Id = NCCL_PROTO_LL128; // Data bytes (no flags etc) in one step of the fifo queue. __device__ static int calcBytePerStep() { return (ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS)*NCCL_LL128_DATAELEMS/NCCL_LL128_LINEELEMS; } // Granularity of data bytes transferred per thread. __device__ static int calcBytePerGrain() { return NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_DATAELEMS*sizeof(uint64_t)/NCCL_LL128_LINEELEMS; } // Group width is how many consecutive group values a subchannel occupies. static constexpr int MaxGroupWidth = 1; }; /* Fan (as in fan-in & fan-out) classes hold recv and send counts. The template * arguments are static bounds on the maximum values. Asymmetric counts are * independent. Symmetric is a static guarantee that nrecv==nsend, so it only * stores one value at runtime. This optimization save 32-bit register, but more * importantly uses fewer predicate registers when unrolling loops. */ template struct FanAsymmetric { static constexpr int MaxRecv = MaxRecv_, MaxSend = MaxSend_; int nr, ns; FanAsymmetric() = default; __device__ FanAsymmetric(int nrecv, int nsend): nr(nrecv), ns(nsend) { // assert(nrecv <= MaxRecv && nsend <= MaxSend); } __device__ int nrecv() const { return MaxRecv ? nr : 0; } __device__ int nsend() const { return MaxSend ? ns : 0; } }; template struct FanSymmetric { static constexpr int MaxRecv = MaxArity, MaxSend = MaxArity; int n; FanSymmetric() = default; __device__ FanSymmetric(int nrecv, int nsend): n(nrecv) { // assert(nrecv == nsend && nrecv <= MaxArity); } __device__ int nrecv() const { return n; } __device__ int nsend() const { return n; } }; // The primitives class. Specialized per protocol in the other headers. template class Primitives; // Used by LL & LL128 to implement direct members in the naive way. template struct PrimitivesWithoutDirect { __device__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) { static_cast(this)->send(inpIx, eltN); } __device__ void directSendFromOutput(intptr_t outIx, int eltN) { static_cast(this)->sendFromOutput(outIx, eltN); } __device__ void directRecv(intptr_t outIx, int eltN) { static_cast(this)->recv(outIx, eltN, /*postOp=*/false); } __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { static_cast(this)->copySend(inpIx, outIx, eltN, postOp); } __device__ void directRecvCopySend(intptr_t outIx, int eltN) { static_cast(this)->recvCopySend(outIx, eltN, /*postOp=*/false); } __device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { // Direct is only for the send part static_cast(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp); } }; #include "prims_simple.h" #include "prims_ll.h" #include "prims_ll128.h" #endif nccl-2.18.3-1/src/collectives/device/prims_ll.h000066400000000000000000000326401444201535000212210ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ template class Primitives: public PrimitivesWithoutDirect> { // In the case of Fan::MaxRecv == 0, we need to force MaxRecv to 1 for this to compile // This is because of a recv buffer which is allocated to MaxRecv length in send-only cases static constexpr int MaxRecv = Fan::MaxRecv > 1 ? Fan::MaxRecv : 1; static constexpr int MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; RedOp redOp; const int tid; const int nthreads; const int wid; const int group; const int stepLines; Fan fan; T *userBufs[2]; struct ncclConnInfo* recvConn = NULL; volatile uint64_t* recvConnHeadPtr = NULL; uint64_t recvConnHead; struct ncclConnInfo* sendConn = NULL; volatile int* sendConnFifoPtr = NULL; volatile uint64_t* sendConnHeadPtr = NULL; uint64_t sendConnHead; uint64_t sendConnHeadCache; // Cache last seen value uint64_t recvStep[MaxRecv]; uint64_t sendStep[MaxSend]; union ncclLLFifoLine* recvBuff[MaxRecv]; union ncclLLFifoLine* sendBuff[MaxSend]; inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepLines; } inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepLines; } inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); } inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } inline __device__ void barrier() { if (nthreads == WARP_SIZE) __syncwarp(); else asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group)); } uint32_t abort = 0; inline __device__ int checkAbort(int &spins, int send) { spins++; if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { abort = *ncclShmem.comm.abortFlag; spins = 0; } return abort; } inline __device__ void waitSend(int nbytes) { if (sendConnHeadPtr) { int spins = 0; while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { sendConnHeadCache = *sendConnHeadPtr; if (checkAbort(spins, 1)) break; } if (sendConnFifoPtr) { int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes; sendConnFifoPtr[sendConnHead%NCCL_STEPS] = size; } sendConnHead += 1; } barrier(); } inline __device__ void incRecv(int i) { recvStep[i] += 1; } inline __device__ void postRecv() { barrier(); if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1; } inline __device__ void incSend(int i, int offset) { // LL Cleanup : write all flags in the slice to make sure we don't have // data corruption when flag loops over. if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) { for (int o = offset; oi4)); if (checkAbort(spins, 0)) break; } while ((flag1 != flag) || (flag2 != flag)); uint64_t val64 = data1 + (((uint64_t)data2) << 32); return val64; } template __device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) { #pragma unroll for (int i=BeginIx; i < MaxRecv; i++) { if (i < fan.nrecv()) { union ncclLLFifoLine* src = recvPtr(i) + offset; asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4)); } } } __device__ uint64_t readLLFinish(int offset, ncclLLFifoLine(&line)[MaxRecv], int i) { union ncclLLFifoLine* src = recvPtr(i) + offset; uint32_t flag = recvFlag(i); int spins = 0; while (line[i].flag1 != flag || line[i].flag2 != flag) { asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4)); if (checkAbort(spins, 0)) break; } uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32); return val64; } __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); } static constexpr int EltPerLine = sizeof(uint64_t)/sizeof(T); template __device__ static U load(U *src) { union { U elt; uint16_t u2; uint32_t u4; uint64_t u8; }; if(sizeof(U) == 1) asm("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src)); else if(sizeof(U) == 2) asm("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src)); else if(sizeof(U) == 4) asm("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src)); else asm("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src)); return elt; } template __device__ static void store(U *dst, U val) { union { U elt; uint16_t u2; uint32_t u4; uint64_t u8; }; elt = val; if(sizeof(U) == 1) asm("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4)); else if(sizeof(U) == 2) asm("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2)); else if(sizeof(U) == 4) asm("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4)); else asm("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8)); } struct DataLoader { int misalign; union { uint32_t u4[sizeof(T) <= 2 ? 3 : 2]; uint64_t u8; T elt[EltPerLine]; }; __device__ void loadBegin(T *src, int eltN) { if (sizeof(T) <= 2) { misalign = reinterpret_cast(src)%4; uint32_t *p = reinterpret_cast(reinterpret_cast(src) & -uintptr_t(4)); u4[0] = load(p+0); u4[1] = misalign + eltN*sizeof(T) > 4 ? load(p+1) : 0; // u4[2] would be simpler, but that throws warnings on some compilers u4[sizeof(T) <= 2 ? 2 : 0] = misalign + eltN*sizeof(T) > 8 ? load(p+2) : 0; } else { #pragma unroll for(int i=0; i < EltPerLine; i++) { if(i==0 || i < eltN) elt[i] = load(src + i); } } } __device__ uint64_t loadFinish() { if (sizeof(T) <= 2) { u4[0] = __funnelshift_r(u4[0], u4[1], 8*misalign); // u4[2] would be simpler, but that throws warnings on some compilers u4[1] = __funnelshift_r(u4[1], u4[sizeof(T) <= 2 ? 2 : 0], 8*misalign); } return u8; } }; __device__ void storeData(T *dst, uint64_t val, int eltN) { union { uint64_t u8; T elt[EltPerLine]; }; u8 = val; #pragma unroll for(int i=0; i < EltPerLine; i++) { if (i==0 || i < eltN) //store(dst+i, elt[i]); dst[i] = elt[i]; } } template __device__ __forceinline__ void LLGenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) { constexpr int SRC = SrcBuf != -1 ? 1 : 0; constexpr int DST = DstBuf != -1 ? 1 : 0; T *srcElts = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx; T *dstElts = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx; // Always waitSend in case of cleanup nelem = nelem < 0 ? 0 : nelem; if (SEND) waitSend(divUp(nelem, EltPerLine)*sizeof(ncclLLFifoLine)); nelem -= tid*EltPerLine; srcElts += tid*EltPerLine; dstElts += tid*EltPerLine; int offset = tid; int eltPerTrip = nthreads*EltPerLine; while (nelem > 0) { int eltInLine = EltPerLine < nelem ? EltPerLine : nelem; DataLoader dl; ncclLLFifoLine line[MaxRecv]; uint64_t data, peerData; if (SRC) { dl.loadBegin(srcElts, eltInLine); srcElts += eltPerTrip; } if (RECV) { readLLBeginAll<1>(offset, line); peerData = readLL(offset, 0); } if (SRC) { data = dl.loadFinish(); if (SrcBuf == Input) data = applyPreOp(redOp, data); } if (RECV) { data = !SRC ? peerData : applyReduce(redOp, peerData, data); #pragma unroll MaxRecv for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) { peerData = readLLFinish(offset, line, i); data = applyReduce(redOp, peerData, data); } } if (postOp) data = applyPostOp(redOp, data); // Send : inter-node, then intra-node, then local if (SEND) { for (int i=1; i < MaxSend && i < fan.nsend(); i++) storeLL(sendPtr(i)+offset, data, sendFlag(i)); storeLL(sendPtr(0)+offset, data, sendFlag(0)); } if (DST) { storeData(dstElts, data, eltInLine); dstElts += eltPerTrip; } nelem -= eltPerTrip; offset += nthreads; } if (RECV) { for (int i=0; i < MaxRecv; i++) incRecv(i); postRecv(); } if (SEND) { for (int i=1; i < MaxSend && i < fan.nsend(); i++) incSend(i, offset); incSend(0, offset); } } __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { recvBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL]; recvStep[i] = conn->step; if (wid == i) recvConn = conn; } __device__ __forceinline__ void loadRecvSync() { if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv()) { recvConnHeadPtr = recvConn->head; recvConnHead = recvConn->step; } } __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { sendBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL]; sendStep[i] = conn->step; if (wid == i) sendConn = conn; } __device__ __forceinline__ void loadSendSync() { if (tid < fan.nsend()) { sendConnHeadPtr = sendConn->head; sendConnHeadCache = *sendConnHeadPtr; sendConnHead = sendConn->step; sendConnFifoPtr = sendConn->sizesFifo; } } public: __device__ Primitives( const int tid, const int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, uint8_t connIndexRecv=0, uint8_t connIndexSend=0 ): redOp(redOpArg), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group), stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) { auto *channel = &ncclShmem.channel; // If we are going to support oneshot collNet + LL, then we would need to add connector index here int nrecv=0, nsend=0; // We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1 while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) { loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv); nrecv++; } while (nsend < MaxSend && sendPeers[nsend] >= 0) { loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend); nsend++; } this->fan = Fan(nrecv, nsend); loadRecvSync(); loadSendSync(); setDataPtrs(inputBuf, outputBuf); } __device__ ~Primitives() { // Save steps for the next operation if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv()) recvConn->step = recvConnHead; if (tid < fan.nsend()) sendConn->step = sendConnHead; // Ensure all steps written back barrier(); } __device__ void setDataPtrs(void const *inputBuf, void *outputBuf) { userBufs[Input] = (T*)inputBuf; userBufs[Output] = (T*)outputBuf; } __device__ void moveDataPtrs(intptr_t delta) { userBufs[Input] += delta; userBufs[Output] += delta; } __device__ void send(intptr_t inpIx, int eltN) { return LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false); } __device__ void sendFromOutput(intptr_t outIx, int eltN) { return LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false); } __device__ void recv(intptr_t outIx, int eltN, bool postOp=false) { return LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp); } __device__ void recvReduceSend(intptr_t inpIx, int eltN) { return LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false); } __device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { return LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { return LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { return LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp); } __device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { return LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp); } }; nccl-2.18.3-1/src/collectives/device/prims_ll128.h000066400000000000000000000370731444201535000214610ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "op128.h" #define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1) template class Primitives: public PrimitivesWithoutDirect> { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; RedOp redOp; const int tid; // thread index in primitives group const int nthreads; // thread count in primitives group const int wid; // lane index in warp const int stepSize; const int warp; // warp index in primitives group const int warpInBlock; // warp index in thread block const bool flagThread; const int group; Fan fan; T *userBufs[2]; struct ncclConnInfo* recvConn = NULL; volatile uint64_t* recvConnHeadPtr = NULL; uint64_t recvConnHead; struct ncclConnInfo* sendConn = NULL; volatile int* sendConnFifoPtr = NULL; volatile uint64_t* sendConnTailPtr = NULL; uint64_t sendConnTail; volatile uint64_t* sendConnHeadPtr = NULL; uint64_t sendConnHead; uint64_t sendConnHeadCache; // Cache last seen value uint64_t recvStep[MaxRecv]; uint64_t sendStep[MaxSend]; uint64_t* recvBuff[MaxRecv]; uint64_t* sendBuff[MaxSend]; inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; } inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; } inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; } inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; } inline __device__ void barrier() { asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group)); } uint32_t abort = 0; inline __device__ int checkAbort(int &spins, int i, int send) { spins++; if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { abort = *ncclShmem.comm.abortFlag; spins = 0; } return abort; } inline __device__ void waitSend(int nbytes) { if (sendConnHeadPtr) { int spins = 0; while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { sendConnHeadCache = *sendConnHeadPtr; if (checkAbort(spins, wid, 1)) break; } if (sendConnFifoPtr) { sendConnFifoPtr[sendStep[wid]%NCCL_STEPS] = nbytes; } sendConnHead += 1; } } inline __device__ void postRecv() { if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1; } inline __device__ void postSend() { if (sendConnTailPtr) { #if __CUDA_ARCH__ >= 900 __threadfence_system(); #else __threadfence(); #endif *sendConnTailPtr = sendConnTail += 1; } } template __device__ __forceinline__ void loadRegsBegin(uint64_t(®s)[WordPerThread], T const *src, int eltN) { constexpr int EltPer16B = 16/sizeof(T); if(reinterpret_cast(src)%16 == 0) { /* We are aligned to 16 bytes, so load directly to registers no shmem. * Flag threads load half as much data which gets shuffled to the even * registers during Finish. The point of splitting into two phases is to * defer that shuffle, which incurs a dependency stall, until after other * memops are launched by the caller. */ #pragma unroll for(int g=0; g < WordPerThread/2; g++) { int ix = g*WARP_SIZE - 4*(g/2) + wid - (g%2)*(wid/8); if(!flagThread || g%2==0) { if(ix*EltPer16B < eltN) load128((uint64_t*)(src + ix*EltPer16B), regs[2*g+0], regs[2*g+1]); } } } else { // Not aligned. Stage the smallest 16 byte aligned region subsuming the // buffer into shmem. int misalignment = reinterpret_cast(src) % 16; uint64_t *src8 = reinterpret_cast(reinterpret_cast(src) & -uintptr_t(16)); uint64_t *shm8 = shmemCvtPtr((uint64_t*)ncclScratchForWarp(warpInBlock)); #pragma unroll for(int g=0; g < WordPerThread/2; g++) if((g*WARP_SIZE + wid)*16 < misalignment + eltN*sizeof(T)) load128(src8 + 2*(g*WARP_SIZE + wid), regs[2*g+0], regs[2*g+1]); #pragma unroll for(int g=0; g < WordPerThread/2; g++) storeShmem128(shm8 + 2*(g*WARP_SIZE + wid), regs[2*g+0], regs[2*g+1]); __syncwarp(); // Now load from shmem stage to regs. Preserve the same pre-shuffled layout // as the aligned case since Finish() will be applied regardless. T *shm = (T*)shm8 + misalignment/sizeof(T); #pragma unroll for(int g=0; g < WordPerThread/2; g++) { int ix = g*WARP_SIZE - 4*(g/2) + wid - (g%2)*(wid/8); if(!flagThread || g%2==0) { if(ix*EltPer16B < eltN) loadShmemMisaligned128(shm + ix*EltPer16B, regs[2*g+0], regs[2*g+1]); } } } } template __device__ __forceinline__ void loadRegsFinish(uint64_t(®s)[WordPerThread]) { // Move data out of flag registers into the vacant registers. #pragma unroll for (int g=1; g < WordPerThread/2; g+=2) { if (flagThread) regs[2*g] = regs[2*g-1]; } } template __device__ __forceinline__ void storeRegs(T *dst, uint64_t(®s)[WordPerThread], int eltN) { constexpr int EltPer16B = 16/sizeof(T); // Reverse Finish() register permuatation. #pragma unroll for (int g=1; g < WordPerThread/2; g+=2) { if (flagThread) regs[2*g-1] = regs[2*g]; } // Write to dst if 16-byte aligned, shmem otherwise. int misalignment = reinterpret_cast(dst)%16; uint64_t *shm8 = shmemCvtPtr((uint64_t*)ncclScratchForWarp(warpInBlock)); #pragma unroll for(int g=0; g < WordPerThread/2; g++) { int ix = g*WARP_SIZE - 4*(g/2) + wid - (g%2)*(wid/8); if (!flagThread || g%2==0) { if(misalignment == 0 && (ix+1)*EltPer16B <= eltN) store128((uint64_t*)(dst + ix*EltPer16B), regs[2*g+0], regs[2*g+1]); else storeShmem128(shm8+2*ix, regs[2*g+0], regs[2*g+1]); } } __syncwarp(); // Write rest from shmem to dst. No need to coalesce stores to 16-bytes, // the hardware keeps up fine. T *shm = (T*)ncclScratchForWarp(warpInBlock); int skip = misalignment == 0 ? eltN & -EltPer16B : 0; for(int i=skip+wid; i < eltN; i += WARP_SIZE) dst[i] = shm[i]; } #define WARP_MASK 0xffffffff template __device__ __forceinline__ void recvReduceSendCopy(uint64_t(&v)[ELEMS_PER_THREAD], int ll128Offset, bool postOp) { constexpr int SRC = SrcBuf != -1 ? 1 : 0; uint64_t vr[ELEMS_PER_THREAD]; __syncwarp(); /************************ Wait first recv ********************/ if (RECV) { uint64_t* ptr = recvPtr(0)+ll128Offset; uint64_t flag = recvFlag(0); bool needReload; int spins = 0; do { needReload = false; #pragma unroll for (int u=0; u __device__ __forceinline__ void GenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) { constexpr int SRC = SrcBuf != -1 ? 1 : 0; constexpr int DST = DstBuf != -1 ? 1 : 0; T const *srcPtr = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx; T *dstPtr = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx; int wireOffset = WireWordPerSlice*warp + 2*wid; const int nwarps = nthreads/WARP_SIZE; nelem = nelem < 0 ? 0 : nelem; if (SEND) waitSend(divUp(nelem, DataEltPerSlice)*WireWordPerSlice*sizeof(uint64_t)); barrier(); nelem -= DataEltPerSlice*warp; srcPtr += DataEltPerSlice*warp; dstPtr += DataEltPerSlice*warp; while (nelem > 0) { const int eltInSlice = min(nelem, DataEltPerSlice); uint64_t regs[NCCL_LL128_SHMEM_ELEMS_PER_THREAD]; if (SRC) loadRegsBegin(regs, srcPtr, eltInSlice); recvReduceSendCopy(regs, wireOffset, postOp); if (DST) storeRegs(dstPtr, regs, eltInSlice); wireOffset += WireWordPerSlice*nwarps; srcPtr += DataEltPerSlice*nwarps; dstPtr += DataEltPerSlice*nwarps; nelem -= DataEltPerSlice*nwarps; } barrier(); if (SEND) for (int i=0; i < MaxSend; i++) sendStep[i] += 1; if (SEND) postSend(); if (RECV) for (int i=0; i < MaxRecv; i++) recvStep[i] += 1; if (RECV) postRecv(); } __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { recvBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128]; recvStep[i] = conn->step; if (wid == i) recvConn = conn; } __device__ __forceinline__ void loadRecvSync() { if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv()) { recvConnHeadPtr = recvConn->head; recvConnHead = recvConn->step; } } __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { sendBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128]; sendStep[i] = conn->step; if (wid == i) sendConn = conn; } __device__ __forceinline__ void loadSendSync() { if (tid < fan.nsend()) { sendConnHeadPtr = sendConn->head; sendConnHeadCache = *sendConnHeadPtr; sendConnHead = sendConn->step; sendConnFifoPtr = sendConn->sizesFifo; } if (tid >= nthreads-WARP_SIZE && widsizesFifo) { sendConnTailPtr = sendConn->tail; sendConnTail = sendConn->step; } } } public: __device__ Primitives( const int tid, const int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, uint8_t connIndexRecv=0, uint8_t connIndexSend=0 ): redOp(redOpArg), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), warpInBlock(threadIdx.x/WARP_SIZE), flagThread((tid%8)==7), group(group), stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) { auto *channel = &ncclShmem.channel; int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) { loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv); nrecv++; } while (nsend < MaxSend && sendPeers[nsend] >= 0) { loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend); nsend++; } this->fan = Fan(nrecv, nsend); loadRecvSync(); loadSendSync(); setDataPtrs(inputBuf, outputBuf); } __device__ ~Primitives() { // Save steps for the next operation if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv()) recvConn->step = recvConnHead; if (tid < fan.nsend()) sendConn->step = sendConnHead; // Ensure all steps written back barrier(); } __device__ void setDataPtrs(void const *inputBuf, void *outputBuf) { userBufs[Input] = (T*)inputBuf; userBufs[Output] = (T*)outputBuf; } __device__ void moveDataPtrs(intptr_t delta) { userBufs[Input] += delta; userBufs[Output] += delta; } __device__ void send(intptr_t inpIx, int eltN) { return GenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false); } __device__ void sendFromOutput(intptr_t outIx, int eltN) { return GenericOp<0, 1, Output, -1>(outIx, -1, eltN, false); } __device__ void recv(intptr_t outIx, int eltN, bool postOp=false) { return GenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp); } __device__ void recvReduceSend(intptr_t inpIx, int eltN) { return GenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false); } __device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { return GenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { return GenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { return GenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp); } __device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { return GenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp); } }; nccl-2.18.3-1/src/collectives/device/prims_simple.h000066400000000000000000000717141444201535000221100ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ template class Primitives< T, RedOp, Fan, Direct, ProtoSimple, P2p > { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; static constexpr int RoleInput = 0x01, RoleOutput = 0x02, RoleWaitRecv = 0x04, RoleWaitSend = 0x08, RolePostSend = 0x10, RolePostRecv = 0x20, Aborted = 0x40, OffsFifoEnabled = 0x80, SizesFifoEnabled = 0x100, DirectWrite = 0x200, DirectRead = 0x400, ThreadsSynced = 0x800, NvlsMinPolling = 0x1000; const int tid, tidInBlock; const int nthreads; int nworkers; const int stepSize; Fan fan; int index; // Peer index I'm responsible for int flags; int group; uint64_t step; int *connOffsFifoPtr; // (flags & OffsFifoEnabled) union { T *userBuff; // (flags & (RoleInput|RoleOutput)) T *connEltsFifo; // !(flags & (RoleInput|RoleOutput)) }; union { int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled) T *directBuff; // !(flags & SizesFifoEnabled) }; uint64_t *connStepPtr; uint64_t connStepCache; // Cache last seen value of (*connStepPtr) // Don't use barrier 0 as it's used by the final sync __device__ void barrier() { flags |= ThreadsSynced; if (nthreads == WARP_SIZE) __syncwarp(); else { int bar = 15-group; asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nthreads) : "memory"); } } __device__ void subBarrier() { if (nworkers == WARP_SIZE) __syncwarp(); else { int bar = (nworkers==nthreads ? 15 : 8) - group; asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nworkers) : "memory"); } } __device__ bool barrierAny(int vote) { flags |= ThreadsSynced; if (nthreads == WARP_SIZE) { return __any_sync(~0u, vote); } else { int ans, bar = 15-group; asm volatile( "{ .reg .pred p;" " setp.ne.s32 p, %1, 0;" " bar.red.or.pred p, %2, %3, p; " " selp.s32 %0, 1, 0, p; }" : "=r"(ans) : "r"(vote), "r"(bar), "r"(nthreads) : "memory"); return ans != 0; } } __device__ bool subBarrierAny(int vote) { if (nworkers == WARP_SIZE) { return __any_sync(~0u, vote); } else { int ans, bar = (nworkers==nthreads ? 15 : 8) - group; asm volatile( "{ .reg .pred p;" " setp.ne.s32 p, %1, 0;" " bar.red.or.pred p, %2, %3, p; " " selp.s32 %0, 1, 0, p; }" : "=r"(ans) : "r"(vote), "r"(bar), "r"(nworkers) : "memory"); return ans != 0; } } inline __device__ bool checkAbort(int &spins) { spins++; if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { if (*ncclShmem.comm.abortFlag) { flags |= Aborted; ncclShmem.aborted = 1; } spins = 0; } return flags & Aborted; } inline __device__ uint64_t loadStepValue(uint64_t* ptr) { #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 if (flags & NvlsMinPolling) { uint64_t ans; asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr))); return ans; } #endif // volatile is faster than acquire but not as correct. Make sure reduceCopy // loads data using volatile so it doesn't see stale data in L1. return ld_volatile_global(ptr); } template __device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) { const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send; const bool noRecvWait = DirectRecv && Src && (flags & DirectRead); // no wait when directly reading from remote input const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) || ((flags & (Send*RoleWaitSend)) && !noSendWait)) { int spins = 0; while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) { connStepCache = loadStepValue(connStepPtr); if (checkAbort(spins)) break; //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice)); } } if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) { if (isSendNotRecv && (flags & SizesFifoEnabled)) connSizesFifoPtr[step%NCCL_STEPS] = nelts*sizeof(T); void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst) : (ncclShmem.groups[group].srcs + Src); if (flags & OffsFifoEnabled) ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T); else if (isSendNotRecv && DirectSend) { if (flags & DirectWrite) { ptrs[index] = directBuff + dstIx + offset; } else if (flags & DirectRead) { // empty send ptrs[index] = nullptr; } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; } } else if (!isSendNotRecv && DirectRecv) { if (flags & DirectRead) { ptrs[index] = directBuff + srcIx + offset; } else if (flags & DirectWrite) { ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; } } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; } step += StepPerSlice; } } template inline __device__ void postPeer(bool dataStored) { if (flags & (Recv*RolePostRecv | Send*RolePostSend)) { step += StepPerSlice; if (Send && (flags & RolePostSend) && dataStored) fence_acq_rel_sys(); st_relaxed_sys_global(connStepPtr, step); } } template __device__ __forceinline__ void genericOp( intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp ) { constexpr int DirectRecv = 1 && Direct && DirectRecv1; constexpr int DirectSend = 1 && Direct && DirectSend1; constexpr int Src = SrcBuf != -1; constexpr int Dst = DstBuf != -1; nelem = nelem < 0 ? 0 : nelem; int sliceSize = stepSize*StepPerSlice; sliceSize = max(divUp(nelem, 16*SlicePerChunk)*16, sliceSize/32); int slice = 0; int offset = 0; if (tid < nworkers && offset < nelem) { // Worker-only loop for non-empty slices. Non-workers and empty slices are // processed in the loop following this if block. The benefit of splitting // the loop like this is we pull two branches out of the critical path. // Using "number of branch insns (taken or not) encountered dynamically" // as the performance metric, then: // perf_orig = 2*numslices // perf_new = 2+numslices // So the new code and old code behave the same for numslices=2, and for // numslices>2 the new code is superior. And note that in the case // numslices=1, the loop is trivially unrollable (single iteration) so we // don't incur that that tail branch and we still have perf_new=2. // // ORIGINAL CODE: // unrolled for(slices) { // if(worker) { // This branch removed // wait(); // subBarrier(); // if(slice not empty) // This branch removed // ReduceCopyMulti(); // } // barrier(); // post(); // } // Since we no longer unroll, new branch added here #if __CUDA_ARCH__ < 700 // Above doesn't matter on older hardware. #pragma unroll SlicePerChunk #else #pragma unroll 1 #endif do { sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset; if (Src && (flags & (SrcBuf==Input ? RoleInput : RoleOutput))) ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset; if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput))) ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset; waitPeer(srcIx, dstIx, offset, sliceSize); subBarrier(); /* if user abort the kernel, we don't need to actually perform copy/reduce; just set size * to 0 to avoid unnecessary workload. */ int workSize = ncclShmem.aborted ? 0 : sliceSize; if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy if (Send) { reduceCopy (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false, 1, ncclShmem.groups[group].srcs, fan.nsend(), ncclShmem.groups[group].dsts+1, workSize); } } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) { // For broadcast in CollNet to do empty send reduceCopy (tid, nworkers, ncclShmem.redOpArgs[0], nullptr, postOp, Recv, ncclShmem.groups[group].srcs, Dst, ncclShmem.groups[group].dsts, workSize); } else { constexpr int PreOpSrcs = SrcBuf != Input ? 0 : DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1; reduceCopy (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs, Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts, workSize); } barrier(); // This barrier has a counterpart in following loop postPeer(0 < sliceSize); offset += sliceSize; slice += 1; } while (slice < SlicePerChunk && offset < nelem); } // Non-workers come straight here. Workers too but only once the remaining // slices are all empty. Since empty slices are the uncommon case, and // worker perf is the limiter, perf-wise this loop is effectively unentered, // hence just a single branch insn. #pragma unroll 1 while (slice < SlicePerChunk) { sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset; { // Only workers could have Wait roles so we know the slice must be empty // since we've exited the loop above. waitPeer(0, 0, 0, 0); } barrier(); // Has couterpart in preceding worker-only loop. postPeer(0 < sliceSize); offset += sliceSize; slice += 1; } } // Scatter/Gather generic op // skip: my own rank order in the buffer chunks // shift: peer offset to avoid all ranks sending to or receiving from same peer template __device__ __forceinline__ void ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp) { constexpr int DirectRecv = 1 && Direct && DirectRecv1; constexpr int DirectSend = 1 && Direct && DirectSend1; int offset = 0; // slice offset int sliceSize = stepSize*StepPerSlice; int dataSize = max(DIVUP(peerElem, 16*SlicePerChunk)*16, sliceSize/32); // per-peer slice size #pragma unroll for (int slice=0; slice(0, inpIx, offset, realSize); subBarrier(); #pragma unroll // Loop over peers for (int j=0; j= 0 && i >= skip) pOffset += peerElem; void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset; int realPeerSize = min(realSize, totalElem-pOffset); if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) { reduceCopy(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize); // Mark for threadfence at the end fenceNeeded |= true; } } } else if (Recv) { if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset; int pOffset = index*peerOffset; if (skip >= 0 && index >= skip) pOffset += peerElem; // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer waitPeer(outIx, outIx+pOffset, offset, realSize); subBarrier(); #pragma unroll for (int j=0; j= 0 && i >= skip) pOffset += peerElem; void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset; int realPeerSize = min(realSize, totalElem-pOffset); if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0; if (realPeerSize > 0) reduceCopy(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize); } } } fenceNeeded = barrierAny(fenceNeeded); postPeer(fenceNeeded); offset += realSize; } } __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) { if (flags & (RoleWaitRecv|RolePostRecv)) { auto *conn = &peer->recv[connIndex]; step = conn->step; step = roundUp(step, SlicePerChunk*StepPerSlice); if (flags & RolePostRecv) { connStepPtr = conn->head; *connStepPtr = step; // Return credits in case we rounded up. } if (flags & RoleWaitRecv) { ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs() flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0; connStepPtr = conn->tail; connStepCache = loadStepValue(connStepPtr); flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0; if (Direct) { // User buffers have been registered if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; } } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { // direct read not allowed in non-register case // otherwise, in one-to-multi send, we could mix empty send and intermediate send flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0; } } } if (flags & OffsFifoEnabled) connOffsFifoPtr = conn->offsFifo; connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; } } } __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) { if (flags & (RoleWaitSend|RolePostSend)) { auto *conn = &peer->send[connIndex]; step = conn->step; step = roundUp(step, SlicePerChunk*StepPerSlice); if (flags & RolePostSend) { connStepPtr = conn->tail; } if (flags & RoleWaitSend) { ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs() flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0; connStepPtr = conn->head; connStepCache = loadStepValue(connStepPtr); flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0; if (flags & OffsFifoEnabled) connOffsFifoPtr = conn->offsFifo; connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; if (conn->sizesFifo != nullptr) { flags |= SizesFifoEnabled; connSizesFifoPtr = conn->sizesFifo; } else if (Direct) { // User buffers have been registered if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; } } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { // direct read not allowed in non-register case // otherwise, in one-to-multi send, we could mix empty send and intermediate send flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0; } } } } } } public: __device__ Primitives( int tid, int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr ): tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group), stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) { // For send operations, we need an extra warp to overlap the threadfence and the copy this->nworkers = nthreads - (MaxSend > 0 && nthreads-WARP_SIZE >= 64 ? WARP_SIZE : 0); int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++; while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++; this->fan = Fan(nrecv, nsend); constexpr int ThreadPerSync = 8; static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers"); int g = tid / ThreadPerSync; int ng = nthreads / ThreadPerSync; index = tid % ThreadPerSync; flags = 0; if (g == 0) { if (index < nrecv) flags |= RoleWaitRecv; if (index == nrecv) flags |= RoleInput; } else if (g == 1) { if (index < nsend) flags |= RoleWaitSend; if (index == nsend) flags |= RoleOutput; } else if (g == ng - 2) { if (index < nrecv) flags |= RolePostRecv; } else if (g == ng - 1) { if (index < nsend) flags |= RolePostSend; } int peer = 0; if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index]; if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index]; loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e); loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e); setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e); } __device__ ~Primitives() { // Ensure ncclShmem.groups[].send/recvConns are available if (!(flags & ThreadsSynced)) barrier(); // Save steps for the next operation if (flags & (RolePostSend|RolePostRecv)) { auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns; conns[index]->step = step; } // Make sure all threads are done writing back conn->step and done using // ncclShmem.groups[group] barrier(); } __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) { if (flags & RoleInput) { userBuff = (T*)inputBuf; ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input } if (flags & RoleOutput) userBuff = (T*)outputBuf; bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite); bool sendAcceptor = flags == (flags|RoleWaitSend|DirectWrite); bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched) bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead); // receiver accepts direct buffer int regUsed = e != nullptr ? e->elem.regUsed : 0; if (Direct && recvProvider) { int spins = 0; void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange; // Wait for consumer to consume previous value before trampling it. while (*slot != nullptr && !checkAbort(spins)); directBuff = (T*)outputBuf; // Encode pointer by XOR'ing against some address they definitely wouldn't send // since we want to allow them sending us nullptr while not colliding with // the empty slot value. *slot = reinterpret_cast(reinterpret_cast(directBuff) ^ reinterpret_cast(slot)); } if (Direct && sendAcceptor) { int spins = 0; void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange; void *ptr; while (true) { ptr = *slot; if (ptr != nullptr || checkAbort(spins)) break; } directBuff = regUsed ? (T*)(e->dnOutputs[index]) : reinterpret_cast(reinterpret_cast(ptr) ^ reinterpret_cast(slot)); *slot = nullptr; } if (Direct && sendProvider) { int spins = 0; void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange; volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange; volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1; // Wait for consumer to consume previous value before trampling it. while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins)); // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter) // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend) directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf; // Exchange pre-scalers for use in direct pull *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg; *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32); // Encode pointer by XOR'ing against some address they definitely wouldn't send // since we want to allow them sending us nullptr while not colliding with // the empty slot value. *slot = reinterpret_cast(reinterpret_cast(directBuff) ^ reinterpret_cast(slot)); } if (Direct && recvAcceptor) { int spins = 0; void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange; volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange; volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1; void *ptr; while (true) { ptr = *slot; if (ptr != nullptr || checkAbort(spins)) break; } directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) : reinterpret_cast(reinterpret_cast(ptr) ^ reinterpret_cast(slot)); if (MaxSend != 0) { // reduce group rather than gather group // Store scalers for remote inputs uint64_t arg0, arg1; while (true) { arg0 = *argSlot0; arg1 = *argSlot1; if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break; } ncclShmem.redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff); } *argSlot0 = 0; *argSlot1 = 0; *slot = nullptr; } } __device__ void moveDataPtrs(intptr_t delta) { if (flags & (RoleInput|RoleOutput)) userBuff += delta; } __device__ __forceinline__ void send(intptr_t inpIx, int eltN) { genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, eltN, false); } __device__ __forceinline__ void sendFromOutput(intptr_t outIx, int eltN) { genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, eltN, false); } __device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) { genericOp<0, 1, 0, 1, Input, -1>(inpIx, outIx, eltN, false); } __device__ __forceinline__ void directSendFromOutput(intptr_t outIx, int eltN) { genericOp<0, 1, 0, 1, Output, -1>(outIx, outIx, eltN, false); } __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp); } __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) { genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false); } __device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void recvSend(int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, -1, -1>(-1, -1, eltN, postOp); } __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp); } __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) { genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false); } __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp); } __device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp); } __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) { genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp); } __device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { // Direct is only for the send part genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void scatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) { ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false); } __device__ __forceinline__ void directScatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) { ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false); } __device__ __forceinline__ void gather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp=false) { ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, postOp); } __device__ __forceinline__ void directGather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) { ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false); } }; nccl-2.18.3-1/src/collectives/device/reduce.cu000066400000000000000000000005411444201535000210220ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "reduce.h" #include "common.h" #include "collectives.h" IMPL_COLL_R(Reduce); nccl-2.18.3-1/src/collectives/device/reduce.h000066400000000000000000000072041444201535000206450ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "devcomm.h" #include "collectives.h" #include "primitives.h" namespace { template __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1)); const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*chunkSize; const ssize_t size = args->count; const int rank = ncclShmem.comm.rank; const int prevRank = ring->userRanks[nranks-1]; const int root = args->root; Primitives, 0, Proto, 0> prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg); auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int { int realChunkSize; if (Proto::Id == NCCL_PROTO_SIMPLE) { realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels)); realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize); return realChunkSize; }; if (prevRank == root) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { int realChunkSize = calcChunkSize(gridOffset); ssize_t offset = gridOffset + bid*realChunkSize; int nelem = min(realChunkSize, size-offset); prims.send(offset, nelem); } } else if (rank == root) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { int realChunkSize = calcChunkSize(gridOffset); ssize_t offset = gridOffset + bid*realChunkSize; int nelem = min(realChunkSize, size-offset); prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true); } } else { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { int realChunkSize = calcChunkSize(gridOffset); ssize_t offset = gridOffset + bid*realChunkSize; int nelem = min(realChunkSize, size-offset); prims.recvReduceSend(offset, nelem); } } } } template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { using Proto = ProtoSimple; runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; nccl-2.18.3-1/src/collectives/device/reduce_kernel.h000066400000000000000000000576511444201535000222200ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_REDUCE_KERNEL_H_ #define NCCL_REDUCE_KERNEL_H_ #include "op128.h" #include #include //////////////////////////////////////////////////////////////////////////////// // The reduction function classes. All classes must: // 1. Expose the `EltType` typedef. // 2. Have constructor taking no arguments (default constructible). // 3. Have constructor taking `uint64_t opArg`. template struct FuncNull { using EltType = T; __device__ FuncNull(uint64_t opArg=0) {}; }; template struct FuncSum { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; }; template struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; }; template struct FuncMin { using EltType = T; __device__ FuncMin(uint64_t opArg=0) {}; }; template struct FuncMax { using EltType = T; __device__ FuncMax(uint64_t opArg=0) {}; }; template struct FuncPreMulSum; template struct FuncSumPostDiv; //////////////////////////////////////////////////////////////////////////////// // Trait classes for reduction functions. Given a function (FuncSum, etc.) // and a number of elements in a pack, will reduce, preOp, or postOp a pack // of elements. These classes are intended to be specialized for specific // combinations of reduction function and pack size. template struct Apply_Reduce /*{ static BytePack reduce( Fn fn, BytePack a, BytePack b ); }*/; template struct Apply_PreOp/*{ static constexpr bool IsIdentity; static BytePack preOp(Fn fn, BytePack a); }*/; template struct Apply_PostOp/*{ static constexpr bool IsIdentity; static BytePack postOp(Fn fn, BytePack a); }*/; template struct LoadMultimem_BigPackSize/*{ // If non-zero, then this and sizeof(T) are valid pack sizes for LoadMultimem, // otherwise there are no valid pack sizes for LoadMultimem. static constexpr int BigPackSize = 0; }*/; template struct Apply_LoadMultimem/*{ static BytePack load(Fn fn, uintptr_t addr); }*/; //////////////////////////////////////////////////////////////////////////////// // Public API for calling the trait classes. These take the data elements as a // pack of any type, which could be a BytePack or any integral type (uint64_t, // uint32_t, etc.), and will return a new pack where each element has been // transformed appropriately. template __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) { return fromPack( Apply_Reduce::Size/sizeof(typename Fn::EltType)> ::reduce(fn, toPack(a), toPack(b)) ); } template __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) { return fromPack( Apply_PreOp::Size/sizeof(typename Fn::EltType)> ::preOp(fn, toPack(a)) ); } template __device__ __forceinline__ Pack applyPostOp(Fn fn, Pack a) { return fromPack( Apply_PostOp::Size/sizeof(typename Fn::EltType)> ::postOp(fn, toPack(a)) ); } template __device__ __forceinline__ BytePack applyLoadMultimem(Fn fn, uintptr_t addr) { return Apply_LoadMultimem::load(fn, addr); } //////////////////////////////////////////////////////////////////////////////// // Apply_Reduce // Nonsensical base case template struct Apply_Reduce { __device__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) { return {}; } }; // General recursive definition (EltPerPack > 1). This is how we iterate over // all elements in a pack of any size, by breaking it into halves. Eventually // we'll hit a base case (a more specific template specialization which takes // precedence). template struct Apply_Reduce { template __device__ static BytePack reduce(Fn fn, BytePack a, BytePack b) { a.half[0] = Apply_Reduce::reduce(fn, a.half[0], b.half[0]); a.half[1] = Apply_Reduce::reduce(fn, a.half[1], b.half[1]); return a; } }; // Base case definitions (EltPerPack == 1) template struct Apply_Reduce, /*EltPerPack=*/1> { __device__ static BytePack reduce(FuncSum fn, BytePack a, BytePack b) { return a; } }; template struct Apply_Reduce, /*EltPerPack=*/1> { __device__ static BytePack reduce(FuncSum fn, BytePack a, BytePack b) { return toPack(fromPack(a) + fromPack(b)); } }; template struct Apply_Reduce, /*EltPerPack=*/1> { __device__ static BytePack reduce(FuncProd fn, BytePack a, BytePack b) { return toPack(fromPack(a) * fromPack(b)); } }; template struct Apply_Reduce, /*EltPerPack=*/1> { __device__ static BytePack reduce(FuncMin fn, BytePack a, BytePack b) { return toPack(min(fromPack(a), fromPack(b))); } }; template struct Apply_Reduce, /*EltPerPack=*/1> { __device__ static BytePack reduce(FuncMax fn, BytePack a, BytePack b) { return toPack(max(fromPack(a), fromPack(b))); } }; // Optimizations for specfic types and element count combinations: template<> struct Apply_Reduce, /*EltPerPack=*/4> { __device__ static BytePack<4> reduce(FuncSum fn, BytePack<4> a, BytePack<4> b) { constexpr uint32_t lo = 0x00ff00ff; constexpr uint32_t hi = ~lo; uint32_t x = a.u32; uint32_t y = b.u32; a.u32 = (((x&lo) + (y&lo))&lo) + (((x&hi) + (y&hi))&hi); return a; } }; template<> struct Apply_Reduce, /*EltPerPack=*/4> { __device__ static BytePack<4> reduce(FuncSum fn, BytePack<4> a, BytePack<4> b) { return Apply_Reduce, 4>::reduce(FuncSum(), a, b); } }; #if 300 <= __CUDA_ARCH__ && __CUDA_ARCH__ < 500 template<> struct Apply_Reduce, /*EltPerPack=*/4> { __device__ static BytePack<4> reduce(FuncMin fn, BytePack<4> a, BytePack<4> b) { uint32_t z=0; asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z)); return a; } }; template<> struct Apply_Reduce, /*EltPerPack=*/4> { __device__ static BytePack<4> reduce(FuncMin fn, BytePack<4> a, BytePack<4> b) { int32_t z=0; asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z)); return a; } }; template<> struct Apply_Reduce, /*EltPerPack=*/4> { __device__ static BytePack<4> reduce(FuncMax fn, BytePack<4> a, BytePack<4> b) { uint32_t z=0; asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z)); return a; } }; template<> struct Apply_Reduce, /*EltPerPack=*/4> { __device__ static BytePack<4> reduce(FuncMax fn, BytePack<4> a, BytePack<4> b) { int32_t z=0; asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z)); return a; } }; #endif #define SPECIALIZE_REDUCE(Fn, T, EltPerPack, Vec, expr_of_x_y) \ template<> \ struct Apply_Reduce, EltPerPack> { \ __device__ __forceinline__ static BytePack reduce( \ Fn fn, BytePack a, BytePack b \ ) { \ Vec x = fromPack(a); \ Vec y = fromPack(b); \ return toPack(expr_of_x_y); \ } \ }; #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 SPECIALIZE_REDUCE(FuncSum, half, 1, half, __hadd(x, y)) SPECIALIZE_REDUCE(FuncSum, half, 2, half2, __hadd2(x, y)) SPECIALIZE_REDUCE(FuncProd, half, 1, half, __hmul(x, y)) SPECIALIZE_REDUCE(FuncProd, half, 2, half2, __hmul2(x, y)) #else SPECIALIZE_REDUCE(FuncSum, half, 1, half, __float2half(__half2float(x) + __half2float(y))) SPECIALIZE_REDUCE(FuncProd, half, 1, half, __float2half(__half2float(x) * __half2float(y))) #endif #if __CUDA_ARCH__ >= 800 SPECIALIZE_REDUCE(FuncMin, half, 1, half, __hmin(x, y)) SPECIALIZE_REDUCE(FuncMin, half, 2, half2, __hmin2(x, y)) SPECIALIZE_REDUCE(FuncMax, half, 1, half, __hmax(x, y)) SPECIALIZE_REDUCE(FuncMax, half, 2, half2, __hmax2(x, y)) #else SPECIALIZE_REDUCE(FuncMin, half, 1, half, __float2half(fminf(__half2float(x), __half2float(y)))) SPECIALIZE_REDUCE(FuncMax, half, 1, half, __float2half(fmaxf(__half2float(x), __half2float(y)))) #endif #if defined(__CUDA_BF16_TYPES_EXIST__) #if __CUDA_ARCH__ >= 800 SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __hadd(x, y)) SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y)) SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y)) SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y)) SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 1, __nv_bfloat16, __hmin(x, y)) SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 2, __nv_bfloat162, __hmin2(x, y)) SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 1, __nv_bfloat16, __hmax(x, y)) SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 2, __nv_bfloat162, __hmax2(x, y)) #else SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) + __bfloat162float(y))) SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) * __bfloat162float(y))) SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fminf(__bfloat162float(x), __bfloat162float(y)))) SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fmaxf(__bfloat162float(x), __bfloat162float(y)))) #endif #endif #undef SPECIALIZE_REDUCE //////////////////////////////////////////////////////////////////////////////// // Apply_PreOp // General recursive definition (EltPerPack > 1) template struct Apply_PreOp { static constexpr bool IsIdentity = Apply_PreOp::IsIdentity; template __device__ static BytePack preOp(Fn fn, BytePack a) { #if __cpp_if_constexpr if constexpr(!IsIdentity) { #else if (!IsIdentity) { #endif // The `if (!IsIdentity)` condition is not strictly necessary, but it may help // compiler in that it won't have to tear a register apart for no reason // just to put it back together again. a.half[0] = Apply_PreOp::preOp(fn, a.half[0]); a.half[1] = Apply_PreOp::preOp(fn, a.half[1]); } return a; } }; // Base case definition (EltPerPack == 1), by default is identity function. template struct Apply_PreOp { static constexpr bool IsIdentity = true; template __device__ static BytePack preOp(Fn fn, BytePack a) { return a; } }; // Base case definition (EltPerPack == 0), is nonsense! template struct Apply_PreOp { static constexpr bool IsIdentity = true; __device__ static BytePack<0> preOp(Fn fn, BytePack<0> a) { return {}; } }; //////////////////////////////////////////////////////////////////////////////// // Apply_PostOp // General recursive definition (EltPerPack > 1) template struct Apply_PostOp { static constexpr bool IsIdentity = Apply_PostOp::IsIdentity; template __device__ static BytePack postOp(Fn fn, BytePack a) { #if __cpp_if_constexpr if constexpr(!IsIdentity) { #else if (!IsIdentity) { #endif // The `if (!IsIdentity)` condition is not strictly necessary, but it may help // compiler in that it won't have to tear a register apart for no reason // just to put it back together again. a.half[0] = Apply_PostOp::postOp(fn, a.half[0]); a.half[1] = Apply_PostOp::postOp(fn, a.half[1]); } return a; } }; // Base case definition (EltPerPack == 1), by default is identity function. template struct Apply_PostOp { static constexpr bool IsIdentity = true; template __device__ static BytePack postOp(Fn fn, BytePack a) { return a; } }; // Base case definition (EltPerPack == 0), is nonsense! template struct Apply_PostOp { static constexpr bool IsIdentity = true; __device__ static BytePack<0> postOp(Fn fn, BytePack<0> a) { return {}; } }; //////////////////////////////////////////////////////////////////////////////// // FuncPreMulSum // General definition for all integral types, float, and double. template struct FuncPreMulSum { using EltType = T; T scalar; __device__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; T val; }; u64 = opArg; scalar = val; } }; template<> struct FuncPreMulSum { using EltType = half; #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 half2 scalar; __device__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; half val; }; u64 = opArg; scalar.x = val; scalar.y = val; } #else float scalar; __device__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; half val; }; u64 = opArg; scalar = __half2float(val); } #endif }; #if defined(__CUDA_BF16_TYPES_EXIST__) template<> struct FuncPreMulSum<__nv_bfloat16> { using EltType = __nv_bfloat16; #if __CUDA_ARCH__ >= 800 __nv_bfloat162 scalar; __device__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; __nv_bfloat16 val; }; u64 = opArg; scalar.x = val; scalar.y = val; } #else float scalar; __device__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; __nv_bfloat16 val; }; u64 = opArg; scalar = __bfloat162float(val); } #endif }; #endif template struct Apply_Reduce, /*EltPerPack=*/1> { __device__ static BytePack reduce(FuncPreMulSum fn, BytePack a, BytePack b) { // FuncPreMulSum reduce dispatches to FuncSum. return Apply_Reduce, 1>::reduce(FuncSum(), a, b); } }; // PreOp of FuncPreMulSum for integral types, float, and double. template struct Apply_PreOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; __device__ static BytePack preOp(FuncPreMulSum fn, BytePack a) { return toPack(fromPack(a) * fn.scalar); } }; //////////////////////////////////////////////////////////////////////////////// // Apply_PreOp of FuncPreMulSum for float16. template<> struct Apply_PreOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; __device__ static BytePack preOp(FuncPreMulSum fn, BytePack a) { #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 return toPack(__hmul(fromPack(a), fn.scalar.x)); #else return toPack(__float2half(__half2float(fromPack(a)) * fn.scalar)); #endif } }; #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 template<> struct Apply_PreOp, /*EltPerPack=*/2> { static constexpr bool IsIdentity = false; __device__ static BytePack preOp(FuncPreMulSum fn, BytePack a) { return toPack(__hmul2(fromPack(a), fn.scalar)); } }; #endif //////////////////////////////////////////////////////////////////////////////// // Apply_PreOp of FuncPreMulSum for bfloat16. #if defined(__CUDA_BF16_TYPES_EXIST__) template<> struct Apply_PreOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; __device__ static BytePack preOp( FuncPreMulSum<__nv_bfloat16> fn, BytePack a ) { #if __CUDA_ARCH__ >= 800 return toPack<__nv_bfloat16>(__hmul(fromPack<__nv_bfloat16>(a), fn.scalar.x)); #else return toPack<__nv_bfloat16>(__float2bfloat16(__bfloat162float(fromPack<__nv_bfloat16>(a)) * fn.scalar)); #endif } }; #if __CUDA_ARCH__ >= 800 template<> struct Apply_PreOp, /*EltPerPack=*/2> { static constexpr bool IsIdentity = false; __device__ static BytePack preOp( FuncPreMulSum<__nv_bfloat16> fn, BytePack a ) { return toPack<__nv_bfloat162>(__hmul2(fromPack<__nv_bfloat162>(a), fn.scalar)); } }; #endif #endif //////////////////////////////////////////////////////////////////////////////// // FuncSumPostDiv template struct IsFloatingPoint: std::false_type {}; template<> struct IsFloatingPoint: std::true_type {}; #if defined(__CUDA_BF16_TYPES_EXIST__) template<> struct IsFloatingPoint<__nv_bfloat16>: std::true_type {}; #endif template<> struct IsFloatingPoint: std::true_type {}; template<> struct IsFloatingPoint: std::true_type {}; template::value> struct FuncSumPostDiv_IntOnly; template struct FuncSumPostDiv: FuncSumPostDiv_IntOnly { __device__ FuncSumPostDiv(uint64_t opArg=0): FuncSumPostDiv_IntOnly(opArg) { } }; template struct FuncSumPostDiv_IntOnly: FuncSum { using EltType = T; int divisor; __device__ FuncSumPostDiv_IntOnly(uint64_t opArg=0): divisor(opArg) {} }; template struct FuncSumPostDiv_IntOnly { static_assert(sizeof(T)!=sizeof(T), "FuncSumPostDiv is only for implementing ncclAvg on integral types."); }; template struct Apply_Reduce, /*EltPerPack=*/1>: Apply_Reduce, 1> { __device__ static BytePack reduce(FuncSumPostDiv fn, BytePack a, BytePack b) { // FuncSumPostDiv reduce dispatches to FuncSum. return Apply_Reduce, 1>::reduce(FuncSum(), a, b); } }; template struct Apply_PostOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; __device__ static BytePack postOp(FuncSumPostDiv fn, BytePack a) { return toPack(fromPack(a) / fn.divisor); } }; //////////////////////////////////////////////////////////////////////////////// // Apply_LoadMultimem #define SIZEOF_BytePack_field_u16 2 #define PTX_REG_BytePack_field_u16 "h" #define SIZEOF_BytePack_field_u32 4 #define PTX_REG_BytePack_field_u32 "r" #define SIZEOF_BytePack_field_u64 8 #define PTX_REG_BytePack_field_u64 "l" #define DEFINE_Apply_LoadMultimem(Fn, T, op, ptx_ty, pack_field) \ template<> \ struct Apply_LoadMultimem, SIZEOF_BytePack_field_##pack_field> { \ static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \ __device__ static BytePack load(Fn fn, uintptr_t addr) { \ BytePack ans; \ asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ : "l"(addr)); \ return ans; \ } \ }; #define DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \ template<> \ struct Apply_LoadMultimem, 4*(SIZEOF_BytePack_field_##pack_field)> { \ static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \ __device__ static BytePack load(Fn fn, uintptr_t addr) { \ BytePack ans; \ asm("multimem.ld_reduce.relaxed.sys.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \ : "l"(addr)); \ return ans; \ } \ }; #define DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(Fn, T, op, ptx_ty, pack_field) \ DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \ template<> \ struct Apply_LoadMultimem, sizeof(T)> { \ __device__ static BytePack load(Fn fn, uintptr_t addr) { \ BytePack<2*sizeof(T)> tmp; \ asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \ : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ : "l"(addr & -uintptr_t(sizeof(T)))); \ return tmp.half[(addr/sizeof(T))%2]; \ } \ }; template struct Apply_LoadMultimem { __device__ static BytePack load(Fn fn, uintptr_t addr) { __trap(); return {}; } }; #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 template struct LoadMultimem_BigPackSize { using T = typename Fn::EltType; static constexpr bool IsSum = std::is_same>::value || std::is_same>::value || std::is_same>::value; static constexpr bool IsMinOrMax = std::is_same>::value || std::is_same>::value; static constexpr bool IsFloat = IsFloatingPoint::value; static constexpr int BigPackSize = IsFloat && IsSum && sizeof(T) < 8 ? 16 : IsFloat && IsSum ? 8 : IsFloat && IsMinOrMax && sizeof(T)==2 ? 16 : !IsFloat && (IsSum||IsMinOrMax) && sizeof(T)>=4 ? sizeof(T) : /*multimem.ld_reduce not supported:*/ 0; }; DEFINE_Apply_LoadMultimem(FuncSum, uint32_t, add, u32, u32) DEFINE_Apply_LoadMultimem(FuncMin, uint32_t, min, u32, u32) DEFINE_Apply_LoadMultimem(FuncMax, uint32_t, max, u32, u32) DEFINE_Apply_LoadMultimem(FuncSum, int32_t, add, s32, u32) DEFINE_Apply_LoadMultimem(FuncMin, int32_t, min, s32, u32) DEFINE_Apply_LoadMultimem(FuncMax, int32_t, max, s32, u32) DEFINE_Apply_LoadMultimem(FuncSum, uint64_t, add, u64, u64) DEFINE_Apply_LoadMultimem(FuncMin, uint64_t, min, u64, u64) DEFINE_Apply_LoadMultimem(FuncMax, uint64_t, max, u64, u64) DEFINE_Apply_LoadMultimem(FuncSum, int64_t, add, u64, u64) DEFINE_Apply_LoadMultimem(FuncMin, int64_t, min, s64, u64) DEFINE_Apply_LoadMultimem(FuncMax, int64_t, max, s64, u64) DEFINE_Apply_LoadMultimem(FuncSum, float, add, f32, u32) DEFINE_Apply_LoadMultimem_v4(FuncSum, float, add, f32, u32) DEFINE_Apply_LoadMultimem(FuncSum, double, add, f64, u64) DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, half, add, f16x2, u32) DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, half, min, f16x2, u32) DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, half, max, f16x2, u32) #if defined(__CUDA_BF16_TYPES_EXIST__) DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, __nv_bfloat16, add, bf16x2, u32) DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, __nv_bfloat16, min, bf16x2, u32) DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, __nv_bfloat16, max, bf16x2, u32) #endif #else template struct LoadMultimem_BigPackSize { static constexpr int BigPackSize = 0; }; #endif #undef DEFINE_Apply_LoadMultimem #undef DEFINE_Apply_LoadMultimem_v4 #undef DEFINE_Apply_LoadMultimem_v4x2_and_subhalf #undef SIZEOF_BytePack_field_u64 #undef PTX_REG_BytePack_field_u64 #undef SIZEOF_BytePack_field_u32 #undef PTX_REG_BytePack_field_u32 #undef SIZEOF_BytePack_field_u16 #undef PTX_REG_BytePack_field_u16 #endif // REDUCE_KERNEL_H_ nccl-2.18.3-1/src/collectives/device/reduce_scatter.cu000066400000000000000000000005601444201535000225500ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "reduce_scatter.h" #include "common.h" #include "collectives.h" IMPL_COLL_R(ReduceScatter); nccl-2.18.3-1/src/collectives/device/reduce_scatter.h000066400000000000000000000125221444201535000223710ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "devcomm.h" #include "collectives.h" #include "primitives.h" namespace { template __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; int const *ringRanks = ring->userRanks; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1)); // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*chunkSize; const ssize_t size = args->count; Primitives, 0, Proto, 0> prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; if (Proto::Id == NCCL_PROTO_SIMPLE) { realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels)); realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize); realChunkSize = int(realChunkSize); ssize_t chunkOffset = gridOffset + bid*int(realChunkSize); /////////////// begin ReduceScatter steps /////////////// ssize_t offset; int nelem = min(realChunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ringRanks[nranks-1]; offset = chunkOffset + rankDest * size; prims.send(offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; j struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { using Proto = ProtoSimple; runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { __device__ __forceinline__ void run(ncclWorkElem *args) { const int tid = threadIdx.x; const int bid = args->bid; const int nChannels = args->nChannels; struct ncclNvls* nvls = &ncclShmem.channel.nvls; const ssize_t chunkSize = int(args->lastChunkSize); const ssize_t size = args->count; const ssize_t loopSize = nChannels*chunkSize; const int nThreadsScatter = 128 + WARP_SIZE; const int nThreadsReduce = 384; const int tidEndScatter = nThreadsScatter; const int tidEndReduce = tidEndScatter + nThreadsReduce; using Proto = ProtoSimple<1, 1>; if (tid < tidEndScatter) { // Scatter Primitives, /*Direct=*/0, Proto, 0> prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL, args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*chunkSize; int nelem = min(chunkSize, size-offset); prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0); } } else if (tid < tidEndReduce) { // Reduce through NVLS Primitives, /*Direct=*/0, Proto, 0> prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff, args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*chunkSize; int nelem = min(chunkSize, size-offset); prims.recv(offset, nelem); } } } }; nccl-2.18.3-1/src/collectives/device/sendrecv.cu000066400000000000000000000005451444201535000213700ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "sendrecv.h" #include "common.h" #include "collectives.h" IMPL_COLL_P(SendRecv); nccl-2.18.3-1/src/collectives/device/sendrecv.h000066400000000000000000000076061444201535000212150ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "devcomm.h" #include "collectives.h" #include "primitives.h" template struct RunWork { template __device__ void runSend(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) { void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); ssize_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); if (args->peer == ncclShmem.comm.rank) { struct ncclWorkElemP2p* recvArgs = args-1; void* recvBuff = reinterpret_cast(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32); if (buff != recvBuff) { reduceCopy (tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count); } } else { int chunkSize = args->chunkSize/sizeof(T); if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; int const peer = args->peer; Primitives, 1, Proto, 1> prims (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1); size_t offset = 0; do { int nelem = min(size_t(chunkSize), count-offset); prims.directSend(offset, offset, nelem); offset += nelem; } while(offset < count); } } template __device__ void runRecv(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) { if (args->peer != ncclShmem.comm.rank) { void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); ssize_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); int chunkSize = args->chunkSize/sizeof(T); if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize int const peer = args->peer; Primitives, 1, Proto, 1> prims (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1); size_t offset = 0; do { int nelem = min(size_t(chunkSize), count-offset); prims.directRecv(offset, nelem); offset += nelem; } while(offset < count); } } __device__ __forceinline__ void run(ncclWork *work) { struct ncclWorkElemP2p* args = work->p2pElems; int ngroups = args->ngroups; int tid = threadIdx.x; int wid = tid / WARP_SIZE; // This has to work even for groups of 2.5 warps (which is 8 groups, and means 3 // warps for send, 2 warps for recv). // warpStarts were rounded thanks to int division, but for group number we need to round the other way around // So we mirror wid then mirror again the group. #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE) uint8_t group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS; args += group; tid -= args->warpStart * WARP_SIZE; int nthreads = args->nWarps * WARP_SIZE; if (args->p2pType == ncclWorkP2pTypeUnused) return; if (tid >= nthreads || args->peer == -1) return; // Select Proto here // This is to allow the same kernel to run multiple primitives on different warps (thread groups) if ((group%2) == 0) { if (args->proto == NCCL_PROTO_LL) { runRecv(tid, nthreads, group, args); } else { runRecv>(tid, nthreads, group, args); } } else { if (args->proto == NCCL_PROTO_LL) { runSend(tid, nthreads, group, args); } else { runSend>(tid, nthreads, group, args); } } } }; nccl-2.18.3-1/src/collectives/reduce.cc000066400000000000000000000026301444201535000175420ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "enqueue.h" #include "collectives.h" #include "nccl.h" NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { struct NvtxParamsReduce { size_t bytes; int root; ncclRedOp_t op; }; constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)}, {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, offsetof(NvtxParamsReduce, op)} }; NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op}; NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload) struct ncclInfo info = { ncclFuncReduce, "Reduce", sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */ REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; return ncclEnqueueCheck(&info); } nccl-2.18.3-1/src/collectives/reduce_scatter.cc000066400000000000000000000025631444201535000212740ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "enqueue.h" #include "collectives.h" #include "nccl.h" NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { struct NvtxParamsReduceScatter { size_t bytes; ncclRedOp_t op; }; constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, offsetof(NvtxParamsReduceScatter, op)} }; NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op}; NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload) struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */ REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; return ncclEnqueueCheck(&info); } nccl-2.18.3-1/src/collectives/sendrecv.cc000066400000000000000000000037241444201535000201110ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "enqueue.h" #include "collectives.h" #include "argcheck.h" // Need some checks here since we access comm struct NvtxParamsSendRecv { size_t bytes; int peer; }; constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)} }; NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload) struct ncclInfo info = { ncclFuncSend, "Send", NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 1, 1 }; ncclResult_t ret; NCCLCHECK(ncclGroupStart()); ret = ncclEnqueueCheck(&info); NCCLCHECK(ncclGroupEnd()); return ret; } NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload) struct ncclInfo info = { ncclFuncRecv, "Recv", NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 1, 1 }; ncclResult_t ret; NCCLCHECK(ncclGroupStart()); ret = ncclEnqueueCheck(&info); NCCLCHECK(ncclGroupEnd()); return ret; } nccl-2.18.3-1/src/debug.cc000066400000000000000000000163531444201535000150540ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #include "nccl_net.h" #include #include #include int ncclDebugLevel = -1; static int pid = -1; static char hostname[1024]; thread_local int ncclDebugNoWarn = 0; char ncclLastError[1024] = ""; // Global string for the last error in human readable form uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV FILE *ncclDebugFile = stdout; pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER; std::chrono::steady_clock::time_point ncclEpoch; static __thread int tid = -1; void ncclDebugInit() { pthread_mutex_lock(&ncclDebugLock); if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; } const char* nccl_debug = getenv("NCCL_DEBUG"); int tempNcclDebugLevel = -1; if (nccl_debug == NULL) { tempNcclDebugLevel = NCCL_LOG_NONE; } else if (strcasecmp(nccl_debug, "VERSION") == 0) { tempNcclDebugLevel = NCCL_LOG_VERSION; } else if (strcasecmp(nccl_debug, "WARN") == 0) { tempNcclDebugLevel = NCCL_LOG_WARN; } else if (strcasecmp(nccl_debug, "INFO") == 0) { tempNcclDebugLevel = NCCL_LOG_INFO; } else if (strcasecmp(nccl_debug, "ABORT") == 0) { tempNcclDebugLevel = NCCL_LOG_ABORT; } else if (strcasecmp(nccl_debug, "TRACE") == 0) { tempNcclDebugLevel = NCCL_LOG_TRACE; } /* Parse the NCCL_DEBUG_SUBSYS env var * This can be a comma separated list such as INIT,COLL * or ^INIT,COLL etc */ char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS"); if (ncclDebugSubsysEnv != NULL) { int invert = 0; if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; } ncclDebugMask = invert ? ~0ULL : 0ULL; char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv); char *subsys = strtok(ncclDebugSubsys, ","); while (subsys != NULL) { uint64_t mask = 0; if (strcasecmp(subsys, "INIT") == 0) { mask = NCCL_INIT; } else if (strcasecmp(subsys, "COLL") == 0) { mask = NCCL_COLL; } else if (strcasecmp(subsys, "P2P") == 0) { mask = NCCL_P2P; } else if (strcasecmp(subsys, "SHM") == 0) { mask = NCCL_SHM; } else if (strcasecmp(subsys, "NET") == 0) { mask = NCCL_NET; } else if (strcasecmp(subsys, "GRAPH") == 0) { mask = NCCL_GRAPH; } else if (strcasecmp(subsys, "TUNING") == 0) { mask = NCCL_TUNING; } else if (strcasecmp(subsys, "ENV") == 0) { mask = NCCL_ENV; } else if (strcasecmp(subsys, "ALLOC") == 0) { mask = NCCL_ALLOC; } else if (strcasecmp(subsys, "CALL") == 0) { mask = NCCL_CALL; } else if (strcasecmp(subsys, "PROXY") == 0) { mask = NCCL_PROXY; } else if (strcasecmp(subsys, "NVLS") == 0) { mask = NCCL_NVLS; } else if (strcasecmp(subsys, "ALL") == 0) { mask = NCCL_ALL; } if (mask) { if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask; } subsys = strtok(NULL, ","); } free(ncclDebugSubsys); } // Cache pid and hostname getHostName(hostname, 1024, '.'); pid = getpid(); /* Parse and expand the NCCL_DEBUG_FILE path and * then create the debug file. But don't bother unless the * NCCL_DEBUG level is > VERSION */ const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE"); if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) { int c = 0; char debugFn[PATH_MAX+1] = ""; char *dfn = debugFn; while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) { if (ncclDebugFileEnv[c++] != '%') { *dfn++ = ncclDebugFileEnv[c-1]; continue; } switch (ncclDebugFileEnv[c++]) { case '%': // Double % *dfn++ = '%'; break; case 'h': // %h = hostname dfn += snprintf(dfn, PATH_MAX, "%s", hostname); break; case 'p': // %p = pid dfn += snprintf(dfn, PATH_MAX, "%d", pid); break; default: // Echo everything we don't understand *dfn++ = '%'; *dfn++ = ncclDebugFileEnv[c-1]; break; } } *dfn = '\0'; if (debugFn[0] != '\0') { FILE *file = fopen(debugFn, "w"); if (file != nullptr) { setbuf(file, nullptr); // disable buffering ncclDebugFile = file; } } } ncclEpoch = std::chrono::steady_clock::now(); __atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE); pthread_mutex_unlock(&ncclDebugLock); } /* Common logging function used by the INFO, WARN and TRACE macros * Also exported to the dynamically loadable Net transport modules so * they can share the debugging mechanisms and output files */ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit(); if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; } // Save the last error (WARN) as a human readable string if (level == NCCL_LOG_WARN) { pthread_mutex_lock(&ncclDebugLock); va_list vargs; va_start(vargs, fmt); (void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs); va_end(vargs); pthread_mutex_unlock(&ncclDebugLock); } if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return; if (tid == -1) { tid = syscall(SYS_gettid); } int cudaDev; if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) { cudaGetDevice(&cudaDev); } char buffer[1024]; size_t len = 0; if (level == NCCL_LOG_WARN) { len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line); } else if (level == NCCL_LOG_INFO) { len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev); } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) { len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid); } else if (level == NCCL_LOG_TRACE) { auto delta = std::chrono::steady_clock::now() - ncclEpoch; double timestamp = std::chrono::duration_cast>(delta).count()*1000; len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, pid, tid, cudaDev, timestamp, filefunc, line); } if (len) { va_list vargs; va_start(vargs, fmt); len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); va_end(vargs); buffer[len++] = '\n'; fwrite(buffer, 1, len, ncclDebugFile); } } NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0); void ncclSetThreadName(pthread_t thread, const char *fmt, ...) { // pthread_setname_np is nonstandard GNU extension // needs the following feature test macro #ifdef _GNU_SOURCE if (ncclParamSetThreadName() != 1) return; char threadName[NCCL_THREAD_NAMELEN]; va_list vargs; va_start(vargs, fmt); vsnprintf(threadName, NCCL_THREAD_NAMELEN, fmt, vargs); va_end(vargs); pthread_setname_np(thread, threadName); #endif } nccl-2.18.3-1/src/enhcompat.cc000066400000000000000000000026111444201535000157340ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ /* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */ enum cudaError_t { cudaErrorStubLibrary = 34 }; extern "C" { cudaError_t cudaStreamGetCaptureInfo_v2(...) __attribute__((visibility("hidden"))) __attribute((weak)); cudaError_t cudaStreamGetCaptureInfo_v2(...) { return cudaErrorStubLibrary; } cudaError_t cudaUserObjectCreate(...) __attribute__((visibility("hidden"))) __attribute((weak)); cudaError_t cudaUserObjectCreate(...) { return cudaErrorStubLibrary; } cudaError_t cudaGraphRetainUserObject(...) __attribute__((visibility("hidden"))) __attribute((weak)); cudaError_t cudaGraphRetainUserObject(...) { return cudaErrorStubLibrary; } cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak)); cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; } cudaError_t cudaGetDriverEntryPoint(...) __attribute__((visibility("hidden"))) __attribute((weak)); cudaError_t cudaGetDriverEntryPoint(...) { return cudaErrorStubLibrary; } } nccl-2.18.3-1/src/enqueue.cc000066400000000000000000002171761444201535000154430ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "enqueue.h" #include "argcheck.h" #include "coll_net.h" #include "gdrwrap.h" #include "bootstrap.h" #include "channel.h" #include "cudawrap.h" #include // std::memcpy #include // PRIx64 static void* const ncclKernelGeneric = (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t); struct ncclKernelMatch { void* kernelFn; bool specialized; }; // Only generate inline kernels for LL #define NCCL_FUNC5(func, algo, devredop, dtype, specialized) \ /*LL */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), true && specialized}, \ /*LL128 */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized}, \ /*SIMPLE*/{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized} #define NCCL_FUNC4(func, devredop, type, specialized) \ NCCL_FUNC5(func, TREE, devredop, type, specialized), \ NCCL_FUNC5(func, RING, devredop, type, specialized), \ NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \ NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, specialized), \ NCCL_FUNC5(func, NVLS, devredop, type, specialized), \ NCCL_FUNC5(func, NVLS_TREE, devredop, type, specialized) #ifdef __CUDA_BF16_TYPES_EXIST__ #define HAVE_BFLOAT16 1 #else #define HAVE_BFLOAT16 0 #endif // Must be consistent with ncclDataType_t #define NCCL_FUNCS3(func, devredop, reduction, specialized) \ NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int8_t, int8_t), specialized), \ NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint8_t, int8_t), specialized), \ NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int32_t, int8_t), specialized), \ NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint32_t, int8_t), specialized), \ NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int64_t, int8_t), specialized), \ NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint64_t, int8_t), specialized), \ NCCL_FUNC4(func, devredop, MACRO_IF(reduction, half, int8_t), specialized), \ NCCL_FUNC4(func, devredop, MACRO_IF(reduction, float, int8_t), specialized), \ NCCL_FUNC4(func, devredop, MACRO_IF(reduction, double, int8_t), specialized) \ MACRO_IF(HAVE_BFLOAT16, \ SINGLE_ARG(, NCCL_FUNC4(func, devredop, MACRO_IF(reduction, __nv_bfloat16, int8_t), specialized)), \ /*nothing*/ \ ) // Must be consistent with ncclDevRedOp_t -- but we only generate kernel for sums. #define NCCL_FUNCS2(func, reduction) \ NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/1), /*Sum*/ \ NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Prod*/ \ NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Max*/ \ NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Min*/ \ NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*PreMulSum*/ \ NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0) /*SumPostDiv*/ // Must be consistent with the ncclFuncSet enum static const ncclKernelMatch ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { {(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true}, // We don't bake special kernels for the one-rank reductions {/*int8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, {/*uint8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, {/*int32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, {/*uint32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, {/*int64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, {/*uint64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, {/*half*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, {/*float*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, {/*double*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, #if HAVE_BFLOAT16 {/*bfloat16*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false}, #endif NCCL_FUNCS2(Broadcast, /*reduction=*/0), NCCL_FUNCS2(Reduce, /*reduction=*/1), NCCL_FUNCS2(AllGather, /*reduction=*/0), NCCL_FUNCS2(ReduceScatter, /*reduction=*/1), NCCL_FUNCS2(AllReduce, /*reduction=*/1) }; static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */); NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0); // Returns maximum kernel stack size of all CUDA kernels ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) { constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]); ncclResult_t result = ncclSuccess; if (maxStackSize) *maxStackSize = 0; int carveout = ncclParamL1SharedMemoryCarveout(); // Keep track if we already visited a function pointer. void* lru[2] = {nullptr, nullptr}; for (int i=0; i < KernelCount; i++) { void* fn = ncclKerns[i].kernelFn; if (fn == lru[0] || fn == lru[1]) goto next_kernel; lru[1] = lru[0]; lru[0] = fn; if (maxStackSize) { cudaFuncAttributes attr = {0}; CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0); if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes; ignore0:; } if (carveout) { CUDACHECKGOTO(cudaFuncSetAttribute(fn, cudaFuncAttributePreferredSharedMemoryCarveout, carveout), result, ignore1); ignore1:; } if (ncclShmemDynamicSize(cudaArch) != 0) { CUDACHECKGOTO(cudaFuncSetAttribute(fn, cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)), result, next_kernel); } next_kernel:; } return result; } /*****************************************************************************/ /* Launch system : synchronization and CUDA kernel launch */ /*****************************************************************************/ static void appendWorkElemColl( struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, int funcIndex, struct ncclWorkElem const *elem, int bid ) { struct ncclKernelPlan::Channel* chan = &plan->channels[channelId]; struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue); if (q && funcIndex == q->work.header.funcIndex && elem->nWarps == q->work.elems[0].nWarps && chan->nWorkElem < NCCL_MAX_WORK_ELEMENTS) { int e = chan->nWorkElem++; q->work.elems[e] = *elem; // C++ struct assignment q->work.elems[e].bid = bid; q->work.elems[e].isUsed = 1; return; } q = ncclMemoryStackAlloc(&comm->memScoped); q->work.header.type = ncclWorkTypeColl; q->work.header.funcIndex = funcIndex; q->work.elems[0] = *elem; // C++ struct assignment q->work.elems[0].bid = bid; q->work.elems[0].isUsed = 1; chan->nWorkElem = 1; chan->nWork += 1; ncclIntruQueueEnqueue(&chan->workQueue, q); } static void appendWorkElemColl( struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, int funcIndex, struct ncclWorkElemReg const *elem, int bid ) { struct ncclKernelPlan::Channel* chan = &plan->channels[channelId]; struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue); if (q && funcIndex == q->work.header.funcIndex && elem->elem.nWarps == q->work.regElems[0].elem.nWarps && chan->nWorkElem < NCCL_MAX_WORK_ELEMENTS_REG) { int e = chan->nWorkElem++; q->work.regElems[e] = *elem; // C++ struct assignment q->work.regElems[e].elem.bid = bid; q->work.regElems[e].elem.isUsed = 1; return; } q = ncclMemoryStackAlloc(&comm->memScoped); q->work.header.type = ncclWorkTypeRegColl; q->work.header.funcIndex = funcIndex; q->work.regElems[0] = *elem; // C++ struct assignment q->work.regElems[0].elem.bid = bid; q->work.regElems[0].elem.isUsed = 1; chan->nWorkElem = 1; chan->nWork += 1; ncclIntruQueueEnqueue(&chan->workQueue, q); } static void finishWorkP2p(struct ncclWork* work) { int nElem = 0; for (int e=0; e < NCCL_MAX_WORK_ELEMENTS_P2P; e++) { if (work->p2pElems[e].p2pType != ncclWorkP2pTypeUnused) nElem = e+1; } int nGroup = 1; while (nGroup < nElem) nGroup *= 2; int nWarp = 1; while (nWarp*nGroup <= (NCCL_MAX_NTHREADS/WARP_SIZE)/2) nWarp *= 2; for (int i=0; i < nGroup; i++) { work->p2pElems[i].ngroups = nGroup; work->p2pElems[i].warpStart = i*(NCCL_MAX_NTHREADS/WARP_SIZE)/nGroup; int extraWarp = nWarp >= 2 ? i%2 : 0; work->p2pElems[i].nWarps = nWarp + extraWarp; } } static void finishWork(struct ncclWork* work) { if (work->header.type == ncclWorkTypeP2p) { finishWorkP2p(work); } } static void appendWorkElemP2p( struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId, struct ncclWorkElemP2p const *elem, bool fuseOk ) { constexpr int funcIndex = FUNC_INDEX_P2P; struct ncclKernelPlan::Channel* chan = &plan->channels[channelId]; struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue); if (q && funcIndex == q->work.header.funcIndex) { if (!fuseOk) goto NewWork; if (chan->p2pTailElem[elem->p2pType-1] < NCCL_MAX_WORK_ELEMENTS_P2P) { for (int e = -2 + chan->p2pTailElem[elem->p2pType-1]; e >= 0; e -= 2) { // Can't have multiple elements of the same ncclWork communicate with the // same peer otherwise they would attempt to use that connection concurrently. if (q->work.p2pElems[e].peer == elem->peer) goto NewWork; } int e = chan->p2pTailElem[elem->p2pType-1]; q->work.p2pElems[e] = *elem; // C++ struct assignment chan->p2pTailElem[elem->p2pType-1] += 2; return; } NewWork: finishWorkP2p(&q->work); } q = ncclMemoryStackAlloc(&comm->memScoped); q->work.header.type = ncclWorkTypeP2p; q->work.header.funcIndex = FUNC_INDEX_P2P; chan->p2pTailElem[ncclWorkP2pTypeRecv-1] = 0; chan->p2pTailElem[ncclWorkP2pTypeSend-1] = 1; q->work.p2pElems[chan->p2pTailElem[elem->p2pType-1]] = *elem; // C++ struct assignment chan->p2pTailElem[elem->p2pType-1] += 2; chan->nWork += 1; ncclIntruQueueEnqueue(&chan->workQueue, q); } static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) { bool needed = true; NCCLCHECK(ncclProxySaveOp(comm, op, &needed)); if (needed) { struct ncclProxyOp* q = ncclMemoryPoolAlloc(&comm->memPool_ncclProxyOp, &comm->memPermanent); *q = *op; // C++ struct assignment ncclIntruQueueEnqueue(&plan->channels[op->channelId].proxyOpQueue, q); } return ncclSuccess; } // Put coll workelem & proxyOp in plan assuming nWorkBudget permits, so please // ensure *nWorkBudget >= nBids upon entry. static ncclResult_t addCollToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex, struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp, int nCollChannels, int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[] ) { struct ncclKernelPlan::Channel *chans = plan->channels; // Choose the `nBid` least loaded channels to do the work. This ensures // all bids go to different channels in case they need to synchronize. int least[/*nBid*/MAXCHANNELS]; least[0] = 0; int maxIndexInLeast = 0; size_t maxBytesInLeast = chans[0].collBytes; // Initialize least[] such that the first nBid channels are accounted for. for (int b=1; b < nBid; b++) { least[b] = b; if (maxBytesInLeast < chans[b].collBytes) { maxIndexInLeast = b; maxBytesInLeast = chans[b].collBytes; } } // Sort in the rest of the channels. If a channel has less work than the max // member of least[], replace that member and compute the new max. for (int c=nBid; c < nCollChannels; c++) { if (chans[c].collBytes < maxBytesInLeast) { least[maxIndexInLeast] = c; maxBytesInLeast = chans[least[0]].collBytes; maxIndexInLeast = 0; for (int b=1; b < nBid; b++) { if (maxBytesInLeast < chans[least[b]].collBytes) { maxIndexInLeast = b; maxBytesInLeast = chans[least[b]].collBytes; } } } } uint64_t opCount = uint64_t(plan->collOpCount++)<<1 | 0; bytes /= nBid; for (int bid=0; bid < nBid; bid++) { int c = least[bid]; chans[c].collBytes += bytes; // Add work elem *nWorkBudget += chans[c].nWork; if (!regBufUsed) { appendWorkElemColl(comm, plan, c, funcIndex, workElem, bid); } else { // Buffer registration in play which could only for CollNet at the moment. struct ncclChannel* channel = &comm->channels[c]; struct ncclWorkElemReg workElemReg; workElemReg.elem = *workElem; // C++ struct assignment workElemReg.elem.regUsed = 1; for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) { int peer = channel->collnetDirect.down[i]; if (peer == -1) break; int j = comm->rankToLocalRank[peer]; // Get intra-node slot workElemReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer workElemReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer } for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) { int peer = channel->collnetDirect.up[i]; if (peer == -1) break; int j = comm->rankToLocalRank[peer]; // Output buffer of root peer workElemReg.upOutputs[i] = regBufRecv[j]; } appendWorkElemColl(comm, plan, c, funcIndex, &workElemReg, bid); } *nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork // Add proxy task. Empty collectives do not make it to the proxy thread // since they don't imply synchronization for the user like p2p. if (proxyOp->nsteps != 0) { struct ncclProxyOp tmp = *proxyOp; // C++ struct assignment tmp.channelId = c; tmp.opCount = opCount; NCCLCHECK(addProxyOpIfNeeded(comm, plan, &tmp)); } } return ncclSuccess; } NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384); // Put p2p op in plan assuming there is space in nWorkBudget, so you must // ensure *nWorkBudget >= 1 upon entry. static ncclResult_t addP2pToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes, bool fuseOk ) { struct ncclInfo info = { isSendNotRecv ? ncclFuncSend : ncclFuncRecv, isSendNotRecv ? "Send" : "Recv", nullptr, addr, bytes, ncclInt8, ncclSum, peer, comm, (cudaStream_t)0, /*Args*/1, 1 }; int channelId; NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, info.coll, &channelId)); info.channelId = channelId; // 1 is connIndex struct ncclConnInfo* conn = isSendNotRecv ? &comm->channels[channelId].peers[peer]->send[1].conn : &comm->channels[channelId].peers[peer]->recv[1].conn; info.protocol = ((conn->buffs[NCCL_PROTO_LL] != nullptr) && bytes <= ncclParamP2pLLThreshold()) ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE; struct ncclProxyOp proxyOp = {}; NCCLCHECK(ncclProxyComputeP2p(&info, &proxyOp)); struct ncclWorkElemP2p elem = {0}; elem.proto = info.protocol; elem.peer = peer; elem.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE; elem.p2pType = isSendNotRecv ? ncclWorkP2pTypeSend : ncclWorkP2pTypeRecv; elem.buffLo32 = uint32_t(reinterpret_cast(addr)); elem.buffHi32 = reinterpret_cast(addr)>>32; elem.countLo32 = uint32_t(bytes); elem.countHi32 = bytes>>32; elem.chunkSize = info.chunkSize; // computed by ncclProxyComputeP2p *nWorkBudget += plan->channels[channelId].nWork; appendWorkElemP2p(comm, plan, channelId, &elem, fuseOk); *nWorkBudget -= plan->channels[channelId].nWork; // Calculate the opCount after appendWorkElemP2p since it will always return // with channel->nWork equal to one plus the work index this p2p settled in. proxyOp.opCount = uint64_t(plan->channels[channelId].nWork)<<1 | 1; NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp)); return ncclSuccess; } static void finishPlan(struct ncclKernelPlan* plan) { int channelUbound = 0; int channelCount = 0; uint64_t channelMask = 0; bool hasProxyOps = false; for (int c=0; c < MAXCHANNELS; c++) { struct ncclWorkList* tail = ncclIntruQueueTail(&plan->channels[c].workQueue); if (tail != nullptr) { channelUbound = c+1; channelCount += 1; channelMask |= 1ull<work.header.isLast = 1; finishWork(&tail->work); } hasProxyOps |= !ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue); } plan->channelUbound = channelUbound; plan->channelCount = channelCount; plan->channelMask = channelMask; plan->hasProxyOps = hasProxyOps; plan->threadPerBlock = std::max(plan->threadPerBlock, 3*WARP_SIZE); } static ncclResult_t registerIntraNodeBuffers( struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclInfo* info, bool* outRegBufUsed, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS] ) { *outRegBufUsed = false; ncclResult_t result = ncclSuccess; #if CUDART_VERSION >= 11030 int localRank = comm->localRank; if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess; struct HandlePair { cudaIpcMemHandle_t ipc[2]; // {send, recv} size_t offset[2]; // {send, recv} }; struct HandlePair handles[NCCL_MAX_LOCAL_RANKS]; CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback); CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback); void *baseSend, *baseRecv; size_t size; CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff)); handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend; CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff)); handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv; NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair))); // Open handles locally for (int i=0; i < comm->localRanks; i++) { if (i == localRank) { // Skip self outRegBufSend[i] = nullptr; outRegBufRecv[i] = nullptr; } else { for (int sr=0; sr < 2; sr++) { // Get base address of mapping void* base; CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess)); // Get real buffer address by adding offset in the mapping (sr==0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr]; // Enqueue reminder to close memory handle struct ncclPointerList* q = ncclMemoryPoolAlloc(&comm->memPool_ncclPointerList, &comm->memPermanent); q->ptr = base; ncclIntruQueueEnqueue(&plan->ipcMemQueue, q); } } } *outRegBufUsed = true; fallback: #endif return result; } NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 0); static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport); static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps); static ncclResult_t scheduleCollTasksToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget ) { struct ncclTasks* tasks = &comm->tasks; size_t bytePerChannel[/*collNetSupport*/2]; if (comm->channelSize > 0) { // Set by user bytePerChannel[/*collNetSupport=*/0] = comm->channelSize; bytePerChannel[/*collNetSupport=*/1] = comm->channelSize; } else { // Latency increases as scale increases // We would thus want to increase the chunk size to compensate for the lost efficiency bytePerChannel[/*collNetSupport=*/0] = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks); bytePerChannel[/*collNetSupport=*/1] = 256<<10; // Hand-tuned } for (int collNetSupport=0; collNetSupport < 2; collNetSupport++) { while (tasks->collBytesTotal < bytePerChannel[collNetSupport]*comm->nChannels && bytePerChannel[collNetSupport] > NCCL_MIN_CHANNEL_SIZE) { // Reduce per-channel size so we utilize all channels. bytePerChannel[collNetSupport] /= 2; } } while (tasks->nTasksColl != 0) { struct ncclTaskColl* head = ncclIntruQueueHead(&tasks->collQueue); struct ncclInfo aggInfo = {}; aggInfo.comm = comm; aggInfo.coll = head->func; aggInfo.datatype = head->datatype; aggInfo.opFull = head->op; aggInfo.op = (ncclRedOp_t)(int)head->op.op; aggInfo.count = head->count; int nAggChannels = 0; int nAggOps = 1; struct ncclTaskColl* aggEnd = head->next; int collNetSupport = 0; NCCLCHECK(getCollNetSupport(&aggInfo, &collNetSupport)); // Find a range of ops that can be aggregated together. while (aggEnd != nullptr && aggEnd->func == aggInfo.coll && aggEnd->datatype == aggInfo.datatype && aggEnd->op.op == aggInfo.opFull.op) { aggInfo.count += aggEnd->count; int nc = DIVUP(aggEnd->count*ncclTypeSize(aggInfo.datatype), bytePerChannel[collNetSupport]); nc = std::max(1, std::min(nc, comm->nChannels)); nAggChannels += nc; nAggOps++; aggEnd = aggEnd->next; } if (nAggOps > 1) { NCCLCHECK(ncclInfoSetDerived(&aggInfo, comm->nRanks)); aggInfo.nChannels = std::min(comm->nChannels, nAggChannels); int opPerChannel = DIVUP(nAggChannels, aggInfo.nChannels); NCCLCHECK(getAlgoInfo(&aggInfo, collNetSupport, opPerChannel)); } while (head != aggEnd) { struct ncclInfo info = {}; info.comm = comm; info.coll = head->func; info.sendbuff = head->sendbuff; info.recvbuff = head->recvbuff; info.count = head->count; info.root = head->root; info.datatype = head->datatype; info.opFull = head->op; // C++ struct assignment info.op = (ncclRedOp_t)(int)head->op.op; info.chunkSteps = head->chunkSteps; info.sliceSteps = head->sliceSteps; NCCLCHECK(ncclInfoSetDerived(&info, comm->nRanks)); if (nAggOps > 1) { int maxChannels = aggInfo.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels; info.nChannels = DIVUP(info.nBytes, bytePerChannel[collNetSupport]); info.nChannels = std::max(1, std::min(info.nChannels, maxChannels)); info.algorithm = aggInfo.algorithm; info.protocol = aggInfo.protocol; info.nThreads = aggInfo.nThreads; } int workFuncIndex; struct ncclWorkElem workElem = {}; struct ncclProxyOp proxyOp = {}; NCCLCHECK(computeColl(&info, &workFuncIndex, &workElem, &proxyOp)); if (*nWorkBudget < info.nChannels) return ncclSuccess; // Ensure room for addCollToPlan() bool regBufUsed = false; void* regBufSend[NCCL_MAX_LOCAL_RANKS]; void* regBufRecv[NCCL_MAX_LOCAL_RANKS]; if (plan->persistent && ncclParamGraphRegister() && info.algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other comm->intraRanks < comm->localRanks) { // only with inter-process & intra-node peers NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, ®BufUsed, regBufSend, regBufRecv)); } int maxChannels = info.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels; NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp, maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv)); tasks->nTasksColl -= 1; tasks->collBytesTotal -= info.nBytes; ncclIntruQueueDequeue(&tasks->collQueue); head = ncclIntruQueueHead(&tasks->collQueue); plan->threadPerBlock = std::max(plan->threadPerBlock, info.nThreads); if (!plan->kernelSpecialized) { plan->kernelFn = ncclKerns[workFuncIndex].kernelFn; plan->kernelSpecialized = ncclKerns[workFuncIndex].specialized; } } } return ncclSuccess; } static size_t calcP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) { size_t size = std::max(minSize, divUp(totalSize, minChannels)); int nChannels = minChannels; while (size > maxSize && nChannels <= maxChannels/2) { nChannels *= 2; size = divUp(totalSize, nChannels); } return alignUp(size, minSize); } static ncclResult_t scheduleP2pTasksToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget ) { struct ncclTasks* tasks = &comm->tasks; int nRanks = comm->nRanks; struct ncclTasks::Peer* peers = tasks->peers; int const *sendOrder = tasks->p2pSendOrder; int const *recvOrder = tasks->p2pRecvOrder; plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS); if (!plan->kernelSpecialized) { plan->kernelFn = ncclKerns[FUNC_INDEX_P2P].kernelFn; plan->kernelSpecialized = ncclKerns[FUNC_INDEX_P2P].specialized; } // Compute how much to split operations // Natural step size matching buffer steps. ssize_t stepSize = comm->p2pChunkSize; // Try to use all channels int nChannelsMax = comm->p2pnChannelsPerPeer; int nChannelsMin = nChannelsMax; if (comm->nNodes == 1) { // Try to use all channels, but one channel per operation. while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2; // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth. while (nChannelsMax*nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2; } bool fuseOk; // We can perform 8 send/recv per round per CTA. Make sure we jump between fused blocks at node boundaries. while (tasks->nTasksP2p != 0) { for (int i=0; i < tasks->p2pOrderSteps; i++) { int sendPeer = sendOrder[i]; int recvPeer = recvOrder[i]; if ((i % (NCCL_MAX_WORK_ELEMENTS_P2P/2)) == 0) fuseOk = false; struct ncclTaskP2p* send = sendPeer != -1 ? ncclIntruQueueHead(&peers[sendPeer].sendQueue) : NULL; struct ncclTaskP2p* recv = recvPeer != -1 ? ncclIntruQueueHead(&peers[recvPeer].recvQueue) : NULL; if (sendPeer == comm->rank) { if (recvPeer != comm->rank) { WARN("Sendrecv plan not aligned for self"); return ncclInternalError; } if (send && recv == nullptr) { WARN("Trying to send to self without a matching recv"); return ncclInvalidUsage; } if (send == nullptr && recv) { WARN("Trying to recv to self without a matching send"); return ncclInvalidUsage; } } if (send != nullptr || recv != nullptr) { char* recvPtr = recv ? (char*)recv->buff : nullptr; char* sendPtr = send ? (char*)send->buff : nullptr; ssize_t recvBytes = recv ? recv->bytes : 0; ssize_t sendBytes = send ? send->bytes : 0; ssize_t minSize = stepSize/8; ssize_t maxSize = comm->nNodes > 1 ? stepSize : stepSize*32; ssize_t recvChunkBytesMax = calcP2pChunkSize(recvBytes, nChannelsMin, nChannelsMax, minSize, maxSize); ssize_t sendChunkBytesMax = calcP2pChunkSize(sendBytes, nChannelsMin, nChannelsMax, minSize, maxSize); // Zero size send/recv are syncs, encode here with -1. recvBytes = recv && recvBytes == 0 ? -1 : recvBytes; sendBytes = send && sendBytes == 0 ? -1 : sendBytes; // Advance to current chunk. Syncs will always have chunk=0 so no effect on the -1. if (recv) recvPtr += recv->chunk*recvChunkBytesMax; if (recv) recvBytes -= recv->chunk*recvChunkBytesMax; if (send) sendPtr += send->chunk*sendChunkBytesMax; if (send) sendBytes -= send->chunk*sendChunkBytesMax; do { ssize_t recvChunkBytes = std::min(recvBytes, recvChunkBytesMax); // -1 preserved ssize_t sendChunkBytes = std::min(sendBytes, sendChunkBytesMax); if (recvChunkBytes != 0) { if (recvChunkBytes == -1) recvChunkBytes = 0; if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, fuseOk)); fuseOk = true; recvPtr += recvChunkBytes; recvBytes -= recvChunkBytes; recv->chunk += 1; if (recvBytes <= 0) { recvBytes = 0; // in case still -1 ncclIntruQueueDequeue(&peers[recvPeer].recvQueue); tasks->nTasksP2p -= 1; } } if (sendChunkBytes != 0) { if (sendChunkBytes == -1) sendChunkBytes = 0; if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes, fuseOk)); fuseOk = true; sendPtr += sendChunkBytes; sendBytes -= sendChunkBytes; send->chunk += 1; if (sendBytes <= 0) { sendBytes = 0; // in case still -1 ncclIntruQueueDequeue(&peers[sendPeer].sendQueue); tasks->nTasksP2p -= 1; } } } while (sendBytes != 0 || recvBytes != 0); } } } return ncclSuccess; } // Comparison of monotonic rolling counters. static inline bool rollingLess32(uint32_t a, uint32_t b) { constexpr uint32_t PositiveMax = uint32_t(-1)>>1; return a-b > PositiveMax; } static inline uint32_t rollingMin32(uint32_t a, uint32_t b) { constexpr uint32_t PositiveMax = uint32_t(-1)>>1; return (b-a <= PositiveMax) ? a : b; } // Spin until its safe to increase comm->workFifoSent to desiredSent. static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredSent) { if (__builtin_expect(rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent), false)) { while (1) { // We have to poll for notifications from device. uint32_t* doneLive = comm->workFifoDone; uint32_t ackd[MAXCHANNELS]; for (int c=0; c < MAXCHANNELS; c++) { ackd[c] = __atomic_load_n(&doneLive[c], __ATOMIC_RELAXED); } // Compiler-only fence to prevent fusion of loops to encourage dense loads. __atomic_signal_fence(__ATOMIC_SEQ_CST); uint32_t ackdAll = comm->workFifoSent; for (int c=0; c < MAXCHANNELS; c++) { // ackdAll is min over all non-quiesced channels if (ackd[c] != comm->channels[c].workFifoSent) ackdAll = rollingMin32(ackdAll, ackd[c]); } // Compiler only fence to prevent fusion of loops to encourage dense stores. __atomic_signal_fence(__ATOMIC_SEQ_CST); for (int c=0; c < MAXCHANNELS; c++) { // Advance counter on quiesced channels so they don't lag behind // too far where they could get lost in 32-bit wraparound. if (ackd[c] == comm->channels[c].workFifoSent) { comm->channels[c].workFifoSent = ackdAll; __atomic_store_n(&doneLive[c], ackdAll, __ATOMIC_RELAXED); } } comm->workFifoAckdMin = ackdAll; // See if that was enough. if (!rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent)) break; sched_yield(); } } } static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) { bool persistent = plan->persistent; int channelUbound = plan->channelUbound; int nWork = 0; for (int c=0; c < channelUbound; c++) nWork += plan->channels[c].nWork; struct ncclWork* workHeap; if (!persistent) { workHeap = comm->workFifoHeap; } else { workHeap = ncclMemoryStackAlloc(&comm->memScoped, nWork); } uint32_t ixMask = persistent ? ~uint32_t(0) : comm->workFifoDepth-1; uint32_t ixSent; if (persistent) { ixSent = 0; } else { ixSent = comm->workFifoSent; // First work for a channel has to be at workHeap+blockIdx.x which means // we cannot tolerate fifo wraparound. So round up to the wrap boundary // if not doing so would incur crossing it. if (((ixSent + plan->channelCount-1) & ixMask) < (ixSent & ixMask)) { ixSent = (ixSent + ixMask) & ~ixMask; // Need to update workFifoSent so waitWorkFifoAvailable() knows we've // skipped those elements. Consider if all the channels report quiesced, // this way the skipped slots will be considered consumed as well. comm->workFifoSent = ixSent; } waitWorkFifoAvailable(comm, ixSent + nWork); } uint32_t ixHead = ixSent; ixSent += plan->channelCount; int channelsWithWork = 0; // number of channels below `c` with work structs. for (int c=0; c < channelUbound; c++) { struct ncclWorkList* q = ncclIntruQueueHead(&plan->channels[c].workQueue); // Offset of first work equals number of channels below with work. uint32_t ix = ixHead + channelsWithWork; channelsWithWork += q != nullptr ? 1 : 0; while (q != nullptr) { if (q->next != nullptr) { q->work.header.workNext = int32_t(ixSent & ixMask) - int32_t(ixHead & ixMask); } else { q->work.header.inFifo = !persistent ? 1 : 0; // Tell channel to ack us back ix+1 indicating that all slots up to and // including ix have been consumed. q->work.header.doneAcks = ix+1; comm->channels[c].workFifoSent = ix+1; } workHeap[ix & ixMask] = q->work; // C++ struct assignment q = q->next; if (q != nullptr) ix = ixSent++; } } if (!persistent) { comm->workFifoSent = ixSent; if (comm->workFifoHeapGdrHandle != nullptr) wc_store_fence(); plan->workHead = &comm->devWorkFifoHeap[ixHead & ixMask]; } else { NCCLCHECK(ncclCudaMalloc(&plan->workHead, nWork)); NCCLCHECK(ncclCudaMemcpy(plan->workHead, workHeap, nWork)); } return ncclSuccess; } static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* plan) { uint64_t collOpCount = comm->sharedRes->collOpCount; // Advance comm's collOpCount by number of colls in this plan. comm->sharedRes->collOpCount += plan->collOpCount; for (int c=0; c < plan->channelUbound; c++) { struct ncclProxyOp* q = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue); uint64_t p2pOpCount = comm->sharedRes->p2pOpCount[c]; uint64_t nextP2pOpCount = p2pOpCount; while (q != nullptr) { struct ncclProxyOp* qNext = q->enqNext; // Ignoring the bottom tag bit, opCount's are zero-based within plan so // translate them to the tip of the comm's history. if (q->opCount & 1) { // p2p // p2pOpCount is monotonic increasing within a plan's channel so just // remember last value to compute max. nextP2pOpCount = p2pOpCount + (q->opCount>>1); nextP2pOpCount += 1; // +1 to ensure next plan doesn't collide q->opCount = (p2pOpCount<<1) + q->opCount; } else { // coll q->opCount = (collOpCount<<1) + q->opCount; } NCCLCHECK(ncclProxySaveOp(comm, q, nullptr)); // May overwrite enqNext. if (!plan->persistent) { // Non-persistent kernels have their memory reclaimed after upload. ncclMemoryPoolFree(&plan->memPool_ncclProxyOp, q); } q = qNext; } // Advance channel's p2pOpCount by number of p2p's in this plan channel. comm->sharedRes->p2pOpCount[c] = nextP2pOpCount; } return ncclSuccess; } static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) { NCCLCHECK(uploadProxyOps(comm, plan)); NCCLCHECK(ncclProxyStart(comm)); if (!plan->persistent) { // Notify main thread of our reclaiming. This will reclaim plan concurrently. ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer); } return ncclSuccess; } static void CUDART_CB hostStreamPlanCallback(void *plan_) { NVTX3_FUNC_RANGE_IN(nccl_domain); struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_; ncclResult_t result = hostStreamPlanTask(plan->comm, plan); if (result != ncclSuccess) { WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result)); } } static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) { struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim` if (plan->persistent) { comm->persistentRefs -= 1; NCCLCHECK(ncclCudaFree(plan->workHead)); while (!ncclIntruQueueEmpty(&plan->ipcMemQueue)) { struct ncclPointerList* q = ncclIntruQueueDequeue(&plan->ipcMemQueue); CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr)); ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q); } } ncclMemoryPoolTakeAll(&comm->memPool_ncclProxyOp, &plan->memPool_ncclProxyOp); ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); return ncclSuccess; } static void persistentDestructor(void* plans_) { struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plans_; struct ncclComm* comm = plan->comm; while (plan != nullptr) { struct ncclKernelPlan* next = plan->next; ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer); plan = next; } } ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { ncclResult_t result = ncclSuccess; struct ncclTasks* tasks = &comm->tasks; bool persistent = ncclCudaGraphValid(tasks->capturingGraph); int nPlans = 0; // Poll for callbacks sent to us from other threads. Typically these free // resources from to our memory pools. NCCLCHECK(ncclCommPollCallbacks(comm, /*waitSome=*/false)); // We already have one frame present which holds all of our tasks (which we // are about to schedule). Now push an additional frame for allocating // work structs (see appendWorkElem() variants all use scoped allocation). ncclMemoryStackPush(&comm->memScoped); if (tasks->nTasksColl + tasks->nTasksP2p != 0) { do { struct ncclKernelPlan* plan = ncclMemoryPoolAlloc(&comm->memPool_ncclKernelPlan, &comm->memPermanent); ncclIntruQueueEnqueue(&comm->planQueue, plan); nPlans += 1; plan->comm = comm; plan->reclaimer.fn = reclaimPlan; plan->persistent = persistent; // Non-persistent kernels fill up at most half of our fifo per kernel. int nWorkBudget = plan->persistent ? INT_MAX : comm->workFifoDepth/2; int nWorkBudgetOld = nWorkBudget; // Drain coll tasks first. This is essential since we partition tasks based // on the work budget and p2p work isn't collective. If we were to drain p2p // first, the place where we cut the kernel could vary by rank which would // cause the "shortest channel first" channel picker to have divergent results. if (tasks->nTasksColl != 0) { NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &nWorkBudget), result, failure); } // And only drain p2p tasks once colls are depleted. if (tasks->nTasksColl == 0 && tasks->nTasksP2p != 0) { NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &nWorkBudget), result, failure); } if (nWorkBudget == nWorkBudgetOld) { // We weren't able to fit any tasks into our budget which means now we're // stuck in an infinite loop. We defer this check until here, instead of // doing it in comm init, to permit testing with insanely shallow queues // for cases where that's expected to still work (e.g. few channels). WARN("'NCCL_WORK_FIFO_DEPTH=%d' is too small. Minimum value is %d", comm->workFifoDepth, 2*MAXCHANNELS); result = ncclInvalidUsage; goto failure; } finishPlan(plan); } while (tasks->nTasksColl + tasks->nTasksP2p != 0); struct ncclKernelPlan* planHead = ncclIntruQueueHead(&comm->planQueue); comm->unlaunchedPlansHead = planHead; // Semantically we want these dependencies for the kernels launched: // 1. Launch host task on hostStream. // 2. Launch kernel, depends on all of {deviceStream, hostStream, userStream[i]...} // 3. {deviceStream, userStream[i]...} depend on kernel. // We achieve this by: // 1. userStream[0] waits on deviceStream // 2. deviceStream waits on each of userStream[1...] // 3. host task launch on hostStream // 4. userStream[0] waits on hostStream // 5. kernel launch on userStream[0] // 6. deviceStream waits on userStream[0] // 7. userStream[1...] each waits on deviceStream // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires // at least one of the two streams to be strong-stream. cudaStream_t launchStream = tasks->streams->stream; NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, failure); // Create dependency for device stream on user streams. First from extra user // streams to deviceStream. Then deviceStream to first user stream. for (struct ncclCudaStreamList* l=tasks->streams->next; l != nullptr; l = l->next) { NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure); } NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure); if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) { // We have to launch host tasks to push proxy args. We are careful to only // do this if necessary since host tasks impose a high performance cost in CUDA. bool acquired = false; for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) { if (plan->hasProxyOps) { if (!acquired) { acquired = true; NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure); } NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure); } } if (acquired) { // Make to-be-launched kernels dependent on just-launched host stream tasks. NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure); NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure); } } if (persistent) { comm->persistentRefs += nPlans; NCCLCHECKGOTO(ncclCudaGraphAddDestructor(tasks->capturingGraph, persistentDestructor, (void*)planHead), result, failure); } } if (false) { failure: ncclMemoryStackPop(&comm->memScoped); // deallocate ncclWork's } return result; } ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) { // This code is called after we've checked in to the intra-process barrier // but before launching the kernel. We are not allowed to call CUDA unless the // kernel launch is captured. NCCLCHECK(uploadWork(comm, plan)); return ncclSuccess; } #if CUDART_VERSION >= 12000 // NCCL uses the "Remote" Mem Sync domain by default NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote); #endif ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) { struct ncclTasks* tasks = &comm->tasks; void *fn = plan->kernelFn; cudaStream_t launchStream = tasks->streams->stream; dim3 grid = {(unsigned)plan->channelCount, 1, 1}; dim3 block = {(unsigned)plan->threadPerBlock, 1, 1}; size_t smem = ncclShmemDynamicSize(comm->cudaArch); void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead}; #if CUDART_VERSION >= 11080 int driverVersion; NCCLCHECK(ncclCudaDriverVersion(&driverVersion)); if (driverVersion >= 11080) { int compCap = comm->compCap; unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0; cudaLaunchConfig_t launchConfig = {0}; cudaLaunchAttribute launchAttrs[3]; int attrs = 0; /* Cooperative Group Array (CGA) * On sm90 and later we have an extra level of hierarchy where we * can group together several blocks within the Grid, called * Thread Block Clusters. * Clusters enable multiple thread blocks running concurrently * across multiple SMs to synchronize and collaboratively fetch * and exchange data. A cluster of blocks are guaranteed to be * concurrently scheduled onto a group of SMs. * The maximum value is 8 and it must be divisible into the grid dimensions */ if (clusterSize) { // Grid dimension must be divisible by clusterSize if (grid.x % clusterSize) clusterSize = 1; launchAttrs[attrs].id = cudaLaunchAttributeClusterDimension; launchAttrs[attrs++].val.clusterDim = {clusterSize, 1, 1}; launchAttrs[attrs].id = cudaLaunchAttributeClusterSchedulingPolicyPreference; launchAttrs[attrs++].val.clusterSchedulingPolicyPreference = cudaClusterSchedulingPolicySpread; } #if CUDART_VERSION >= 12000 if (compCap >= 90 && driverVersion >= 12000) { // Set the NCCL Mem Sync domain on CUDA 12.0 and later (sm90) launchAttrs[attrs].id = cudaLaunchAttributeMemSyncDomain; launchAttrs[attrs++].val.memSyncDomain = (cudaLaunchMemSyncDomain) ncclParamMemSyncDomain(); } #endif launchConfig.gridDim = grid; launchConfig.blockDim = block; launchConfig.dynamicSmemBytes = smem; launchConfig.attrs = launchAttrs; launchConfig.numAttrs = attrs; launchConfig.stream = launchStream; CUDACHECK(cudaLaunchKernelExC(&launchConfig, fn, args)); return ncclSuccess; } #endif // Standard kernel launch CUDACHECK(cudaLaunchKernel(fn, grid, block, args, smem, launchStream)); return ncclSuccess; } ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) { if (!(plan->persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking)) { // If this isn't being captured and there aren't any CUDA graphs alive // then we don't need to do our proxyOp pushing on the host stream. NCCLCHECK(hostStreamPlanTask(comm, plan)); } return ncclSuccess; } ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { ncclResult_t result = ncclSuccess; struct ncclTasks* tasks = &comm->tasks; tasks->collBytesTotal = 0; // Just in case subtraction during scheduleCollTasksToPlan() doesn't get to 0 // Deallocate ncclWork's. This frame exists so long as ncclLaunchPrepare // succeeded, and if it ncclLaunchPrepare didn't succeed we wouldn't be here. ncclMemoryStackPop(&comm->memScoped); if (!ncclIntruQueueEmpty(&comm->planQueue)) { // Reset queue to empty without destroying plans since those will be sent // back to us for reclaiming via callbackQueue. ncclIntruQueueConstruct(&comm->planQueue); cudaStream_t launchStream = tasks->streams->stream; // First user stream gets launch // Create dependency for deviceStream on launchStream. We know that deviceStream // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare), // so we can say that launchStream subsumes it. NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1); resume1: // Create dependency for other user streams (skip launch stream) on deviceStream. // Again, the user streams haven't been touched since deviceStream waited on them // so we can say they are subsumed by deviceStream. struct ncclCudaStreamList* sl = tasks->streams->next; tasks->streams = nullptr; // Reset comm->tasks.streams to empty. while (sl != nullptr) { NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2); resume2: sl = sl->next; } // Release device stream as acquired in ncclLaunchPrepare() NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, resume3); resume3:; } return result; } /*****************************************************************************/ /* Enqueueing system : computation of kernel and proxy operations parameters */ /*****************************************************************************/ static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) { // Translate ncclAvg and PreMulSum ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op; *collNetTypeSupport = info->comm->collNetSupportMatrix[netOp][info->datatype]; return ncclSuccess; } // numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency. static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps) { struct ncclComm* comm = info->comm; if (comm->nRanks == 1) { info->algorithm = NCCL_ALGO_RING; info->protocol = NCCL_PROTO_SIMPLE; } else { float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete. // Find algorithm / protocol. info->algorithm = -1; info->protocol = -1; int nAlgos = NCCL_NUM_ALGORITHMS; for (int a=0; adatatype, info->opFull.op)) continue; if (a == NCCL_ALGO_NVLS && collNetTypeSupport != 1 && comm->nNodes > 1) continue; if (a == NCCL_ALGO_NVLS_TREE && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue; for (int p=0; p= 0 && time < minTime) { info->algorithm = a; info->protocol = p; minTime = time; } } } if (info->algorithm == -1 || info->protocol == -1) { WARN("Error : no algorithm/protocol available"); return ncclInternalError; } //if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime); TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime); } int nc = (info->nChannels > 0) ? info->nChannels : comm->nChannels; int nt = comm->maxThreads[info->algorithm][info->protocol]; int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol]; if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { // CollNet channel tuning int ncSwitch = 16; bool flag = true; while (ncSwitch >= 1 && flag) { while ((flag = info->nBytes < nc*nt*info->comm->channels[0].collnetDirect.nHeads*threadThreshold) && nc > ncSwitch) { if (nc == ncSwitch+ncSwitch/2) threadThreshold /= 2; nc--; } ncSwitch /= 2; } } else if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) { // NVLS should not need more than 16 channels to get peak BW. nc = comm->nvlsChannels; } else { // Ring/Tree channel tuning while (info->nBytes < nc*nt*threadThreshold) { if (nc >= 2) nc--; else if ((nt % 128) == 0) nt/=2; else break; } } if (info->protocol == NCCL_PROTO_SIMPLE) { if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync // More threads or sync warps needed due to split thread model if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE; } nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt; info->nChannels = nc; info->nThreads = nt; return ncclSuccess; } static ncclResult_t getPatternInfo(struct ncclInfo* info) { switch (info->coll) { case ncclFuncBroadcast: info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break; case ncclFuncReduce: info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break; case ncclFuncReduceScatter: case ncclFuncAllGather: info->pattern = info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls : ncclPatternRing; break; case ncclFuncAllReduce: info->pattern = info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls : info->algorithm == NCCL_ALGO_NVLS_TREE ? ncclPatternNvlsTree : info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect : info->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break; default: WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm); return ncclInternalError; } return ncclSuccess; } static ncclResult_t getLoopInfo(struct ncclInfo* info) { switch (info->pattern) { case ncclPatternTreeUp: case ncclPatternTreeDown: case ncclPatternTreeUpDown: case ncclPatternPipelineFrom: case ncclPatternPipelineTo: case ncclPatternCollnetChain: info->nstepsPerLoop = info->nchunksPerLoop = 1; break; case ncclPatternNvls: info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].nvls.nHeads; break; case ncclPatternCollnetDirect: info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collnetDirect.nHeads; break; case ncclPatternRing: info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break; case ncclPatternRingTwice: info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break; case ncclPatternNvlsTree: info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].nvls.nHeads; break; default: WARN("Unknown pattern %d", info->pattern); return ncclInternalError; } return ncclSuccess; } static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) { int collNetTypeSupport = 0; // Check whether algo and proto have been preset (as in aggregation case) // If so, skip the calculation if (info->nChannels > 0 && info->nThreads > 0) goto comp_next; NCCLCHECK(getCollNetSupport(info, &collNetTypeSupport)); NCCLCHECK(getAlgoInfo(info, collNetTypeSupport, 1)); comp_next: // Set nstepsPerLoop and nchunksPerLoop NCCLCHECK(getPatternInfo(info)); NCCLCHECK(getLoopInfo(info)); work->sendbuff = info->sendbuff; work->recvbuff = info->recvbuff; work->root = info->root; work->count = info->count; work->nChannels = info->nChannels; work->nWarps = info->nThreads / WARP_SIZE; work->redOpArg = info->opFull.scalarArg; work->redOpArgIsPtr = info->opFull.scalarArgIsPtr; if (info->comm->nRanks == 1) { // one-rank reduce index *workFuncIndex = 1 + int(info->datatype); return ncclSuccess; } *workFuncIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol); int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS; int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1; int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1; int chunkSize = stepSize*chunkSteps; // Compute lastChunkSize if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) { if (info->pattern == ncclPatternTreeUpDown) { // Optimize chunkSize / nSteps while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2; while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2; while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2; } // Use lastChunkSize as chunkSize work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { // Optimize chunkSize / nSteps while (info->nBytes / (info->nChannels*info->comm->channels[0].collnetDirect.nHeads*chunkSize) < info->comm->channels[0].collnetDirect.depth*64 && chunkSize > 131072) chunkSize /= 2; while (info->nBytes / (info->nChannels*info->comm->channels[0].collnetDirect.nHeads*chunkSize) < info->comm->channels[0].collnetDirect.depth*8 && chunkSize > 65536) chunkSize /= 2; while (info->nBytes / (info->nChannels*info->comm->channels[0].collnetDirect.nHeads*chunkSize) < info->comm->channels[0].collnetDirect.depth*8 && chunkSize > 32768) chunkSize /= 2; // Use lastChunkSize as chunkSize work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); // Set direct direction for broadcast-gather (read or write) work->direct = (info->nBytes / info->nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ; } else if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN) { stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; chunkSize = std::min(256*1024, stepSize*chunkSteps); while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*64 && chunkSize > 131072) chunkSize /= 2; while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth*8 && chunkSize > 65536) chunkSize /= 2; while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2; work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_NVLS) { int maxChunkSize = 131072; if (chunkSize > maxChunkSize) chunkSize = maxChunkSize; // Use uint64_t so that concurrentOps*chunkSize*X does not overflow uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads; if ((info->nBytes < (64 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; if ((info->nBytes < (8 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768; if ((info->nBytes < (2 * (concurrentOps*chunkSize))) && (chunkSize > 16384)) chunkSize = 16384; work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) { // Use uint64_t so that concurrentOps*chunkSize*X does not overflow uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads; if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144; if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072; if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; if ((info->nBytes < (1 * (concurrentOps*chunkSize))) && (chunkSize > 32768)) chunkSize = 32768; work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->protocol == NCCL_PROTO_LL) { const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine); const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; work->lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop); ALIGN_SIZE(work->lastChunkSize, info->nThreads*sizeof(uint64_t)); work->lastChunkSize /= ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) { int nNodes = info->comm->nNodes; float ppn = info->comm->nRanks / (float)nNodes; float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn; while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2; while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2; // Use lastChunkSize as chunkSize work->lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype)); } // Compute nSteps for proxies int chunkEffectiveSize = chunkSize; if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2; if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS; //if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol); int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize))); proxyOp->nsteps = info->nstepsPerLoop * nLoops * chunkSteps; proxyOp->sliceSteps = sliceSteps; proxyOp->chunkSteps = chunkSteps; proxyOp->chunkSize = chunkSize; proxyOp->protocol = info->protocol; proxyOp->dtype = info->datatype; proxyOp->redOp = info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum info->op; proxyOp->pattern = info->pattern; proxyOp->root = info->root; // This is used by P2P to reduce the receive buffer size. We don't use it in collectives // because some protocols need to transmit more than the total size, plus they sometimes // round up proxyOp->nbytes = stepSize*proxyOp->sliceSteps; TRACE(NCCL_COLL,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d chunksize %d comm %p", proxyOp->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads, nLoops, proxyOp->nsteps, chunkSize, info->comm); return ncclSuccess; } static ncclResult_t hostToDevRedOp( ncclDevRedOpFull *opFull, ncclRedOp_t op, ncclDataType_t datatype, ncclComm *comm ) { union { int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; half f16; #if defined(__CUDA_BF16_TYPES_EXIST__) __nv_bfloat16 bf16; #endif float f32; double f64; void *ptr; }; u64 = 0; opFull->scalarArgIsPtr = false; switch (int(op)) { case ncclSum: opFull->op = ncclDevSum; break; case ncclProd: opFull->op = ncclDevProd; break; case ncclMax: opFull->op = ncclDevMax; break; case ncclMin: opFull->op = ncclDevMin; break; case ncclAvg: switch ((int)datatype) { case ncclInt8: case ncclInt32: case ncclInt64: case ncclUint8: case ncclUint32: case ncclUint64: opFull->op = ncclDevSumPostDiv; u64 = comm->nRanks; break; case ncclFloat16: opFull->op = ncclDevPreMulSum; f16 = __float2half(float(1.0/comm->nRanks)); // __double2half not supported pre CUDA 11.x break; #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: opFull->op = ncclDevPreMulSum; bf16 = __float2bfloat16(float(1.0/comm->nRanks)); break; #endif case ncclFloat32: opFull->op = ncclDevPreMulSum; f32 = float(1.0/comm->nRanks); break; case ncclFloat64: opFull->op = ncclDevPreMulSum; f64 = 1.0/comm->nRanks; break; } opFull->scalarArgIsPtr = false; opFull->scalarArg = u64; break; default: // user created int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps); ncclUserRedOp *user = &comm->userRedOps[ix]; if (datatype != user->datatype) { WARN("Data type supplied to user-created ncclRedOp_t does not match type " "given to reduction operation"); return ncclInvalidArgument; } *opFull = user->opFull; break; } return ncclSuccess; } // Converts `info` to a task and adds it to `comm->tasks`. The exception is with // single rank communicators, collectives are issued as `ncclMemcpyAsync`s and // thus don't need a task. static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* info) { ncclTasks *tasks = &comm->tasks; if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { int peer = info->root; ssize_t nBytes = info->count*ncclTypeSize(info->datatype); bool isSendNotRecv = info->coll == ncclFuncSend; // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. ncclGroupCommJoin(info->comm); struct ncclTaskP2p* p2p = ncclMemoryStackAlloc(&comm->memScoped); p2p->buff = (void*)info->recvbuff; p2p->bytes = nBytes; p2p->chunk = 0; ncclIntruQueueEnqueue( isSendNotRecv ? &tasks->peers[peer].sendQueue : &tasks->peers[peer].recvQueue, p2p); tasks->nTasksP2p += 1; // Mark channels that need pre-connect if (comm->rank != peer) { int channelBaseId; NCCLCHECK(ncclChannelComputeBase(comm, peer, info->coll, &channelBaseId)); if (!(isSendNotRecv ? tasks->peers[peer].sendSeen : tasks->peers[peer].recvSeen)) { (isSendNotRecv ? tasks->peers[peer].sendSeen : tasks->peers[peer].recvSeen) = true; for (int c=0; c < comm->p2pnChannelsPerPeer; c++) { int channelId; NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId)); if (isSendNotRecv) { if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector comm->connectSend[peer] |= (1UL<channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector comm->connectRecv[peer] |= (1UL<op, info->datatype, comm)); // User-defined reduction ops may need alter the data even for unitary reductions if (comm->nRanks == 1 && opFull.op < ncclDevPreMulSum) { if (info->sendbuff != info->recvbuff) { size_t bytes = info->count*ncclTypeSize(info->datatype); CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, bytes, cudaMemcpyDeviceToDevice, info->stream)); } return ncclSuccess; } else { // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. ncclGroupCommJoin(info->comm); struct ncclTaskColl* t = ncclMemoryStackAlloc(&comm->memScoped); t->func = info->coll; t->sendbuff = info->sendbuff; t->recvbuff = info->recvbuff; t->count = info->count; t->root = info->root; t->datatype = info->datatype; t->op = opFull; // C++ struct assignment t->chunkSteps = info->chunkSteps; t->sliceSteps = info->sliceSteps; ncclIntruQueueEnqueue(&tasks->collQueue, t); tasks->collBytesTotal += info->nBytes; tasks->nTasksColl += 1; } } if (info->stream != tasks->streamRecent || tasks->streams == nullptr) { tasks->streamRecent = info->stream; struct ncclCudaStreamList* l = tasks->streams; while (true) { if (l == nullptr) { // Got to the end, this must be a new stream. struct ncclCudaGraph graph; NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream)) if (tasks->streams != nullptr && !ncclCudaGraphSame(tasks->capturingGraph, graph)) { WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph."); return ncclInvalidUsage; } tasks->capturingGraph = graph; // C++ struct assignment // Add stream to list l = ncclMemoryStackAlloc(&comm->memScoped); l->stream = info->stream; l->next = tasks->streams; tasks->streams = l; break; } if (l->stream == info->stream) break; // Already seen stream. l = l->next; } } return ncclSuccess; } ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { NCCLCHECK(ncclGroupStartInternal()); ncclResult_t ret = ncclSuccess; int devOld = -1; NCCLCHECKGOTO(PtrCheck(info->comm, info->opName, "comm"), ret, fail); // Check whether communicator is ready to communicate NCCLCHECKGOTO(ncclCommEnsureReady(info->comm), ret, fail); if (info->comm->checkPointers) { CUDACHECKGOTO(cudaGetDevice(&devOld), ret, fail); CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, fail); } NCCLCHECKGOTO(ArgsCheck(info), ret, fail); INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zi,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast(info->sendbuff), reinterpret_cast(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream); NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail); exit: if (devOld != -1) CUDACHECK(cudaSetDevice(devOld)); ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); /* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change * so we have to check state here. */ if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) }; return ret; fail: if (info->comm && !info->comm->config.blocking) (void) ncclCommSetAsyncError(info->comm, ret); goto exit; } NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) { NCCLCHECK(PtrCheck(comm, "ncclRedOpCreatePreMulSum", "comm")); /* join init thread before creating PreMulSum op. */ NCCLCHECK(ncclCommEnsureReady(comm)); if (comm->userRedOpFreeHead == comm->userRedOpCapacity) { // double capacity and resize int cap = 2*comm->userRedOpCapacity; if (cap < 4) cap = 4; ncclUserRedOp *ops = new ncclUserRedOp[cap]; std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp)); for(int ix=comm->userRedOpCapacity; ix < cap; ix++) ops[ix].freeNext = ix + 1; delete[] comm->userRedOps; comm->userRedOps = ops; comm->userRedOpCapacity = cap; } // pop from free list int ix = comm->userRedOpFreeHead; ncclUserRedOp *user = &comm->userRedOps[ix]; comm->userRedOpFreeHead = user->freeNext; user->freeNext = -1; // allocated user->datatype = datatype; user->opFull.op = ncclDevPreMulSum; if (residence == ncclScalarHostImmediate) { user->opFull.scalarArgIsPtr = false; std::memcpy(&user->opFull.scalarArg, scalar, ncclTypeSize(datatype)); } else { user->opFull.scalarArgIsPtr = true; user->opFull.scalarArg = reinterpret_cast(scalar); } *op = ncclRedOp_t(int(ncclNumOps) + ix); *op = ncclUserRedOpMangle(comm, *op); TRACE_CALL("ncclRedOpCreatePreMulSum(%d,%p,%d,%d,%p)", *op, scalar, datatype, residence, comm); return ncclSuccess; } NCCL_API(ncclResult_t, ncclRedOpDestroy, ncclRedOp_t op, ncclComm_t comm); ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) { if (0 <= int(op) && int(op) < int(ncclNumOps)) { WARN("ncclRedOpDestroy : operator is a NCCL builtin."); return ncclInvalidArgument; } if (int(op) < 0 || int(ncclMaxRedOp) < int(op)) { WARN("ncclRedOpDestroy : operator is garbage."); return ncclInvalidArgument; } if (comm == NULL) { WARN("ncclRedOpDestroy : invalid communicator passed."); return ncclInvalidArgument; } int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps); if (comm->userRedOpCapacity <= ix || comm->userRedOps[ix].freeNext != -1) { WARN("ncclRedOpDestroy : operator unknown to this communicator."); return ncclInvalidArgument; } // push to free list comm->userRedOps[ix].freeNext = comm->userRedOpFreeHead; comm->userRedOpFreeHead = ix; TRACE_CALL("ncclRedOpDestroy(%d,%p)", op, comm); return ncclSuccess; } nccl-2.18.3-1/src/graph/000077500000000000000000000000001444201535000145505ustar00rootroot00000000000000nccl-2.18.3-1/src/graph/connect.cc000066400000000000000000000441211444201535000165120ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "graph.h" #include "trees.h" #include "rings.h" #include "topo.h" /******************************************************************/ /********************* Internode connection ***********************/ /******************************************************************/ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) { int rank = comm->rank; int localRanks = comm->topo->nodes[GPU].count; int nChannels = comm->nChannels; for (int c=0; cchannels+c; channel->ring.prev = channel->ring.next = -1; channel->tree.up = -1; channel->collnetChain.up = -1; for (int i=0; itree.down[i] = -1; for (int i=0; icollnetChain.down[i] = -1; channel->collnetDirect.out = -1; channel->collnetDirect.headRank = -1; channel->collnetDirect.nHeads = 0; channel->collnetDirect.shift = 0; for (int i=0; icollnetDirect.up[i] = -1; for (int i=0; icollnetDirect.down[i] = -1; int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks; int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks; int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks; int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra+c*localRanks; for (int i=0; iringRecv[c] = ringIntra[0]; topoRanks->ringSend[c] = ringIntra[localRanks-1]; channel->ring.prev = (i == 0) ? -1 : ringIntra[i-1]; channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1]; } if (treeIntra[i] == rank) { int parentIndex = 0; int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0; topoRanks->treeToParent[c] = treeIntra[parentIndex]; topoRanks->treeToChild0[c] = treeIntra[child0Index]; topoRanks->treeToChild1[c] = treeIntra[child1Index]; channel->tree.up = i == 0 ? -1 : treeIntra[i-1]; channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1]; } if (collNetIntra[i] == rank) { channel->collnetChain.up = i == 0 ? comm->nRanks : collNetIntra[i-1]; channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1]; } } topoRanks->ringPrev[c] = channel->ring.prev; topoRanks->ringNext[c] = channel->ring.next; topoRanks->nvlsHeads[c] = nvlsIntra[0]; } // Duplicate channels rings/trees struct ncclChannel* channel0 = comm->channels; struct ncclChannel* channel1 = channel0+nChannels; memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel)); return ncclSuccess; } static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) { int nChannels = comm->nChannels; int nNodes = comm->nNodes; for (int c=0; cnNodes; int* send = ringSend+c*comm->nNodes; int* prev = ringPrev+c*comm->nRanks; int* next = ringNext+c*comm->nRanks; struct ncclChannel* channel0 = comm->channels+c; struct ncclChannel* channel1 = channel0+nChannels; for (int n=0; nrank == recvRank) { channel0->ring.prev = prevSendRank; channel1->ring.prev = prevSendRank; } int sendRank = send[n]; int nextRecvRank = recv[(n+1)%nNodes]; next[sendRank] = nextRecvRank; if (comm->rank == sendRank) { channel0->ring.next = nextRecvRank; channel1->ring.next = nextRecvRank; } } TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next); TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next); } return ncclSuccess; } static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes) { for (int n=0; nup = indexes[u]; return ncclSuccess; } static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) { if (d == -1) return ncclSuccess; int x = 0; while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++; if (x == NCCL_MAX_TREE_ARITY) { WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]); return ncclInternalError; } tree->down[x] = indexes[d]; return ncclSuccess; } static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) { const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node; // Compute tree depth. Not an exact value but a good approximation in most // cases int depth = comm->nRanks/nNodes - 1 + log2i(nNodes); int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType; int* ttp, *ttc0, *ttc1; NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType)); for (int c=0; cchannels+c; struct ncclChannel* channel1 = channel0+nChannels; ttp = treeToParent+c*comm->nNodes; ttc0 = treeToChild0+c*comm->nNodes; ttc1 = treeToChild1+c*comm->nNodes; if (comm->rank == ttp[node]) { NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u)); NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u)); } if (comm->rank == ttc0[node]) { NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0)); NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0)); } if (comm->rank == ttc1[node]) { NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1)); NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1)); } if (comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) { INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]); INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]); } channel0->tree.depth = channel1->tree.depth = depth; } return ncclSuccess; } static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) { int rank = comm->rank; int localRanks = comm->localRanks; int nHeads = 0; int *heads; NCCLCHECK(ncclCalloc(&heads, localRanks)); // Find all head ranks // Head index is always 0 for (int c=0; cnChannels; c++) { int* collNetIntra = collNetGraph->intra+c*localRanks; int head = collNetIntra[0]; for (int h=0; hnChannels; c++) { struct ncclChannel* channel = comm->channels+c; char line[1024]; sprintf(line, "CollNet channel %d rank %d ", c, rank); int nDown = 0; for (int i=0; icollnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks int* collNetIntra = collNetGraph->intra+i*localRanks; sprintf(line+strlen(line), "down "); for (int r=0; rcollnetDirect.down[nDown++] = collNetIntra[r]; // connect to all peers sprintf(line+strlen(line), " %d ", collNetIntra[r]); } sprintf(line+strlen(line), "nDown %d ", nDown); break; } } // Connect to all heads int nUp = 0; sprintf(line+strlen(line), "up "); for (int h=0; hcollnetDirect.up[nUp++] = heads[h]; sprintf(line+strlen(line), " %d ", heads[h]); } channel->collnetDirect.nHeads = nHeads; channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2; sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads); sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift); INFO(NCCL_GRAPH, "%s", line); channel->collnetChain.depth = comm->nRanks/comm->nNodes; } for (int c=0; cnvlsChannels; c++) { struct ncclChannel* channel = comm->channels+c; if (channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks; } free(heads); return ncclSuccess; } static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct ncclTopoGraph* nvlsGraph) { int nHeads = nvlsGraph->nChannels; int headRank = -1; for (int h=0; hintra[h*comm->localRanks] == comm->rank) headRank = h; } if (nHeads == 0) { comm->nvlsChannels = 0; return ncclSuccess; } for (int c=0; cnvlsChannels; c++) { struct ncclChannel* channel = comm->channels+c; channel->nvls.nHeads = nHeads; for (int h=0; hnvls.up[h] = comm->nRanks+1+h; for (int h=nHeads; hnvls.up[h] = -1; channel->nvls.down = comm->nRanks+1+headRank; channel->nvls.out = -1; // NVLS+SHARP not yet implemented. channel->nvls.headRank = headRank; channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1; channel->nvls.node = comm->node; channel->nvls.nNodes = comm->nNodes; } if (comm->nNodes == 1) return ncclSuccess; // Connect Trees int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1; int pc0, pc1; // ignored NCCLCHECK(ncclGetDtree(comm->nNodes, comm->node, &tree0Parent, &tree0Child0, &tree0Child1, &pc0, &tree1Parent, &tree1Child0, &tree1Child1, &pc1)); int* heads = NULL; int treeUp[2] = { -1, -1 }; int treeDown0[2] = { -1, -1 }; int treeDown1[2] = { -1, -1 }; if (comm->node == 0) { for (int h=0; hnNodes; for (int n=0; nnNodes && n<20; n++) { sprintf(line+strlen(line), " %2d", heads[n]); } INFO(NCCL_INIT, "%s", line); } } // Find the heads where I'm the head rank and retain tree up/down for (int h=0; hnNodes; if (heads[comm->node] == comm->rank) { treeUp[0] = tree0Parent == -1 ? -1: heads[tree0Parent]; treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0]; treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1]; treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent]; treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0]; treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1]; break; } } // Set prev/next in all channels (NVLS compute channels work // orthogonally to NVLS search channels). for (int c=0; cnvlsChannels; c++) { struct ncclChannel* channel = comm->channels+c; channel->nvls.treeUp = treeUp[c%2]; channel->nvls.treeDown[0] = channel->nvls.down; int ix = 1; if (treeDown0[c%2] != -1) channel->nvls.treeDown[ix++] = treeDown0[c%2]; if (treeDown1[c%2] != -1) channel->nvls.treeDown[ix] = treeDown1[c%2]; } struct ncclNvls* nvls0 = &comm->channels[0].nvls; struct ncclNvls* nvls1 = &comm->channels[1].nvls; INFO(NCCL_GRAPH, "NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d", nvls0->treeDown[0], nvls0->treeDown[1], comm->rank, nvls0->treeUp, nvls1->treeDown[0], nvls1->treeDown[1], comm->rank, nvls1->treeUp); return ncclSuccess; } // Legacy naming NCCL_PARAM(MinNrings, "MIN_NRINGS", -2); NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2); // New naming NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2); NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2); int ncclMinNchannels() { int minNchannels = 0; if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings(); if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels(); if (minNchannels > MAXCHANNELS) { WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS); minNchannels = MAXCHANNELS; } if (minNchannels < 0) minNchannels = 0; return minNchannels; } int ncclMaxNchannels() { int maxNchannels = MAXCHANNELS; if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings(); if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels(); if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS; if (maxNchannels < 1) { WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels); maxNchannels = 1; } return maxNchannels; } static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) { int nranks = comm->nRanks; int c; for (c=start; cchannels+c, comm->channels+c-start, sizeof(struct ncclChannel)); } return c; } ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs) { // Gather data from all ranks int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads; int nranks = comm->nRanks; int nNodes = comm->nNodes; int nChannels = comm->nChannels; NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS)); for (int c=0; cringRecv[c]; ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c]; treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c]; treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c]; treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c]; nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c]; } for (int r=0; rringPrev[c]; ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c]; } } // Connect rings and trees. This should also duplicate the channels. NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext)); NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns)); NCCLCHECK(connectNvls(comm, nvlsHeads, graphs[NCCL_ALGO_NVLS])); // Duplicate ringPrev/ringNext for ncclBuildRing memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int)); // Duplication should be complete now nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2); // Setup CollNet if (comm->collNetSupport == 1) { struct ncclTopoGraph* collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT]; // Add more channels to saturate intra-node bandwidth, except the 1 PPN case if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) { int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2); nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext); } NCCLCHECK(connectCollNet(comm, collNetGraph)); } // Use 4 compute channels per search channel to reach peak BW on <8 PPN if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= MAXCHANNELS) { nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext); } // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS. // We permit combining max, then min, to only use the first channels, then duplicate them. if (comm->sharedRes->owner != comm) { /* child comm #channels cannot exceed top parent #channels. */ nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels); nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext); } else { nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs); nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext); } // Create rings array and check all is fine NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext)); free(ringRecv); free(ringSend); free(ringPrev); free(ringNext); free(treeToParent); free(treeToChild0); free(treeToChild1); free(nvlsHeads); return ncclSuccess; } nccl-2.18.3-1/src/graph/paths.cc000066400000000000000000000705511444201535000162060ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #include "graph.h" #include "topo.h" #include "comm.h" #include "net.h" #include "channel.h" // Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths struct ncclTopoNodeList { struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES]; int count; }; static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) { for (int i=0; inodes[t].count; i++) { if (system->nodes[t].nodes[i].id == id) { *path = node->paths[t]+i; return ncclSuccess; } } WARN("Could not find node of type %d id %lx", t, id); return ncclInternalError; } NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0); static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) { if (baseNode->paths[baseNode->type] == NULL) { NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count)); } // breadth-first search to set all paths to that node in the system struct ncclTopoNodeList nodeList; struct ncclTopoNodeList nextNodeList; nodeList.count = 1; nodeList.list[0] = baseNode; nextNodeList.count = 0; struct ncclTopoLinkList* basePath; NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath)); basePath->count = 0; basePath->bw = LOC_BW; basePath->type = PATH_LOC; while (nodeList.count) { nextNodeList.count = 0; for (int n=0; ntype, baseNode->id, &path)); for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node->links+l; struct ncclTopoNode* remNode = link->remNode; if (remNode->paths[baseNode->type] == NULL) { NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count)); } struct ncclTopoLinkList* remPath; NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath)); float bw = std::min(path->bw, link->bw); // allow routing through a GPU only as 1 hop if (node != baseNode && node->type == GPU && (ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue; if ((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) { // Find reverse link for (int l=0; lnlinks; l++) { if (remNode->links[l].remNode == node) { remPath->list[0] = remNode->links+l; break; } } if (remPath->list[0] == NULL) { WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx", remNode->type, remNode->id, remNode->nlinks, node->type, node->id); return ncclInternalError; } // Copy the rest of the path for (int i=0; icount; i++) remPath->list[i+1] = path->list[i]; remPath->count = path->count + 1; remPath->bw = bw; // Start with path type = link type. PATH and LINK types are supposed to match. // Don't consider LINK_NET as we only care about the NIC->GPU path. int type = link->type == LINK_NET ? LINK_LOC : link->type; // Differentiate between one and multiple PCI switches if (node->type == PCI && remNode->type == PCI) type = PATH_PXB; // Consider a path going through the CPU as PATH_PHB if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB; // Set 1 hop NVLink as NVB if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB; remPath->type = std::max(path->type, type); // Add to the list for the next iteration if not already in the list int i; for (i=0; itype], node->id); #else sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id); int offset = strlen(line); #endif for (int t=0; tpaths[t] == NULL) continue; for (int n = 0; nnodes[t].count; n++) { #ifdef ENABLE_TRACE line[0] = 0; int offset = 0; for (int i=0; ipaths[t][n].count; i++) { struct ncclTopoLink* link = node->paths[t][n].list[i]; struct ncclTopoNode* remNode = link->remNode; sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id); offset = strlen(line); } INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].bw); #else sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].bw, topoPathTypeStr[node->paths[t][n].type]); offset = strlen(line); #endif } } #ifndef ENABLE_TRACE INFO(NCCL_GRAPH, "%s", line); #endif } ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) { for (int i=0; inodes[GPU].count; i++) { printNodePaths(system, system->nodes[GPU].nodes+i); } for (int i=0; inodes[NET].count; i++) { printNodePaths(system, system->nodes[NET].nodes+i); } return ncclSuccess; } static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) { // Find the closest CPU to a GPU int minHops = 0; int localCpu = -1; struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU]; for (int c=0; cnodes[CPU].count; c++) { int hops = paths[c].count; if (minHops == 0 || hops < minHops) { localCpu = c; minHops = hops; } } if (localCpu == -1) { WARN("Error : could not find CPU close to GPU %d", gpu); return ncclInternalError; } *retCpu = localCpu; return ncclSuccess; } static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) { struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix; struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1; int l=0; // Node 1 -> CPU for (int i=0; ipaths[tx][ix].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i]; // CPU -> Node 2 for (int i=0; ipaths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i]; // Update path characteristics srcNode->paths[t2][i2].count = l; srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type); if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN; srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw); return ncclSuccess; } // Remove/free paths for a given type static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) { for (int t=0; tnodes[t].count; n++) { struct ncclTopoNode* node = system->nodes[t].nodes+n; free(node->paths[nodeType]); node->paths[nodeType] = NULL; } // Remove links _from_ the given type for (int n=0; nnodes[nodeType].count; n++) { struct ncclTopoNode* node = system->nodes[nodeType].nodes+n; free(node->paths[t]); node->paths[t] = NULL; } } } static const int levelsOldToNew[] = { PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS }; ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv) { if (*level == -1) { int l = -1; if (disableEnv) { char* str = getenv(disableEnv); if (str) { int disable = strtol(str, NULL, 0); if (disable == 1) l = 0; } } if (l == -1) { char* str = getenv(levelEnv); if (str) { for (int i=0; i<=PATH_SYS; i++) { if (strcmp(str, topoPathTypeStr[i]) == 0) { l = i; break; } } // Old style numbering // levelsOldToNew to is an array with each index corresponding to the // "old level" int, and each value mapping to the correct value defined in topo.h // maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew) if (l == -1 && str[0] >= '0' && str[0] <= '9') { int oldLevel = strtol(str, NULL, 0); const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1; if (oldLevel > maxOldLevel) oldLevel = maxOldLevel; l = levelsOldToNew[oldLevel]; } } } if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]); *level = l >= 0 ? l : -2; } return ncclSuccess; } NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0); int ncclTopoUserP2pLevel = -1; ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) { *p2p = 0; if (read) *read = 0; if (intermediateRank) *intermediateRank = -1; // Get GPUs from topology int g1, g2; NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1)); struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1; if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) { // GPU not found, we can't use p2p. return ncclSuccess; } int intermediateIndex = -1; // Set intermediate GPU rank, if routing through an intermediate GPU. struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2; if (path->count == 2) { struct ncclTopoNode* intermediateNode = path->list[0]->remNode; if (intermediateNode->type == GPU) { intermediateIndex = intermediateNode - system->nodes[GPU].nodes; if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank; } } // In general, use P2P whenever we can. int p2pLevel = PATH_SYS; // User override if (ncclTopoUserP2pLevel == -1) NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL")); if (ncclTopoUserP2pLevel != -2) { p2pLevel = ncclTopoUserP2pLevel; goto compare; } // Don't use P2P through ARM CPUs int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB; if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { p2pLevel = PATH_PXB; } if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { p2pLevel = PATH_PXB; } compare: // Compute the PCI distance and compare with the p2pLevel. if (path->type <= p2pLevel) *p2p = 1; if (*p2p == 1) { // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to // validate against NVML at all since they are pretending to be on other hw. if (g1 != g2 && ncclParamIgnoreDisabledP2p() != 2) { int indexes[3] = {-1,-1,-1}; int verticeN = 0; NCCLCHECK(ncclNvmlEnsureInitialized()); indexes[verticeN++] = system->nodes[GPU].nodes[g1].gpu.dev; if (intermediateIndex != -1) indexes[verticeN++] = system->nodes[GPU].nodes[intermediateIndex].gpu.dev; indexes[verticeN++] = system->nodes[GPU].nodes[g2].gpu.dev; for (int i=1; i < verticeN; i++) { nvmlGpuP2PStatus_t status; status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusRead; bool good = status == NVML_P2P_STATUS_OK; status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite; good &= status == NVML_P2P_STATUS_OK; if (!good) { if (ncclParamIgnoreDisabledP2p()) { *p2p = 0; } else if (path->type <= PATH_NVB) { WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]); return ncclUnhandledCudaError; } else if (path->type < PATH_SYS) { INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]); } } } } } if (path->type == PATH_NVL) { struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2; // Enable P2P Read for Ampere/NVLink only if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1; } return ncclSuccess; } NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); int ncclTopoUserGdrLevel = -1; ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) { *useGdr = 0; // Get GPU and NET int n, g; NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n)); struct ncclTopoNode* net = system->nodes[NET].nodes+n; NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // Check that both the NIC and GPUs support it if (net->net.gdrSupport == 0) return ncclSuccess; if (gpu->gpu.gdrSupport == 0) return ncclSuccess; if (read) { // For reads (sends) only enable under certain conditions int gdrReadParam = ncclParamNetGdrRead(); if (gdrReadParam == 0) return ncclSuccess; if (gdrReadParam < 0) { int nvlink = 0; // Since we don't know whether there are other communicators, // it's better to keep things local if we have a single GPU. if (system->nodes[GPU].count == 1) nvlink = 1; for (int i=0; inodes[GPU].count; i++) { if (i == g) continue; if (gpu->paths[GPU][i].type == PATH_NVL) { nvlink = 1; break; } } if (!nvlink) return ncclSuccess; } } // Check if we are close enough that it makes sense to enable GDR int netGdrLevel = PATH_PXB; NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL")); if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel; int distance = gpu->paths[NET][n].type; if (distance == PATH_PXN) { // In case of PXN, use the intermediate GPU distance instead int proxyRank, g; NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank)); NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g)); struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g; distance = proxyGpu->paths[NET][n].type; } if (distance > netGdrLevel) { INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel); return ncclSuccess; } *useGdr = 1; INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read); return ncclSuccess; } // Set to 0 to disable the flush on Hopper when using GDR NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1); // Determine whether we need to flush the GDR recv buffers ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) { int g; NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // Flush is required on Ampere and earlier *flush = gpu->gpu.cudaCompCap < 90 ? 1 : ncclParamNetForceFlush(); return ncclSuccess; } NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0); // Check whether going through the network would be faster than going through P2P/SHM. ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net) { if (ncclParamNetDisableIntra() == 1) { *net = 0; return ncclSuccess; } *net = 1; // First check the current GPU-to-GPU speed. int g1, g2; if (ncclTopoIdToIndex(system, GPU, id1, &g1) != ncclSuccess || ncclTopoIdToIndex(system, GPU, id2, &g2) != ncclSuccess) { return ncclSuccess; } struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1; struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2; float speed = gpu1->paths[GPU][g2].bw; // Now check the speed each GPU can access the network through PXB or better float netSpeed1 = 0, netSpeed2 = 0; for (int n=0; nnodes[NET].count; n++) { struct ncclTopoLinkList* path = gpu1->paths[NET]+n; if (path->type <= PATH_PXB && path->bw > netSpeed1) netSpeed1 = path->bw; path = gpu2->paths[NET]+n; if (path->type <= PATH_PXB && path->bw > netSpeed2) netSpeed2 = path->bw; } if (netSpeed1 > speed && netSpeed2 > speed) return ncclSuccess; *net = 0; return ncclSuccess; } ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank) { // Get GPU and NET int n, g; NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n)); NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; struct ncclTopoLinkList* path = gpu->paths[NET]+n; if (path->type == PATH_PXN) { struct ncclTopoNode* node; int type = NVS; for (int i=0; icount && type == NVS; i++) { node = path->list[i]->remNode; type = node->type; } if (type != GPU) { WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev); return ncclInternalError; } *intermediateRank = node->gpu.rank; } else { *intermediateRank = rank; } return ncclSuccess; } NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0); // Net v4 plugins don't have non-blocking connect/accept. We can't therefore use // remote proxies without risking deadlocks int ncclPxnDisable(struct ncclComm* comm) { static int pxnDisable = -1; if (pxnDisable == -1) { if (comm && ncclNetVersion(comm) == 4) { INFO(NCCL_INIT, "PXN Disabled as plugin is v4"); pxnDisable = 1; } else { pxnDisable = ncclParamPxnDisable(); } } return pxnDisable; } ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks) { struct ncclTopoSystem* system = comm->topo; *nranks = 0; *intermediateRanks = NULL; if (system->nodes[NET].count == 0) return ncclSuccess; int nr = 0; int* ranks = NULL; for (int rank=0; ranknRanks; rank++) { int netDev, proxyRank; NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank)); if (proxyRank == comm->rank) continue; int useGdr; NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr)); if (useGdr == 0) continue; int found = 0; for (int r=0; rnodes[CPU].count; c++) { NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system)); } // Set direct paths to GPUs. for (int g=0; gnodes[GPU].count; g++) { NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system)); } // Set direct paths to NICs. for (int n=0; nnodes[NET].count; n++) { NCCLCHECK(ncclTopoSetPaths(system->nodes[NET].nodes+n, system)); } // Set direct paths to NVSwitches. for (int n=0; nnodes[NVS].count; n++) { NCCLCHECK(ncclTopoSetPaths(system->nodes[NVS].nodes+n, system)); } // Update path for GPUs when we don't want to / can't use GPU Direct P2P for (int g=0; gnodes[GPU].count; g++) { for (int p=0; pnodes[GPU].count; p++) { int p2p; NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL)); if (p2p == 0) { // Divert all traffic through the CPU int cpu; NCCLCHECK(getLocalCpu(system, g, &cpu)); NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g)); } } if (comm == NULL) continue; // Remove GPUs we can't (or don't want to) communicate with through P2P or SHM struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank; for (int p=0; pnodes[GPU].count; p++) { if (p == g) continue; struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank; int p2p; NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo)); if (p2p == 0) { int shm; NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo)); if (shm == 0) { // Mark this peer as inaccessible. We'll trim it later. system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET; } } } } // Update paths for NICs (no GPU Direct, PXN, ...) for (int n=0; nnodes[NET].count; n++) { struct ncclTopoNode* netNode = system->nodes[NET].nodes+n; for (int g=0; gnodes[GPU].count; g++) { // Check whether we can access the NIC through another NVLink-connected GPU (PXN) struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; if (ncclPxnDisable(comm) != 1) { int localGpuIndex; NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex)); if (localGpuIndex != g && localGpuIndex != -1) { // PXN = PCI + NVLink. struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex; // Only use PXN for NIC n if remote GPU p ... if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU // We can use that GPU as relay to communicate with that NIC. // Only enabling it in the GPU->NIC direction for now to favor // receiving locally and sending remotely (consistent with net.cc) NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n)); } } // Update path when we dont want to / can't use GPU Direct RDMA. int gdr; NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr)); if (gdr == 0) { // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU int localCpu; NCCLCHECK(getLocalCpu(system, g, &localCpu)); NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g)); NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n)); } } } return ncclSuccess; } ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) { int *domains; int64_t *ids; NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count)); NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count)); int myDomain = 0; for (int g=0; gnodes[GPU].count; g++) { struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; domains[g] = g; ids[g] = gpu->id; for (int p=0; ppaths[GPU][p].type < PATH_NET) { domains[g] = std::min(domains[g], domains[p]); } } if (gpu->gpu.rank == comm->rank) myDomain = domains[g]; } int ngpus = system->nodes[GPU].count; for (int i=0; inodes[GPU].count /* This one varies over the loops */; g++) { gpu = system->nodes[GPU].nodes+g; if (gpu->id == ids[i]) break; else gpu=NULL; } if (gpu == NULL) { WARN("Could not find id %lx", ids[i]); free(domains); free(ids); return ncclInternalError; } NCCLCHECK(ncclTopoRemoveNode(system, GPU, g)); } if (system->nodes[GPU].count == comm->nRanks) { for (int n=system->nodes[NET].count-1; n>=0; n--) NCCLCHECK(ncclTopoRemoveNode(system, NET, n)); } free(domains); free(ids); return ncclSuccess; } void ncclTopoFree(struct ncclTopoSystem* system) { for (int t=0; tnodes[GPU].nodes[peer].paths[GPU]+g; if (path->type == PATH_NVL) { float nvlBw = ncclTopoNVLinkBw(system->nodes[GPU].nodes[g].gpu.cudaCompCap); *nChannels = 2*std::max(1, (int)(path->bw / nvlBw)); } else { *nChannels = 2; } } else { // Remote rank, use network *nChannels = ncclParamNChannelsPerNetPeer(); } return ncclSuccess; } NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1); NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS); static int nextPow2(int v) { int pow2 = 1; while (pow2 < v) pow2 <<= 1; return pow2; } ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { /* here we already honor comm->max/minCTAs for p2pnChannels. */ if (comm->sharedRes->owner != comm) { comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels()); comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()), comm->sharedRes->tpP2pNChannels); } else { comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels()); comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()); } int minChannels = comm->p2pnChannels; // We need to loop through all local GPUs to have a global picture for (int g=0; gtopo->nodes[GPU].count; g++) { for (int r=0; rnRanks; r++) { int nChannels; NCCLCHECK(ncclTopoGetNchannels(comm->topo, g, r, &nChannels)); if (nChannels >= 0) minChannels = std::min(minChannels, nChannels); } } // Round to next pow2 nChannelsPerPeer and nChannels comm->p2pnChannelsPerPeer = nextPow2(minChannels); comm->p2pnChannels = nextPow2(comm->p2pnChannels); // Init channels that weren't used so far for (int c=comm->nChannels; cp2pnChannels; c++) NCCLCHECK(initChannel(comm, c)); // We want to spread channels used when there aren't many and progressively // fill the whole space of nChannels. To do so we mirror the bits in the // nChannels space. for (int c=0; cp2pnChannels; c++) { int mirror = 0; for (int b=1, mb=(comm->p2pnChannels>>1); bp2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb; comm->p2pChannels[c] = mirror; } return ncclSuccess; } ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks) { int ngpus = system->nodes[GPU].count; NCCLCHECK(ncclCalloc(ranks, ngpus)); int nvbGpus = 0; for (int g=0; gnodes[GPU].nodes+g; if (gpu->gpu.rank != rank) continue; for (int p=0; ppaths[GPU][p].type == PATH_NVB) { (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank; } } } *nranks = nvbGpus; return ncclSuccess; } int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) { int minPath = PATH_DIS; for (int i=0; inodes[GPU].count; i++) { struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU]; for (int j=0; jnodes[GPU].count; j++) { if (i == j) continue; minPath = std::min(minPath, paths[j].type); } } return minPath >= PATH_PIX ? 0 : 1; } nccl-2.18.3-1/src/graph/rings.cc000066400000000000000000000035141444201535000162040ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #define MAXWIDTH 20 #define PREFIXLEN 15 #define STRLENGTH (PREFIXLEN+5*MAXWIDTH) void dumpLine(int* values, int nranks, const char* prefix) { int prefixlen = strlen(prefix); char line[STRLENGTH+1]; line[STRLENGTH] = '\0'; memset(line, ' ', STRLENGTH); strncpy(line, prefix, PREFIXLEN); for (int i=0; i NCCL_PARAM(CrossNic, "CROSS_NIC", 2); // Initialize system->maxBw. This is the per-channel (i.e. per-SM) // max bw. static float getMaxBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) { float maxBw = 0.0; for (int i=0; inodes[type].count; i++) { struct ncclTopoLinkList* path = gpu->paths[type]+i; float bw = path->bw; if (path->count == 0) continue; maxBw = std::max(maxBw, bw); } return maxBw; } static float getTotalBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { float nvlinkBw = 0.0, pciBw = 0.0; for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = gpu->links+l; if (link->type == LINK_NVL) nvlinkBw += link->bw; if (link->type == LINK_PCI) pciBw = link->bw; } return std::max(pciBw, nvlinkBw); } ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) { system->maxBw = 0.0; system->totalBw = 0.0; int inter = system->nodes[NET].count; if (inter == 0 && system->nodes[GPU].count == 1) { system->maxBw = LOC_BW; return ncclSuccess; } for (int g=0; gnodes[GPU].count; g++) { struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; system->maxBw = std::max(system->maxBw, getMaxBw(system, gpu, inter ? NET : GPU)); system->totalBw = std::max(system->totalBw, getTotalBw(system, gpu)); } return ncclSuccess; } static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, struct ncclTopoLink** revLink) { for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node2->links+l; if (link->remNode == node1) { *revLink = link; return ncclSuccess; } } WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id); return ncclInternalError; } // This is unfortunately needed since manipulating floats often results in rounding errors. #define SUB_ROUND(a, b) (a = roundf((a-b)*1000)/1000) static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float bw, int* steps) { float pciBw = bw; for (int step=0; stepcount; step++) { struct ncclTopoNode* node = path->list[step]->remNode; if (node->type == CPU) { // Account for P2P inefficiency through Intel CPU RC if (path->type == PATH_PHB && start->type == GPU && node->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && node->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { pciBw = INTEL_P2P_OVERHEAD(bw); } } } struct ncclTopoNode* node = start; for (int step=0; steplist[step]; struct ncclTopoLink* revLink = NULL; float fwBw = link->type == LINK_PCI ? pciBw : bw; float revBw = 0; if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) { if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink)); revBw += fwBw/8; } if (link->remNode->type == CPU && link->type == LINK_NVL) { if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink)); revBw += fwBw; } if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; } SUB_ROUND(link->bw, fwBw); if (revBw) SUB_ROUND(revLink->bw, revBw); node = link->remNode; } *steps = maxSteps; return ncclSuccess; } // Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1). static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct ncclTopoNode** node) { // First handle easy cases *node = system->nodes[type2].nodes+index2; if (type1 == -1) return ncclSuccess; struct ncclTopoNode* node1 = system->nodes[type1].nodes+index1; struct ncclTopoLinkList* path = node1->paths[type2]+index2; struct ncclTopoNode* node2 = system->nodes[type2].nodes+index2; struct ncclTopoLinkList* revPath = node2->paths[type1]+index1; if (path == NULL) { WARN("No path computed to go from %s/%d to %s/%d", topoNodeTypeStr[type1], index1, topoNodeTypeStr[type2], index2); return ncclInternalError; } if (path->count == 0 ) return ncclSuccess; // Now check link type *node = NULL; int intra = (type1 == GPU || type1 == NVS) && (type2 == GPU || type2 == NVS); float bw = intra ? graph->bwIntra : graph->bwInter; int type = intra ? graph->typeIntra : graph->typeInter; if (mult == 1 && (path->type > type)) return ncclSuccess; if (mult == 1 && (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE || graph->pattern == NCCL_TOPO_PATTERN_TREE || graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) && (revPath->type > type)) return ncclSuccess; bw *= mult; // Check there is enough bandwidth on paths. int step = 0; NCCLCHECK(followPath(path, node1, path->count, bw, &step)); if (step < path->count) goto rewind; // Enough bandwidth : return destination node. graph->nHops += mult*path->count; *node = system->nodes[type2].nodes+index2; return ncclSuccess; rewind: // Not enough bandwidth : rewind and exit. NCCLCHECK(followPath(path, node1, step, -bw, &step)); return ncclSuccess; } static int gpuPciBw(struct ncclTopoNode* gpu) { for (int l=0; lnlinks; l++) { struct ncclTopoLink* gpuLink = gpu->links+l; if (gpuLink->type != LINK_PCI) continue; struct ncclTopoNode* pci = gpuLink->remNode; for (int l=0; lnlinks; l++) { struct ncclTopoLink* pciLink = pci->links+l; if (pciLink->remNode != gpu) continue; return std::min(gpuLink->bw, pciLink->bw); } } return -1; } /* Choose the order in which we try next GPUs. This is critical for the search to quickly converge to the best solution even if it eventually times out. */ struct ncclGpuScore { int g; // Retain the index int startIndex; // Least important int intraNhops; int intraBw; int interNhops; int interPciBw; int interBw; // Most important }; static int cmpScore(const void * g1, const void * g2) { struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1; struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2; int d; if ((d = (s2->interBw - s1->interBw))) return d; if ((d = (s2->interPciBw - s1->interPciBw))) return d; if ((d = (s1->interNhops - s2->interNhops))) return d; if ((d = (s2->intraBw - s1->intraBw))) return d; if ((d = (s1->intraNhops - s2->intraNhops))) return d; return s1->startIndex - s2->startIndex; } static int cmpIntraScores(struct ncclGpuScore* scores, int count) { int intraBw = scores[0].intraBw; int intraNhops = scores[0].intraNhops; for (int i=1; inodes[GPU].count; g++) { if (system->nodes[GPU].nodes[g].gpu.rank == rank) { *index = g; return ncclSuccess; } } WARN("Could not find gpu rank %d", rank); return ncclInternalError; } static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int* index) { for (int n=0; nnodes[NET].count; n++) { if (system->nodes[NET].nodes[n].id == id) { *index = n; return ncclSuccess; } } WARN("Could not find net id %lx", id); return ncclInternalError; } static ncclResult_t getNetPaths(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoLinkList** netPaths) { int netId = graph->inter[graph->nChannels*2]; int n; NCCLCHECK(getNetIndex(system, netId, &n)); *netPaths=system->nodes[NET].nodes[n].paths[GPU]; return ncclSuccess; } ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) { const uint64_t flag = 1ULL<<(graph->nChannels); int ngpus = system->nodes[GPU].count; struct ncclTopoLinkList* paths = gpu->paths[GPU]; struct ncclTopoLinkList* netPaths = NULL; if (sortNet) NCCLCHECK(getNetPaths(system, graph, &netPaths)); struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES]; memset(scores, 0, ngpus*sizeof(struct ncclGpuScore)); int start = gpu-system->nodes[GPU].nodes; int count = 0; for (int i=1; inodes[GPU].nodes[g].used & flag) continue; scores[count].g = g; scores[count].startIndex = i; scores[count].intraNhops = paths[g].count; scores[count].intraBw = paths[g].bw; if (netPaths) { scores[count].interNhops = netPaths[g].count; scores[count].interPciBw = gpuPciBw(system->nodes[GPU].nodes+g); scores[count].interBw = netPaths[g].bw; } count++; } // Sort GPUs qsort(scores, count, sizeof(struct ncclGpuScore), cmpScore); // Check if all have the same intra-node score in which case we go reverse for sortNet = -1 if (sortNet == -1 && cmpIntraScores(scores, count) == 0) { for (int i=0; inChannels == 0) return ncclInternalError; int ngpus = system->nodes[GPU].count; int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1]; for (int i=0; inodes[GPU].nodes[i].gpu.rank == nextRank) { *g = i; return ncclSuccess; } if (*g == -1) return ncclInternalError; return ncclSuccess; } ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time); ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time, int type, int index, int g) { const uint64_t flag = 1ULL<<(graph->nChannels); struct ncclTopoNode* gpu; NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu)); if (gpu) { gpu->used ^= flag; NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, time)); gpu->used ^= flag; NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, -1, &gpu)); } return ncclSuccess; } ncclResult_t ncclTopoSearchTryNvls(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) { struct ncclTopoNode* nvs; struct ncclTopoNode* gpu; int d0=0; // See if there is enough bandwidth for NVS->GPU traffic do { NCCLCHECK(ncclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? 2 : 1, &gpu)); d0++; } while (gpu && d0 < system->nodes[GPU].count); if (gpu == NULL) { d0--; } else { int d1=0; // See if there is enough bandwidth for GPU->NVS traffic do { NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? 2 : 1, &nvs)); d1++; } while (nvs && d1 < system->nodes[GPU].count); if (nvs == NULL) { d1--; } else { // Both directions worked. Move on to the next path. NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time)); } while (d1) { d1--; NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? -2 : -1, &nvs)); } } while (d0) { d0--; NCCLCHECK(ncclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? -2 : -1, &gpu)); } return ncclSuccess; } ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) { // 1. Try to get the same nChannels between Rings and Trees if (graph->nChannels < graph->minChannels) return ncclSuccess; if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better. if (graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count) *copy = 1; return ncclSuccess; } // 2. Try to get better bandwidth // Give a 15% perf bonus to paths not crossing nics float target = 1.0 - (refGraph->crossNic - graph->crossNic) * .15; if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra*target) { *copy = 1; return ncclSuccess; } if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra*target) return ncclSuccess; // 3. Less hops if (graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1; return ncclSuccess; } // Build a list of the best NETs to try. // // "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu // index when trying to get back to the NIC. // // The list is built the following way: // 1. Select NETs starting with those close to GPU(s), based on paths[n].type. // 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list // based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which // might have been choosen by GPU 0 (case with multiple independent communicators per node) // 3. Then add the NETs to the final list if they were not already added by another closer GPU. ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) { int netCount = 0; int localNetCount; int* localNets; NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count)); for (int t=0; t <= typeInter; t++) { for (int g=0; gnodes[GPU].count; g++) { if (gpu != -1 && gpu != g) continue; localNetCount = 0; struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; struct ncclTopoLinkList* paths = gpu->paths[NET]; for (int n=0; nnodes[NET].count; n++) { if (paths[n].type == t) localNets[localNetCount++] = n; } if (localNetCount == 0) continue; // Shuffle by gpu NVML device number so that GPUs on the same PCI switch // with multiple NICs don't use the same one as first choice. for (int r=0; rnodes[GPU].nodes[g].gpu.dev % localNetCount; r++) { int net0 = localNets[0]; for (int i=0; inodes[GPU].count; if (step == ngpus) { // Determine whether we found a better solution or not int copy = 0; graph->nChannels++; NCCLCHECK(ncclTopoCompareGraphs(system, graph, saveGraph, ©)); if (copy) { memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph)); if (graph->nChannels == graph->maxChannels) *time = -1; } if (graph->nChannels < graph->maxChannels) { NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, time)); } graph->nChannels--; return ncclSuccess; } graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank; int g = gpu - system->nodes[GPU].nodes; if (step == backToNet) { // first get back to NIC if (system->nodes[NET].count) { int startNetIndex; NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex)); struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; int netcount; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netcount)); for (int i=0; inodes[NET].nodes+n; if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue; // Balanced Tree : count half of the bandwidth on first two GPUs int nextBackToNet = -1; float bwInterSave = graph->bwInter; if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) { // Count half of the bandwidth on each of the first two GPUs if (step == 0) nextBackToNet = 1; else if (net->id != graph->inter[graph->nChannels*2+1]) continue; graph->bwInter /= 2; } NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net)); graph->bwInter = bwInterSave; if (net) { graph->inter[graph->nChannels*2+1] = net->id; NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time)); if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2; NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net)); graph->bwInter = bwInterSave; } } free(nets); } } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time)); } else if (step < system->nodes[GPU].count-1) { // Go to next GPU int next[NCCL_TOPO_MAX_NODES]; int count; if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order next[0] = step+1; count = 1; } else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next)); count = 1; } else { // Normal search NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 )); } for (int i=0; iintra[graph->nChannels*ngpus], &p)); struct ncclTopoNode* firstGpu; NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, 1, &firstGpu)); if (firstGpu) { NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, time)); NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, -1, &firstGpu)); } } else { // Next path NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time)); } return ncclSuccess; } ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) { const int bw = graph->bwInter; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); int netcount; NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount)); for (int i=0; inodes[NET].nodes+n; struct ncclTopoNode* gpu; if (graph->collNet && net->net.collSupport == 0) continue; if (net->net.bw < bw) continue; graph->inter[graph->nChannels*2] = net->id; graph->latencyInter = net->net.latency; for (int i=0; inodes[NET].count; i++) { if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) { system->nodes[NET].nodes[i].net.bw -= bw; } } // NVLS needs to balance on all NICs if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, nets[graph->nChannels])); } else { if (graph->nChannels > 0) { // Try to replay the last channel int g; NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); } if (graph->nChannels == 0 || graph->sameChannels == 0) { if (graph->nChannels == 0) { // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long int t = 1 << 10; NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0)); if (t == -1) *time = -1; } // Then try the most local GPUs float maxBw = 0; int minHops = 0xfffffff; struct ncclTopoLinkList* paths = net->paths[GPU]; for (int g=0; gnodes[GPU].count; g++) { if (paths[g].bw > maxBw) { maxBw = paths[g].bw; minHops = paths[g].count; } else if (paths[g].bw == maxBw && paths[g].count < minHops) { minHops = paths[g].count; } } if (maxBw >= bw) { // In the first loop, avoid using GPUs in both directions between channels (one channel // sending from that GPU and one channel receiving to that GPU), since that usually leads // to lower BW. for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) { for (int g=0; gnodes[GPU].count; g++) { if (paths[g].bw == maxBw && paths[g].count == minHops) { gpu = system->nodes[GPU].nodes+g; int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1; if (tryGpuBidir == gpuUsed) { NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); } } } } } } } for (int i=0; inodes[NET].count; i++) { if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) { system->nodes[NET].nodes[i].net.bw += bw; } } } free(nets); return ncclSuccess; } /* Search Patterns * * Intra-node * Ring : GPU a -> GPU b -> .. -> GPU x -> GPU a * (=Split Tree Loop) * Tree : GPU a -> GPU b -> .. -> GPU x * (=Split Tree) * * Inter-node * Ring : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic) * Tree : NET n -> GPU a -> GPU b -> .. -> GPU x * `--> NET n (or m if crossNic) * Split Tree : NET n -> GPU a -> GPU b -> .. -> GPU x * `--> NET n (or m if crossNic) * Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a * `--> NET n (or m if crossNic) */ ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) { if (system->nodes[NET].count) { if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1; else if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) *backToNet = 1; else *backToNet = 0; *backToFirstRank = -1; } else { *backToNet = -1; if (pattern == NCCL_TOPO_PATTERN_RING) *backToFirstRank = system->nodes[GPU].count-1; else *backToFirstRank = -1; } return ncclSuccess; } ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time) { int backToNet, backToFirstRank; NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank)); if (system->nodes[NET].count) { // Start from NET ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time); } else { // Intra-node only. if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, graph->nChannels)); return ncclSuccess; } else if (graph->nChannels == 0) { // Try PCI order first NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0)); } else { // Also try to replay previous channel int g; NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, -1, -1, g)); } if (graph->sameChannels == 0 || graph->nChannels == 0) { // Finally, try all other possibilities unless we are forced to use the same channels for (int g=0; gnodes[GPU].count; g++) { NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g)); } } } return ncclSuccess; } /************************************/ /* User defined graph from XML file */ /************************************/ struct kvDict kvDictLinkType[] = { { "LOC", PATH_LOC }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "PXN", PATH_PXN }, { "PHB", PATH_PHB }, { "SYS", PATH_SYS }, { NULL, 0 } }; ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; int* inter = graph->inter+2*c; int* intra = graph->intra+ngpus*c; int n=0, g=0; for (int s=0; snSubs; s++) { struct ncclXmlNode* sub = xmlChannel->subs[s]; int dev; NCCLCHECK(xmlGetAttrInt(sub, "dev", &dev)); if (strcmp(sub->name, "net") == 0) { inter[n++] = dev; } else if (strcmp(sub->name, "gpu") == 0) { int rank = -1; for (int g=0; gnodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank; } if (rank == -1) { WARN("XML Import Channel : dev %d not found.", dev); return ncclSystemError; } intra[g++] = rank; } } return ncclSuccess; } ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels) { int id; NCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id)); if (graph->id != id) return ncclSuccess; int crossNic; NCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic)); if (ncclParamCrossNic() == 0 && crossNic == 1) return ncclSuccess; graph->crossNic = crossNic; NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern)); NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter)); if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0; const char* str; NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str)); NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType)); NCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str)); NCCLCHECK(kvConvertToInt(str, &graph->typeInter, kvDictLinkType)); NCCLCHECK(xmlGetAttrInt(xmlGraph, "samechannels", &graph->sameChannels)); for (int s=0; snSubs; s++) { NCCLCHECK(ncclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph)); } *nChannels = xmlGraph->nSubs; return ncclSuccess; } ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels) { for (int s=0; snSubs; s++) { NCCLCHECK(ncclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph, nChannels)); } return ncclSuccess; } /* And the reverse : graph->xml */ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) { struct ncclXmlNode* xmlChannel; int ngpus = system->nodes[GPU].count; int* inter = graph->inter+2*c; int* intra = graph->intra+ngpus*c; NCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel)); struct ncclXmlNode* node; if (system->nodes[NET].count) { NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node)); NCCLCHECK(xmlSetAttrInt(node, "dev", inter[0])); } for (int g=0; gnodes[GPU].nodes[i].gpu.rank == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev; } if (dev == -1) { WARN("XML Export Channel : rank %d not found.", intra[g]); return ncclInternalError; } NCCLCHECK(xmlSetAttrInt(node, "dev", dev)); } if (system->nodes[NET].count) { NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node)); NCCLCHECK(xmlSetAttrInt(node, "dev", inter[1])); } return ncclSuccess; } ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) { struct ncclXmlNode* xmlGraph; NCCLCHECK(xmlAddNode(xml, parent, "graph", &xmlGraph)); NCCLCHECK(xmlSetAttrInt(xmlGraph, "id", graph->id)); NCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern)); NCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic)); NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels)); NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->bwIntra)); NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->bwInter)); NCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter)); const char* str; NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType)); NCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str)); NCCLCHECK(kvConvertToStr(graph->typeInter, &str, kvDictLinkType)); NCCLCHECK(xmlSetAttr(xmlGraph, "typeinter", str)); NCCLCHECK(xmlSetAttrInt(xmlGraph, "samechannels", graph->sameChannels)); for (int c=0; cnChannels; c++) { NCCLCHECK(ncclTopoGetXmlFromChannel(graph, c, system, xml, xmlGraph)); } return ncclSuccess; } ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml) { xml->maxIndex = 0; struct ncclXmlNode* xmlGraphs; NCCLCHECK(xmlAddNode(xml, NULL, "graphs", &xmlGraphs)); NCCLCHECK(xmlSetAttrInt(xmlGraphs, "version", NCCL_GRAPH_XML_VERSION)); for (int g=0; gnodes[GPU].count; graph->crossNic = ncclParamCrossNic(); int crossNic = (system->nodes[NET].count > 1) && graph->crossNic && (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE || graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? 1 : 0; graph->bwIntra = graph->bwInter = 0; graph->latencyInter = 0; if (graph->crossNic == 2) graph->crossNic = 0; graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; graph->typeInter = PATH_PIX; graph->nChannels = 0; int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1; graph->sameChannels = trySameChannels; char* str = getenv("NCCL_GRAPH_FILE"); if (str) { INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str); struct ncclXml* xml; NCCLCHECK(ncclCalloc(&xml, 1)); NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml)); int nChannels; NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels)); INFO(NCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels); free(xml); if (graph->nChannels > 0) return ncclSuccess; } int ccMin; NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL)); if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess; if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // Force intra-node NVLS algorithm to pull evenly from all GPUs. graph->minChannels = graph->maxChannels = system->nodes[GPU].count; } struct ncclTopoGraph tmpGraph; memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph)); // First try crossnic, then decrease bw and finally increase bwIntra. int nspeeds = 0; float* speedArray = NULL; if (system->nodes[NET].count == 0) { nspeeds = ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA; speedArray = ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra; } else { nspeeds = ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER; speedArray = ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter; } int pass = 1; int speedIndex = 0; float maxBw = system->maxBw; float totalBw = system->totalBw; if (ngpus == 1 || graph->pattern != NCCL_TOPO_PATTERN_RING) totalBw *= ngpus*1.0/(ngpus-1); while ((speedArray[speedIndex] > maxBw || speedArray[speedIndex]*graph->minChannels > totalBw) && speedIndex < nspeeds-1) speedIndex++; tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT; search: int time = tmpGraph.sameChannels ? NCCL_SEARCH_TIMEOUT_SAMECHANNELS : tmpGraph.pattern == NCCL_TOPO_PATTERN_TREE ? NCCL_SEARCH_TIMEOUT_TREE : NCCL_SEARCH_TIMEOUT; tmpGraph.nChannels = 0; globalTimeout -= time; NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time)); #if 0 printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : ""); for (int c=0; cnChannels; c++) { printf("%2d : ", c); for (int g=0; gintra[c*ngpus+g]); } printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]); printf("\n"); } #endif // Optimal solution, stop here if (time == -1) goto done; if (graph->nChannels*graph->bwInter >= system->totalBw) goto done; if (pass == 1) { // First pass, we don't have a solution yet ; try other options // Try having different channels if (tmpGraph.sameChannels == 1) { tmpGraph.sameChannels = 0; goto search; } tmpGraph.sameChannels = trySameChannels; if (time != -1) globalTimeout += time; else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT; if (globalTimeout < 0 && graph->nChannels) goto done; // Try a simpler tree if (ccMin >= 90 && tmpGraph.pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) { tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE; goto search; } tmpGraph.pattern = graph->pattern; int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS; if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) { tmpGraph.typeIntra += 1; goto search; } tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { tmpGraph.typeInter += 1; goto search; } tmpGraph.typeInter = PATH_PIX; if (crossNic && tmpGraph.crossNic == 0) { // Try again with crossNic if permitted tmpGraph.crossNic = crossNic; goto search; } tmpGraph.crossNic = 0; // Decrease bw until we find a solution if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) { tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex]; goto search; } speedIndex = 0; while (speedArray[speedIndex] > maxBw && speedIndex < nspeeds-1) speedIndex++; tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; } done: // We have a solution. Start from that solution and move to pass 2. if (pass == 1) { time = -1; memcpy(&tmpGraph, graph, sizeof(tmpGraph)); speedIndex = 0; while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++; tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; tmpGraph.minChannels = graph->nChannels; pass = 2; } // 3. See if we can increase bwIntra for trees (2 nodes or collnet) if (pass == 2) { if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING && tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 && speedIndex > 0) { tmpGraph.bwIntra = speedArray[--speedIndex]; goto search; } time = -1; memcpy(&tmpGraph, graph, sizeof(tmpGraph)); } if (graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != NCCL_TOPO_PATTERN_NVLS) { WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern); for (int i=0; iintra[i] = system->nodes[GPU].nodes[i].gpu.rank; graph->inter[0] = graph->inter[1] = 0; graph->bwIntra = graph->bwInter = 0.1; graph->typeIntra = graph->typeInter = PATH_SYS; graph->nChannels = 1; } if (graph->nChannels == 0) return ncclSuccess; if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess; if (graph->bwIntra < 25.0) return ncclSuccess; if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess; int dupChannels = std::min(graph->nChannels*2, graph->maxChannels); memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int)); memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int)); graph->bwIntra /= DIVUP(dupChannels, graph->nChannels); graph->bwInter /= DIVUP(dupChannels, graph->nChannels); graph->nChannels = dupChannels; return ncclSuccess; } ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, bw %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->bwIntra, graph->bwInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels); int ngpus = system->nodes[GPU].count; char line[1024]; for (int c=0; cnChannels; c++) { sprintf(line, "%2d :", c); int offset = strlen(line); if (system->nodes[NET].count > 0) { sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c]); offset = strlen(line); } for (int i=0; iintra[ngpus*c+i]); offset = strlen(line); } if (system->nodes[NET].count > 0) { sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c+1]); offset = strlen(line); } INFO(NCCL_GRAPH, "%s", line); } return ncclSuccess; } ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) { char* str = getenv("NCCL_GRAPH_DUMP_FILE"); if (str) { INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str); struct ncclXml* xml; NCCLCHECK(ncclCalloc(&xml, 1)); NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml)); NCCLCHECK(ncclTopoDumpXmlToFile(str, xml)); free(xml); } return ncclSuccess; } #include "comm.h" // NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int* dev) { int localRanks = comm->topo->nodes[GPU].count; for (int c=0; cnChannels; c++) { if (graph->intra[c*localRanks] == comm->rank) { *dev = graph->inter[c*2]; return ncclSuccess; } } WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank); return ncclInternalError; } // 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2); ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) { if (graph) { // Honor the net device in the graph int channel = channelId%graph->nChannels; int ngpus = comm->topo->nodes[GPU].count; int index = graph->intra[channel*ngpus] == rank ? 0 : 1; if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) { *dev = graph->inter[channel*2+index]; } else { NCCLCHECK(getNvlsNetDev(comm, graph, dev)); } NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank)); } else if (peerRank == -1) { return ncclInternalError; } else { // Start with our local NIC and local Rank NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, channelId, dev)); *proxyRank = rank; int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel(); // See whether we can use the remote rank preferred device. if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) { // Find local NIC number close to local nvmlDev int nvmlDev = comm->peerInfo[peerRank].nvmlDev; int localRank; if (ncclTopoDevToRank(comm->topo, nvmlDev, &localRank) != ncclSuccess) return ncclSuccess; int netDev; NCCLCHECK(ncclTopoGetLocalNet(comm->topo, localRank, channelId, &netDev)); int n; // Check that device exists on our node if (ncclParamCrossNic() == 0) { if (ncclTopoIdToIndex(comm->topo, NET, netDev, &n) != ncclSuccess) { WARN("Rank %d requires NIC %d but that NIC is not available for rank %d", peerRank, netDev, rank); return ncclInvalidUsage; } *dev = netDev; } if (pxnLevel == 1) { int g, n; NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g)); NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n)); struct ncclTopoNode* gpu = comm->topo->nodes[GPU].nodes+g; if (gpu->paths[NET][n].type <= PATH_PXN) { *dev = netDev; NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank)); } } else if (pxnLevel == 2) { // Check which local GPU corresponds to that NIC and see if we can use PXN. int n, g1, g2; NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n)); NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1)); NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netDev, &g2)); if (g2 != -1) { struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2; if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) { *proxyRank = peerGpu->gpu.rank; *dev = netDev; return ncclSuccess; } } } } } return ncclSuccess; } nccl-2.18.3-1/src/graph/topo.cc000066400000000000000000001037761444201535000160560ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #include "graph.h" #include "topo.h" #include "comm.h" #include "nvmlwrap.h" #include "net.h" #include "coll_net.h" #include #include #include "xml.h" #include "cpuset.h" #define BUSID_SIZE (sizeof("0000:00:00.0")) #define BUSID_REDUCED_SIZE (sizeof("0000:00")) const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "", "SYS", "NET" }; const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" }; /******************************************************************/ /******************* Graph Creation Functions *********************/ /******************************************************************/ // Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000. ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { char* str = path+offset; // Remove trailing "/" if (*str == '/') str--; // Find next / while (*str != '/') str--; str++; int64_t numid; NCCLCHECK(busIdToInt64(str, &numid)); // Ignore subdevice because those should use the same PCI link so we want to merge nodes. numid -= numid & 0xf; *id = numid; return ncclSuccess; } static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) { *cpu = NULL; if (node->type == CPU) { *cpu = node; return ncclSuccess; } for (int l=0; lnlinks; l++) { if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu)); if (*cpu != NULL) return ncclSuccess; } return ncclSuccess; } int interCpuBw = 0; int cpuPciBw = 0; static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) { *bw = LOC_BW; if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) { *bw = P9_BW; return ncclSuccess; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) { *bw = ARM_BW; return ncclSuccess; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW; } return ncclSuccess; } enum ncclNvLinkDeviceType { ncclNvLinkDeviceUnknown, ncclNvLinkDeviceGpu, ncclNvLinkDeviceSwitch, ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea) }; ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { for (int i=0; inodes[type].count; i++) { if (system->nodes[type].nodes[i].id == id) { *node = system->nodes[type].nodes+i; return ncclSuccess; } } return ncclSuccess; } ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) { WARN("Error : tried to create too many nodes of type %d", type); return ncclInternalError; } struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count; system->nodes[type].count++; n->type = type; n->id = id; if (type == GPU) { // Create link to itself (used in some corner cases) n->nlinks=1; n->links[0].type = LINK_LOC; n->links[0].remNode = n; n->links[0].bw = LOC_BW; n->gpu.dev = NCCL_TOPO_UNDEF; n->gpu.rank = NCCL_TOPO_UNDEF; n->gpu.cudaCompCap = NCCL_TOPO_UNDEF; } else if (type == CPU) { n->cpu.arch = NCCL_TOPO_UNDEF; n->cpu.vendor = NCCL_TOPO_UNDEF; n->cpu.model = NCCL_TOPO_UNDEF; } else if (type == NET) { n->net.asic = 0ULL; n->net.port = NCCL_TOPO_UNDEF; n->net.bw = 0.0; n->net.latency = 0.0; } *node = n; return ncclSuccess; } ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int index) { struct ncclTopoNode* delNode = system->nodes[type].nodes+index; for (int t=0; tpaths[t]); for (int n=0; nnodes[t].count; n++) { struct ncclTopoNode* node = system->nodes[t].nodes+n; if (node == delNode) continue; for (int l=0; lnlinks; l++) { while (lnlinks && node->links[l].remNode == delNode) { memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink)); node->nlinks--; } if (lnlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) { node->links[l].remNode--; } } } } memmove(delNode, delNode+1, (system->nodes[type].count-index-1)*sizeof(struct ncclTopoNode)); system->nodes[type].count--; return ncclSuccess; } ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw) { // Aggregate links into higher bw for NVLink struct ncclTopoLink* link; for (link = node->links; link->remNode; link++) { if (link->remNode == remNode && link->type == type) break; } if (link->remNode == NULL) node->nlinks++; link->type = type; link->remNode = remNode; link->bw += bw; // Sort links in BW descending order struct ncclTopoLink linkSave; memcpy(&linkSave, link, sizeof(struct ncclTopoLink)); while (link != node->links) { if ((link-1)->bw >= linkSave.bw) break; memcpy(link, link-1, sizeof(struct ncclTopoLink)); link--; } memcpy(link, &linkSave, sizeof(struct ncclTopoLink)); return ncclSuccess; } // BCM Gen4 Switches present themselves as a two-level hierarchical switch // even though they're supposed to sustain full BW across all ports. // Flatten the switch as this extra level can break the search and make // NCCL take wrong topology decisions. ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) { for (int s=0; snodes[PCI].count; s++) { struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s; uint64_t device = pciSwitch->pci.device; // Only flatten PEX Gen 4 switches in base mode if ((device & 0xfffffffffffff000) == 0x1000c0101000a000) { // Find sub switches with the same device ID. int64_t* subSwIds; NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks)); int subs = 0; for (int l=0; lnlinks; l++) { struct ncclTopoNode* sub = pciSwitch->links[l].remNode; // Only fuse sub switches with the same device ID. if (sub->type != PCI || sub->pci.device != device) continue; // Save sub switch for later subSwIds[subs++] = sub->id; // Remove link to that sub switch memmove(pciSwitch->links+l, pciSwitch->links+l+1, (pciSwitch->nlinks-l-1)*(sizeof(struct ncclTopoLink))); pciSwitch->nlinks--; // Don't increase l for the next iteration as we just shifted all links by one. l--; } for (int s=0; snodes[PCI].nodes is changing every time we remove a node) int index; NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index)); struct ncclTopoNode* sub = system->nodes[PCI].nodes+index; // Connect all sub PCI devices to the parent switch for (int l=0; lnlinks; l++) { struct ncclTopoNode* remNode = sub->links[l].remNode; if (remNode == pciSwitch) continue; // Add link from parent PCI switch -> PCI device memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink)); pciSwitch->nlinks++; // Update link from PCI device -> parent PCI switch for (int rl=0; rlnlinks; rl++) { if (remNode->links[rl].remNode == sub) { remNode->links[rl].remNode = pciSwitch; break; } } } NCCLCHECK(ncclTopoRemoveNode(system, PCI, index)); } // Set subdevice to 0x0000 to make sure we don't merge this switch again. pciSwitch->pci.device = 0x1000c01010000000; free(subSwIds); // Restart, as system->nodes[PCI].nodes has changed. s = 0; } } return ncclSuccess; } ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) { // And connect all CPU nodes together for (int n=0; nnodes[CPU].count; n++) { for (int p=0; pnodes[CPU].count; p++) { if (n == p) continue; float bw; NCCLCHECK(ncclTopoGetInterCpuBw(system->nodes[CPU].nodes+n, &bw)); NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, bw)); } } return ncclSuccess; } static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) { if (node->type == GPU) { sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank); } else if (node->type == CPU) { sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model); } else if (node->type == PCI) { sprintf(line+offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device); } else { sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id); } INFO(NCCL_GRAPH, "%s", line); for (int i=0; inlinks; l++) { struct ncclTopoLink* link = node->links+l; if (link->type == LINK_LOC) continue; if (link->type != LINK_PCI || link->remNode != prevNode) { sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw); int nextOffset = strlen(line); if (link->type == LINK_PCI) { NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset)); } else { if (link->remNode->type == NET) { sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw); } else { sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id); } INFO(NCCL_GRAPH, "%s", line); } } } return ncclSuccess; } ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) { INFO(NCCL_GRAPH, "=== System : maxBw %2.1f totalBw %2.1f ===", s->maxBw, s->totalBw); char line[1024]; for (int n=0; nnodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0)); INFO(NCCL_GRAPH, "=========================================="); NCCLCHECK(ncclTopoPrintPaths(s)); return ncclSuccess; } static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) { // Shift all links to have upLink as last link if (upNode) { int l=0; while (node->links[l].remNode != upNode) l++; struct ncclTopoLink upLink; memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink)); while (node->links[l+1].remNode) { memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink)); l++; } memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink)); } // Recursively sort the PCI tree for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node->links+l; if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node)); } return ncclSuccess; } // We want the graph to be organized to ease/accelerate traversal : // 1. NVLinks (already the case) // 2. PCI down // 3. PCI up // 4. SYS (already the case) ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) { for (int n=0; nnodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL)); return ncclSuccess; } ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) { int dev; NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev)); struct ncclTopoNode* net; NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev)); const char* str; NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str)); if (str) sscanf(str, "0x%lx", &net->net.asic); else net->net.asic = dev; ncclDebugNoWarn = NCCL_GRAPH; int mbps; NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0)); if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1 net->net.bw = mbps / 8000.0; if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0; NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0)); ncclDebugNoWarn = 0; NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.bw)); NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.bw)); return ncclSuccess; } ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) { for (int s=0; snSubs; s++) { struct ncclXmlNode* xmlNet = xmlNic->subs[s]; if (strcmp(xmlNet->name, "net") != 0) continue; int index; NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index)); if (index == -1) continue; NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic)); } return ncclSuccess; } ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap)); NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank)); NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev)); NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport)); // Do not go any further, nvlinks will be added in a second pass return ncclSuccess; } struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } }; struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */ { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 }, { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) { const char* str; int type; NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str)); NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass)); int64_t busId; NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str)); NCCLCHECK(busIdToInt64(str, &busId)); struct ncclTopoNode* node = NULL; struct ncclXmlNode* xmlGpu = NULL; NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu)); if (xmlGpu != NULL) { type = GPU; int index; NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index)); if (index == -1) return ncclSuccess; NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId)); NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node)); } struct ncclXmlNode* xmlNic = NULL; NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic)); if (xmlNic != NULL) { type = NIC; // Ignore sub device ID and merge multi-port NICs into one PCI device. busId &= 0xfffffffffffffff0; struct ncclTopoNode* nicNode = NULL; NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId)); if (nicNode == NULL) { NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId)); node = nicNode; // Connect it to parent later on } NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode)); } else if (type == PCI) { NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId)); NCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str)); if (str) node->pci.device += strtol(str, NULL, 0) << 48; NCCLCHECK(xmlGetAttr(xmlPci, "device", &str)); if (str) node->pci.device += strtol(str, NULL, 0) << 32; NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str)); if (str) node->pci.device += strtol(str, NULL, 0) << 16; NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str)); if (str) node->pci.device += strtol(str, NULL, 0); for (int s=0; snSubs; s++) { struct ncclXmlNode* xmlSubPci = xmlPci->subs[s]; NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node)); } } if (node) { int width, speed; NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width)); NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str)); // Manage cases where speed was not indicated in /sys if (width == 0) width = 16; NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end) NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0)); NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0)); } return ncclSuccess; } struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } }; struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { " Shanghai ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } }; ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) { int numaId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId)); struct ncclTopoNode* cpu; NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId)); const char* str; NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str)); if (str != NULL) { NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity)); } NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str)); NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch)); if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) { NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str)); NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor)); if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { int familyId, modelId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW; } else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { int familyId, modelId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG; } } for (int s=0; snSubs; s++) { struct ncclXmlNode* node = xmlCpu->subs[s]; if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu)); if (strcmp(node->name, "nic") == 0) { struct ncclTopoNode* nic = NULL; NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0)); if (nic == NULL) { NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0)); NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW)); NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW)); } NCCLCHECK(ncclTopoAddNic(node, system, nic)); } } return ncclSuccess; } ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) { if (strcmp(node->name, "nvlink") == 0) { struct ncclTopoNode* gpu = NULL; int64_t pBusId; NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId)); if (gpu == NULL) { WARN("Add NVLink error : could not find GPU %lx", pBusId); return ncclInternalError; } int count; NCCLCHECK(xmlGetAttrInt(node, "count", &count)); const char* targetClass; NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass)); int targetType; NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass)); struct ncclTopoNode* remote = NULL; if (targetType == GPU) { // NVL P2P connection to another GPU const char* target; NCCLCHECK(xmlGetAttrStr(node, "target", &target)); int64_t busId; NCCLCHECK(busIdToInt64(target, &busId)); NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId)); } else if (targetType == CPU) { // NVL connection to the local CPU NCCLCHECK(findLocalCpu(gpu, &remote)); } else { if (system->nodes[NVS].count == 0) { NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0)); } else { remote = system->nodes[NVS].nodes; } } if (remote) { float nvlBw = ncclTopoNVLinkBw(gpu->gpu.cudaCompCap); NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlBw)); if (remote->type != GPU) { NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlBw)); } } } else { const char* busId; NCCLCHECK(xmlGetAttr(node, "busid", &busId)); for (int s=0; snSubs; s++) { NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId)); } } return ncclSuccess; } ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) { NCCLCHECK(ncclCalloc(topoSystem, 1)); struct ncclXmlNode* topNode; NCCLCHECK(xmlFindTag(xml, "system", &topNode)); for (int s=0; snSubs; s++) { struct ncclXmlNode* node = topNode->subs[s]; if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem)); } NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL)); NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem)); NCCLCHECK(ncclTopoConnectCpus(*topoSystem)); NCCLCHECK(ncclTopoSortSystem(*topoSystem)); return ncclSuccess; } NCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0); // Only set values if not already set static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value); } return ncclSuccess; } static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attrName, const uint64_t value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value); } return ncclSuccess; } static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value); } return ncclSuccess; } ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { struct ncclXml* xml; NCCLCHECK(ncclCalloc(&xml, 1)); char* xmlTopoFile = getenv("NCCL_TOPO_FILE"); if (xmlTopoFile) { INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile); NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1)); } else { // Try default XML topology location NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0)); } if (xml->maxIndex == 0) { // Create top tag struct ncclXmlNode* top; NCCLCHECK(xmlAddNode(xml, NULL, "system", &top)); NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION)); } // Auto-detect GPUs if needed for (int r=0; rnRanks; r++) { if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) { char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId)); struct ncclXmlNode* node; NCCLCHECK(ncclTopoFillGpu(xml, busId, &node)); if (node == NULL) continue; NCCLCHECK(xmlSetAttrInt(node, "keep", 1)); NCCLCHECK(xmlSetAttrInt(node, "rank", r)); NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport)); } } // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, // so we start with collnet so that it has precedence. int netDevCount = 0; if (collNetSupport(comm)) { NCCLCHECK(collNetDevices(comm, &netDevCount)); for (int n=0; ndmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1)); } } if (netDevCount == 0) { NCCLCHECK(comm->ncclNet->devices(&netDevCount)); } for (int n=0; nncclNet->getProperties(n, &props)); struct ncclXmlNode* netNode; NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode)); NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1)); NCCLCHECK(xmlSetAttrInt(netNode, "dev", n)); NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed)); NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port)); NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency)); NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid)); NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms)); bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); } // Remove XML branches which don't have a node with keep="1" (typically when importing a topology) NCCLCHECK(ncclTopoTrimXml(xml)); xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE"); if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) { INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile); NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml)); } NCCLCHECK(ncclTopoGetSystemFromXml(xml, system)); free(xml); return ncclSuccess; } static ncclResult_t getLocalNetMask(struct ncclTopoSystem* system, int g, uint64_t* localNetMask, int* type) { int minType = PATH_DIS; float maxBw = 0; int count = 0; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); for (int n=0; nnodes[NET].count; n++) { struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g; if (path->bw > maxBw || (path->bw == maxBw && path->type < minType)) { maxBw = path->bw; minType = path->type; if (type) *type = minType; count = 0; } if (path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id; } *localNetMask = 0ULL; for (int n=0; n= 64) return ncclInternalError; *localNetMask |= 1ULL<nodes[GPU].count; NCCLCHECK(ncclCalloc(&localNetMasks, ngpus)); // Fill localNetMasks for all GPUs. for (int g=0; gnodes[GPU].count; int* gpus; NCCLCHECK(ncclCalloc(&gpus, ngpus)); // Find localNetMask which includes net with the most local GPUs. int netLocalGpus = 0, minType = PATH_DIS; uint64_t localNetMask = 0ULL; for (int g=0; gnodes[CPU].nodes[0].cpu.arch; *vendor = system->nodes[CPU].nodes[0].cpu.vendor; *model = system->nodes[CPU].nodes[0].cpu.model; return ncclSuccess; } NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) { struct ncclTopoNode* cpu = NULL, *gpu = NULL; for (int g=0; gnodes[GPU].count; g++) { if (system->nodes[GPU].nodes[g].gpu.rank == rank) { gpu = system->nodes[GPU].nodes+g; // Find closer CPU int cpuIndex = -1, minHops = 0; for (int c=0; cnodes[CPU].count; c++) { int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count; if (cpuIndex == -1 || nHops < minHops) { cpuIndex = c; minHops = nHops; } } cpu = system->nodes[CPU].nodes+cpuIndex; } } if (cpu == NULL) { WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank); return ncclInternalError; } // Query the CPU affinity set we were provided cpu_set_t mask; SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); #ifdef ENABLE_TRACE { char affinityStr[sizeof(cpu_set_t)*2]; NCCLCHECK(ncclCpusetToStr(&mask, affinityStr)); TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr); } #endif // Get the affinity of the CPU close to our GPU. cpu_set_t cpuMask = cpu->cpu.affinity; #ifdef ENABLE_TRACE { char affinityStr[sizeof(cpu_set_t)*2]; NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr)); TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr); } #endif cpu_set_t finalMask; if (ncclParamIgnoreCpuAffinity()) // Ignore the CPU affinity set and use the GPU one instead finalMask = cpuMask; else // Use a subset of the GPU affinity set CPU_AND(&finalMask, &mask, &cpuMask); memcpy(affinity, &finalMask, sizeof(cpu_set_t)); // If there is a non empty set, use it to set affinity if (CPU_COUNT(&finalMask)) { char affinityStr[sizeof(cpu_set_t)*2]; NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr); } return ncclSuccess; } ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count) { *count = system->nodes[GPU].count; return ncclSuccess; } ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count) { *count = system->nodes[NET].count; return ncclSuccess; } ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count) { *count = system->nodes[NVS].count; return ncclSuccess; } ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax) { if (system->nodes[GPU].count == 0) return ncclInternalError; int min, max; min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap; for (int g=1; gnodes[GPU].count; g++) { min = std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap); max = std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap); } if (ccMin) *ccMin = min; if (ccMax) *ccMax = max; return ncclSuccess; } ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) { for (int g=0; gnodes[GPU].count; g++) { if (system->nodes[GPU].nodes[g].gpu.rank == rank) { *localRank = g; return ncclSuccess; } } WARN("Could not find local GPU with rank %d", rank); return ncclInternalError; } nccl-2.18.3-1/src/graph/topo.h000066400000000000000000000135251444201535000157100ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_TOPO_H_ #define NCCL_TOPO_H_ #include "graph.h" #include "core.h" #define LOC_BW 5000.0 #define SM60_NVLINK_BW 18.0 #define SM70_NVLINK_BW 20.0 #define SM80_NVLINK_BW 20.0 #define SM90_NVLINK_BW 20.0 #define SM86_NVLINK_BW 12.0 #define PCI_BW 12.0 // PCI Gen3 x16 #define QPI_BW 6.0 #define SKL_QPI_BW 10.0 #define ZPI_BW 6.0 #define YONGFENG_ZPI_BW 9.0 #define P9_BW 32.0 #define ARM_BW 6.0 #define NET_BW 12.0 // 100Gbit // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU // to GPU traffic consumes more PCI bandwidth. #define INTEL_P2P_OVERHEAD(bw) (bw*6/5) #define NCCL_TOPO_NODE_TYPES 7 #define GPU 0 #define PCI 1 #define NVS 2 #define CPU 3 // Actually NUMA domains #define NIC 4 #define NET 5 extern const char* topoNodeTypeStr[]; // We want link types and path types to match as much as possible #define LINK_LOC 0 #define LINK_NVL 1 // Skipping 2 for PATH_NVB #define LINK_PCI 3 // Skipping 4 for PATH_PXB // Skipping 5 for PATH_PXN // Skipping 6 for PATH_PHB #define LINK_SYS 7 #define LINK_NET 8 extern const char* topoLinkTypeStr[]; // Local (myself) #define PATH_LOC 0 // Connection traversing NVLink #define PATH_NVL 1 // Connection through NVLink using an intermediate GPU #define PATH_NVB 2 // Connection traversing at most a single PCIe bridge #define PATH_PIX 3 // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) #define PATH_PXB 4 // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations. #define PATH_PXN 5 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) #define PATH_PHB 6 // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) #define PATH_SYS 7 // Connection through the network #define PATH_NET 8 // Disconnected #define PATH_DIS 9 extern const char* topoPathTypeStr[]; struct ncclTopoNode; struct ncclTopoLink { int type; float bw; struct ncclTopoNode* remNode; }; #define NCCL_TOPO_MAX_LINKS 32 #define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES) struct ncclTopoLinkList { struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS]; int count; float bw; int type; }; #define NCCL_TOPO_CPU_INTEL_BDW 1 #define NCCL_TOPO_CPU_INTEL_SKL 2 #define NCCL_TOPO_UNDEF (-1) struct ncclTopoNode { int type; int64_t id; // Type specific data union { struct { int dev; // NVML dev number int rank; int cudaCompCap; int gdrSupport; }gpu; struct { uint64_t asic; int port; float bw; float latency; int gdrSupport; int collSupport; int maxChannels; }net; struct { int arch; int vendor; int model; cpu_set_t affinity; }cpu; struct { uint64_t device; }pci; }; int nlinks; struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS]; // Pre-computed paths to GPUs and NICs struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES]; // Used during search uint64_t used; }; struct ncclTopoNodeSet { int count; struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES]; }; struct ncclTopoSystem { struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES]; float maxBw; float totalBw; }; ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id); ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id); ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id); ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw); ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system); ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system); ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank); ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem); ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels); ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml); ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* ccMax); static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) { *index = -1; for (int i=0; inodes[type].count; i++) { if (system->nodes[type].nodes[i].id == id) { *index = i; return ncclSuccess; } } return ncclInternalError; } static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) { *index = -1; for (int i=0; inodes[GPU].count; i++) { if (system->nodes[GPU].nodes[i].gpu.rank == rank) { *index = i; return ncclSuccess; } } return ncclInternalError; } static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, int* rank) { *rank = -1; for (int i=0; inodes[GPU].count; i++) { if (system->nodes[GPU].nodes[i].gpu.dev == dev) { *rank = system->nodes[GPU].nodes[i].gpu.rank; return ncclSuccess; } } return ncclInternalError; } // Returns NVLink bw in GB/s static float ncclTopoNVLinkBw(int cudaCompCap) { return cudaCompCap >= 90 ? SM90_NVLINK_BW : cudaCompCap == 86 ? SM86_NVLINK_BW : cudaCompCap >= 80 ? SM80_NVLINK_BW : cudaCompCap >= 70 ? SM70_NVLINK_BW : cudaCompCap >= 60 ? SM60_NVLINK_BW : SM80_NVLINK_BW; } #endif nccl-2.18.3-1/src/graph/trees.cc000066400000000000000000000074171444201535000162120ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank) /* Btree which alternates leaves and nodes. * Assumes root is 0, which conveniently builds a tree on powers of two, * (because we have pow2-1 ranks) which lets us manipulate bits. * Find first non-zero bit, then : * Find the parent : * xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below) * xx11[0] -> xx10[0] (3,7,11 below) * Find the children : * xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13) * xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13) * * Illustration : * 0---------------8 * ______/ \______ * 4 12 * / \ / \ * 2 6 10 \ * / \ / \ / \ \ * 1 3 5 7 9 11 13 */ ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) { int up, down0, down1; int bit; for (bit=1; bit 0 so it has to be our child 1, not 0. *d1 = nranks > 1 ? bit >> 1 : -1; return ncclSuccess; } up = (rank ^ bit) | (bit << 1); // if smaller than the parent, we are his first child, otherwise we're his second if (up >= nranks) up = (rank ^ bit); *parentChildType = (rank < up) ? 0 : 1; *u = up; int lowbit = bit >> 1; // down0 is always within bounds down0 = lowbit == 0 ? -1 : rank-lowbit; down1 = lowbit == 0 ? -1 : rank+lowbit; // Make sure down1 is within bounds while (down1 >= nranks) { down1 = lowbit == 0 ? -1 : rank+lowbit; lowbit >>= 1; } *d0 = down0; *d1 = down1; return ncclSuccess; } /* Build a double binary tree. Take the previous tree for the first tree. * For the second tree, we use a mirror tree (if nranks is even) * * 0---------------8 3----------------11 * ______/ \ / \______ * 4 \ / 7 * / \ \ / / \ * 2 6 10 1 5 9 * / \ / \ / \ / \ / \ / \ * 1 3 5 7 9 11 0 2 4 6 8 10 * * or shift it by one rank (if nranks is odd). * * 0---------------8 1---------------9 * ______/ \______ ______/ \______ * 4 12 5 0 * / \ / / \ / * 2 6 10 3 7 11 * / \ / \ / \ / \ / \ / \ * 1 3 5 7 9 11 2 4 6 8 10 12 */ ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) { // First tree ... use a btree ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0); // Second tree ... mirror or shift if (nranks % 2 == 1) { // shift int shiftrank = (rank-1+nranks) % nranks; int u, d0, d1; ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1); *s1 = u == -1 ? -1 : (u+1) % nranks; *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks; *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks; } else { // mirror int u, d0, d1; ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1); *s1 = u == -1 ? -1 : nranks-1-u; *d1_0 = d0 == -1 ? -1 : nranks-1-d0; *d1_1 = d1 == -1 ? -1 : nranks-1-d1; } return ncclSuccess; } nccl-2.18.3-1/src/graph/tuning.cc000066400000000000000000000460411444201535000163700ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #include "devcomm.h" #include "comm.h" #include "topo.h" NCCL_PARAM(Nthreads, "NTHREADS", -2); NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2); static int getNthreads(const char* name, int env, int min, int max, int def) { int nt = env; if (nt > 0) { if (nt % WARP_SIZE != 0) { WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE); nt = max; } else if (nt > max) { WARN("Invalid %s %d (maximum %d).", name, nt, max); nt = max; } else if (nt < min) { WARN("Invalid %s %d (minimum %d).", name, nt, min); nt = min; } } else { nt = def; } return nt; } ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) { int def, set; if (str[0] == '^') { def = 1; set = 0; str++; } else { def = 0; set = 1; } for (int i=0; itopo, &cpuArch, &cpuVendor, &cpuModel)); if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0; if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0; else return 1.0; } ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) { int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads); comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS); comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] = NCCL_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS); comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS); int nNodes = comm->nNodes; int nRanks = comm->nRanks; if (nRanks <= 1) return ncclSuccess; int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX; int cpuArch, cpuVendor, cpuModel; NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); int index2 = nNodes <= 2 ? nNodes-1 : 2; // LL: for single node, we look at GPU type; for multi-node, we look at CPU type int index1 = nNodes == 1 ? compCapIndex : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0; double llMaxBw = llMaxBws[index1][index2]; double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2]; double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2]; double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2]; // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]; float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; for (int a=0; atypeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI; for (int a=0; a 1 ? 2*nNodes :0) : coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 : nNodes; for (int a=0; abwIntra : graphs[a]->bwInter; float busBw = graphs[a]->nChannels * bw; // Various model refinements if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); } if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw); if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw); if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85; if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) { // Collnet+Direct requires all GPUs to have a local NIC to work at full speed float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio factor -= (factor-1)/2; busBw /= factor; } if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE && minCompCap >= 90) busBw *= .85; // Convert bus BW to algorithm BW float ratio; if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps; else if (a == NCCL_ALGO_NVLS) ratio = 5.0/6.0; else if (a == NCCL_ALGO_NVLS_TREE) ratio = .70 * nNodes / (2*(nNodes-1)); else ratio = .5; comm->bandwidths[coll][a][p] = busBw * ratio; comm->latencies[coll][a][p] = baseLat[a][p]; float intraLat = hwLat[intraHw[a]][a][p]; float interLat = hwLat[NCCL_HW_NET][a][p] + graphs[a]->latencyInter; // Also add the flush extra latency if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter; if (a == NCCL_ALGO_RING) { float lat = hwLat[hw[a]][a][p]; if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) { if (graphs[a]->sameChannels) { comm->latencies[coll][a][p] += lat; } else { if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling comm->latencies[coll][a][p] += nsteps*lat; } } else { // Inter-node rings still have to launch nsteps * net overhead. float netOverhead = 0.0; if (nNodes > 1) { netOverhead = getNetOverhead(comm); if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3; } intraLat = std::max(intraLat, netOverhead); comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat; } } else if (a == NCCL_ALGO_TREE) { comm->latencies[coll][a][p] += 2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat); } else if (a == NCCL_ALGO_COLLNET_DIRECT) { comm->latencies[coll][a][p] += 2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat; // Add 0.5 arity serialization latency } else if (a == NCCL_ALGO_COLLNET_CHAIN) { comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat; } else if (a == NCCL_ALGO_NVLS) { if (nNodes > 1) comm->latencies[coll][a][p] += hwLat[NCCL_HW_NET][a][p]; } else if (a == NCCL_ALGO_NVLS_TREE) { comm->latencies[coll][a][p] += 2*(nNodes-1)*hwLat[NCCL_HW_NET][a][p]; } } } } // Protocols/Algorithms enable/disable, and user overrides. // All are enabled except ll128 which is enabled by default only in certain cases. int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 }; const char *protoStr = getenv("NCCL_PROTO"); if (protoStr) { INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr); NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable)); } const char *algoStr = getenv("NCCL_ALGO"); if (algoStr) { INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr); NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable)); } if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0; // Disable CollNet if it is not supported if (comm->collNetSupport == 0) { algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0; if (comm->nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0; // If user has hard set NCCL_ALGO=COLLNET, ignore it if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 && algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) { algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1; if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET"); } } else { // Disable CollNet+Direct if not on an NVSwitch system int nvsCount = 0; NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount)); if (nvsCount == 0) algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; } for (int c=0; ctypeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN)); pEnable &= (graphs[a]->typeIntra <= PATH_NVB); pEnable &= (minCompCap == maxCompCap); switch (minCompCap) { case 70: pEnable &= 1; break; case 80: pEnable &= 1; break; case 90: pEnable &= !(CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2); break; default: pEnable &= 0; break; } } if (pEnable == 0) comm->bandwidths[c][a][p] = 0; // Never disable ring for non-allreduce operations. That allows to run real apps with NCCL_ALGO=TREE. if (a == NCCL_ALGO_RING && c != ncclFuncAllReduce) continue; if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; } if (comm->rank == 0) { char line[1024]; for (int block=0; block<2; block++) { sprintf(line, " Algorithm |"); for (int ba=0; bamaxThreads[a][p]); } } INFO(NCCL_TUNING, "%s", line); for (int c=0; clatencies[c][a][p], comm->bandwidths[c][a][p]); } } INFO(NCCL_TUNING, "%s", line); } } } // Set per-thread amount of work before we increase nThreads and nChannels for (int a=0; athreadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD; comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD; comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD; } comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks; comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = 512; comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = 512; // Override defaults with user env char* str = getenv("NCCL_THREAD_THRESHOLDS"); if (str) { INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str); ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }}; sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2); for (int a=0; a<2; a++) { for (int p=0; p= 0) comm->threadThresholds[a][p] = t[a][p]; } } } INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld | %ld", comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL], comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128], comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE], comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE]); return ncclSuccess; } // Trees are not perfectly sticking to the model for medium sizes. Applying a static correction // factor is not ideal but works quite well. Powers of two, 64 B to 256MB. static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = { { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .4, .4, .5, .6, .7, .8, .9, 1.0, 1.0, 1.0, 1.0 }, { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .7, .6, .6, .6, .6, .6, .6, .8, .9, .9, .9, .9, 1.0, 1.0 }, { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .6, .7, .8, .7, .7, .8, .9, .9 } }; ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) { float bw = info->comm->bandwidths[info->coll][algorithm][protocol]; float lat = info->comm->latencies[info->coll][algorithm][protocol]; if (bw == 0) { *time = -1.0; return ncclSuccess; } int logSize = log2i(info->nBytes>>6); if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize]; if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels; if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1 && info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) { lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring } // Tree pipelining saves latency in aggregation cases int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS); *time = lat * latCount + (info->nBytes) / (1000 * bw); return ncclSuccess; } nccl-2.18.3-1/src/graph/xml.cc000066400000000000000000000717301444201535000156670ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include #include #include #include #include #include "core.h" #include "nvmlwrap.h" #include "xml.h" /*******************/ /* XML File Parser */ /*******************/ ncclResult_t xmlGetChar(FILE* file, char* c) { if (fread(c, 1, 1, file) == 0) { WARN("XML Parse : Unexpected EOF"); return ncclInternalError; } return ncclSuccess; } ncclResult_t xmlGetValue(FILE* file, char* value, char* last) { char c; NCCLCHECK(xmlGetChar(file, &c)); if (c != '"' && c != '\'') { #if INT_OK int o = 0; do { value[o++] = c; NCCLCHECK(xmlGetChar(file, &c)); } while (c >= '0' && c <= '9'); value[o] = '\0'; *last = c; return ncclSuccess; #else WARN("XML Parse : Expected (double) quote."); return ncclInternalError; #endif } int o = 0; do { NCCLCHECK(xmlGetChar(file, &c)); value[o++] = c; } while (c != '"'); value[o-1] = '\0'; NCCLCHECK(xmlGetChar(file, last)); return ncclSuccess; } ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) { char c; char* ptr = name; int o = 0; do { NCCLCHECK(xmlGetChar(file, &c)); if (c == '=') { ptr[o] = '\0'; if (value == NULL) { WARN("XML Parse : Unexpected value with name %s", ptr); return ncclInternalError; } return xmlGetValue(file, value, last); } ptr[o] = c; if (o == MAX_STR_LEN-1) { ptr[o] = '\0'; WARN("Error : name %s too long (max %d)", ptr, MAX_STR_LEN); return ncclInternalError; } o++; } while (c != ' ' && c != '>' && c != '/' && c != '\n' && c != '\r'); ptr[o-1] = '\0'; *last = c; return ncclSuccess; } // Shift the 3-chars string by one char and append c at the end #define SHIFT_APPEND(s, c) do { s[0]=s[1]; s[1]=s[2]; s[2]=c; } while(0) ncclResult_t xmlSkipComment(FILE* file, char* start, char next) { // Start from something neutral with \0 at the end. char end[4] = "..."; // Inject all trailing chars from previous reads. We don't need // to check for --> here because there cannot be a > in the name. for (int i=0; i" while (strcmp(end, "-->") != 0) { int c; if (fread(&c, 1, 1, file) != 1) { WARN("XML Parse error : unterminated comment"); return ncclInternalError; } SHIFT_APPEND(end, c); } return ncclSuccess; } ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) { node->type = NODE_TYPE_NONE; char c = ' '; while (c == ' ' || c == '\n' || c == '\r') { if (fread(&c, 1, 1, file) == 0) return ncclSuccess; } if (c != '<') { WARN("XML Parse error : expecting '<', got '%c'", c); return ncclInternalError; } // Read XML element name NCCLCHECK(xmlGetToken(file, node->name, NULL, &c)); // Check for comments if (strncmp(node->name, "!--", 3) == 0) { NCCLCHECK(xmlSkipComment(file, node->name+3, c)); return xmlGetNode(file, node); } // Check for closing tag if (node->name[0] == '\0' && c == '/') { node->type = NODE_TYPE_CLOSE; // Re-read the name, we got '/' in the first call NCCLCHECK(xmlGetToken(file, node->name, NULL, &c)); if (c != '>') { WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name); return ncclInternalError; } return ncclSuccess; } node->type = NODE_TYPE_OPEN; // Get Attributes int a = 0; while (c == ' ') { NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c)); if (a == MAX_ATTR_COUNT) { INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT); // Actually we need to still consume the extra attributes so we have an extra one. } else a++; } node->nAttrs = a; if (c == '/') { node->type = NODE_TYPE_SINGLE; char str[MAX_STR_LEN]; NCCLCHECK(xmlGetToken(file, str, NULL, &c)); } if (c != '>') { WARN("XML Parse : expected >, got '%c'", c); return ncclInternalError; } return ncclSuccess; } typedef ncclResult_t (*xmlHandlerFunc_t)(FILE*, struct ncclXml*, struct ncclXmlNode*); struct xmlHandler { const char * name; xmlHandlerFunc_t func; }; ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head, struct xmlHandler handlers[], int nHandlers) { if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess; while (1) { if (xml->maxIndex == MAX_NODES) { WARN("Error : XML parser is limited to 1024 nodes"); return ncclInternalError; } struct ncclXmlNode* node = xml->nodes+xml->maxIndex; memset(node, 0, sizeof(struct ncclXmlNode)); NCCLCHECK(xmlGetNode(file, node)); if (node->type == NODE_TYPE_NONE) { if (head) { WARN("XML Parse : unterminated %s", head->name); return ncclInternalError; } else { // All done return ncclSuccess; } } if (head && node->type == NODE_TYPE_CLOSE) { if (strcmp(node->name, head->name) != 0) { WARN("XML Mismatch : %s / %s", head->name, node->name); return ncclInternalError; } return ncclSuccess; } int found = 0; for (int h=0; hname, handlers[h].name) == 0) { if (head) head->subs[head->nSubs++] = node; node->parent = head; node->nSubs = 0; xml->maxIndex++; NCCLCHECK(handlers[h].func(file, xml, node)); found = 1; break; } } if (!found) { if (nHandlers) INFO(NCCL_GRAPH, "Ignoring element %s", node->name); NCCLCHECK(xmlLoadSub(file, xml, node, NULL, 0)); } } } /**************/ /* XML Writer */ /**************/ ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node) { for (int i=0; iname); for (int a=0; anAttrs; a++) { fprintf(file, " %s=\"%s\"", node->attrs[a].key, node->attrs[a].value); } if (node->nSubs == 0) { fprintf(file, "/>\n"); } else { fprintf(file, ">\n"); for (int s=0; snSubs; s++) { NCCLCHECK(ncclTopoDumpXmlRec(indent+2, file, node->subs[s])); } for (int i=0; i\n", node->name); } return ncclSuccess; } ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml) { FILE* file = fopen(xmlTopoFile, "w"); if (file == NULL) { WARN("Unable to open %s, not dumping topology.", xmlTopoFile); return ncclSuccess; } NCCLCHECK(ncclTopoDumpXmlRec(0, file, xml->nodes)); fclose(file); return ncclSuccess; } /****************************************/ /* Parser rules for our specific format */ /****************************************/ ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { struct xmlHandler handlers[] = { { "net", ncclTopoXmlLoadNet } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic} }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadCpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "nic", ncclTopoXmlLoadNic } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2)); return ncclSuccess; } ncclResult_t ncclTopoXmlLoadSystem(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { int version; NCCLCHECK(xmlGetAttrInt(head, "version", &version)); if (version != NCCL_TOPO_XML_VERSION) { WARN("XML Topology has wrong version %d, %d needed", version, NCCL_TOPO_XML_VERSION); return ncclInvalidUsage; } const char* name; NCCLCHECK(xmlGetAttr(head, "name", &name)); if (name != NULL) INFO(NCCL_GRAPH, "Loading topology %s", name); else INFO(NCCL_GRAPH, "Loading unnamed topology"); struct xmlHandler handlers[] = { { "cpu", ncclTopoXmlLoadCpu } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); return ncclSuccess; } ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml, int warn) { FILE* file = fopen(xmlTopoFile, "r"); if (file == NULL) { if (warn) { WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno)); } return ncclSuccess; } INFO(NCCL_GRAPH, "Loading topology file %s", xmlTopoFile); struct xmlHandler handlers[] = { { "system", ncclTopoXmlLoadSystem } }; xml->maxIndex = 0; NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1)); fclose(file); return ncclSuccess; } /**********************/ /* XML creation */ /* from autodetection */ /**********************/ #define BUSID_SIZE (sizeof("0000:00:00.0")) #define BUSID_REDUCED_SIZE (sizeof("0000:00")) static void memcpylower(char* dst, const char* src, const size_t size) { for (int i=0; i %s=%s", path, fileName, attrName, strValue); return ncclSuccess; } ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml* xml) { int index; NCCLCHECK(xmlGetAttrIndex(cpuNode, "affinity", &index)); if (index == -1) { const char* numaId; NCCLCHECK(xmlGetAttr(cpuNode, "numaid", &numaId)); if (numaId == NULL) { WARN("GetXmlFromCpu : could not find CPU numa ID."); return ncclInternalError; } // Set affinity char cpumaskPath[] = "/sys/devices/system/node/node0000"; sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId); NCCLCHECK(ncclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity")); } NCCLCHECK(xmlGetAttrIndex(cpuNode, "arch", &index)); if (index == -1) { // Fill CPU type / vendor / model #if defined(__PPC__) NCCLCHECK(xmlSetAttr(cpuNode, "arch", "ppc64")); #elif defined(__aarch64__) NCCLCHECK(xmlSetAttr(cpuNode, "arch", "arm64")); #elif defined(__x86_64__) NCCLCHECK(xmlSetAttr(cpuNode, "arch", "x86_64")); #endif } #if defined(__x86_64__) NCCLCHECK(xmlGetAttrIndex(cpuNode, "vendor", &index)); if (index == -1) { union { struct { // CPUID 0 String register order uint32_t ebx; uint32_t edx; uint32_t ecx; }; char vendor[12]; } cpuid0; asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0) : "memory"); char vendor[13]; strncpy(vendor, cpuid0.vendor, 12); vendor[12] = '\0'; NCCLCHECK(xmlSetAttr(cpuNode, "vendor", vendor)); } NCCLCHECK(xmlGetAttrIndex(cpuNode, "familyid", &index)); if (index == -1) { union { struct { unsigned steppingId:4; unsigned modelId:4; unsigned familyId:4; unsigned processorType:2; unsigned resv0:2; unsigned extModelId:4; unsigned extFamilyId:8; unsigned resv1:4; }; uint32_t val; } cpuid1; asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1) : "memory"); int familyId = cpuid1.familyId + (cpuid1.extFamilyId << 4); int modelId = cpuid1.modelId + (cpuid1.extModelId << 4); NCCLCHECK(xmlSetAttrInt(cpuNode, "familyid", familyId)); NCCLCHECK(xmlSetAttrInt(cpuNode, "modelid", modelId)); } #endif return ncclSuccess; } ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct ncclXmlNode** pciNode) { NCCLCHECK(xmlFindTagKv(xml, "pci", pciNode, "busid", busId)); if (*pciNode == NULL) { NCCLCHECK(xmlAddNode(xml, NULL, "pci", pciNode)); NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId)); } return ncclSuccess; } // Check whether a string is in BDF format or not. // BDF (Bus-Device-Function) is "BBBB:BB:DD.F" where B, D and F are hex digits. // There can be trailing chars. int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); } int checkBDFFormat(char* bdf) { if (bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.') return 0; if (isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) || isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) || isHex(bdf[9] == 0) || isHex(bdf[11] == 0)) return 0; return 1; } ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) { // Fill info, then parent const char* busId; NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); char* path = NULL; ncclDebugNoWarn = NCCL_GRAPH; getPciPath(busId, &path); ncclDebugNoWarn = 0; if (path) { NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class")); } int index; ncclDebugNoWarn = NCCL_GRAPH; NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index)); if (index == -1) { if (path) ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"); } NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index)); if (index == -1) { if (path) ncclTopoSetAttrFromSys(pciNode, path, "device", "device"); } NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index)); if (index == -1) { if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"); } NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index)); if (index == -1) { if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"); } ncclDebugNoWarn = 0; NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index)); if (index == -1) { if (path) { char deviceSpeedStr[MAX_STR_LEN]; float deviceSpeed; NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr)); sscanf(deviceSpeedStr, "%f GT/s", &deviceSpeed); char portSpeedStr[MAX_STR_LEN]; float portSpeed; NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_speed", portSpeedStr)); sscanf(portSpeedStr, "%f GT/s", &portSpeed); NCCLCHECK(xmlSetAttr(pciNode, "link_speed", portSpeed < deviceSpeed ? portSpeedStr : deviceSpeedStr)); } else { NCCLCHECK(xmlSetAttr(pciNode, "link_speed", "")); } } NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index)); if (index == -1) { if (path) { char strValue[MAX_STR_LEN]; NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue)); int deviceWidth = strtol(strValue, NULL, 0); NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_width", strValue)); int portWidth = strtol(strValue, NULL, 0); NCCLCHECK(xmlSetAttrInt(pciNode, "link_width", std::min(deviceWidth,portWidth))); } else { NCCLCHECK(xmlSetAttr(pciNode, "link_width", "")); } } struct ncclXmlNode* parent = pciNode->parent; if (parent == NULL) { if (path) { // Save that for later in case next step is a CPU char numaIdStr[MAX_STR_LEN]; NCCLCHECK(ncclTopoGetStrFromSys(path, "numa_node", numaIdStr)); // Go up one level in the PCI tree. Rewind two "/" and follow the upper PCI // switch, or stop if we reach a CPU root complex. int slashCount = 0; int parentOffset; for (parentOffset = strlen(path)-1; parentOffset>0; parentOffset--) { if (path[parentOffset] == '/') { slashCount++; path[parentOffset] = '\0'; int start = parentOffset - 1; while (start>0 && path[start] != '/') start--; // Check whether the parent path looks like "BBBB:BB:DD.F" or not. if (checkBDFFormat(path+start+1) == 0) { // This a CPU root complex. Create a CPU tag and stop there. struct ncclXmlNode* topNode; NCCLCHECK(xmlFindTag(xml, "system", &topNode)); NCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr)); if (parent == NULL) { NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent)); NCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr)); } } else if (slashCount == 2) { // Continue on the upper PCI switch for (int i = strlen(path)-1; i>0; i--) { if (path[i] == '/') { NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", path+i+1)); if (parent == NULL) { NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent)); NCCLCHECK(xmlSetAttr(parent, "busid", path+i+1)); } break; } } } } if (parent) break; } } else { // No information on /sys, attach GPU to unknown CPU NCCLCHECK(xmlFindTagKv(xml, "cpu", &parent, "numaid", "-1")); if (parent == NULL) { struct ncclXmlNode* topNode; NCCLCHECK(xmlFindTag(xml, "system", &topNode)); NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent)); NCCLCHECK(xmlSetAttr(parent, "numaid", "-1")); NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml)); } } pciNode->parent = parent; parent->subs[parent->nSubs++] = pciNode; } if (strcmp(parent->name, "pci") == 0) { NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); } else if (strcmp(parent->name, "cpu") == 0) { NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml)); } free(path); return ncclSuccess; } ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvmlDev, struct ncclXml* xml, struct ncclXmlNode** gpuNodeRet) { struct ncclXmlNode* gpuNode = NULL; NCCLCHECK(xmlGetSub(pciNode, "gpu", &gpuNode)); if (gpuNode == NULL) NCCLCHECK(xmlAddNode(xml, pciNode, "gpu", &gpuNode)); int index = -1; int dev = -1; NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index)); if (index == -1) { NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev)); NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev)); } NCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev)); if (dev == -1) { *gpuNodeRet = NULL; return ncclSuccess; } NCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index)); if (index == -1) { int cudaMajor, cudaMinor; if (nvmlDev == NULL) { cudaDeviceProp devProp; CUDACHECK(cudaGetDeviceProperties(&devProp, dev)); cudaMajor = devProp.major; cudaMinor = devProp.minor; } else { NCCLCHECK(ncclNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor)); } NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor)); } int sm; NCCLCHECK(xmlGetAttrInt(gpuNode, "sm", &sm)); struct ncclXmlNode* nvlNode = NULL; NCCLCHECK(xmlGetSub(gpuNode, "nvlink", &nvlNode)); if (nvlNode == NULL) { // NVML NVLink detection int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : (sm < 90) ? 12 : 18; if (maxNvLinks > 0 && nvmlDev == NULL) { WARN("No NVML device handle. Skipping nvlink detection."); maxNvLinks = 0; } for (int l=0; l= 11080 if (sm >= 90) { nvmlFieldValue_t fv; fv.fieldId = NVML_FI_DEV_NVLINK_GET_STATE; fv.scopeId = l; // fv.value will contain NV_FEATURE_ENABLED or NV_FEATURE_DISABLED if ((ncclNvmlDeviceGetFieldValues(nvmlDev, 1, &fv) == ncclSuccess) && (fv.nvmlReturn == NVML_SUCCESS)) isActive = (nvmlEnableState_t) fv.value.uiVal; } else /* FALLTHRU to GetNvLinkState if before SM90 */ #endif { (void) ncclNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive); } if (isActive != NVML_FEATURE_ENABLED) continue; // Try to figure out what's on the other side of the NVLink nvmlPciInfo_t remoteProc; if (ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue; // Make a lower case copy of the bus ID for calling ncclDeviceType // PCI system path is in lower case char* p = remoteProc.busId; char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; for (int c=0; cnSubs; s++) { struct ncclXmlNode* sub = gpuNode->subs[s]; if (strcmp(sub->name, "nvlink") != 0) continue; int index; NCCLCHECK(xmlGetAttrIndex(sub, "tclass", &index)); if (index == -1) { const char* busId; NCCLCHECK(xmlGetAttr(sub, "target", &busId)); char* path; ncclDebugNoWarn = NCCL_GRAPH; getPciPath(busId, &path); ncclDebugNoWarn = 0; if (path == NULL || strcmp(busId, "fffffff:ffff:ff") == 0) { // Remote NVLink device is not visible inside this VM. Assume NVSwitch. NCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000")); } else { NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass")); free(path); } } } *gpuNodeRet = gpuNode; return ncclSuccess; } ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode) { struct ncclXmlNode* node; NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node)); NCCLCHECK(xmlSetAttrIfUnset(node, "class", "0x03")); NCCLCHECK(ncclTopoGetXmlFromSys(node, xml)); nvmlDevice_t nvmlDev; NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev)); NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode)); return ncclSuccess; } // Returns the subsystem name of a path, i.e. the end of the path // where sysPath/subsystem points to. ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) { char subSysPath[PATH_MAX]; sprintf(subSysPath, "%s/subsystem", sysPath); char* path = realpath(subSysPath, NULL); if (path == NULL) { subSys[0] = '\0'; } else { int offset; for (offset = strlen(path); offset > 0 && path[offset] != '/'; offset--); strcpy(subSys, path+offset+1); free(path); } return ncclSuccess; } ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode) { NCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName)); if (*netNode != NULL) return ncclSuccess; const char* pciSysPath = pciPath; if (pciSysPath) { char subSystem[PATH_MAX]; NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem)); // This is not a PCI device (virtual, usb, ...). if (strcmp(subSystem, "pci") != 0) { INFO(NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem); pciSysPath = NULL; } } struct ncclXmlNode* parent = NULL; if (pciSysPath) { int offset; for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--); char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; strcpy(busId, pciSysPath+offset+1); NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent)); NCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02")); NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); } else { // Virtual NIC, no PCI device, attach to first CPU NCCLCHECK(xmlFindTag(xml, "cpu", &parent)); } struct ncclXmlNode* nicNode = NULL; NCCLCHECK(xmlGetSub(parent, "nic", &nicNode)); if (nicNode == NULL) { NCCLCHECK(xmlAddNode(xml, parent, "nic", &nicNode)); } // We know that this net does not exist yet (we searched for it at the // beginning of this function), so we can add it. NCCLCHECK(xmlAddNode(xml, nicNode, "net", netNode)); NCCLCHECK(xmlSetAttr(*netNode, "name", netName)); return ncclSuccess; } ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node) { const char* str; NCCLCHECK(xmlGetAttr(node, "keep", &str)); if (str && strcmp(str, "1") == 0) { NCCLCHECK(xmlUnsetAttr(node, "keep")); } else { // Copy nSubs and subs as they could change as we trim recursively. struct ncclXmlNode* subs[MAX_SUBS]; int nSubs = node->nSubs; memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*)); for (int s=0; snSubs == 0) NCCLCHECK(xmlRemoveNode(node)); } return ncclSuccess; } ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) { NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes)); return ncclSuccess; } /**************************************************/ /* Parser rules for the user-defined graph search */ /**************************************************/ ncclResult_t ncclTopoXmlGraphLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); return ncclSuccess; } ncclResult_t ncclTopoXmlGraphLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); return ncclSuccess; } ncclResult_t ncclTopoXmlGraphLoadChannel(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { struct xmlHandler handlers[] = { { "net", ncclTopoXmlGraphLoadNet }, { "gpu", ncclTopoXmlGraphLoadGpu } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2)); return ncclSuccess; } ncclResult_t ncclTopoXmlGraphLoadGraph(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { struct xmlHandler handlers[] = { { "channel", ncclTopoXmlGraphLoadChannel } }; NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); return ncclSuccess; } ncclResult_t ncclTopoXmlGraphLoadGraphs(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) { int version; NCCLCHECK(xmlGetAttrInt(head, "version", &version)); if (version != NCCL_GRAPH_XML_VERSION) { WARN("XML Graph has wrong version %d, %d needed", version, NCCL_GRAPH_XML_VERSION); return ncclInvalidUsage; } const char* name; NCCLCHECK(xmlGetAttr(head, "name", &name)); if (name != NULL) INFO(NCCL_GRAPH, "Loading graphs for topology %s", name); else INFO(NCCL_GRAPH, "Loading graphs"); struct xmlHandler handlers[] = { { "graph", ncclTopoXmlGraphLoadGraph } }; NCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1)); return ncclSuccess; } ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml) { FILE* file = fopen(xmlGraphFile, "r"); if (file == NULL) { WARN("Could not open XML graph file %s : %s", xmlGraphFile, strerror(errno)); return ncclSystemError; } struct xmlHandler handlers[] = { { "graphs", ncclTopoXmlGraphLoadGraphs } }; xml->maxIndex = 0; NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1)); fclose(file); return ncclSuccess; } nccl-2.18.3-1/src/graph/xml.h000066400000000000000000000223351444201535000155260ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef XML_H_ #define XML_H_ #include "nccl.h" #include "debug.h" #include "checks.h" #include // A few constraints to make the implementation easy #define MAX_STR_LEN 255 #define MAX_ATTR_COUNT 16 #define MAX_SUBS 32 #define MAX_NODES 1024 #define NODE_TYPE_NONE 0 #define NODE_TYPE_OPEN 1 #define NODE_TYPE_CLOSE 2 #define NODE_TYPE_SINGLE 3 struct ncclXmlNode { char name[MAX_STR_LEN+1]; struct { char key[MAX_STR_LEN+1]; char value[MAX_STR_LEN+1]; } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params int nAttrs; int type; struct ncclXmlNode* parent; struct ncclXmlNode* subs[MAX_SUBS]; int nSubs; }; struct ncclXml { struct ncclXmlNode nodes[MAX_NODES]; int maxIndex; }; /* File functions */ #define NCCL_TOPO_XML_VERSION 1 ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml, int warn); ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml); #define NCCL_GRAPH_XML_VERSION 1 ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml); /* Auto-detect functions */ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode); ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode); /* Remove unneeded parts */ ncclResult_t ncclTopoTrimXml(struct ncclXml* xml); /**************/ /* XML Struct */ /* Functions */ /**************/ static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrName, int* index) { *index = -1; const int nAttrs = node->nAttrs; for (int a=0; aattrs[a].key, attrName, MAX_STR_LEN) == 0) { *index = a; return ncclSuccess; } } return ncclSuccess; } static ncclResult_t xmlGetAttr(struct ncclXmlNode* node, const char* attrName, const char** value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); *value = index == -1 ? NULL : node->attrs[index].value; return ncclSuccess; } static ncclResult_t xmlGetAttrStr(struct ncclXmlNode* node, const char* attrName, const char** value) { NCCLCHECK(xmlGetAttr(node, attrName, value)); if (*value == NULL) { WARN("Attribute %s of node %s not found", attrName, node->name); return ncclInternalError; } return ncclSuccess; } static ncclResult_t xmlGetAttrInt(struct ncclXmlNode* node, const char* attrName, int* value) { const char* str; NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); *value = strtol(str, NULL, 0); return ncclSuccess; } static ncclResult_t xmlGetAttrIntDefault(struct ncclXmlNode* node, const char* attrName, int* value, int defaultValue) { const char* str; NCCLCHECK(xmlGetAttr(node, attrName, &str)); *value = str ? strtol(str, NULL, 0) : defaultValue; return ncclSuccess; } static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) { const char* str; NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); *value = strtof(str, NULL); return ncclSuccess; } static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node) { *node = NULL; for (int i=0; imaxIndex; i++) { struct ncclXmlNode* n = xml->nodes+i; if (strcmp(n->name, tagName) == 0) { *node = n; return ncclSuccess; } } return ncclSuccess; } static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node, const char* attrName, const char* attrValue) { *node = NULL; for (int i=0; imaxIndex; i++) { struct ncclXmlNode* n = xml->nodes+i; if (strcmp(n->name, tagName) == 0) { const char* value; NCCLCHECK(xmlGetAttr(n, attrName, &value)); if (value && strcmp(value, attrValue) == 0) { *node = n; return ncclSuccess; } } } return ncclSuccess; } static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, const char* value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); node->attrs[index].key[MAX_STR_LEN] = '\0'; } strncpy(node->attrs[index].value, value, MAX_STR_LEN); node->attrs[index].value[MAX_STR_LEN] = '\0'; return ncclSuccess; } static ncclResult_t xmlSetAttrIfUnset(struct ncclXmlNode* node, const char* attrName, const char* value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index != -1) return ncclSuccess; index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); node->attrs[index].key[MAX_STR_LEN] = '\0'; strncpy(node->attrs[index].value, value, MAX_STR_LEN); node->attrs[index].value[MAX_STR_LEN] = '\0'; return ncclSuccess; } static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); node->attrs[index].key[MAX_STR_LEN] = '\0'; } snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value); node->attrs[index].value[MAX_STR_LEN] = '\0'; return ncclSuccess; } static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); node->attrs[index].key[MAX_STR_LEN] = '\0'; } snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value); node->attrs[index].value[MAX_STR_LEN] = '\0'; return ncclSuccess; } static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* attrName) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); if (index == -1) return ncclSuccess; for (int i=index+1; inAttrs; i++) { strcpy(node->attrs[i-1].key, node->attrs[i].key); strcpy(node->attrs[i-1].value, node->attrs[i].value); } node->nAttrs--; return ncclSuccess; } static ncclResult_t xmlGetSub(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub) { *sub = NULL; for (int s=0; snSubs; s++) { if (strcmp(node->subs[s]->name, subName) == 0) { *sub = node->subs[s]; return ncclSuccess; } } return ncclSuccess; } static ncclResult_t xmlGetSubKv(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub, const char* attrName, const char* attrValue) { *sub = NULL; for (int s=0; snSubs; s++) { struct ncclXmlNode* subNode = node->subs[s]; if (strcmp(subNode->name, subName) == 0) { const char* value; NCCLCHECK(xmlGetAttr(subNode, attrName, &value)); if (value && strcmp(value, attrValue) == 0) { *sub = node->subs[s]; return ncclSuccess; } } } return ncclSuccess; } static ncclResult_t xmlGetSubKvInt(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub, const char* attrName, const int attrValue) { char strValue[10]; snprintf(strValue, 10, "%d", attrValue); NCCLCHECK(xmlGetSubKv(node, subName, sub, attrName, strValue)); return ncclSuccess; } static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent, const char* subName, struct ncclXmlNode** sub) { if (xml->maxIndex == MAX_NODES) { WARN("Error : too many XML nodes (max %d)", MAX_NODES); return ncclInternalError; } struct ncclXmlNode* s = xml->nodes+xml->maxIndex++; s->nSubs = 0; s->nAttrs = 0; *sub = s; s->parent = parent; if (parent) parent->subs[parent->nSubs++] = s; strncpy(s->name, subName, MAX_STR_LEN); s->name[MAX_STR_LEN] = '\0'; return ncclSuccess; } static ncclResult_t xmlRemoveNode(struct ncclXmlNode* node) { node->type = NODE_TYPE_NONE; struct ncclXmlNode* parent = node->parent; if (parent == NULL) return ncclSuccess; int shift = 0; for (int s=0; snSubs; s++) { if (parent->subs[s] == node) shift = 1; else if (shift) parent->subs[s-1] = parent->subs[s]; } parent->nSubs--; return ncclSuccess; } // Dictionary for STR -> INT conversions. No dictionary size information, // there needs to be a last element with str == NULL. struct kvDict { const char* str; int value; }; static ncclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* dict) { struct kvDict* d = dict; while (d->str) { if (strncmp(str, d->str, strlen(d->str)) == 0) { *value = d->value; return ncclSuccess; } d++; } INFO(NCCL_GRAPH, "KV Convert to int : could not find value of '%s' in dictionary, falling back to %d", str, d->value); *value = d->value; return ncclSuccess; } static ncclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict) { struct kvDict* d = dict; while (d->str) { if (value == d->value) { *str = d->str; return ncclSuccess; } d++; } WARN("KV Convert to str : could not find value %d in dictionary", value); return ncclInternalError; } #endif nccl-2.18.3-1/src/group.cc000066400000000000000000000362431444201535000151220ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "group.h" #include "debug.h" #include "enqueue.h" #include "transport.h" #include "channel.h" #include __thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting __thread ncclResult_t ncclGroupError = ncclSuccess; __thread struct ncclComm* ncclGroupCommHead = nullptr; __thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr; __thread struct ncclIntruQueue ncclAsyncJobs; __thread struct ncclGroupJob *ncclGroupJobMainPtr = NULL; __thread struct ncclGroupJob ncclGroupJobMain; __thread int ncclGroupBlocking = -1; /* default mode */ __thread bool ncclGroupJobAbortFlag = false; void* ncclAsyncJobMain(void* arg); static ncclResult_t groupJobComplete(struct ncclGroupJob *job); ncclResult_t ncclAsyncLaunch( struct ncclAsyncJob* job, ncclResult_t(*func)(struct ncclAsyncJob*), void(*undo)(struct ncclAsyncJob*), void(*destructor)(void*), ncclComm_t comm ) { ncclResult_t ret = ncclSuccess; if (ncclGroupDepth == 0) { ret = func(job); if (ret != ncclSuccess && undo) undo(job); if (destructor) destructor(job); } else { job->func = func; job->undo = undo; job->destructor = destructor; job->abortFlag = comm->abortFlag; job->childAbortFlag = comm->childAbortFlag; job->state = ncclGroupJobRunning; job->comm = comm; /* check if there are blocking and nonblocking comms at the same time in group. */ if (ncclGroupBlocking == -1) { /* first met communicator */ ncclGroupBlocking = comm->config.blocking; } else if (ncclGroupBlocking != comm->config.blocking) { WARN("Blocking and nonblocking communicators are not allowed in the same group."); ret = ncclInvalidArgument; } ncclIntruQueueEnqueue(&ncclAsyncJobs, job); } return ret; } void* ncclAsyncJobMain(void* arg) { struct ncclAsyncJob* job = (struct ncclAsyncJob*)arg; job->result = job->func(job); if (job->result != ncclSuccess) { INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, job->result); } __atomic_store_n(&job->state, ncclGroupJobDone, __ATOMIC_RELEASE); return arg; } ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) { ncclResult_t ret; SYSCHECK(pthread_join(job->thread, NULL), "pthread_join"); if (job->result != ncclSuccess) { WARN("ncclAsyncJobComplete: job %p failed, job error %d", job, job->result); } ret = job->result; if (job->destructor) job->destructor((void*)job); return ret; } NCCL_API(ncclResult_t, ncclGroupStart); ncclResult_t ncclGroupStart() { ncclResult_t ret = ncclSuccess; NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECK(ncclGroupStartInternal()); TRACE_CALL("ncclGroupStart()"); return ret; } NCCL_API(ncclResult_t, ncclGroupEnd); ncclResult_t ncclGroupEnd() { ncclResult_t ret = ncclSuccess; NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECKGOTO(ncclGroupEndInternal(), ret, exit); TRACE_CALL("ncclGroupEnd()"); exit: return ret; } struct ncclPreconnectJob { struct ncclAsyncJob base; struct ncclComm* comm; }; ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) { struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_; struct ncclComm* comm = job->comm; CUDACHECK(cudaSetDevice(comm->cudaDev)); if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1)); return ncclSuccess; } static ncclResult_t doLaunches(struct ncclComm* head) { ncclResult_t result = ncclSuccess; struct ncclComm* cliqueComm0 = head->intraComm0; struct ncclComm* cliqueHead = head; struct ncclComm* cliqueNextHead; bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup; // This outer loop iterates over cliques of comms which are siblings of the // same global entity. We calculate a clique as all comms which have the same // `intraComm0` value. do { struct ncclComm* comm = cliqueHead; bool capturingYes = false, capturingNo = false; do { (ncclCudaGraphValid(comm->tasks.capturingGraph) ? capturingYes : capturingNo) = true; CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure); if (useBarrier) ncclCommIntraBarrierIn(comm, 1); comm = comm->groupNext; } while (comm != nullptr && comm->intraComm0 == cliqueComm0); cliqueNextHead = comm; if (capturingYes && capturingNo) { // We have entered barriers but are aborting without leaving them. Thus // these comms are permanently trashed. We need a good mechanism for // tracking and reporting that. WARN("Either none or all communicators in a ncclGroup() can be CUDA graph captured."); result = ncclInvalidUsage; goto failure; } while (true) { // Iterate rounds of launches for clique. bool moreRounds; comm = cliqueHead; do { // Iterate clique members. struct ncclComm* next = comm->groupNext; if (useBarrier) { // Barrier reduction result tells us if this was the final round. moreRounds = 0 != ncclCommIntraBarrierOut(comm); } else { moreRounds = comm->unlaunchedPlansHead != nullptr; } if (moreRounds) { // Pop next unlaunched kernel struct ncclKernelPlan* plan = comm->unlaunchedPlansHead; if (plan != nullptr) { comm->unlaunchedPlansHead = plan->next; CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure); NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure); } // Barrier reduction input indicates if we require further rounds. if (useBarrier) ncclCommIntraBarrierIn(comm, comm->unlaunchedPlansHead != nullptr ? 1 : 0); if (plan != nullptr) { NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure); } } else { // Final round. CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); NCCLCHECKGOTO(ncclLaunchFinish(comm), result, failure); } comm = next; } while (comm != cliqueNextHead); if (!moreRounds) break; } cliqueHead = cliqueNextHead; } while (cliqueHead != nullptr); failure: return result; } static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue* asyncJobsPtr, ncclResult_t* groupErrorPtr, ncclResult_t error) { struct ncclComm* comm = *groupCommHeadPtr; while (comm != nullptr) { struct ncclComm* next = comm->groupNext; (void) ncclGroupCommLeave(comm); // overwrites comm->groupNext // We don't know if preconnect succeeded or happened at all, so clear // the flags that let `taskAppend()` skip over checking if preconnect // is needed. comm->preconnectNext = reinterpret_cast(0x1); for (int i = 0; i < comm->nRanks; i++) { comm->tasks.peers[i].sendSeen = false; comm->tasks.peers[i].recvSeen = false; comm->connectSend[i] = 0UL; comm->connectRecv[i] = 0UL; } comm->unlaunchedPlansHead = nullptr; // Reclaim abandoned kernel plan memory. Note ncclWork structs were already // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`. while (!ncclIntruQueueEmpty(&comm->planQueue)) { struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue); // Persistent plans will be reclaimed via the callbackQueue when the // graph drops its UserObject reference. if (!plan->persistent) { for (int c = 0; c < MAXCHANNELS; c++) { while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) { struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue); ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop); } } ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); } } // Reset comm->tasks to empty. comm->tasks.nTasksColl = 0; comm->tasks.nTasksP2p = 0; comm->tasks.streams = nullptr; ncclIntruQueueConstruct(&comm->tasks.collQueue); comm->tasks.collBytesTotal = 0; for (int i = 0; i < comm->nRanks; i++) { ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue); ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue); } if (!comm->config.blocking) (void) ncclCommSetAsyncError(comm, error); comm = next; } /* reset everything */ while (!ncclIntruQueueEmpty(asyncJobsPtr)) { struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr); *job->abortFlag = 1; if (job->comm && !job->comm->config.blocking) (void) ncclCommSetAsyncError(job->comm, error); if (job->undo) job->undo(job); if (job->destructor) job->destructor((void*)job); } *groupErrorPtr = ncclSuccess; *groupCommHeadPtr = nullptr; *groupCommPreconnectHeadPtr = nullptr; return; } static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) { int savedDev; ncclResult_t ret = ncclSuccess; bool jobsDone = false; bool errorJobAbortFlag = false; struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_; struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr; struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr; struct ncclIntruQueue *asyncJobsMain = gjob->asyncJobsPtr; volatile bool *groupAbortFlag = gjob->abortFlagPtr; CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail); if (groupCommPreconnectHeadMain != nullptr) { struct ncclComm* comm = groupCommPreconnectHeadMain; do { struct ncclPreconnectJob* job; NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); job->base.func = ncclPreconnectFunc; job->base.undo = nullptr; job->base.destructor = free; job->base.state = ncclGroupJobRunning; job->base.abortFlag = comm->abortFlag; job->comm = comm; ncclIntruQueueEnqueue(asyncJobsMain, &job->base); struct ncclComm* next = comm->preconnectNext; comm->preconnectNext = reinterpret_cast(0x1); comm = next; } while (comm != nullptr); } if (!ncclIntruQueueEmpty(asyncJobsMain)) { struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain); do { SYSCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), ret, fail); job = job->next; } while (job != nullptr); do { jobsDone = true; job = ncclIntruQueueHead(asyncJobsMain); do { ncclGroupJobState_t state = __atomic_load_n(&job->state, __ATOMIC_ACQUIRE); if (state == ncclGroupJobRunning) { jobsDone = false; } else if (state == ncclGroupJobDone) { if (pthread_join(job->thread, nullptr) != 0) { WARN("Error waiting for pthread_join : %s", strerror(errno)); ret = ncclSystemError; } job->state = ncclGroupJobJoined; if (job->result != ncclSuccess && ret == ncclSuccess) { ret = job->result; errorJobAbortFlag = true; } } else { /* safety check */ assert(state == ncclGroupJobJoined); } if (*groupAbortFlag == true || errorJobAbortFlag == true) { *job->abortFlag = 1; if (job->childAbortFlag) *job->childAbortFlag = 1; } job = job->next; } while (job != nullptr); // Let preconnect threads progress. if (jobsDone == false) usleep(1); } while (jobsDone == false); if (ret != ncclSuccess) goto fail; } if (groupCommHeadMain != nullptr) { NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail); } /* this atomic must happen before cleanup and setting state of communicators */ __atomic_store_n(&gjob->doneFlag, true, __ATOMIC_RELEASE); while (!ncclIntruQueueEmpty(asyncJobsMain)) { struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain); if (job->comm && !job->comm->config.blocking) (void) ncclCommSetAsyncError(job->comm, ret); if (job->destructor) job->destructor((void*)job); } while (groupCommHeadMain != nullptr) { struct ncclComm* comm = groupCommHeadMain; struct ncclComm* next = comm->groupNext; (void) ncclGroupCommLeave(comm); if (!comm->config.blocking) { (void) ncclCommSetAsyncError(comm, ret); } groupCommHeadMain = next; } *gjob->groupErrorPtr = ncclSuccess; *gjob->groupCommHeadPtr = nullptr; *gjob->groupCommPreconnectHeadPtr = nullptr; CUDACHECK(cudaSetDevice(savedDev)); exit: return ret; fail: groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, ret); goto exit; } ncclResult_t ncclGroupEndInternal() { ncclResult_t ret = ncclSuccess; if (ncclGroupDepth == 0) { WARN("ncclGroupEnd: not in a group call."); ret = ncclInvalidUsage; goto exit; } if ((--ncclGroupDepth) > 0) goto exit; if ((ret = ncclGroupError) != ncclSuccess) goto fail; if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) { ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead; ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead; ncclGroupJobMain.groupErrorPtr = &ncclGroupError; ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs; ncclGroupJobMain.abortFlagPtr = &ncclGroupJobAbortFlag; ncclGroupJobMain.doneFlag = false; ncclGroupJobMainPtr = &ncclGroupJobMain; /* make sure ncclGroupBlocking has been set. */ assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1); if (ncclGroupBlocking == 0 && (ncclGroupCommPreconnectHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs))) { /* nonblocking group */ if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) { ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs); do { NCCLCHECKGOTO(ncclCommSetAsyncError(job->comm, ncclInProgress), ret, fail); job = job->next; } while (job); } if (ncclGroupCommHead) { ncclComm_t comm = ncclGroupCommHead; do { NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail); comm = comm->groupNext; } while (comm); } ncclGroupJobMainPtr->base.func = groupLaunch; SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail); ret = ncclInProgress; } else { /* blocking group */ NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail); groupResetJobState(); } } exit: return ret; fail: groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, ret); groupResetJobState(); goto exit; } void ncclGroupJobAbort() { ncclGroupJobAbortFlag = true; (void) groupJobComplete(ncclGroupJobMainPtr); /* reset group abort flag */ ncclGroupJobAbortFlag = false; } nccl-2.18.3-1/src/include/000077500000000000000000000000001444201535000150725ustar00rootroot00000000000000nccl-2.18.3-1/src/include/align.h000066400000000000000000000022331444201535000163350ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_ALIGN_H_ #define NCCL_ALIGN_H_ #define DIVUP(x, y) \ (((x)+(y)-1)/(y)) #define ROUNDUP(x, y) \ (DIVUP((x), (y))*(y)) #define ALIGN_POWER(x, y) \ ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x)))) #define ALIGN_SIZE(size, align) \ size = ((size + (align) - 1) / (align)) * (align); #if !__CUDA_ARCH__ #ifndef __host__ #define __host__ #endif #ifndef __device__ #define __device__ #endif #endif template __host__ __device__ constexpr Z divUp(X x, Y y) { return (x+y-1)/y; } template __host__ __device__ constexpr Z roundUp(X x, Y y) { return (x+y-1) - (x+y-1)%y; } // assumes second argument is a power of 2 template __host__ __device__ constexpr Z alignUp(X x, int a) { return (x+a-1) & Z(-a); } #endif nccl-2.18.3-1/src/include/alloc.h000066400000000000000000000246271444201535000163500ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_ALLOC_H_ #define NCCL_ALLOC_H_ #include "nccl.h" #include "checks.h" #include "align.h" #include "utils.h" #include "p2p.h" #include #include #include #include uint64_t clockNano(); // from utils.h with which we have a circular dependency template ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish); memset(*ptr, 0, nelem*sizeof(T)); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T)); INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); return result; } #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) inline ncclResult_t ncclCudaHostFree(void* ptr) { CUDACHECK(cudaFreeHost(ptr)); return ncclSuccess; } template ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { void* p = malloc(nelem*sizeof(T)); if (p == NULL) { WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); return ncclSystemError; } //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p); memset(p, 0, nelem*sizeof(T)); *ptr = (T*)p; return ncclSuccess; } #define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__) template ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { if (nelem < oldNelem) return ncclInternalError; if (nelem == oldNelem) return ncclSuccess; T* oldp = *ptr; T* p = (T*)malloc(nelem*sizeof(T)); if (p == NULL) { WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); return ncclSystemError; } memcpy(p, oldp, oldNelem*sizeof(T)); free(oldp); memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T)); *ptr = (T*)p; INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr); return ncclSuccess; } #if CUDART_VERSION >= 11030 #include #include "cudawrap.h" static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) { ncclResult_t result = ncclSuccess; size_t granularity = 0; CUdevice currentDev; CUmemAllocationProp prop = {}; CUmemAccessDesc accessDesc = {}; CUmemGenericAllocationHandle handle; int cudaDev; int flag = 0; CUDACHECK(cudaGetDevice(&cudaDev)); CUCHECK(cuDeviceGet(¤tDev, cudaDev)); prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported prop.location.id = currentDev; // Query device to see if RDMA support is available CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev)); if (flag) prop.allocFlags.gpuDirectRDMACapable = 1; CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); ALIGN_SIZE(size, granularity); /* Allocate the physical memory on the device */ CUCHECK(cuMemCreate(&handle, size, &prop, 0)); /* Reserve a virtual address range */ CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); /* Map the virtual address range to the physical allocation */ CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); /* Now allow RW access to the newly mapped memory */ accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.id = currentDev; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); if (handlep) *handlep = handle; TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle); return result; } static inline ncclResult_t ncclCuMemFree(void *ptr) { if (ptr == NULL) return ncclSuccess; ncclResult_t result = ncclSuccess; CUmemGenericAllocationHandle handle; size_t size = 0; CUCHECK(cuMemRetainAllocationHandle(&handle, ptr)); CUCHECK(cuMemRelease(handle)); CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle); CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size)); CUCHECK(cuMemRelease(handle)); CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); return result; } #else extern int ncclCuMemEnable(); static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) { WARN("CUMEM not supported prior to CUDA 11.3"); return ncclInternalError; } static inline ncclResult_t ncclCuMemFree(void *ptr) { WARN("CUMEM not supported prior to CUDA 11.3"); return ncclInternalError; } #endif template ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (ncclCuMemEnable()) { NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish); } else { CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T)); INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); return result; } #define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__) template ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); // Need a side stream so as not to interfere with graph capture. cudaStream_t stream; CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); if (ncclCuMemEnable()) { NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish); } else { CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); } CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T)); INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); return result; } #define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__) template ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (ncclCuMemEnable()) { NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish); } else { CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); } CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T)); INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); return result; } #define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__) template ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); // Need a side stream so as not to interfere with graph capture. cudaStream_t stream; CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish); NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish); CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); return result; } template ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); return result; } template ncclResult_t ncclCudaFree(T* ptr) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr); CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (ncclCuMemEnable()) { NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish); } else { CUDACHECKGOTO(cudaFree(ptr), result, finish); } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); return result; } // Allocate memory to be potentially ibv_reg_mr'd. This needs to be // allocated on separate pages as those pages will be marked DONTFORK // and if they are shared, that could cause a crash in a child process inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) { size_t page_size = sysconf(_SC_PAGESIZE); void* p; int size_aligned = ROUNDUP(size, page_size); int ret = posix_memalign(&p, page_size, size_aligned); if (ret != 0) return ncclSystemError; memset(p, 0, size); *ptr = p; INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr); return ncclSuccess; } #define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__) #endif nccl-2.18.3-1/src/include/argcheck.h000066400000000000000000000007371444201535000170210ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_ARGCHECK_H_ #define NCCL_ARGCHECK_H_ #include "core.h" #include "info.h" ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); ncclResult_t ArgsCheck(struct ncclInfo* info); #endif nccl-2.18.3-1/src/include/bootstrap.h000066400000000000000000000032311444201535000172570ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_BOOTSTRAP_H_ #define NCCL_BOOTSTRAP_H_ #include "nccl.h" #include "comm.h" struct ncclBootstrapHandle { uint64_t magic; union ncclSocketAddress addr; }; static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID"); ncclResult_t bootstrapNetInit(); ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv); ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle); ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm); ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks); ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size); ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag); ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size); ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size); ncclResult_t bootstrapClose(void* commState); ncclResult_t bootstrapAbort(void* commState); #endif nccl-2.18.3-1/src/include/channel.h000066400000000000000000000041011444201535000166470ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_CHANNEL_H_ #define NCCL_CHANNEL_H_ #include "comm.h" ncclResult_t initChannel(struct ncclComm* comm, int channelid); ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks); static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) { int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2; int peerNode = comm->rankToNode[peer]; int peerIndex = comm->rankToLocalRank[peer]; int nsteps = comm->maxLocalRanks; int rankIndex = comm->rankToLocalRank[comm->rank]; int step, delta; if (coll == ncclFuncSend) { step = (nsteps + peerIndex - rankIndex)%nsteps; delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes; } else if (coll == ncclFuncRecv) { step = (nsteps + rankIndex - peerIndex)%nsteps; delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes; } else { return ncclInternalError; } *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step; return ncclSuccess; } static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) { //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels; *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels; return ncclSuccess; } static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) { int base; NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base)); NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId)); return ncclSuccess; } #endif nccl-2.18.3-1/src/include/checks.h000066400000000000000000000130541444201535000165060ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_CHECKS_H_ #define NCCL_CHECKS_H_ #include "debug.h" // Check CUDA RT calls #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ return ncclUnhandledCudaError; \ } \ } while(false) #define CUDACHECKGOTO(cmd, RES, label) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ RES = ncclUnhandledCudaError; \ goto label; \ } \ } while(false) // Report failure but clear error and continue #define CUDACHECKIGNORE(cmd) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \ (void) cudaGetLastError(); \ } \ } while(false) #include // Check system calls #define SYSCHECK(call, name) do { \ int retval; \ SYSCHECKVAL(call, name, retval); \ } while (false) #define SYSCHECKVAL(call, name, retval) do { \ SYSCHECKSYNC(call, name, retval); \ if (retval == -1) { \ WARN("Call to " name " failed : %s", strerror(errno)); \ return ncclSystemError; \ } \ } while (false) #define SYSCHECKSYNC(call, name, retval) do { \ retval = call; \ if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ } else { \ break; \ } \ } while(true) #define SYSCHECKGOTO(statement, RES, label) do { \ if ((statement) == -1) { \ /* Print the back trace*/ \ RES = ncclSystemError; \ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ goto label; \ } \ } while (0); #define NEQCHECK(statement, value) do { \ if ((statement) != value) { \ /* Print the back trace*/ \ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ return ncclSystemError; \ } \ } while (0); #define NEQCHECKGOTO(statement, value, RES, label) do { \ if ((statement) != value) { \ /* Print the back trace*/ \ RES = ncclSystemError; \ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ goto label; \ } \ } while (0); #define EQCHECK(statement, value) do { \ if ((statement) == value) { \ /* Print the back trace*/ \ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ return ncclSystemError; \ } \ } while (0); #define EQCHECKGOTO(statement, value, RES, label) do { \ if ((statement) == value) { \ /* Print the back trace*/ \ RES = ncclSystemError; \ INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ goto label; \ } \ } while (0); // Propagate errors up #define NCCLCHECK(call) do { \ ncclResult_t RES = call; \ if (RES != ncclSuccess && RES != ncclInProgress) { \ /* Print the back trace*/ \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ return RES; \ } \ } while (0); #define NCCLCHECKGOTO(call, RES, label) do { \ RES = call; \ if (RES != ncclSuccess && RES != ncclInProgress) { \ /* Print the back trace*/ \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ goto label; \ } \ } while (0); #define NCCLWAIT(call, cond, abortFlagPtr) do { \ volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ ncclResult_t RES = call; \ if (RES != ncclSuccess && RES != ncclInProgress) { \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ return ncclInternalError; \ } \ if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \ } while (!(cond)); #define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \ volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ RES = call; \ if (RES != ncclSuccess && RES != ncclInProgress) { \ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ goto label; \ } \ if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \ } while (!(cond)); #define NCCLCHECKTHREAD(a, args) do { \ if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \ return args; \ } \ } while(0) #define CUDACHECKTHREAD(a) do { \ if ((a) != cudaSuccess) { \ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ args->ret = ncclUnhandledCudaError; \ return args; \ } \ } while(0) #endif nccl-2.18.3-1/src/include/coll_net.h000066400000000000000000000064441444201535000170520ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef COLL_NET_H_ #define COLL_NET_H_ #include "nccl.h" #include "nccl_net.h" typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; // Translation to external API static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; } static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; } static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; } static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; } static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; } static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; } static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; } /* DMA-BUF support */ static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; } static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; } static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; } static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; } static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; } static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; } static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; } static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; } #endif nccl-2.18.3-1/src/include/collectives.h000066400000000000000000000131621444201535000175620ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_COLLECTIVES_H_ #define NCCL_COLLECTIVES_H_ enum ncclDevRedOp_t { ncclDevSum, ncclDevProd, ncclDevMax, ncclDevMin, ncclDevPreMulSum, ncclDevSumPostDiv, ncclNumDevRedOps }; struct ncclDevRedOpFull { ncclDevRedOp_t op; bool scalarArgIsPtr; uint64_t scalarArg; }; #define FUNC_INDEX_P2P 0 #define FUNC_INDEX(func, devredop, ncclType, al, pr) (1+ncclNumTypes+(((((func)*ncclNumDevRedOps + (devredop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)) #define NCCL_FUNC_NAME(func, algo, proto, devredop, type) \ ncclFunction_##func##_##algo##_##proto##_##devredop##_##type #define NCCL_ONERANK_REDUCE_NAME(devredop, type) \ ncclFunction_OneRankReduce_##devredop##_##type #define NCCL_KERN_NAME(func, algo, proto, devredop, type) \ ncclKernel_##func##_##algo##_##proto##_##devredop##_##type #define NCCL_IMPL_NAME(func, algo, proto) \ nccl##func##algo##proto /* Declare all collective operations */ #define DECL5(func, algo, proto, devredop, type) \ extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \ extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \ #define SINGLE_ARG(...) __VA_ARGS__ #define CONCAT(a,b) a##b #define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(SINGLE_ARG(t), SINGLE_ARG(f)) #define MACRO_IF_0(t, f) f #define MACRO_IF_1(t, f) t #define DECL4(func, algo, devredop, type, undef) \ MACRO_IF(undef, /*undefined*/, DECL5(func, algo, SIMPLE, devredop, type)) \ MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL, devredop, type)) \ MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128, devredop, type)) #define DECL3(func, devredop, type, undef) \ DECL4(func, RING, devredop, type, undef) \ DECL4(func, TREE, devredop, type, undef) \ DECL4(func, COLLNET_DIRECT, devredop, type, undef) \ DECL4(func, COLLNET_CHAIN, devredop, type, undef) \ DECL4(func, NVLS, devredop, type, undef) \ DECL4(func, NVLS_TREE, devredop, type, undef) #if defined(__CUDA_BF16_TYPES_EXIST__) #define DECL2(func, devredop, undefForFloat) \ DECL3(func, devredop, int8_t, /*undef=*/0) \ DECL3(func, devredop, uint8_t, /*undef=*/0) \ DECL3(func, devredop, int32_t, /*undef=*/0) \ DECL3(func, devredop, uint32_t, /*undef=*/0) \ DECL3(func, devredop, int64_t, /*undef=*/0) \ DECL3(func, devredop, uint64_t, /*undef=*/0) \ DECL3(func, devredop, half, /*undef=*/undefForFloat) \ DECL3(func, devredop, float, /*undef=*/undefForFloat) \ DECL3(func, devredop, double, /*undef=*/undefForFloat) \ DECL3(func, devredop, __nv_bfloat16, /*undef=*/undefForFloat) #else #define DECL2(func, devredop, undefForFloat) \ DECL3(func, devredop, int8_t, /*undef=*/0) \ DECL3(func, devredop, uint8_t, /*undef=*/0) \ DECL3(func, devredop, int32_t, /*undef=*/0) \ DECL3(func, devredop, uint32_t, /*undef=*/0) \ DECL3(func, devredop, int64_t, /*undef=*/0) \ DECL3(func, devredop, uint64_t, /*undef=*/0) \ DECL3(func, devredop, half, /*undef=*/undefForFloat) \ DECL3(func, devredop, float, /*undef=*/undefForFloat) \ DECL3(func, devredop, double, /*undef=*/undefForFloat) #endif #define DECL(func) \ DECL2(func, Sum, /*undefForFloat=*/0) \ DECL2(func, Prod, /*undefForFloat=*/0) \ DECL2(func, Min, /*undefForFloat=*/0) \ DECL2(func, Max, /*undefForFloat=*/0) \ DECL2(func, PreMulSum, /*undefForFloat=*/0) \ DECL2(func, SumPostDiv, /*undefForFloat=*/1) DECL2(Broadcast, Sum, /*undefForFloat=*/0) DECL(Reduce) DECL2(AllGather, Sum, /*undefForFloat=*/0) DECL(ReduceScatter) DECL(AllReduce) DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)(); extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)(); extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)(); extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)(); extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)(); extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)(); extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)(); #if defined(__CUDA_BF16_TYPES_EXIST__) extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16)(); #endif extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)(); extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)(); // CHUNKSIZE must be a multiple of SLICESIZE #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) #define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2) #define ALLGATHER_SLICESTEPS (NCCL_STEPS/4) #define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2) #define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4) #define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2) #define BROADCAST_SLICESTEPS 1 #define BROADCAST_CHUNKSTEPS 1 #define REDUCE_SLICESTEPS 1 #define REDUCE_CHUNKSTEPS 1 #define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above // We can't use the enum identifiers like ncclSum, ncclFloat, etc since this // macro will be used in preprocessor conditionals where enums have no meaning. #define NCCL_NVLS_SUPPORTS(/*ncclDataType_t*/ type, /*ncclDevRedOp_t*/ red) \ (((type==2 || type==3) && (red==0 || red==2 || red==3)) || \ ((type==4 || type==5) && (red==0 || red==2 || red==3)) || \ ((type==6 || type==9) && (red==0 || red==2 || red==3)) || \ (type==7 && red==0) || \ (type==8 && red==0)) #endif nccl-2.18.3-1/src/include/comm.h000066400000000000000000000326401444201535000162030ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_COMM_H_ #define NCCL_COMM_H_ #include "transport.h" #include "p2p.h" #include "collectives.h" #include "proxy.h" #include "strongstream.h" #if CUDART_VERSION < 9000 struct cudaLaunchParams { void *func; dim3 gridDim; dim3 blockDim; void **args; size_t sharedMem; cudaStream_t stream; }; #endif #define CACHE_LINE_SIZE 128 #define MEM_ALIGN 4096 #define CUDA_IPC_MIN 2097152UL // Channels / LL tuning #define NCCL_LL_THREAD_THRESHOLD 8 #define NCCL_LL128_THREAD_THRESHOLD 8 #define NCCL_SIMPLE_THREAD_THRESHOLD 64 struct ncclSendMem { union { struct { uint64_t head; char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; void* ptrExchange; uint64_t redOpArgExchange[2]; char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)]; int offsFifo[NCCL_STEPS]; }; char pad3[MEM_ALIGN]; }; }; struct ncclRecvMem { union { struct { uint64_t tail; char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; int sizesFifo[NCCL_STEPS]; int offsFifo[NCCL_STEPS]; int flush; // For GDRCopy-based flush }; char pad4[MEM_ALIGN]; }; }; enum helperThreadState {ThreadStart, ThreadStop}; #define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS) struct ncclGraphHelperResources { ncclComm* comm; pthread_mutex_t threadLock; pthread_cond_t threadCond; enum helperThreadState threadState; void* ipcBases[NCCL_IPC_POOL_SIZE]; int ipcTail; int ipcHead; }; struct ncclUserRedOp { int freeNext; // -1=allocated, otherwise index of next free entry in array ncclDataType_t datatype; ncclDevRedOpFull opFull; }; struct ncclNodeRanks { int localRanks; int* localRankToRank; }; struct ncclDestructor { struct ncclDestructor* next; void* obj; ncclResult_t(*fn)(struct ncclDestructor* me); }; struct ncclCommCallback { struct ncclCommCallback* next; ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb); }; struct ncclSharedResources { int refCount; struct ncclComm* owner; /* comm which creates this shared res. */ struct ncclChannelPeer* peers[MAXCHANNELS]; struct ncclDevChannelPeer* devPeers[MAXCHANNELS]; /* P2P operation counter, one per channel */ uint64_t p2pOpCount[MAXCHANNELS]; /* Collective operation counter */ uint64_t collOpCount; int tpNRanks; int tpNLocalRanks; int tpNChannels; int tpP2pNChannels; int tpP2pChunkSize; uint64_t magic; // top parent rank to localRank translation table int* tpRankToLocalRank; // Internal streams struct ncclStrongStream deviceStream, hostStream; /* proxy related shared res */ struct ncclProxyState* proxyState; }; struct ncclChannel { struct ncclChannelPeer** peers; struct ncclDevChannelPeer** devPeers; struct ncclRing ring; int* devRingUserRanks; struct ncclTree tree; struct ncclTree collnetChain; struct ncclDirect collnetDirect; struct ncclNvls nvls; int id; // index of this channel uint32_t workFifoSent; // last used work index+1 /* comm split sharable resources */ struct ncclChannelPeer* collnetPeers; struct ncclDevChannelPeer* collnetDevPeers; struct ncclChannelPeer* nvlsPeers; struct ncclDevChannelPeer* nvlsDevPeers; }; struct ncclWorkList { struct ncclWorkList* next; struct ncclWork work; }; struct ncclPointerList { struct ncclPointerList* next; void *ptr; }; struct ncclKernelPlan { // A kernel plan is also a callback that reclaims itself. Hence this must // be the first member. struct ncclCommCallback reclaimer; struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup struct ncclComm* comm; struct ncclKernelPlan* next; bool persistent; // aka captured in a graph bool kernelSpecialized; void *kernelFn; int channelUbound; // only channels c < channelUbound are present int channelCount; // number of channels present uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask) bool hasProxyOps; // does any channel have a non-empty proxyOpQueue int threadPerBlock; // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel() struct ncclWork* workHead; int collOpCount; // zero based for this plan struct ncclIntruQueue ipcMemQueue; struct Channel { int nWork; union { int nWorkElem; // used for coll and reg coll int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1 }; size_t collBytes; struct ncclIntruQueue workQueue; struct ncclIntruQueue proxyOpQueue; } channels[MAXCHANNELS]; }; struct ncclComm { struct ncclMemoryStack memPermanent, memScoped; // List of destructors to run when comm is destructed struct ncclDestructor* destructorHead; struct ncclSharedResources* sharedRes; /* map to top parent ranks. */ int* topParentRanks; int* topParentLocalRanks; struct ncclChannel channels[MAXCHANNELS]; struct ncclPeerInfo* peerInfo; struct ncclTopoSystem* topo; ncclNet_t* ncclNet; ncclCollNet_t* ncclCollNet; void* bootstrap; // Bitmasks for ncclTransportP2pSetup uint64_t* connectSend; uint64_t* connectRecv; uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. uint64_t commHash; int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator int cudaDev; // my cuda device index int nvmlDev; // my nvml device index int compCap; // compute capability of the GPU int minCompCap, maxCompCap; // min/max compute capability in the communicator int64_t busId; // my PCI bus ID in int format cpu_set_t cpuAffinity; // CPU affinity of the GPU int cudaArch; // matches __CUDA_ARCH__ of device int node; int nNodes; int localRank; int localRanks; int maxLocalRanks; int* rankToNode; int* rankToLocalRank; int* localRankToRank; // localRanks and localRanktoRank for all nodes struct ncclNodeRanks* nodeRanks; bool checkPointers; bool dmaBufSupport; // Counter for tracking CUDA launches (P2P and collectives included) uint64_t opCount; // Channels for collectives int nChannels; int nvlsChannels; int collNetChannels; // Channels (per peer) for p2p int p2pnChannels; int p2pnChannelsPerPeer; int p2pChannels[MAXCHANNELS]; // Should this comm allocate LL buffers for network P2P connections? bool allocP2pNetLLBuffers; // Buffer sizes int buffSizes[NCCL_NUM_PROTOCOLS]; int p2pChunkSize; // Algorithm/Protocols thresholds ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; /* This attribute can indicate the states of communicators and return code of * asynchronous NCCL operations. */ ncclResult_t asyncResult; // Flag to ask NCCL kernels to abort volatile uint32_t *abortFlag; volatile uint32_t *childAbortFlag; uint32_t *abortFlagRefCount; // Device side of the communicator (for cudaFree's) struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm // Operation pool. int workFifoDepth; // size of workFifoHeap[], power of 2 struct ncclWork* workFifoHeap; struct ncclWork* devWorkFifoHeap; void* workFifoHeapGdrHandle; // Work completion notificaion uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot. uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels. // Intra-process sync struct ncclComm* intraComm0; // leader of intra-process comms (self possible) struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head int intraRank; int intraRanks; uint32_t intraBarrierPhase; char intraPad1[64 - sizeof(uint64_t)]; uint64_t intraBarrierCounter; // only used if this is intraComm0 char intraPad2[64 - sizeof(uint64_t)]; uint64_t intraBarrierGate; // only used if this is intraComm0 struct ncclProxyState* proxyState; int proxyRefCountOld; /* store proxy post-atomic-sub refcount */ // Whether this communicator uses collNet int collNetSupport; uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes]; int intraHighestTransportType; int* collNetHeads; int collNetHeadsNum; /* sharable collNet proxy progress resource. */ struct ncclCollNetSharedRes* collNetSharedRes; // NVLink SHARP (NVLS) support int nvlsSupport; /* sharable NVLS resource. */ struct ncclNvlsSharedRes* nvlsResources; size_t channelSize; // User requested work size (bytes) for channel partitions // pools backed by comm->memPermanent struct ncclMemoryPool memPool_ncclProxyOp; struct ncclMemoryPool memPool_ncclKernelPlan; struct ncclMemoryPool memPool_ncclPointerList; // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when // this comm is not yet in a group. struct ncclComm* groupNext; // Subset of those in groupNext list. Holds 0x1 if not needing preconnect. struct ncclComm* preconnectNext; int persistentRefs; // number of persistent plan-lists capturing this comm struct ncclTasks tasks; // user-created reduction ops int userRedOpCapacity, userRedOpFreeHead; ncclUserRedOp *userRedOps; // Queue of things for the main thread to do struct ncclIntruQueueMpsc callbackQueue; // List of kernel plans built form tasks. struct ncclIntruQueue planQueue; // First of the unlaunched kernels in `planQueue` struct ncclKernelPlan* unlaunchedPlansHead; ncclConfig_t config; // initState is to more conveniently reclaim resources when errors happen. ncclResult_t initState; // flag to indicate if ncclCommFinalize() is called bool finalizeCalled; // shared structures for finalization int finalizeRankCnt; }; enum ncclLaunchMode { ncclLaunchModeInvalid=0, ncclLaunchModeParallel, ncclLaunchModeGroup }; extern enum ncclLaunchMode ncclParamLaunchMode; void ncclCommPushFree(struct ncclComm* comm, void* buf); void ncclCommPushCudaFree(struct ncclComm* comm, void* buf); void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf); void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle); inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) { ncclResult_t result = ncclSuccess; struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome); while (cb != nullptr) { struct ncclCommCallback* next = cb->next; ncclResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb if (res1 != ncclSuccess) result = res1; cb = next; } NCCLCHECK(result); return ncclSuccess; } inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) { int phase = comm->intraBarrierPhase; if (comm->intraRanks == 1) { // Release everyone (just me). comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1); } else { struct ncclComm* comm0 = comm->intraComm0; uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE); if (uint32_t(count) == uint32_t(comm->intraRanks)) { // Reset. __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED); // Release everyone. __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE); } } } // returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x) inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) { struct ncclComm* comm0 = comm->intraComm0; comm->intraBarrierPhase ^= 1; uint32_t phase = comm->intraBarrierPhase; uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); if ((gate & 1) != phase) { uint64_t t0 = clockNano(); do { // Spin vigorously for first 5us. if (clockNano()-t0 >= 5*1000) sched_yield(); gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); } while ((gate & 1) != phase); } if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE); return gate>>32; } // Scrambles the bits of non-builtin values of ncclRedOp_t according to the // communicator memory address. Used to catch bugs so that integer handles // associated with this communicator won't collide with handles of other // communicatrs. This function is its own inverse. static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) { // Preserve the built-in values. if(int(op) < int(ncclNumOps)) return op; uint64_t h = reinterpret_cast(comm); h ^= h >> 32; h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant h >>= 32; // h is now an excellent 32-bit hash of the comm pointer h &= int(ncclMaxRedOp); // ncclMaxRedOp is a power of 2 minus 1 int op1 = int(h) ^ int(op); // Since builtin values are preserved, we also have to preserve their preimage. return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1); } ncclResult_t ncclCommEnsureReady(ncclComm_t comm); ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState); #endif nccl-2.18.3-1/src/include/core.h000066400000000000000000000030731444201535000161760ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_CORE_H_ #define NCCL_CORE_H_ #include #include #include #include #include // For std::min/std::max #include "nccl.h" #ifdef PROFAPI #define NCCL_API(ret, func, args...) \ __attribute__ ((visibility("default"))) \ __attribute__ ((alias(#func))) \ ret p##func (args); \ extern "C" \ __attribute__ ((visibility("default"))) \ __attribute__ ((weak)) \ ret func(args) #else #define NCCL_API(ret, func, args...) \ extern "C" \ __attribute__ ((visibility("default"))) \ ret func(args) #endif // end PROFAPI static __inline__ int ncclTypeSize(ncclDataType_t type) { switch (type) { case ncclInt8: case ncclUint8: return 1; case ncclFloat16: #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: #endif return 2; case ncclInt32: case ncclUint32: case ncclFloat32: return 4; case ncclInt64: case ncclUint64: case ncclFloat64: return 8; default: return -1; } } #include "debug.h" #include "checks.h" #include "cudawrap.h" #include "alloc.h" #include "utils.h" #include "param.h" #include "nvtx.h" #endif // end include guard nccl-2.18.3-1/src/include/cpuset.h000066400000000000000000000027441444201535000165550ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_CPUSET_H_ #define NCCL_CPUSET_H_ // Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t static int hexToInt(char c) { int v = c - '0'; if (v < 0) return -1; if (v > 9) v = 10 + c - 'a'; if ((v < 0) || (v > 15)) return -1; return v; } #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) { uint32_t cpumasks[CPU_SET_N_U32]; int m = CPU_SET_N_U32-1; cpumasks[m] = 0; for (int o=0; o=0; o--) { if (c == 0 && m8[o] == 0) continue; sprintf(str+c, "%02x", m8[o]); c+=2; if (o && o%4 == 0) { sprintf(str+c, ","); c++; } } str[c] = '\0'; return ncclSuccess; } #endif nccl-2.18.3-1/src/include/cudawrap.h000066400000000000000000000111451444201535000170530ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_CUDAWRAP_H_ #define NCCL_CUDAWRAP_H_ #include #include #include "checks.h" // Is cuMem API usage enabled extern int ncclCuMemEnable(); #if CUDART_VERSION >= 11030 #include #else typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags); typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion); typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags); #endif #define CUPFN(symbol) pfn_##symbol // Check CUDA PFN driver calls #define CUCHECK(cmd) do { \ CUresult err = pfn_##cmd; \ if( err != CUDA_SUCCESS ) { \ const char *errStr; \ (void) pfn_cuGetErrorString(err, &errStr); \ WARN("Cuda failure '%s'", errStr); \ return ncclUnhandledCudaError; \ } \ } while(false) #define CUCHECKGOTO(cmd, res, label) do { \ CUresult err = pfn_##cmd; \ if( err != CUDA_SUCCESS ) { \ const char *errStr; \ (void) pfn_cuGetErrorString(err, &errStr); \ WARN("Cuda failure '%s'", errStr); \ res = ncclUnhandledCudaError; \ goto label; \ } \ } while(false) // Report failure but clear error and continue #define CUCHECKIGNORE(cmd) do { \ CUresult err = pfn_##cmd; \ if( err != CUDA_SUCCESS ) { \ const char *errStr; \ (void) pfn_cuGetErrorString(err, &errStr); \ INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr); \ } \ } while(false) #define CUCHECKTHREAD(cmd, args) do { \ CUresult err = pfn_##cmd; \ if (err != CUDA_SUCCESS) { \ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \ args->ret = ncclUnhandledCudaError; \ return args; \ } \ } while(0) #define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol #if CUDART_VERSION >= 11030 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */ DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000); DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000); DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000); DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000); DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020); DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020); DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000); DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000); DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000); DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000); // cuMem API support DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000); DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020); DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020); #if CUDA_VERSION >= 11070 DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support #endif #if CUDA_VERSION >= 12010 /* NVSwitch Multicast support */ DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010); DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010); DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010); DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010); DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010); DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010); #endif #endif /* CUDA Driver functions loaded with dlsym() */ DECLARE_CUDA_PFN_EXTERN(cuInit, 2000); DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020); DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030); ncclResult_t ncclCudaLibraryInit(void); extern int ncclCudaDriverVersionCache; extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit() inline ncclResult_t ncclCudaDriverVersion(int* driver) { int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED); if (version == -1) { CUDACHECK(cudaDriverGetVersion(&version)); __atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED); } *driver = version; return ncclSuccess; } #endif nccl-2.18.3-1/src/include/debug.h000066400000000000000000000030441444201535000163320ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_DEBUG_H_ #define NCCL_DEBUG_H_ #include "nccl_net.h" #include #include #include #include #include #include // Conform to pthread and NVTX standard #define NCCL_THREAD_NAMELEN 16 extern int ncclDebugLevel; extern uint64_t ncclDebugMask; extern pthread_mutex_t ncclDebugLock; extern FILE *ncclDebugFile; extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); // Let code temporarily downgrade WARN into INFO extern thread_local int ncclDebugNoWarn; extern char ncclLastError[]; #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__) #ifdef ENABLE_TRACE #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) extern std::chrono::steady_clock::time_point ncclEpoch; #else #define TRACE(...) #endif void ncclSetThreadName(pthread_t thread, const char *fmt, ...); #endif nccl-2.18.3-1/src/include/devcomm.h000066400000000000000000000277131444201535000167070ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_DEVICE_H_ #define NCCL_DEVICE_H_ #include "nccl.h" #include "align.h" #include #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t; extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS]; #define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* #define NCCL_ALGO_TREE 0 #define NCCL_ALGO_RING 1 #define NCCL_ALGO_COLLNET_DIRECT 2 #define NCCL_ALGO_COLLNET_CHAIN 3 #define NCCL_ALGO_NVLS 4 #define NCCL_ALGO_NVLS_TREE 5 extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS]; #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 #define NCCL_PROTO_LL 0 #define NCCL_PROTO_LL128 1 #define NCCL_PROTO_SIMPLE 2 extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS]; #define NCCL_MAX_OPS 2048 #define NCCL_STEPS 8 union ncclLLFifoLine { /* Flags have to be *after* data, because otherwise, an incomplete receive from the network may receive the flag but not the data. Note this is assuming that either we receive contiguous chunks of data (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ struct { uint32_t data1; uint32_t flag1; uint32_t data2; uint32_t flag2; }; uint64_t v[2]; int4 i4; }; #define WARP_SIZE 32 #define MAXCHANNELS 32 #define NCCL_MAX_NTHREADS 640 #define NCCL_SIMPLE_MAX_NTHREADS 512 #define NCCL_LL_MAX_NTHREADS 512 #define NCCL_LL_LINES_PER_THREAD 8 #ifdef TEST_LL_CLEANUP #define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup #define NCCL_LL_FLAG_MAX 0x100 #define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX)) #else #define NCCL_LL_CLEAN_MASK 0x7ffffff8 #define NCCL_LL_FLAG(a) ((uint32_t)(a)) #endif // Make sure the clean mask will last for at least NCCL_NSTEPS static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value"); #define NCCL_LL128_LINESIZE 128 #define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t)) #define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1) #define NCCL_LL128_MAX_NTHREADS 640 #define NCCL_LL128_ELEMS_PER_THREAD 120 #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) #define NCCL_DIRECT_WRITE 0x01 #define NCCL_DIRECT_READ 0x02 #define NCCL_DIRECT_NIC 0x04 #define NCCL_IPC_WRITE 0x08 #define NCCL_IPC_READ 0x10 #define NCCL_NVLS_MIN_POLL 0x20 struct ncclConnInfo { // Regular comm mechanism char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send uint64_t *tail; // Local for recv, remote for send uint64_t *head; // Local for send, remote for recv int flags; // Direct communication / other flags int shared; // Buffers are shared void **ptrExchange; // Pointer exchange for direct communication uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case int *sizesFifo; // Sizes fifo from GPU to proxy int *offsFifo; // Buffer fifo from proxy to GPU uint64_t step; // Keep where we are uint64_t llLastCleaning; }; struct ncclProxyConnector { int tpRank; int tpLocalRank; int sameProcess; struct ncclProxyConnection* connection; }; struct ncclConnector { int connected; struct ncclProxyConnector proxyConn; struct ncclTransportComm* transportComm; void* transportResources; struct ncclConnInfo conn; }; struct ncclRing { // Shortcuts for userRanks[1] and userRanks[n-1] int prev; int next; // Maps an internal nccl index to user-specified rank order. This is necessary // since we need to know how the user expects data to be ordered across // devices. Ordered from current device. int* userRanks; int index; // This rank's index in the ring }; // The root of each tree only has one node down (+1 intra-node). #define NCCL_MAX_TREE_ARITY_TOP 2 // Nodes inside the binary tree can have to two nodes down (+1 intra-node). #define NCCL_MAX_TREE_ARITY 3 struct ncclTree { int depth; int up; int down[NCCL_MAX_TREE_ARITY]; }; #define NCCL_MAX_DIRECT_ARITY 7 struct ncclDirect { int depth; int out; int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC) int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads int up[NCCL_MAX_DIRECT_ARITY]; int down[NCCL_MAX_DIRECT_ARITY]; }; #define NCCL_MAX_NVLS_ARITY 8 #define NCCL_MAX_NVLS_TREE_ARITY 3 struct ncclNvls { int out; int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC) int up[NCCL_MAX_NVLS_ARITY]; int down; int treeUp; int treeDown[NCCL_MAX_NVLS_TREE_ARITY]; int node; int nNodes; }; #define NCCL_MAX_CONNS 2 struct ncclChannelPeer { struct ncclConnector send[NCCL_MAX_CONNS]; struct ncclConnector recv[NCCL_MAX_CONNS]; int refCount; }; struct ncclDevComm; /* ncclWork is to be a power of two, currently 8x64 bytes, */ /* to make sure reads to host from the CUDA kernel are aligned. */ /* Make sure to adjust padding at the end of ncclWorkElem. */ #define NCCL_WORK_SIZE 512 enum ncclWorkType : uint8_t { ncclWorkTypeUnused=0, ncclWorkTypeColl=1, ncclWorkTypeP2p=2, ncclWorkTypeRegColl=3 }; enum ncclWorkP2PType : uint8_t { ncclWorkP2pTypeUnused=0, ncclWorkP2pTypeSend, ncclWorkP2pTypeRecv }; struct ncclWorkHeader { union { int32_t workNext; // when isLast=0: Offset from kernel argument workHead uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back. }; uint16_t funcIndex; uint8_t isLast:1; // last work for this kernel uint8_t inFifo:1; // is this work in the fifo enum ncclWorkType type; }; struct ncclWorkElem { union { uint8_t flagBits; struct { uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1; }; }; uint8_t nWarps; uint8_t direct; const void * sendbuff; void * recvbuff; size_t count; size_t lastChunkSize; uint32_t root; uint8_t bid; uint8_t nChannels; uint64_t redOpArg; }; #define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem)) static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9"); struct ncclWorkElemP2p { int peer : 30; int proto : 2; enum ncclWorkP2PType p2pType; uint8_t nWarps; uint8_t warpStart; uint8_t ngroups; // Important not to use any fields with greater than 4-byte alignment since // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if // there were 8-byte fields. //void* buff; uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32; //size_t count; uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32; int chunkSize; }; static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16"); #define NCCL_MAX_WORK_ELEMENTS_P2P 16 struct ncclWorkElemReg { struct ncclWorkElem elem; void* dnInputs[NCCL_MAX_DIRECT_ARITY+1]; void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1]; void* upOutputs[NCCL_MAX_DIRECT_ARITY+1]; }; #define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg)) static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2"); // Number of named barriers supported by CUDA #define NCCL_MAX_GROUPS 16 struct ncclWork { struct ncclWorkHeader header; union { char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)]; struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS]; struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P]; struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG]; }; }; static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE"); static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0"); struct ncclDevChannelPeer { // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo // instead of the full ncclConnector. struct ncclConnInfo send[NCCL_MAX_CONNS]; struct ncclConnInfo recv[NCCL_MAX_CONNS]; }; struct alignas(16) ncclDevChannel { struct ncclDevChannelPeer** peers; struct ncclRing ring; struct ncclTree tree; struct ncclTree collnetChain; struct ncclDirect collnetDirect; struct ncclNvls nvls; uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed }; struct ncclDevComm { int rank; int nRanks; int buffSizes[NCCL_NUM_PROTOCOLS]; // Operation list for aggregation int workFifoDepth; struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory // Flag to ask NCCL kernels to abort volatile uint32_t* abortFlag; // Channels, device side struct ncclDevChannel* channels/*[MAXCHANNELS]*/; }; struct alignas(16) ncclDevCommAndChannels { struct ncclDevComm comm; struct ncclDevChannel channels[MAXCHANNELS]; }; #ifdef __CUDA_ARCH__ #define NCCL_CUDA_ARCH __CUDA_ARCH__ #else #define NCCL_CUDA_ARCH 0 #endif template __host__ __device__ constexpr T min_constexpr(T a) { return a; } template __host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) { return min_constexpr((a < b ? a : b), c...); } template __host__ __device__ constexpr T max_constexpr(T a) { return a; } template __host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) { return max_constexpr((a > b ? a : b), c...); } // Calculate the unroll factor given: // * bytePerPack: number of bytes accessed per instruction // * insns: max permissible unroll value // * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack) __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) { return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack); } // Note that all unroll value logic should depend on a given cudaArch argument // and not __CUDA_ARCH__ since these need to be host-side executable where the // arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device // side code can elide passing the arch for brevity. __host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) { // Our collective unroll should move to the same bytes&insns model as NVLS. return cudaArch >= 800 ? 8 : 4; } __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } __host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) { return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch)); } // The amount of dynamic shmem per warp __host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) { return (max_constexpr( /*LL */0, /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t), /*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16, // NVLS needs an extra 16B to read unaligned data. /*NVLS */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16 ) + 15) & -16; // pad to 16 bytes } // The amount of dynamic shmem per block __host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) { return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE); } #endif nccl-2.18.3-1/src/include/enqueue.h000066400000000000000000000021321444201535000167100ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_ENQUEUE_H_ #define NCCL_ENQUEUE_H_ #include "comm.h" #include "group.h" #include "collectives.h" #include "utils.h" #define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64) #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize); ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclLaunchPrepare(struct ncclComm* comm); ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan); ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); ncclResult_t ncclLaunchFinish(struct ncclComm* comm); #endif // End include guard nccl-2.18.3-1/src/include/gdrwrap.h000066400000000000000000000210321444201535000167070ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_GDRWRAP_H_ #define NCCL_GDRWRAP_H_ #include "nccl.h" #include // for standard [u]intX_t types #include #include // These can be used if the GDR library isn't thread safe #include extern pthread_mutex_t gdrLock; #define GDRLOCK() pthread_mutex_lock(&gdrLock) #define GDRUNLOCK() pthread_mutex_unlock(&gdrLock) #define GDRLOCKCALL(cmd, ret) do { \ GDRLOCK(); \ ret = cmd; \ GDRUNLOCK(); \ } while(false) #define GDRCHECK(cmd) do { \ int e; \ /* GDRLOCKCALL(cmd, e); */ \ e = cmd; \ if( e != 0 ) { \ WARN("GDRCOPY failure %d", e); \ return ncclSystemError; \ } \ } while(false) // This is required as the GDR memory is mapped WC #if !defined(__NVCC__) #if defined(__PPC__) static inline void wc_store_fence(void) { asm volatile("sync") ; } #elif defined(__x86_64__) #include static inline void wc_store_fence(void) { _mm_sfence(); } #elif defined(__aarch64__) #ifdef __cplusplus #include static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); } #else #include static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); } #endif #endif #endif //#define GDR_DIRECT 1 #ifdef GDR_DIRECT // Call the GDR API library code directly rather than via // dlopen() wrappers #include static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; } static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; } static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; } static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) { GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle)); return ncclSuccess; } static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) { GDRCHECK(gdr_unpin_buffer(g, handle)); return ncclSuccess; } static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) { GDRCHECK(gdr_get_info(g, handle, info)); return ncclSuccess; } static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) { GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size)); return ncclSuccess; } static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) { GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size)); return ncclSuccess; } static void wrap_gdr_runtime_get_version(int *major, int *minor) { gdr_runtime_get_version(major, minor); return ncclSuccess; } static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) { gdr_driver_get_version(g, major, minor); return ncclSuccess; } static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) { GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size)); return ncclSuccess; } static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) { GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size)); return ncclSuccess; } #else // Dynamically handle dependency the GDR API library /* Extracted from gdrapi.h (v2.1 Nov 2020) */ #define GPU_PAGE_SHIFT 16 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT) #define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1) #define GPU_PAGE_MASK (~GPU_PAGE_OFFSET) struct gdr; typedef struct gdr *gdr_t; typedef struct gdr_mh_s { unsigned long h; } gdr_mh_t; struct gdr_info { uint64_t va; uint64_t mapped_size; uint32_t page_size; uint64_t tm_cycles; uint32_t cycles_per_ms; unsigned mapped:1; unsigned wc_mapping:1; }; typedef struct gdr_info gdr_info_t; /* End of gdrapi.h */ ncclResult_t wrap_gdr_symbols(void); gdr_t wrap_gdr_open(void); ncclResult_t wrap_gdr_close(gdr_t g); ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle); ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle); ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info); ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size); ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size); ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor); ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor); ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size); ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size); #endif // GDR_DIRECT // Global GDR driver handle extern gdr_t ncclGdrCopy; #include "alloc.h" typedef struct gdr_mem_desc { void *gdrDevMem; void *gdrMap; size_t gdrOffset; size_t gdrMapSize; gdr_mh_t gdrMh; } gdr_mem_desc_t; static gdr_t ncclGdrInit() { int libMajor, libMinor, drvMajor, drvMinor; gdr_t handle = NULL; // Dynamically load the GDRAPI library symbols if (wrap_gdr_symbols() == ncclSuccess) { handle = wrap_gdr_open(); if (handle != NULL) { ncclResult_t res; // Query the version of libgdrapi NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error); // Query the version of gdrdrv driver NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error); // Only support GDRAPI 2.1 and later if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) { goto error; } else INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor); } } return handle; error: if (handle != NULL) (void) wrap_gdr_close(handle); return NULL; } template static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) { gdr_info_t info; size_t mapSize; gdr_mh_t mh; char *devMem; void *gdrMap; mapSize = sizeof(T)*nelem; // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE ALIGN_SIZE(mapSize, GPU_PAGE_SIZE); // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1)); uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK; size_t align = alignedAddr - (uint64_t)devMem; //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize); NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh)); NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize)); //TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap); NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info)); // Will offset ever be non zero ? ssize_t off = info.va - alignedAddr; gdr_mem_desc_t* md; NCCLCHECK(ncclCalloc(&md, 1)); md->gdrDevMem = devMem; md->gdrMap = gdrMap; md->gdrMapSize = mapSize; md->gdrOffset = off+align; md->gdrMh = mh; *gdrHandle = md; *ptr = (T *)((char *)gdrMap+off); if (devPtr) *devPtr = (T *)(devMem+off+align); TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p", md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr); return ncclSuccess; } template static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) { gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T))); return ncclSuccess; } static ncclResult_t ncclGdrCudaFree(void* gdrHandle) { gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize)); NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh)); NCCLCHECK(ncclCudaFree(md->gdrDevMem)); free(md); return ncclSuccess; } #endif // End include guard nccl-2.18.3-1/src/include/graph.h000066400000000000000000000120721444201535000163460ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_GRAPH_H_ #define NCCL_GRAPH_H_ #include "nccl.h" #include "devcomm.h" #include #include #include #include #include ncclResult_t ncclTopoCudaPath(int cudaDev, char** path); struct ncclTopoSystem; // Build the topology ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system); ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system); ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system); ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm); void ncclTopoFree(struct ncclTopoSystem* system); ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks); int ncclTopoPathAllNVLink(struct ncclTopoSystem* system); // Query topology ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank); ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush); ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net); int ncclPxnDisable(struct ncclComm* comm); ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank); // Find CPU affinity ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity); #define NCCL_TOPO_CPU_ARCH_X86 1 #define NCCL_TOPO_CPU_ARCH_POWER 2 #define NCCL_TOPO_CPU_ARCH_ARM 3 #define NCCL_TOPO_CPU_VENDOR_INTEL 1 #define NCCL_TOPO_CPU_VENDOR_AMD 2 #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3 #define NCCL_TOPO_CPU_TYPE_BDW 1 #define NCCL_TOPO_CPU_TYPE_SKL 2 #define NCCL_TOPO_CPU_TYPE_YONGFENG 1 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id); ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex); #define NCCL_TOPO_MAX_NODES 256 // Init search. Needs to be done before calling ncclTopoCompute ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system); #define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU) #define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU) #define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU #define NCCL_TOPO_PATTERN_RING 4 // Ring #define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree struct ncclTopoGraph { // Input / output int id; // ring : 0, tree : 1, collnet : 2 int pattern; int crossNic; int collNet; int minChannels; int maxChannels; // Output int nChannels; float bwIntra; float bwInter; float latencyInter; int typeIntra; int typeInter; int sameChannels; int nHops; int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES]; int inter[MAXCHANNELS*2]; }; ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs); struct ncclTopoRanks { int ringRecv[MAXCHANNELS]; int ringSend[MAXCHANNELS]; int ringPrev[MAXCHANNELS]; int ringNext[MAXCHANNELS]; int treeToParent[MAXCHANNELS]; int treeToChild0[MAXCHANNELS]; int treeToChild1[MAXCHANNELS]; int nvlsHeads[MAXCHANNELS]; }; ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks); ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs); ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs); #include "info.h" ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time); #endif nccl-2.18.3-1/src/include/group.h000066400000000000000000000111141444201535000163750ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_GROUP_H_ #define NCCL_GROUP_H_ #include "nccl.h" #include "comm.h" ncclResult_t ncclGroupErrCheck(ncclResult_t ret); void ncclGroupCommJoin(struct ncclComm* comm); void ncclGroupCommPreconnect(struct ncclComm* comm); ncclResult_t ncclGroupCommLeave(struct ncclComm* comm); void ncclGroupJobAbort(); typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); typedef enum ncclGroupJobState { ncclGroupJobRunning = 0, ncclGroupJobDone = 1, ncclGroupJobJoined = 2, } ncclGroupJobState_t; struct ncclAsyncJob { struct ncclAsyncJob* next; pthread_t thread; ncclResult_t result; ncclResult_t(*func)(struct ncclAsyncJob*); void(*undo)(struct ncclAsyncJob*); void(*destructor)(void*); ncclGroupJobState_t state; volatile uint32_t *abortFlag; /* point to comm abortFlag */ volatile uint32_t *childAbortFlag; /* point to child abortFlag */ ncclComm_t comm; }; ncclResult_t ncclAsyncLaunch( struct ncclAsyncJob* job, ncclResult_t(*func)(struct ncclAsyncJob*), void(*undo)(struct ncclAsyncJob*), void(*destructor)(void*), ncclComm_t comm ); struct ncclGroupJob { struct ncclAsyncJob base; struct ncclComm **groupCommHeadPtr; struct ncclComm **groupCommPreconnectHeadPtr; ncclResult_t *groupErrorPtr; volatile bool *abortFlagPtr; struct ncclIntruQueue *asyncJobsPtr; bool doneFlag; }; ncclResult_t ncclGroupStartInternal(); ncclResult_t ncclGroupEndInternal(); ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job); //////////////////////////////////////////////////////////////////////////////// extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting extern __thread ncclResult_t ncclGroupError; extern __thread struct ncclComm* ncclGroupCommHead; extern __thread struct ncclComm* ncclGroupCommPreconnectHead; extern __thread int ncclGroupBlocking; extern __thread struct ncclGroupJob *ncclGroupJobMainPtr; extern __thread struct ncclGroupJob ncclGroupJobMain; static inline void groupResetJobState() { ncclGroupBlocking = -1; ncclGroupJobMainPtr = NULL; memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob)); return; } static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) { ncclResult_t ret = ncclSuccess; if (job) { ret = ncclAsyncJobComplete(&job->base); groupResetJobState(); } return ret; } inline ncclResult_t ncclGroupStartInternal() { /* if previous group launch does not complete, don't launch this one. */ if (ncclGroupJobMainPtr != NULL) { if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) { return ncclInvalidUsage; } else { NCCLCHECK(groupJobComplete(ncclGroupJobMainPtr)); } } ncclGroupDepth++; return ncclSuccess; } inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) { if (ncclGroupDepth > 0) { if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret; } return ret; } // Add comm to this thread's group inline void ncclGroupCommJoin(struct ncclComm* comm) { if (comm->groupNext == reinterpret_cast(0x1)) { // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves // the users program order yet insures siblings occur consecutively. This // is required by doLaunches() in "group.cc". struct ncclComm** pp = &ncclGroupCommHead; while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0) pp = &(*pp)->groupNext; comm->groupNext = *pp; *pp = comm; // Comms gets a new memory stack scope upon joining. Each task batched for // this comm is allocated there. ncclMemoryStackPush(&comm->memScoped); } ncclGroupBlocking = comm->config.blocking; } // Add comm to this thread's group needing preconnect inline void ncclGroupCommPreconnect(struct ncclComm* comm) { if (comm->preconnectNext == reinterpret_cast(0x1)) { comm->preconnectNext = ncclGroupCommPreconnectHead; ncclGroupCommPreconnectHead = comm; } } // Comm has left group inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) { comm->groupNext = reinterpret_cast(0x1); ncclMemoryStackPop(&comm->memScoped); return ncclSuccess; } #endif nccl-2.18.3-1/src/include/ibvcore.h000066400000000000000000000612331444201535000167010ustar00rootroot00000000000000#ifndef NCCL_IBV_CORE_H_ #define NCCL_IBV_CORE_H_ /* Basic IB verbs structs. Needed to dynamically load IB verbs functions without * explicit including of IB verbs header. */ #include #include #include #include #if __GNUC__ >= 3 # define __attribute_const __attribute__((const)) #else # define __attribute_const #endif union ibv_gid { uint8_t raw[16]; struct { uint64_t subnet_prefix; uint64_t interface_id; } global; }; #ifndef container_of /** * container_of - cast a member of a structure out to the containing structure * @ptr: the pointer to the member. * @type: the type of the container struct this is embedded in. * @member: the name of the member within the struct. * */ #define container_of(ptr, type, member) \ ((type *) ((uint8_t *)(ptr) - offsetof(type, member))) #endif #define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz)) /*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ //static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1; enum ibv_node_type { IBV_NODE_UNKNOWN = -1, IBV_NODE_CA = 1, IBV_NODE_SWITCH, IBV_NODE_ROUTER, IBV_NODE_RNIC, /* Leave a gap for future node types before starting with * experimental node types. */ IBV_EXP_NODE_TYPE_START = 32, IBV_EXP_NODE_MIC = IBV_EXP_NODE_TYPE_START }; enum ibv_transport_type { IBV_TRANSPORT_UNKNOWN = -1, IBV_TRANSPORT_IB = 0, IBV_TRANSPORT_IWARP, /* Leave a gap for future transport types before starting with * experimental transport types. */ IBV_EXP_TRANSPORT_TYPE_START = 32, IBV_EXP_TRANSPORT_SCIF = IBV_EXP_TRANSPORT_TYPE_START }; enum ibv_device_cap_flags { IBV_DEVICE_RESIZE_MAX_WR = 1, IBV_DEVICE_BAD_PKEY_CNTR = 1 << 1, IBV_DEVICE_BAD_QKEY_CNTR = 1 << 2, IBV_DEVICE_RAW_MULTI = 1 << 3, IBV_DEVICE_AUTO_PATH_MIG = 1 << 4, IBV_DEVICE_CHANGE_PHY_PORT = 1 << 5, IBV_DEVICE_UD_AV_PORT_ENFORCE = 1 << 6, IBV_DEVICE_CURR_QP_STATE_MOD = 1 << 7, IBV_DEVICE_SHUTDOWN_PORT = 1 << 8, IBV_DEVICE_INIT_TYPE = 1 << 9, IBV_DEVICE_PORT_ACTIVE_EVENT = 1 << 10, IBV_DEVICE_SYS_IMAGE_GUID = 1 << 11, IBV_DEVICE_RC_RNR_NAK_GEN = 1 << 12, IBV_DEVICE_SRQ_RESIZE = 1 << 13, IBV_DEVICE_N_NOTIFY_CQ = 1 << 14, IBV_DEVICE_XRC = 1 << 20, IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29 }; enum ibv_atomic_cap { IBV_ATOMIC_NONE, IBV_ATOMIC_HCA, IBV_ATOMIC_GLOB }; struct ibv_device_attr { char fw_ver[64]; uint64_t node_guid; uint64_t sys_image_guid; uint64_t max_mr_size; uint64_t page_size_cap; uint32_t vendor_id; uint32_t vendor_part_id; uint32_t hw_ver; int max_qp; int max_qp_wr; int device_cap_flags; int max_sge; int max_sge_rd; int max_cq; int max_cqe; int max_mr; int max_pd; int max_qp_rd_atom; int max_ee_rd_atom; int max_res_rd_atom; int max_qp_init_rd_atom; int max_ee_init_rd_atom; enum ibv_atomic_cap atomic_cap; int max_ee; int max_rdd; int max_mw; int max_raw_ipv6_qp; int max_raw_ethy_qp; int max_mcast_grp; int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; int max_fmr; int max_map_per_fmr; int max_srq; int max_srq_wr; int max_srq_sge; uint16_t max_pkeys; uint8_t local_ca_ack_delay; uint8_t phys_port_cnt; }; enum ibv_mtu { IBV_MTU_256 = 1, IBV_MTU_512 = 2, IBV_MTU_1024 = 3, IBV_MTU_2048 = 4, IBV_MTU_4096 = 5 }; enum ibv_port_state { IBV_PORT_NOP = 0, IBV_PORT_DOWN = 1, IBV_PORT_INIT = 2, IBV_PORT_ARMED = 3, IBV_PORT_ACTIVE = 4, IBV_PORT_ACTIVE_DEFER = 5 }; enum { IBV_LINK_LAYER_UNSPECIFIED, IBV_LINK_LAYER_INFINIBAND, IBV_LINK_LAYER_ETHERNET, /* Leave a gap for future link layer types before starting with * experimental link layer. */ IBV_EXP_LINK_LAYER_START = 32, IBV_EXP_LINK_LAYER_SCIF = IBV_EXP_LINK_LAYER_START }; enum ibv_port_cap_flags { IBV_PORT_SM = 1 << 1, IBV_PORT_NOTICE_SUP = 1 << 2, IBV_PORT_TRAP_SUP = 1 << 3, IBV_PORT_OPT_IPD_SUP = 1 << 4, IBV_PORT_AUTO_MIGR_SUP = 1 << 5, IBV_PORT_SL_MAP_SUP = 1 << 6, IBV_PORT_MKEY_NVRAM = 1 << 7, IBV_PORT_PKEY_NVRAM = 1 << 8, IBV_PORT_LED_INFO_SUP = 1 << 9, IBV_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, IBV_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, IBV_PORT_CM_SUP = 1 << 16, IBV_PORT_SNMP_TUNNEL_SUP = 1 << 17, IBV_PORT_REINIT_SUP = 1 << 18, IBV_PORT_DEVICE_MGMT_SUP = 1 << 19, IBV_PORT_VENDOR_CLASS = 1 << 24, IBV_PORT_CLIENT_REG_SUP = 1 << 25, IBV_PORT_IP_BASED_GIDS = 1 << 26, }; struct ibv_port_attr { enum ibv_port_state state; enum ibv_mtu max_mtu; enum ibv_mtu active_mtu; int gid_tbl_len; uint32_t port_cap_flags; uint32_t max_msg_sz; uint32_t bad_pkey_cntr; uint32_t qkey_viol_cntr; uint16_t pkey_tbl_len; uint16_t lid; uint16_t sm_lid; uint8_t lmc; uint8_t max_vl_num; uint8_t sm_sl; uint8_t subnet_timeout; uint8_t init_type_reply; uint8_t active_width; uint8_t active_speed; uint8_t phys_state; uint8_t link_layer; uint8_t reserved; }; enum ibv_event_type { IBV_EVENT_CQ_ERR, IBV_EVENT_QP_FATAL, IBV_EVENT_QP_REQ_ERR, IBV_EVENT_QP_ACCESS_ERR, IBV_EVENT_COMM_EST, IBV_EVENT_SQ_DRAINED, IBV_EVENT_PATH_MIG, IBV_EVENT_PATH_MIG_ERR, IBV_EVENT_DEVICE_FATAL, IBV_EVENT_PORT_ACTIVE, IBV_EVENT_PORT_ERR, IBV_EVENT_LID_CHANGE, IBV_EVENT_PKEY_CHANGE, IBV_EVENT_SM_CHANGE, IBV_EVENT_SRQ_ERR, IBV_EVENT_SRQ_LIMIT_REACHED, IBV_EVENT_QP_LAST_WQE_REACHED, IBV_EVENT_CLIENT_REREGISTER, IBV_EVENT_GID_CHANGE, /* new experimental events start here leaving enough * room for 14 events which should be enough */ IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32, IBV_EXP_EVENT_DCT_ACCESS_ERR, IBV_EXP_EVENT_DCT_REQ_ERR, }; struct ibv_async_event { union { struct ibv_cq *cq; struct ibv_qp *qp; struct ibv_srq *srq; struct ibv_exp_dct *dct; int port_num; /* For source compatible with Legacy API */ uint32_t xrc_qp_num; } element; enum ibv_event_type event_type; }; enum ibv_wc_status { IBV_WC_SUCCESS, IBV_WC_LOC_LEN_ERR, IBV_WC_LOC_QP_OP_ERR, IBV_WC_LOC_EEC_OP_ERR, IBV_WC_LOC_PROT_ERR, IBV_WC_WR_FLUSH_ERR, IBV_WC_MW_BIND_ERR, IBV_WC_BAD_RESP_ERR, IBV_WC_LOC_ACCESS_ERR, IBV_WC_REM_INV_REQ_ERR, IBV_WC_REM_ACCESS_ERR, IBV_WC_REM_OP_ERR, IBV_WC_RETRY_EXC_ERR, IBV_WC_RNR_RETRY_EXC_ERR, IBV_WC_LOC_RDD_VIOL_ERR, IBV_WC_REM_INV_RD_REQ_ERR, IBV_WC_REM_ABORT_ERR, IBV_WC_INV_EECN_ERR, IBV_WC_INV_EEC_STATE_ERR, IBV_WC_FATAL_ERR, IBV_WC_RESP_TIMEOUT_ERR, IBV_WC_GENERAL_ERR }; const char *ibv_wc_status_str(enum ibv_wc_status status); enum ibv_wc_opcode { IBV_WC_SEND, IBV_WC_RDMA_WRITE, IBV_WC_RDMA_READ, IBV_WC_COMP_SWAP, IBV_WC_FETCH_ADD, IBV_WC_BIND_MW, /* * Set value of IBV_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IBV_WC_RECV). */ IBV_WC_RECV = 1 << 7, IBV_WC_RECV_RDMA_WITH_IMM }; enum ibv_wc_flags { IBV_WC_GRH = 1 << 0, IBV_WC_WITH_IMM = 1 << 1 }; struct ibv_wc { uint64_t wr_id; enum ibv_wc_status status; enum ibv_wc_opcode opcode; uint32_t vendor_err; uint32_t byte_len; uint32_t imm_data; /* in network byte order */ uint32_t qp_num; uint32_t src_qp; int wc_flags; uint16_t pkey_index; uint16_t slid; uint8_t sl; uint8_t dlid_path_bits; }; enum ibv_access_flags { IBV_ACCESS_LOCAL_WRITE = 1, IBV_ACCESS_REMOTE_WRITE = (1<<1), IBV_ACCESS_REMOTE_READ = (1<<2), IBV_ACCESS_REMOTE_ATOMIC = (1<<3), IBV_ACCESS_MW_BIND = (1<<4), IBV_ACCESS_RELAXED_ORDERING = (1<<20), }; struct ibv_pd { struct ibv_context *context; uint32_t handle; }; enum ibv_xrcd_init_attr_mask { IBV_XRCD_INIT_ATTR_FD = 1 << 0, IBV_XRCD_INIT_ATTR_OFLAGS = 1 << 1, IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2 }; struct ibv_xrcd_init_attr { uint32_t comp_mask; int fd; int oflags; }; struct ibv_xrcd { struct ibv_context *context; }; enum ibv_rereg_mr_flags { IBV_REREG_MR_CHANGE_TRANSLATION = (1 << 0), IBV_REREG_MR_CHANGE_PD = (1 << 1), IBV_REREG_MR_CHANGE_ACCESS = (1 << 2), IBV_REREG_MR_KEEP_VALID = (1 << 3) }; struct ibv_mr { struct ibv_context *context; struct ibv_pd *pd; void *addr; size_t length; uint32_t handle; uint32_t lkey; uint32_t rkey; }; enum ibv_mw_type { IBV_MW_TYPE_1 = 1, IBV_MW_TYPE_2 = 2 }; struct ibv_mw { struct ibv_context *context; struct ibv_pd *pd; uint32_t rkey; }; struct ibv_global_route { union ibv_gid dgid; uint32_t flow_label; uint8_t sgid_index; uint8_t hop_limit; uint8_t traffic_class; }; struct ibv_grh { uint32_t version_tclass_flow; uint16_t paylen; uint8_t next_hdr; uint8_t hop_limit; union ibv_gid sgid; union ibv_gid dgid; }; enum ibv_rate { IBV_RATE_MAX = 0, IBV_RATE_2_5_GBPS = 2, IBV_RATE_5_GBPS = 5, IBV_RATE_10_GBPS = 3, IBV_RATE_20_GBPS = 6, IBV_RATE_30_GBPS = 4, IBV_RATE_40_GBPS = 7, IBV_RATE_60_GBPS = 8, IBV_RATE_80_GBPS = 9, IBV_RATE_120_GBPS = 10, IBV_RATE_14_GBPS = 11, IBV_RATE_56_GBPS = 12, IBV_RATE_112_GBPS = 13, IBV_RATE_168_GBPS = 14, IBV_RATE_25_GBPS = 15, IBV_RATE_100_GBPS = 16, IBV_RATE_200_GBPS = 17, IBV_RATE_300_GBPS = 18 }; /** * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IBV_RATE_5_GBPS will be * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const; /** * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum. * @mult: multiple to convert. */ enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const; /** * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec. * For example, IBV_RATE_5_GBPS will return the value 5000. * @rate: rate to convert. */ int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const; /** * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum. * @mbps: value to convert. */ enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const; struct ibv_ah_attr { struct ibv_global_route grh; uint16_t dlid; uint8_t sl; uint8_t src_path_bits; uint8_t static_rate; uint8_t is_global; uint8_t port_num; }; enum ibv_srq_attr_mask { IBV_SRQ_MAX_WR = 1 << 0, IBV_SRQ_LIMIT = 1 << 1 }; struct ibv_srq_attr { uint32_t max_wr; uint32_t max_sge; uint32_t srq_limit; }; struct ibv_srq_init_attr { void *srq_context; struct ibv_srq_attr attr; }; enum ibv_srq_type { IBV_SRQT_BASIC, IBV_SRQT_XRC }; enum ibv_srq_init_attr_mask { IBV_SRQ_INIT_ATTR_TYPE = 1 << 0, IBV_SRQ_INIT_ATTR_PD = 1 << 1, IBV_SRQ_INIT_ATTR_XRCD = 1 << 2, IBV_SRQ_INIT_ATTR_CQ = 1 << 3, IBV_SRQ_INIT_ATTR_RESERVED = 1 << 4 }; struct ibv_srq_init_attr_ex { void *srq_context; struct ibv_srq_attr attr; uint32_t comp_mask; enum ibv_srq_type srq_type; struct ibv_pd *pd; struct ibv_xrcd *xrcd; struct ibv_cq *cq; }; enum ibv_qp_type { IBV_QPT_RC = 2, IBV_QPT_UC, IBV_QPT_UD, /* XRC compatible code */ IBV_QPT_XRC, IBV_QPT_RAW_PACKET = 8, IBV_QPT_RAW_ETH = 8, IBV_QPT_XRC_SEND = 9, IBV_QPT_XRC_RECV, /* Leave a gap for future qp types before starting with * experimental qp types. */ IBV_EXP_QP_TYPE_START = 32, IBV_EXP_QPT_DC_INI = IBV_EXP_QP_TYPE_START }; struct ibv_qp_cap { uint32_t max_send_wr; uint32_t max_recv_wr; uint32_t max_send_sge; uint32_t max_recv_sge; uint32_t max_inline_data; }; struct ibv_qp_init_attr { void *qp_context; struct ibv_cq *send_cq; struct ibv_cq *recv_cq; struct ibv_srq *srq; struct ibv_qp_cap cap; enum ibv_qp_type qp_type; int sq_sig_all; /* Below is needed for backwards compatabile */ struct ibv_xrc_domain *xrc_domain; }; enum ibv_qp_init_attr_mask { IBV_QP_INIT_ATTR_PD = 1 << 0, IBV_QP_INIT_ATTR_XRCD = 1 << 1, IBV_QP_INIT_ATTR_RESERVED = 1 << 2 }; struct ibv_qp_init_attr_ex { void *qp_context; struct ibv_cq *send_cq; struct ibv_cq *recv_cq; struct ibv_srq *srq; struct ibv_qp_cap cap; enum ibv_qp_type qp_type; int sq_sig_all; uint32_t comp_mask; struct ibv_pd *pd; struct ibv_xrcd *xrcd; }; enum ibv_qp_open_attr_mask { IBV_QP_OPEN_ATTR_NUM = 1 << 0, IBV_QP_OPEN_ATTR_XRCD = 1 << 1, IBV_QP_OPEN_ATTR_CONTEXT = 1 << 2, IBV_QP_OPEN_ATTR_TYPE = 1 << 3, IBV_QP_OPEN_ATTR_RESERVED = 1 << 4 }; struct ibv_qp_open_attr { uint32_t comp_mask; uint32_t qp_num; struct ibv_xrcd *xrcd; void *qp_context; enum ibv_qp_type qp_type; }; enum ibv_qp_attr_mask { IBV_QP_STATE = 1 << 0, IBV_QP_CUR_STATE = 1 << 1, IBV_QP_EN_SQD_ASYNC_NOTIFY = 1 << 2, IBV_QP_ACCESS_FLAGS = 1 << 3, IBV_QP_PKEY_INDEX = 1 << 4, IBV_QP_PORT = 1 << 5, IBV_QP_QKEY = 1 << 6, IBV_QP_AV = 1 << 7, IBV_QP_PATH_MTU = 1 << 8, IBV_QP_TIMEOUT = 1 << 9, IBV_QP_RETRY_CNT = 1 << 10, IBV_QP_RNR_RETRY = 1 << 11, IBV_QP_RQ_PSN = 1 << 12, IBV_QP_MAX_QP_RD_ATOMIC = 1 << 13, IBV_QP_ALT_PATH = 1 << 14, IBV_QP_MIN_RNR_TIMER = 1 << 15, IBV_QP_SQ_PSN = 1 << 16, IBV_QP_MAX_DEST_RD_ATOMIC = 1 << 17, IBV_QP_PATH_MIG_STATE = 1 << 18, IBV_QP_CAP = 1 << 19, IBV_QP_DEST_QPN = 1 << 20 }; enum ibv_qp_state { IBV_QPS_RESET, IBV_QPS_INIT, IBV_QPS_RTR, IBV_QPS_RTS, IBV_QPS_SQD, IBV_QPS_SQE, IBV_QPS_ERR, IBV_QPS_UNKNOWN }; enum ibv_mig_state { IBV_MIG_MIGRATED, IBV_MIG_REARM, IBV_MIG_ARMED }; struct ibv_qp_attr { enum ibv_qp_state qp_state; enum ibv_qp_state cur_qp_state; enum ibv_mtu path_mtu; enum ibv_mig_state path_mig_state; uint32_t qkey; uint32_t rq_psn; uint32_t sq_psn; uint32_t dest_qp_num; int qp_access_flags; struct ibv_qp_cap cap; struct ibv_ah_attr ah_attr; struct ibv_ah_attr alt_ah_attr; uint16_t pkey_index; uint16_t alt_pkey_index; uint8_t en_sqd_async_notify; uint8_t sq_draining; uint8_t max_rd_atomic; uint8_t max_dest_rd_atomic; uint8_t min_rnr_timer; uint8_t port_num; uint8_t timeout; uint8_t retry_cnt; uint8_t rnr_retry; uint8_t alt_port_num; uint8_t alt_timeout; }; enum ibv_wr_opcode { IBV_WR_RDMA_WRITE, IBV_WR_RDMA_WRITE_WITH_IMM, IBV_WR_SEND, IBV_WR_SEND_WITH_IMM, IBV_WR_RDMA_READ, IBV_WR_ATOMIC_CMP_AND_SWP, IBV_WR_ATOMIC_FETCH_AND_ADD }; enum ibv_send_flags { IBV_SEND_FENCE = 1 << 0, IBV_SEND_SIGNALED = 1 << 1, IBV_SEND_SOLICITED = 1 << 2, IBV_SEND_INLINE = 1 << 3 }; struct ibv_sge { uint64_t addr; uint32_t length; uint32_t lkey; }; struct ibv_send_wr { uint64_t wr_id; struct ibv_send_wr *next; struct ibv_sge *sg_list; int num_sge; enum ibv_wr_opcode opcode; int send_flags; uint32_t imm_data; /* in network byte order */ union { struct { uint64_t remote_addr; uint32_t rkey; } rdma; struct { uint64_t remote_addr; uint64_t compare_add; uint64_t swap; uint32_t rkey; } atomic; struct { struct ibv_ah *ah; uint32_t remote_qpn; uint32_t remote_qkey; } ud; } wr; union { union { struct { uint32_t remote_srqn; } xrc; } qp_type; uint32_t xrc_remote_srq_num; }; }; struct ibv_recv_wr { uint64_t wr_id; struct ibv_recv_wr *next; struct ibv_sge *sg_list; int num_sge; }; struct ibv_mw_bind { uint64_t wr_id; struct ibv_mr *mr; void *addr; size_t length; int send_flags; int mw_access_flags; }; struct ibv_srq { struct ibv_context *context; void *srq_context; struct ibv_pd *pd; uint32_t handle; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t events_completed; /* below are for source compatabilty with legacy XRC, * padding based on ibv_srq_legacy. */ uint32_t xrc_srq_num_bin_compat_padding; struct ibv_xrc_domain *xrc_domain_bin_compat_padding; struct ibv_cq *xrc_cq_bin_compat_padding; void *ibv_srq_padding; /* legacy fields */ uint32_t xrc_srq_num; struct ibv_xrc_domain *xrc_domain; struct ibv_cq *xrc_cq; }; /* Not in use in new API, needed for compilation as part of source compat layer */ enum ibv_event_flags { IBV_XRC_QP_EVENT_FLAG = 0x80000000, }; struct ibv_qp { struct ibv_context *context; void *qp_context; struct ibv_pd *pd; struct ibv_cq *send_cq; struct ibv_cq *recv_cq; struct ibv_srq *srq; uint32_t handle; uint32_t qp_num; enum ibv_qp_state state; enum ibv_qp_type qp_type; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t events_completed; }; struct ibv_comp_channel { struct ibv_context *context; int fd; int refcnt; }; struct ibv_cq { struct ibv_context *context; struct ibv_comp_channel *channel; void *cq_context; uint32_t handle; int cqe; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t comp_events_completed; uint32_t async_events_completed; }; struct ibv_ah { struct ibv_context *context; struct ibv_pd *pd; uint32_t handle; }; enum ibv_flow_flags { IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1, IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1, }; enum ibv_flow_attr_type { /* steering according to rule specifications */ IBV_FLOW_ATTR_NORMAL = 0x0, /* default unicast and multicast rule - * receive all Eth traffic which isn't steered to any QP */ IBV_FLOW_ATTR_ALL_DEFAULT = 0x1, /* default multicast rule - * receive all Eth multicast traffic which isn't steered to any QP */ IBV_FLOW_ATTR_MC_DEFAULT = 0x2, }; enum ibv_flow_spec_type { IBV_FLOW_SPEC_ETH = 0x20, IBV_FLOW_SPEC_IPV4 = 0x30, IBV_FLOW_SPEC_TCP = 0x40, IBV_FLOW_SPEC_UDP = 0x41, }; struct ibv_flow_eth_filter { uint8_t dst_mac[6]; uint8_t src_mac[6]; uint16_t ether_type; /* * same layout as 802.1q: prio 3, cfi 1, vlan id 12 */ uint16_t vlan_tag; }; struct ibv_flow_spec_eth { enum ibv_flow_spec_type type; uint16_t size; struct ibv_flow_eth_filter val; struct ibv_flow_eth_filter mask; }; struct ibv_flow_ipv4_filter { uint32_t src_ip; uint32_t dst_ip; }; struct ibv_flow_spec_ipv4 { enum ibv_flow_spec_type type; uint16_t size; struct ibv_flow_ipv4_filter val; struct ibv_flow_ipv4_filter mask; }; struct ibv_flow_tcp_udp_filter { uint16_t dst_port; uint16_t src_port; }; struct ibv_flow_spec_tcp_udp { enum ibv_flow_spec_type type; uint16_t size; struct ibv_flow_tcp_udp_filter val; struct ibv_flow_tcp_udp_filter mask; }; struct ibv_flow_spec { union { struct { enum ibv_flow_spec_type type; uint16_t size; } hdr; struct ibv_flow_spec_eth eth; struct ibv_flow_spec_ipv4 ipv4; struct ibv_flow_spec_tcp_udp tcp_udp; }; }; struct ibv_flow_attr { uint32_t comp_mask; enum ibv_flow_attr_type type; uint16_t size; uint16_t priority; uint8_t num_of_specs; uint8_t port; uint32_t flags; /* Following are the optional layers according to user request * struct ibv_flow_spec_xxx [L2] * struct ibv_flow_spec_yyy [L3/L4] */ }; struct ibv_flow { uint32_t comp_mask; struct ibv_context *context; uint32_t handle; }; struct ibv_device; struct ibv_context; struct ibv_device_ops { struct ibv_context * (*alloc_context)(struct ibv_device *device, int cmd_fd); void (*free_context)(struct ibv_context *context); }; enum { IBV_SYSFS_NAME_MAX = 64, IBV_SYSFS_PATH_MAX = 256 }; struct ibv_device { struct ibv_device_ops ops; enum ibv_node_type node_type; enum ibv_transport_type transport_type; /* Name of underlying kernel IB device, eg "mthca0" */ char name[IBV_SYSFS_NAME_MAX]; /* Name of uverbs device, eg "uverbs0" */ char dev_name[IBV_SYSFS_NAME_MAX]; /* Path to infiniband_verbs class device in sysfs */ char dev_path[IBV_SYSFS_PATH_MAX]; /* Path to infiniband class device in sysfs */ char ibdev_path[IBV_SYSFS_PATH_MAX]; }; struct verbs_device { struct ibv_device device; /* Must be first */ size_t sz; size_t size_of_context; int (*init_context)(struct verbs_device *device, struct ibv_context *ctx, int cmd_fd); void (*uninit_context)(struct verbs_device *device, struct ibv_context *ctx); /* future fields added here */ }; struct ibv_context_ops { int (*query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr); int (*query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); struct ibv_pd * (*alloc_pd)(struct ibv_context *context); int (*dealloc_pd)(struct ibv_pd *pd); struct ibv_mr * (*reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); struct ibv_mr * (*rereg_mr)(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr, size_t length, int access); int (*dereg_mr)(struct ibv_mr *mr); struct ibv_mw * (*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type); int (*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw, struct ibv_mw_bind *mw_bind); int (*dealloc_mw)(struct ibv_mw *mw); struct ibv_cq * (*create_cq)(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector); int (*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc); int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only); void (*cq_event)(struct ibv_cq *cq); int (*resize_cq)(struct ibv_cq *cq, int cqe); int (*destroy_cq)(struct ibv_cq *cq); struct ibv_srq * (*create_srq)(struct ibv_pd *pd, struct ibv_srq_init_attr *srq_init_attr); int (*modify_srq)(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask); int (*query_srq)(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); int (*destroy_srq)(struct ibv_srq *srq); int (*post_srq_recv)(struct ibv_srq *srq, struct ibv_recv_wr *recv_wr, struct ibv_recv_wr **bad_recv_wr); struct ibv_qp * (*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); int (*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); int (*destroy_qp)(struct ibv_qp *qp); int (*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr); int (*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); struct ibv_ah * (*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr); int (*destroy_ah)(struct ibv_ah *ah); int (*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); int (*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); void (*async_event)(struct ibv_async_event *event); }; struct ibv_context { struct ibv_device *device; struct ibv_context_ops ops; int cmd_fd; int async_fd; int num_comp_vectors; pthread_mutex_t mutex; void *abi_compat; }; enum verbs_context_mask { VERBS_CONTEXT_XRCD = (uint64_t)1 << 0, VERBS_CONTEXT_SRQ = (uint64_t)1 << 1, VERBS_CONTEXT_QP = (uint64_t)1 << 2, VERBS_CONTEXT_RESERVED = (uint64_t)1 << 3, VERBS_CONTEXT_EXP = (uint64_t)1 << 62 }; struct verbs_context { /* "grows up" - new fields go here */ int (*_reserved_2) (void); int (*destroy_flow) (struct ibv_flow *flow); int (*_reserved_1) (void); struct ibv_flow * (*create_flow) (struct ibv_qp *qp, struct ibv_flow_attr *flow_attr); struct ibv_qp * (*open_qp)(struct ibv_context *context, struct ibv_qp_open_attr *attr); struct ibv_qp * (*create_qp_ex)(struct ibv_context *context, struct ibv_qp_init_attr_ex *qp_init_attr_ex); int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); struct ibv_srq * (*create_srq_ex)(struct ibv_context *context, struct ibv_srq_init_attr_ex *srq_init_attr_ex); struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context, struct ibv_xrcd_init_attr *xrcd_init_attr); int (*close_xrcd)(struct ibv_xrcd *xrcd); uint64_t has_comp_mask; size_t sz; /* Must be immediately before struct ibv_context */ struct ibv_context context;/* Must be last field in the struct */ }; /*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ /*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx) { return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ? NULL : container_of(ctx, struct verbs_context, context); } #define verbs_get_ctx_op(ctx, op) ({ \ struct verbs_context *_vctx = verbs_get_ctx(ctx); \ (!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \ !_vctx->op) ? NULL : _vctx; })*/ #define verbs_set_ctx_op(_vctx, op, ptr) ({ \ struct verbs_context *vctx = _vctx; \ if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \ vctx->op = ptr; }) static inline struct verbs_device *verbs_get_device(struct ibv_device *dev) { return (dev->ops.alloc_context) ? NULL : container_of(dev, struct verbs_device, device); } static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { return qp->context->ops.post_send(qp, wr, bad_wr); } #endif // NCCL_IBV_CORE_H_ nccl-2.18.3-1/src/include/ibvsymbols.h000066400000000000000000000047321444201535000174420ustar00rootroot00000000000000#ifndef NCCL_IBV_SYMBOLS_H_ #define NCCL_IBV_SYMBOLS_H_ #ifdef NCCL_BUILD_RDMA_CORE #include #else #include "ibvcore.h" #endif #include "nccl.h" /* IB Verbs Function Pointers*/ struct ncclIbvSymbols { int (*ibv_internal_fork_init)(void); struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices); void (*ibv_internal_free_device_list)(struct ibv_device **list); const char * (*ibv_internal_get_device_name)(struct ibv_device *device); struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device); int (*ibv_internal_close_device)(struct ibv_context *context); int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event); void (*ibv_internal_ack_async_event)(struct ibv_async_event *event); int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr); int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context); int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd); struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access); /* DMA-BUF support */ struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); int (*ibv_internal_dereg_mr)(struct ibv_mr *mr); struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); int (*ibv_internal_destroy_cq)(struct ibv_cq *cq); struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); int (*ibv_internal_destroy_qp)(struct ibv_qp *qp); const char * (*ibv_internal_event_type_str)(enum ibv_event_type event); }; /* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */ ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols); #endif // NCCL_IBV_SYMBOLS_H_ nccl-2.18.3-1/src/include/ibvwrap.h000066400000000000000000000113571444201535000167240ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_IBVWRAP_H_ #define NCCL_IBVWRAP_H_ #ifdef NCCL_BUILD_RDMA_CORE #include #else #include "ibvcore.h" #endif #include "core.h" #include #include typedef enum ibv_return_enum { IBV_SUCCESS = 0, //!< The operation was successful } ibv_return_t; ncclResult_t wrap_ibv_symbols(void); /* NCCL wrappers of IB verbs functions */ ncclResult_t wrap_ibv_fork_init(void); ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices); ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list); const char *wrap_ibv_get_device_name(struct ibv_device *device); ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device); ncclResult_t wrap_ibv_close_device(struct ibv_context *context); ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event); ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event); ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr); ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context); ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd); ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access); struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); /* DMA-BUF support */ ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr); ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context); ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel); ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq); static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) { int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/ if (done < 0) { WARN("Call to ibv_poll_cq() returned %d", done); return ncclSystemError; } *num_done = done; return ncclSuccess; } ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp); static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ if (ret != IBV_SUCCESS) { WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr); return ncclSystemError; } return ncclSuccess; } static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ if (ret != IBV_SUCCESS) { WARN("ibv_post_recv() failed with error %s", strerror(ret)); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event); #endif //End include guard nccl-2.18.3-1/src/include/info.h000066400000000000000000000063671444201535000162120ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_INFO_H_ #define NCCL_INFO_H_ #include "nccl.h" #include "devcomm.h" #include "collectives.h" #include "core.h" #include "utils.h" #include "strongstream.h" typedef enum : uint8_t { ncclPatternRing, ncclPatternRingTwice, ncclPatternPipelineFrom, ncclPatternPipelineTo, ncclPatternTreeUp, ncclPatternTreeDown, ncclPatternTreeUpDown, ncclPatternCollnetChain, ncclPatternCollnetDirect, ncclPatternNvls, ncclPatternNvlsTree, ncclPatternSend, ncclPatternRecv } ncclPattern_t; // Used to pass NCCL call information between functions struct ncclInfo { ncclFunc_t coll; const char* opName; // NCCL Coll Args const void* sendbuff; void* recvbuff; size_t count; ncclDataType_t datatype; ncclRedOp_t op; int root; // peer for p2p operations ncclComm_t comm; cudaStream_t stream; // Algorithm details int chunkSteps; int sliceSteps; // Computed later ncclDevRedOpFull opFull; int algorithm; int protocol; ncclPattern_t pattern; int nChannels; int nThreads; size_t nBytes; int nstepsPerLoop; int nchunksPerLoop; int chunkSize; int channelId; }; inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) { info->nBytes = info->count * ncclTypeSize(info->datatype); if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) { info->count = info->nBytes; info->datatype = ncclInt8; } if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank return ncclSuccess; } struct ncclTaskColl { struct ncclTaskColl* next; ncclFunc_t func; void const* sendbuff; void* recvbuff; size_t count; int root; ncclDataType_t datatype; ncclDevRedOpFull op; int chunkSteps, sliceSteps; }; struct ncclTaskP2p { ncclTaskP2p *next; void *buff; size_t bytes; // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track // of where it left off. int chunk; }; struct ncclCudaStreamList { struct ncclCudaStreamList *next; cudaStream_t stream; }; struct ncclTasks { struct Peer { bool sendSeen, recvSeen; struct ncclIntruQueue sendQueue; struct ncclIntruQueue recvQueue; }; struct ncclIntruQueue collQueue; size_t collBytesTotal; struct Peer* peers/*[nRanks]*/; int *p2pSendOrder, *p2pRecvOrder; int p2pOrderSteps; int nTasksColl, nTasksP2p; // The list of user streams aggregated over all tasks present. struct ncclCudaStreamList* streams; // The most recent user stream. Ignored if streams==nullptr cudaStream_t streamRecent; // The graph capturing all user streams or invalid if none. Thus we restrict the // user that all streams must be captured in the same graph or not captured // at all. Technically we could probably relax this, but that would mean // collecting a different `ncclTasks` per graph and one for non-graph. struct ncclCudaGraph capturingGraph; }; #endif nccl-2.18.3-1/src/include/ipcsocket.h000066400000000000000000000016721444201535000172350ustar00rootroot00000000000000/* * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. * * See COPYRIGHT for license information */ #ifndef NCCL_IPCSOCKET_H #define NCCL_IPCSOCKET_H #include "nccl.h" #include #include #include #include #include #include #include #include #include #include #include #define NCCL_IPC_SOCKNAME_LEN 64 struct ncclIpcSocket { int fd; char socketName[NCCL_IPC_SOCKNAME_LEN]; volatile uint32_t* abortFlag; }; ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag); ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle); ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd); ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash); #endif /* NCCL_IPCSOCKET_H */ nccl-2.18.3-1/src/include/nccl_net.h000066400000000000000000000414071444201535000170360ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_NET_H_ #define NCCL_NET_H_ #include "nccl.h" #include #define NCCL_NET_HANDLE_MAXSIZE 128 #define NCCL_PTR_HOST 0x1 #define NCCL_PTR_CUDA 0x2 #define NCCL_PTR_DMABUF 0x4 // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 8 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] int speed; // Port speed in Mbps. int port; // Port number. float latency; // Network latency int maxComms; // Maximum number of comms we can create int maxRecvs; // Maximum number of grouped receives. }ncclNetProperties_v6_t; typedef ncclNetProperties_v6_t ncclNetProperties_t; typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v6_t; typedef ncclNet_v6_t ncclNet_t; #define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6 typedef struct { // Name of the collective network (mainly for logs) const char* name; // Initialize the collective network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters capable of doing collective operations. // If ndev returns 0, all other functions might be set to NULL. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create connections. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Create a group for collective operations. handles have been created // using listen() above. rank indicates caller's rank in the collective network. ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); // Returns whether a reduction operation on a data type is supported. // 1 for supported, 0 otherwise. ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* collComm, void* mhandle); // Performs an asynchronous allreduce operation on the collective group. // May return request == NULL if the call cannot be performed (or would block). ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free collective comm objects ncclResult_t (*closeColl)(void* collComm); ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v6_t; typedef ncclCollNet_v6_t ncclCollNet_t; #define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6 // v5 struct for backwards compatibility typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* sizes); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v5_t; // v5 struct for backwards compatibility typedef struct { // Name of the collective network (mainly for logs) const char* name; // Initialize the collective network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters capable of doing collective operations. // If ndev returns 0, all other functions might be set to NULL. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create connections. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Create a group for collective operations. handles have been created // using listen() above. rank indicates caller's rank in the collective network. ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); // Returns whether a reduction operation on a data type is supported. // 1 for supported, 0 otherwise. ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* collComm, void* mhandle); // Performs an asynchronous allreduce operation on the collective group. // May return request == NULL if the call cannot be performed (or would block). ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free collective comm objects ncclResult_t (*closeColl)(void* collComm); ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v5_t; // v4 struct for backwards compatibility typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA int speed; // Port speed in Mbps. int port; // Port number. int maxComms; // Maximum number of comms we can create } ncclNetProperties_v4_t; // v4 struct for backwards compatibility typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. ncclResult_t (*connect)(int dev, void* handle, void** sendComm); // Finalize connection establishment after remote peer has called connectHandle ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free send/recv comm objects ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v4_t; // v4 struct for backwards compatibility typedef struct { // Name of the collective network (mainly for logs) const char* name; // Initialize the collective network. ncclResult_t (*init)(ncclDebugLogger_t logFunction); // Return the number of adapters capable of doing collective operations. // If ndev returns 0, all other functions might be set to NULL. ncclResult_t (*devices)(int* ndev); // Get various device properties. ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create connections. ncclResult_t (*listen)(int dev, void* handle, void** listenComm); // Create a group for collective operations. handles have been created // using listen() above. rank indicates caller's rank in the collective network. ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); // Returns whether a reduction operation on a data type is supported. // 1 for supported, 0 otherwise. ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* collComm, void* mhandle); // Performs an asynchronous allreduce operation on the collective group. // May return request == NULL if the call cannot be performed (or would block). ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); // Test whether a request is complete. If size is not NULL, it returns the // number of bytes sent/received. ncclResult_t (*test)(void* request, int* done, int* size); // Close and free collective comm objects ncclResult_t (*closeColl)(void* collComm); ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v4_t; #endif // end include guard nccl-2.18.3-1/src/include/net.h000066400000000000000000000014071444201535000160330ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_INT_NET_H_ #define NCCL_INT_NET_H_ #include "nccl.h" #include "nccl_net.h" #include "comm.h" #include "checks.h" typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; ncclResult_t ncclNetPluginInit(); ncclResult_t ncclNetInit(struct ncclComm* comm); int ncclNetVersion(struct ncclComm* comm); // Test whether the current GPU support GPU Direct RDMA. ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); extern ncclNet_t ncclNetIb; extern ncclNet_t ncclNetSocket; #endif nccl-2.18.3-1/src/include/nvmlwrap.h000066400000000000000000000222701444201535000171140ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_NVMLWRAP_H_ #define NCCL_NVMLWRAP_H_ #include "nccl.h" //#define NCCL_NVML_DIRECT 1 #ifndef NCCL_NVML_DIRECT #define NCCL_NVML_DIRECT 0 #endif #if NCCL_NVML_DIRECT #include "nvml.h" #else // Dynamically handle dependencies on NVML /* Extracted from nvml.h */ typedef struct nvmlDevice_st* nvmlDevice_t; #define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16 typedef enum nvmlEnableState_enum { NVML_FEATURE_DISABLED = 0, //!< Feature disabled NVML_FEATURE_ENABLED = 1 //!< Feature enabled } nvmlEnableState_t; typedef enum nvmlNvLinkCapability_enum { NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device // should be last NVML_NVLINK_CAP_COUNT } nvmlNvLinkCapability_t; typedef enum nvmlReturn_enum { NVML_SUCCESS = 0, //!< The operation was successful NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred } nvmlReturn_t; typedef struct nvmlPciInfo_st { char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffff unsigned int bus; //!< The bus on which the device resides, 0 to 0xff unsigned int device; //!< The device's id on the bus, 0 to 31 unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id // Added in NVML 2.285 API unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID // NVIDIA reserved for internal use only unsigned int reserved0; unsigned int reserved1; unsigned int reserved2; unsigned int reserved3; } nvmlPciInfo_t; /* P2P Capability Index Status*/ typedef enum nvmlGpuP2PStatus_enum { NVML_P2P_STATUS_OK = 0, NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, NVML_P2P_STATUS_GPU_NOT_SUPPORTED, NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, NVML_P2P_STATUS_DISABLED_BY_REGKEY, NVML_P2P_STATUS_NOT_SUPPORTED, NVML_P2P_STATUS_UNKNOWN } nvmlGpuP2PStatus_t; /* P2P Capability Index*/ typedef enum nvmlGpuP2PCapsIndex_enum { NVML_P2P_CAPS_INDEX_READ = 0, NVML_P2P_CAPS_INDEX_WRITE, NVML_P2P_CAPS_INDEX_NVLINK, NVML_P2P_CAPS_INDEX_ATOMICS, NVML_P2P_CAPS_INDEX_PROP, NVML_P2P_CAPS_INDEX_UNKNOWN } nvmlGpuP2PCapsIndex_t; /** * Represents the type for sample value returned */ typedef enum nvmlValueType_enum { NVML_VALUE_TYPE_DOUBLE = 0, NVML_VALUE_TYPE_UNSIGNED_INT = 1, NVML_VALUE_TYPE_UNSIGNED_LONG = 2, NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, // Keep this last NVML_VALUE_TYPE_COUNT }nvmlValueType_t; /** * Union to represent different types of Value */ typedef union nvmlValue_st { double dVal; //!< If the value is double unsigned int uiVal; //!< If the value is unsigned int unsigned long ulVal; //!< If the value is unsigned long unsigned long long ullVal; //!< If the value is unsigned long long signed long long sllVal; //!< If the value is signed long long }nvmlValue_t; /** * Field Identifiers. * * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change. */ /* NVLink Speed */ #define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links #define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device /** * Remote device NVLink ID * * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. */ #define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146 //!< Remote device NVLink ID /** * NVSwitch: connected NVLink count */ #define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch #define NVML_FI_DEV_NVLINK_GET_SPEED 164 #define NVML_FI_DEV_NVLINK_GET_STATE 165 #define NVML_FI_DEV_NVLINK_GET_VERSION 166 #define NVML_FI_MAX 167 //!< One greater than the largest field ID defined above /** * Information for a Field Value Sample */ typedef struct nvmlFieldValue_st { unsigned int fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above. unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId. long long timestamp; //!< CPU Timestamp of this value in microseconds since 1970 long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call. nvmlValueType_t valueType; //!< Type of the value stored in value nvmlReturn_t nvmlReturn; //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS } nvmlFieldValue_t; /* End of nvml.h */ #endif // NCCL_NVML_DIRECT constexpr int ncclNvmlMaxDevices = 32; struct ncclNvmlDeviceInfo { nvmlDevice_t handle; int computeCapabilityMajor, computeCapabilityMinor; }; struct ncclNvmlDevicePairInfo { nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite; }; extern int ncclNvmlDeviceCount; extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices]; extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices]; // All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly. // Outsiders need only call it if they want to inspect the ncclNvml global // tables above. ncclResult_t ncclNvmlEnsureInitialized(); ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device); ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index); ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor); ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus); ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); #endif // End include guard nccl-2.18.3-1/src/include/nvtx.h000066400000000000000000000061341444201535000162460ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_NVTX_H_ #define NCCL_NVTX_H_ #include "nvtx3/nvtx3.hpp" #if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14) #define NVTX3_CONSTEXPR_IF_CPP14 constexpr #else #define NVTX3_CONSTEXPR_IF_CPP14 #endif // Define all NCCL-provided static schema IDs here (avoid duplicates). #define NVTX_SID_CommInitRank 0 #define NVTX_SID_CommInitAll 1 #define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank #define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank #define NVTX_SID_AllGather 4 #define NVTX_SID_AllReduce 5 #define NVTX_SID_Broadcast 6 #define NVTX_SID_ReduceScatter 7 #define NVTX_SID_Reduce 8 #define NVTX_SID_Send 9 #define NVTX_SID_Recv 10 // Define static schema ID for the reduction operation. #define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START extern const nvtxDomainHandle_t ncclNvtxDomainHandle; struct nccl_domain{static constexpr char const* name{"NCCL"};}; class payload_schema { public: explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept { schema_attr.name = schemaName; schema_attr.entries = entries; schema_attr.numEntries = numEntries; schema_attr.schemaId = schemaId; nvtxPayloadSchemaRegister(nvtx3::domain::get(), &schema_attr); } payload_schema() = delete; ~payload_schema() = default; payload_schema(payload_schema const&) = default; payload_schema& operator=(payload_schema const&) = default; payload_schema(payload_schema&&) = default; payload_schema& operator=(payload_schema&&) = default; private: nvtxPayloadSchemaAttr_t schema_attr{ NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, nullptr, NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, nullptr, 0, 0, 0}; }; // Create NVTX push/pop range with parameters // @param name of the operation (see `NVTX_SID_*`) // @param N schema name // @param S schema (entries) // @param P payload (struct) #define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \ static const payload_schema schema{S, std::extent::value, \ NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \ static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ nvtxPayloadData_t nvtx3_bpl__[] = { \ {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \ ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \ ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; extern void initNvtxRegisteredEnums(); #endif nccl-2.18.3-1/src/include/nvtx3/000077500000000000000000000000001444201535000161545ustar00rootroot00000000000000nccl-2.18.3-1/src/include/nvtx3/nvToolsExt.h000066400000000000000000001427441444201535000204660ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ /** \file nvToolsExt.h */ /* ========================================================================= */ /** \mainpage * \tableofcontents * \section INTRODUCTION Introduction * * The NVIDIA Tools Extension library is a set of functions that a * developer can use to provide additional information to tools. * The additional information is used by the tool to improve * analysis and visualization of data. * * The library introduces close to zero overhead if no tool is * attached to the application. The overhead when a tool is * attached is specific to the tool. * * \section INITIALIZATION_SECTION Initialization * * Typically the tool's library that plugs into NVTX is indirectly * loaded via enviromental properties that are platform specific. * For some platform or special cases, the user may be required * to instead explicity initialize instead though. This can also * be helpful to control when the API loads a tool's library instead * of what would typically be the first function call to emit info. * For these rare case, see \ref INITIALIZATION for additional information. * * \section MARKERS_AND_RANGES Markers and Ranges * * Markers and ranges are used to describe events at a specific time (markers) * or over a time span (ranges) during the execution of the application * respectively. * * \subsection MARKERS Markers * * Markers denote specific moments in time. * * * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on * how to specify the domain. * * \subsection THREAD_RANGES Thread Ranges * * Thread ranges denote nested time ranges. Nesting is maintained per thread * per domain and does not require any additional correlation mechanism. The * duration of a thread range is defined by the corresponding pair of * nvtxRangePush* to nvtxRangePop API calls. * * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on * how to specify the domain. * * \subsection PROCESS_RANGES Process Ranges * * Process ranges denote a time span that can expose arbitrary concurrency, as * opposed to thread ranges that only support nesting. In addition the range * start event can happen on a different thread than the end marker. For the * correlation of a start/end pair an unique correlation ID is used that is * returned from the start API call and needs to be passed into the end API * call. * * \subsection EVENT_ATTRIBUTES Event Attributes * * \ref MARKERS_AND_RANGES can be annotated with various attributes to provide * additional information for an event or to guide the tool's visualization of * the data. Each of the attributes is optional and if left unused the * attributes fall back to a default value. The attributes include: * - color * - category * * To specify any attribute other than the text message, the \ref * EVENT_ATTRIBUTE_STRUCTURE "Event Attribute Structure" must be used. * * \section DOMAINS Domains * * Domains enable developers to scope annotations. By default all events and * annotations are in the default domain. Additional domains can be registered. * This allows developers to scope markers, ranges, and resources names to * avoid conflicts. * * The function ::nvtxDomainCreateA or ::nvtxDomainCreateW is used to create * a named domain. * * Each domain maintains its own * - categories * - thread range stacks * - registered strings * * The function ::nvtxDomainDestroy marks the end of the domain. Destroying * a domain unregisters and destroys all objects associated with it such as * registered strings, resource objects, named categories, and started ranges. * * \section RESOURCE_NAMING Resource Naming * * This section covers calls that allow to annotate objects with user-provided * names in order to allow for a better analysis of complex trace data. All of * the functions take the handle or the ID of the object to name and the name. * The functions can be called multiple times during the execution of an * application, however, in that case it is implementation dependent which * name will be reported by the tool. * * \subsection CATEGORY_NAMING Category Naming * * Some function in this library support associating an integer category * to enable filtering and sorting. The category naming functions allow * the application to associate a user friendly name with the integer * category. Support for domains have been added in NVTX_VERSION_2 to * avoid collisions when domains are developed independantly. * * \subsection RESOURCE_OBJECTS Resource Objects * * Resource objects are a generic mechanism for attaching data to an application * resource. The identifier field makes the association to a pointer or handle, * while the type field helps provide deeper understanding of the identifier as * well as enabling differentiation in cases where handles generated by different * APIs may collide. The resource object may also have an associated message to * associate with the application resource, enabling further annotation of this * object and how it is used. * * The resource object was introduced in NVTX_VERSION_2 to supersede existing naming * functions and allow the application resource identified by those functions to be * associated to a domain. The other naming functions are still supported for backward * compatibility but will be associated only to the default domain. * * \subsection RESOURCE_NAMING_OS Resource Naming * * Some operating system resources creation APIs do not support providing a user friendly * name, such as some OS thread creation APIs. This API support resource naming though * both through resource objects and functions following the pattern * nvtxName[RESOURCE_TYPE][A|W](identifier, name). Resource objects introduced in NVTX_VERSION 2 * supersede the other functions with a a more general method of assigning names to OS resources, * along with associating them to domains too. The older nvtxName* functions are only associated * with the default domain. * \section EXTENSIONS Optional Extensions * Optional extensions will either appear within the existing sections the extend or appear * in the "Related Pages" when they introduce new concepts. */ /** * Tools Extension API version */ #if defined(NVTX_VERSION) && NVTX_VERSION < 3 #error "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included. If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX. Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead." #endif /* Header guard */ #if !defined(NVTX_VERSION) #define NVTX_VERSION 3 #if defined(_MSC_VER) #define NVTX_API __stdcall #define NVTX_INLINE_STATIC __inline static #else /*defined(__GNUC__)*/ #define NVTX_API #define NVTX_INLINE_STATIC inline static #endif /* Platform */ #if defined(NVTX_NO_IMPL) /* When omitting implementation, avoid declaring functions inline */ /* without definitions, since this causes compiler warnings. */ #define NVTX_DECLSPEC #elif defined(NVTX_EXPORT_API) /* Allow overriding definition of NVTX_DECLSPEC when exporting API. */ /* Default is empty, meaning non-inline with external linkage. */ #if !defined(NVTX_DECLSPEC) #define NVTX_DECLSPEC #endif #else /* Normal NVTX usage defines the NVTX API inline with static */ /* (internal) linkage. */ #define NVTX_DECLSPEC NVTX_INLINE_STATIC #endif #include "nvtxDetail/nvtxLinkOnce.h" #define NVTX_VERSIONED_IDENTIFIER_L3(NAME, VERSION) NAME##_v##VERSION #define NVTX_VERSIONED_IDENTIFIER_L2(NAME, VERSION) NVTX_VERSIONED_IDENTIFIER_L3(NAME, VERSION) #define NVTX_VERSIONED_IDENTIFIER(NAME) NVTX_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION) /** * The nvToolsExt library depends on stdint.h. If the build tool chain in use * does not include stdint.h then define NVTX_STDINT_TYPES_ALREADY_DEFINED * and define the following types: *
    *
  • uint8_t *
  • int8_t *
  • uint16_t *
  • int16_t *
  • uint32_t *
  • int32_t *
  • uint64_t *
  • int64_t *
  • uintptr_t *
  • intptr_t *
* #define NVTX_STDINT_TYPES_ALREADY_DEFINED if you are using your own header file. */ #ifndef NVTX_STDINT_TYPES_ALREADY_DEFINED #include #endif #include #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /** * Result Codes */ #define NVTX_SUCCESS 0 #define NVTX_FAIL 1 #define NVTX_ERR_INIT_LOAD_PROPERTY 2 #define NVTX_ERR_INIT_ACCESS_LIBRARY 3 #define NVTX_ERR_INIT_LOAD_LIBRARY 4 #define NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT 5 #define NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT 6 #define NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE 7 /** * Size of the nvtxEventAttributes_t structure. */ #define NVTX_EVENT_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxEventAttributes_t) ) ) #define NVTX_NO_PUSH_POP_TRACKING ((int)-2) typedef uint64_t nvtxRangeId_t; /* Forward declaration of opaque domain registration structure */ struct nvtxDomainRegistration_st; typedef struct nvtxDomainRegistration_st nvtxDomainRegistration; /* \brief Domain Handle Structure. * \anchor DOMAIN_HANDLE_STRUCTURE * * This structure is opaque to the user and is used as a handle to reference * a domain. This type is returned from tools when using the NVTX API to * create a domain. * */ typedef nvtxDomainRegistration* nvtxDomainHandle_t; /* Forward declaration of opaque string registration structure */ struct nvtxStringRegistration_st; typedef struct nvtxStringRegistration_st nvtxStringRegistration; /* \brief Registered String Handle Structure. * \anchor REGISTERED_STRING_HANDLE_STRUCTURE * * This structure is opaque to the user and is used as a handle to reference * a registered string. This type is returned from tools when using the NVTX * API to create a registered string. * */ typedef nvtxStringRegistration* nvtxStringHandle_t; /* ========================================================================= */ /** \defgroup GENERAL General * @{ */ /** --------------------------------------------------------------------------- * Color Types * ------------------------------------------------------------------------- */ typedef enum nvtxColorType_t { NVTX_COLOR_UNKNOWN = 0, /**< Color attribute is unused. */ NVTX_COLOR_ARGB = 1 /**< An ARGB color is provided. */ } nvtxColorType_t; /** --------------------------------------------------------------------------- * Message Types * ------------------------------------------------------------------------- */ typedef enum nvtxMessageType_t { NVTX_MESSAGE_UNKNOWN = 0, /**< Message payload is unused. */ NVTX_MESSAGE_TYPE_ASCII = 1, /**< A character sequence is used as payload. */ NVTX_MESSAGE_TYPE_UNICODE = 2, /**< A wide character sequence is used as payload. */ /* NVTX_VERSION_2 */ NVTX_MESSAGE_TYPE_REGISTERED = 3, /**< A unique string handle that was registered with \ref nvtxDomainRegisterStringA() or \ref nvtxDomainRegisterStringW(). */ } nvtxMessageType_t; typedef union nvtxMessageValue_t { const char* ascii; const wchar_t* unicode; /* NVTX_VERSION_2 */ nvtxStringHandle_t registered; } nvtxMessageValue_t; /** @} */ /*END defgroup*/ /* ------------------------------------------------------------------------- */ /** \brief Force initialization (optional) * * Force NVTX library to initialize. The first call to any NVTX API function * will automatically initialize the entire API. This can make the first call * much slower than subsequent calls. In applications where the first call to * NVTX may be in a performance-critical section, calling nvtxInitialize before * any performance-critical sections will ensure NVTX initialization occurs at * an acceptable time. Since nvtxInitialize takes no parameters and has no * expected behavior besides initialization, it is convenient to add a call to * nvtxInitialize in NVTX-instrumented applications that need to force earlier * initialization without changing any other code. For example, if an app's * first NVTX call is nvtxDomainCreate, and it is difficult to move that call * earlier because the domain handle must be stored in an object only created * at that point, adding a call to nvtxInitialize at the top of main() will * ensure the later call to nvtxDomainCreate is as fast as possible. * * \version \NVTX_VERSION_3 * * \param reserved - must be zero or NULL. * * @{ */ NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved); /** @} */ /** @} */ /*END defgroup*/ /* ========================================================================= */ /** \defgroup EVENT_ATTRIBUTES Event Attributes * @{ */ /** --------------------------------------------------------------------------- * Payload Types * ------------------------------------------------------------------------- */ typedef enum nvtxPayloadType_t { NVTX_PAYLOAD_UNKNOWN = 0, /**< Color payload is unused. */ NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 = 1, /**< A 64 bit unsigned integer value is used as payload. */ NVTX_PAYLOAD_TYPE_INT64 = 2, /**< A 64 bit signed integer value is used as payload. */ NVTX_PAYLOAD_TYPE_DOUBLE = 3, /**< A 64 bit floating point value is used as payload. */ /* NVTX_VERSION_2 */ NVTX_PAYLOAD_TYPE_UNSIGNED_INT32 = 4, /**< A 32 bit floating point value is used as payload. */ NVTX_PAYLOAD_TYPE_INT32 = 5, /**< A 32 bit floating point value is used as payload. */ NVTX_PAYLOAD_TYPE_FLOAT = 6 /**< A 32 bit floating point value is used as payload. */ } nvtxPayloadType_t; /** \brief Event Attribute Structure. * \anchor EVENT_ATTRIBUTE_STRUCTURE * * This structure is used to describe the attributes of an event. The layout of * the structure is defined by a specific version of the tools extension * library and can change between different versions of the Tools Extension * library. * * \par Initializing the Attributes * * The caller should always perform the following three tasks when using * attributes: *
    *
  • Zero the structure *
  • Set the version field *
  • Set the size field *
* * Zeroing the structure sets all the event attributes types and values * to the default value. * * The version and size field are used by the Tools Extension * implementation to handle multiple versions of the attributes structure. * * It is recommended that the caller use one of the following to methods * to initialize the event attributes structure: * * \par Method 1: Initializing nvtxEventAttributes for future compatibility * \code * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * \endcode * * \par Method 2: Initializing nvtxEventAttributes for a specific version * \code * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = 1; * eventAttrib.size = (uint16_t)(sizeof(nvtxEventAttributes_v1)); * \endcode * * If the caller uses Method 1 it is critical that the entire binary * layout of the structure be configured to 0 so that all fields * are initialized to the default value. * * The caller should either use both NVTX_VERSION and * NVTX_EVENT_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values * and a versioned type (Method 2). Using a mix of the two methods * will likely cause either source level incompatibility or binary * incompatibility in the future. * * \par Settings Attribute Types and Values * * * \par Example: * \code * // Initialize * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * * // Configure the Attributes * eventAttrib.colorType = NVTX_COLOR_ARGB; * eventAttrib.color = 0xFF880000; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "Example"; * \endcode * * In the example the caller does not have to set the value of * \ref ::nvtxEventAttributes_v2::category or * \ref ::nvtxEventAttributes_v2::payload as these fields were set to * the default value by {0}. * \sa * ::nvtxDomainMarkEx * ::nvtxDomainRangeStartEx * ::nvtxDomainRangePushEx */ typedef struct nvtxEventAttributes_v2 { /** * \brief Version flag of the structure. * * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs * supported in this header file. This can optionally be overridden to * another version of the tools extension library. */ uint16_t version; /** * \brief Size of the structure. * * Needs to be set to the size in bytes of the event attribute * structure used to specify the event. */ uint16_t size; /** * \brief ID of the category the event is assigned to. * * A category is a user-controlled ID that can be used to group * events. The tool may use category IDs to improve filtering or * enable grouping of events in the same category. The functions * \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used * to name a category. * * Default Value is 0 */ uint32_t category; /** \brief Color type specified in this attribute structure. * * Defines the color format of the attribute structure's \ref COLOR_FIELD * "color" field. * * Default Value is NVTX_COLOR_UNKNOWN */ int32_t colorType; /* nvtxColorType_t */ /** \brief Color assigned to this event. \anchor COLOR_FIELD * * The color that the tool should use to visualize the event. */ uint32_t color; /** * \brief Payload type specified in this attribute structure. * * Defines the payload format of the attribute structure's \ref PAYLOAD_FIELD * "payload" field. * * Default Value is NVTX_PAYLOAD_UNKNOWN */ int32_t payloadType; /* nvtxPayloadType_t */ int32_t reserved0; /** * \brief Payload assigned to this event. \anchor PAYLOAD_FIELD * * A numerical value that can be used to annotate an event. The tool could * use the payload data to reconstruct graphs and diagrams. */ union payload_t { uint64_t ullValue; int64_t llValue; double dValue; /* NVTX_VERSION_2 */ uint32_t uiValue; int32_t iValue; float fValue; } payload; /** \brief Message type specified in this attribute structure. * * Defines the message format of the attribute structure's \ref MESSAGE_FIELD * "message" field. * * Default Value is NVTX_MESSAGE_UNKNOWN */ int32_t messageType; /* nvtxMessageType_t */ /** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD * * The text message that is attached to an event. */ nvtxMessageValue_t message; } nvtxEventAttributes_v2; typedef struct nvtxEventAttributes_v2 nvtxEventAttributes_t; /** @} */ /*END defgroup*/ /* ========================================================================= */ /** \defgroup MARKERS_AND_RANGES Markers and Ranges * * See \ref MARKERS_AND_RANGES for more details * * @{ */ /** \name Marker */ /* ------------------------------------------------------------------------- */ /** \brief Marks an instantaneous event in the application. * * A marker can contain a text message or specify additional information * using the event attributes structure. These attributes include a text * message, color, category, and a payload. Each of the attributes is optional * and can only be sent out using the \ref nvtxDomainMarkEx function. * * nvtxDomainMarkEx(NULL, event) is equivalent to calling * nvtxMarkEx(event). * * \param domain - The domain of scoping the category. * \param eventAttrib - The event attribute structure defining the marker's * attribute types and attribute values. * * \sa * ::nvtxMarkEx * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Marks an instantaneous event in the application. * * A marker can contain a text message or specify additional information * using the event attributes structure. These attributes include a text * message, color, category, and a payload. Each of the attributes is optional * and can only be sent out using the \ref nvtxMarkEx function. * If \ref nvtxMarkA or \ref nvtxMarkW are used to specify the marker * or if an attribute is unspecified then a default value will be used. * * \param eventAttrib - The event attribute structure defining the marker's * attribute types and attribute values. * * \par Example: * \code * // zero the structure * nvtxEventAttributes_t eventAttrib = {0}; * // set the version and the size information * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * // configure the attributes. 0 is the default for all attributes. * eventAttrib.colorType = NVTX_COLOR_ARGB; * eventAttrib.color = 0xFF880000; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "Example nvtxMarkEx"; * nvtxMarkEx(&eventAttrib); * \endcode * * \sa * ::nvtxDomainMarkEx * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Marks an instantaneous event in the application. * * A marker created using \ref nvtxMarkA or \ref nvtxMarkW contains only a * text message. * * \param message - The message associated to this marker event. * * \par Example: * \code * nvtxMarkA("Example nvtxMarkA"); * nvtxMarkW(L"Example nvtxMarkW"); * \endcode * * \sa * ::nvtxDomainMarkEx * ::nvtxMarkEx * * \version \NVTX_VERSION_0 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message); NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message); /** @} */ /** \name Process Ranges */ /* ------------------------------------------------------------------------- */ /** \brief Starts a process range in a domain. * * \param domain - The domain of scoping the category. * \param eventAttrib - The event attribute structure defining the range's * attribute types and attribute values. * * \return The unique ID used to correlate a pair of Start and End events. * * \remarks Ranges defined by Start/End can overlap. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain"); * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "my range"; * nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib); * // ... * nvtxDomainRangeEnd(rangeId); * \endcode * * \sa * ::nvtxDomainRangeEnd * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Starts a process range. * * \param eventAttrib - The event attribute structure defining the range's * attribute types and attribute values. * * \return The unique ID used to correlate a pair of Start and End events. * * \remarks Ranges defined by Start/End can overlap. * * \par Example: * \code * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib.category = 3; * eventAttrib.colorType = NVTX_COLOR_ARGB; * eventAttrib.color = 0xFF0088FF; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "Example Range"; * nvtxRangeId_t rangeId = nvtxRangeStartEx(&eventAttrib); * // ... * nvtxRangeEnd(rangeId); * \endcode * * \sa * ::nvtxRangeEnd * ::nvtxDomainRangeStartEx * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Starts a process range. * * \param message - The event message associated to this range event. * * \return The unique ID used to correlate a pair of Start and End events. * * \remarks Ranges defined by Start/End can overlap. * * \par Example: * \code * nvtxRangeId_t r1 = nvtxRangeStartA("Range 1"); * nvtxRangeId_t r2 = nvtxRangeStartW(L"Range 2"); * nvtxRangeEnd(r1); * nvtxRangeEnd(r2); * \endcode * * \sa * ::nvtxRangeEnd * ::nvtxRangeStartEx * ::nvtxDomainRangeStartEx * * \version \NVTX_VERSION_0 * @{ */ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message); NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Ends a process range. * * \param domain - The domain * \param id - The correlation ID returned from a nvtxRangeStart call. * * \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd. * It does not need a domain param since that is associated iwth the range ID at ::nvtxDomainRangeStartEx * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain"); * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "my range"; * nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib); * // ... * nvtxDomainRangeEnd(rangeId); * \endcode * * \sa * ::nvtxDomainRangeStartEx * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Ends a process range. * * \param id - The correlation ID returned from an nvtxRangeStart call. * * \sa * ::nvtxDomainRangeStartEx * ::nvtxRangeStartEx * ::nvtxRangeStartA * ::nvtxRangeStartW * * \version \NVTX_VERSION_0 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id); /** @} */ /** \name Thread Ranges */ /* ------------------------------------------------------------------------- */ /** \brief Starts a nested thread range. * * \param domain - The domain of scoping. * \param eventAttrib - The event attribute structure defining the range's * attribute types and attribute values. * * \return The 0 based level of range being started. This value is scoped to the domain. * If an error occurs, a negative value is returned. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain"); * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib.colorType = NVTX_COLOR_ARGB; * eventAttrib.color = 0xFFFF0000; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "Level 0"; * nvtxDomainRangePushEx(domain, &eventAttrib); * * // Re-use eventAttrib * eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE; * eventAttrib.message.unicode = L"Level 1"; * nvtxDomainRangePushEx(domain, &eventAttrib); * * nvtxDomainRangePop(domain); //level 1 * nvtxDomainRangePop(domain); //level 0 * \endcode * * \sa * ::nvtxDomainRangePop * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Starts a nested thread range. * * \param eventAttrib - The event attribute structure defining the range's * attribute types and attribute values. * * \return The 0 based level of range being started. This level is per domain. * If an error occurs a negative value is returned. * * \par Example: * \code * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib.colorType = NVTX_COLOR_ARGB; * eventAttrib.color = 0xFFFF0000; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; * eventAttrib.message.ascii = "Level 0"; * nvtxRangePushEx(&eventAttrib); * * // Re-use eventAttrib * eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE; * eventAttrib.message.unicode = L"Level 1"; * nvtxRangePushEx(&eventAttrib); * * nvtxRangePop(); * nvtxRangePop(); * \endcode * * \sa * ::nvtxDomainRangePushEx * ::nvtxRangePop * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Starts a nested thread range. * * \param message - The event message associated to this range event. * * \return The 0 based level of range being started. If an error occurs a * negative value is returned. * * \par Example: * \code * nvtxRangePushA("Level 0"); * nvtxRangePushW(L"Level 1"); * nvtxRangePop(); * nvtxRangePop(); * \endcode * * \sa * ::nvtxDomainRangePushEx * ::nvtxRangePop * * \version \NVTX_VERSION_0 * @{ */ NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message); NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Ends a nested thread range. * * \return The level of the range being ended. If an error occurs a negative * value is returned on the current thread. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreate("example library"); * nvtxDomainRangePushA(domain, "Level 0"); * nvtxDomainRangePushW(domain, L"Level 1"); * nvtxDomainRangePop(domain); * nvtxDomainRangePop(domain); * \endcode * * \sa * ::nvtxRangePushEx * ::nvtxRangePushA * ::nvtxRangePushW * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Ends a nested thread range. * * \return The level of the range being ended. If an error occurs a negative * value is returned on the current thread. * * \par Example: * \code * nvtxRangePushA("Level 0"); * nvtxRangePushW(L"Level 1"); * nvtxRangePop(); * nvtxRangePop(); * \endcode * * \sa * ::nvtxRangePushEx * ::nvtxRangePushA * ::nvtxRangePushW * * \version \NVTX_VERSION_0 * @{ */ NVTX_DECLSPEC int NVTX_API nvtxRangePop(void); /** @} */ /** @} */ /*END defgroup*/ /* ========================================================================= */ /** \defgroup RESOURCE_NAMING Resource Naming * * See \ref RESOURCE_NAMING for more details * * @{ */ /* ------------------------------------------------------------------------- */ /** \name Functions for Generic Resource Naming*/ /* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */ /** \cond SHOW_HIDDEN * \brief Resource typing helpers. * * Classes are used to make it easy to create a series of resource types * per API without collisions */ #define NVTX_RESOURCE_MAKE_TYPE(CLASS, INDEX) ((((uint32_t)(NVTX_RESOURCE_CLASS_ ## CLASS))<<16)|((uint32_t)(INDEX))) #define NVTX_RESOURCE_CLASS_GENERIC 1 /** \endcond */ /* ------------------------------------------------------------------------- */ /** \brief Generic resource type for when a resource class is not available. * * \sa * ::nvtxDomainResourceCreate * * \version \NVTX_VERSION_2 */ typedef enum nvtxResourceGenericType_t { NVTX_RESOURCE_TYPE_UNKNOWN = 0, NVTX_RESOURCE_TYPE_GENERIC_POINTER = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 1), /**< Generic pointer assumed to have no collisions with other pointers. */ NVTX_RESOURCE_TYPE_GENERIC_HANDLE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 2), /**< Generic handle assumed to have no collisions with other handles. */ NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 3), /**< OS native thread identifier. */ NVTX_RESOURCE_TYPE_GENERIC_THREAD_POSIX = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 4) /**< POSIX pthread identifier. */ } nvtxResourceGenericType_t; /** \brief Resource Attribute Structure. * \anchor RESOURCE_ATTRIBUTE_STRUCTURE * * This structure is used to describe the attributes of a resource. The layout of * the structure is defined by a specific version of the tools extension * library and can change between different versions of the Tools Extension * library. * * \par Initializing the Attributes * * The caller should always perform the following three tasks when using * attributes: *
    *
  • Zero the structure *
  • Set the version field *
  • Set the size field *
* * Zeroing the structure sets all the resource attributes types and values * to the default value. * * The version and size field are used by the Tools Extension * implementation to handle multiple versions of the attributes structure. * * It is recommended that the caller use one of the following to methods * to initialize the event attributes structure: * * \par Method 1: Initializing nvtxEventAttributes for future compatibility * \code * nvtxResourceAttributes_t attribs = {0}; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE; * \endcode * * \par Method 2: Initializing nvtxEventAttributes for a specific version * \code * nvtxResourceAttributes_v0 attribs = {0}; * attribs.version = 2; * attribs.size = (uint16_t)(sizeof(nvtxResourceAttributes_v0)); * \endcode * * If the caller uses Method 1 it is critical that the entire binary * layout of the structure be configured to 0 so that all fields * are initialized to the default value. * * The caller should either use both NVTX_VERSION and * NVTX_RESOURCE_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values * and a versioned type (Method 2). Using a mix of the two methods * will likely cause either source level incompatibility or binary * incompatibility in the future. * * \par Settings Attribute Types and Values * * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain"); * * // Initialize * nvtxResourceAttributes_t attribs = {0}; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE; * * // Configure the Attributes * attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER; * attribs.identifier.pValue = (const void*)pMutex; * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII; * attribs.message.ascii = "Single thread access to database."; * * nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs); * \endcode * * \sa * ::nvtxDomainResourceCreate */ typedef struct nvtxResourceAttributes_v0 { /** * \brief Version flag of the structure. * * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs * supported in this header file. This can optionally be overridden to * another version of the tools extension library. */ uint16_t version; /** * \brief Size of the structure. * * Needs to be set to the size in bytes of this attribute * structure. */ uint16_t size; /** * \brief Identifier type specifies how to interpret the identifier field * * Defines the identifier format of the attribute structure's \ref RESOURCE_IDENTIFIER_FIELD * "identifier" field. * * Default Value is NVTX_RESOURCE_TYPE_UNKNOWN */ int32_t identifierType; /* values from enums following the pattern nvtxResource[name]Type_t */ /** * \brief Identifier for the resource. * \anchor RESOURCE_IDENTIFIER_FIELD * * An identifier may be a pointer or a handle to an OS or middleware API object. * The resource type will assist in avoiding collisions where handles values may collide. */ union identifier_t { const void* pValue; uint64_t ullValue; } identifier; /** \brief Message type specified in this attribute structure. * * Defines the message format of the attribute structure's \ref RESOURCE_MESSAGE_FIELD * "message" field. * * Default Value is NVTX_MESSAGE_UNKNOWN */ int32_t messageType; /* nvtxMessageType_t */ /** \brief Message assigned to this attribute structure. \anchor RESOURCE_MESSAGE_FIELD * * The text message that is attached to a resource. */ nvtxMessageValue_t message; } nvtxResourceAttributes_v0; typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t; /* \cond SHOW_HIDDEN * \version \NVTX_VERSION_2 */ #define NVTX_RESOURCE_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxResourceAttributes_v0) ) ) typedef struct nvtxResourceHandle* nvtxResourceHandle_t; /** \endcond */ /* ------------------------------------------------------------------------- */ /** \brief Create a resource object to track and associate data with OS and middleware objects * * Allows users to associate an API handle or pointer with a user-provided name. * * * \param domain - Domain to own the resource object * \param attribs - Attributes to be associated with the resource * * \return A handle that represents the newly created resource object. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain"); * nvtxResourceAttributes_t attribs = {0}; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE; * attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER; * attribs.identifier.pValue = (const void*)pMutex; * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII; * attribs.message.ascii = "Single thread access to database."; * nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs); * \endcode * * \sa * ::nvtxResourceAttributes_t * ::nvtxDomainResourceDestroy * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Destroy a resource object to track and associate data with OS and middleware objects * * Allows users to associate an API handle or pointer with a user-provided name. * * \param resource - Handle to the resource in which to operate. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain"); * nvtxResourceAttributes_t attribs = {0}; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE; * attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER; * attribs.identifier.pValue = (const void*)pMutex; * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII; * attribs.message.ascii = "Single thread access to database."; * nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs); * nvtxDomainResourceDestroy(handle); * \endcode * * \sa * ::nvtxDomainResourceCreate * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource); /** @} */ /** \name Functions for NVTX Category Naming*/ /* ------------------------------------------------------------------------- */ /** * \brief Annotate an NVTX category used within a domain. * * Categories are used to group sets of events. Each category is identified * through a unique ID and that ID is passed into any of the marker/range * events to assign that event to a specific category. The nvtxDomainNameCategory * function calls allow the user to assign a name to a category ID that is * specific to the domain. * * nvtxDomainNameCategory(NULL, category, name) is equivalent to calling * nvtxNameCategory(category, name). * * \param domain - The domain of scoping the category. * \param category - The category ID to name. * \param name - The name of the category. * * \remarks The category names are tracked per domain. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("example"); * nvtxDomainNameCategoryA(domain, 1, "Memory Allocation"); * nvtxDomainNameCategoryW(domain, 2, L"Memory Transfer"); * \endcode * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name); NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name); /** @} */ /** \brief Annotate an NVTX category. * * Categories are used to group sets of events. Each category is identified * through a unique ID and that ID is passed into any of the marker/range * events to assign that event to a specific category. The nvtxNameCategory * function calls allow the user to assign a name to a category ID. * * \param category - The category ID to name. * \param name - The name of the category. * * \remarks The category names are tracked per process. * * \par Example: * \code * nvtxNameCategory(1, "Memory Allocation"); * nvtxNameCategory(2, "Memory Transfer"); * nvtxNameCategory(3, "Memory Object Lifetime"); * \endcode * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name); /** @} */ /** \name Functions for OS Threads Naming*/ /* ------------------------------------------------------------------------- */ /** \brief Annotate an OS thread. * * Allows the user to name an active thread of the current process. If an * invalid thread ID is provided or a thread ID from a different process is * used the behavior of the tool is implementation dependent. * * Tools expect thread ID to be a number that uniquely identifies the thread * at the time of the call. Note that a thread's ID can be reused after * it is destroyed. Tools may choose how to handle aliasing of thread IDs. * * POSIX pthread_t type returned by pthread_self() may not comply with these * expectations. Please use OS-specific thread ID instead of pthread_t. * * The thread name is associated to the default domain. To support domains * use resource objects via ::nvtxDomainResourceCreate. * * \param threadId - The ID of the thread to name. * \param name - The name of the thread. * * \par Examples: * MS Windows: * \code * #include * nvtxNameOsThread(GetCurrentThreadId(), "Current thread"); * nvtxNameOsThread(GetThreadId(SomeThreadHandle), "Other thread"); * \endcode * * Android: * \code * #include * nvtxNameOsThreadA(gettid(), "Current thread"); * nvtxNameOsThreadA(getpid(), "Main thread"); * \endcode * * Linux: * \code * #include * nvtxNameOsThreadA(syscall(SYS_gettid), "Current thread"); * \endcode * \code * #include * nvtxNameOsThreadA(getpid(), "Main thread"); * \endcode * * OS X: * \code * #include * nvtxNameOsThreadA(syscall(SYS_thread_selfid), "Current thread"); * \endcode * \code * #include * __uint64_t id; * pthread_threadid_np(pthread_self(), &id); * nvtxNameOsThreadA(id, "Current thread"); * pthread_threadid_np(somePThreadId, &id); * nvtxNameOsThreadA(id, "Other thread"); * \endcode * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name); /** @} */ /** @} */ /*END defgroup*/ /* ========================================================================= */ /** \defgroup STRING_REGISTRATION String Registration * * Registered strings are intended to increase performance by lowering instrumentation * overhead. String may be registered once and the handle may be passed in place of * a string where an the APIs may allow. * * See \ref STRING_REGISTRATION for more details * * @{ */ /* ------------------------------------------------------------------------- */ /** \brief Register a string. * Registers an immutable string with NVTX. Once registered the pointer used * to register the domain name can be used in nvtxEventAttributes_t * \ref MESSAGE_FIELD. This allows NVTX implementation to skip copying the * contents of the message on each event invocation. * * String registration is an optimization. It is recommended to use string * registration if the string will be passed to an event many times. * * String are not unregistered, except that by unregistering the entire domain * * \param domain - Domain handle. If NULL then the global domain is used. * \param string - A unique pointer to a sequence of characters. * * \return A handle representing the registered string. * * \par Example: * \code * nvtxDomainCreateA("com.nvidia.nvtx.example"); * nvtxStringHandle_t message = nvtxDomainRegisterStringA(domain, "registered string"); * nvtxEventAttributes_t eventAttrib = {0}; * eventAttrib.version = NVTX_VERSION; * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib.messageType = NVTX_MESSAGE_TYPE_REGISTERED; * eventAttrib.message.registered = message; * \endcode * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string); NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string); /** @} */ /** @} */ /*END defgroup*/ /* ========================================================================= */ /** \defgroup DOMAINS Domains * * Domains are used to group events to a developer defined scope. Middleware * vendors may also scope their own events to avoid collisions with the * the application developer's events, so that the application developer may * inspect both parts and easily differentiate or filter them. By default * all events are scoped to a global domain where NULL is provided or when * using APIs provided b versions of NVTX below v2 * * Domains are intended to be typically long lived objects with the intention * of logically separating events of large modules from each other such as * middleware libraries from each other and the main application. * * See \ref DOMAINS for more details * * @{ */ /* ------------------------------------------------------------------------- */ /** \brief Register a NVTX domain. * * Domains are used to scope annotations. All NVTX_VERSION_0 and NVTX_VERSION_1 * annotations are scoped to the global domain. The function nvtxDomainCreate * creates a new named domain. * * Each domain maintains its own nvtxRangePush and nvtxRangePop stack. * * \param name - A unique string representing the domain. * * \return A handle representing the domain. * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example"); * * nvtxMarkA("nvtxMarkA to global domain"); * * nvtxEventAttributes_t eventAttrib1 = {0}; * eventAttrib1.version = NVTX_VERSION; * eventAttrib1.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib1.message.ascii = "nvtxDomainMarkEx to global domain"; * nvtxDomainMarkEx(NULL, &eventAttrib1); * * nvtxEventAttributes_t eventAttrib2 = {0}; * eventAttrib2.version = NVTX_VERSION; * eventAttrib2.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; * eventAttrib2.message.ascii = "nvtxDomainMarkEx to com.nvidia.nvtx.example"; * nvtxDomainMarkEx(domain, &eventAttrib2); * nvtxDomainDestroy(domain); * \endcode * * \sa * ::nvtxDomainDestroy * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* name); NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Unregister a NVTX domain. * * Unregisters the domain handle and frees all domain specific resources. * * \param domain - the domain handle * * \par Example: * \code * nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example"); * nvtxDomainDestroy(domain); * \endcode * * \sa * ::nvtxDomainCreateA * ::nvtxDomainCreateW * * \version \NVTX_VERSION_2 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain); /** @} */ /** @} */ /*END defgroup*/ /* ========================================================================= */ /** \cond SHOW_HIDDEN */ #ifdef UNICODE #define nvtxMark nvtxMarkW #define nvtxRangeStart nvtxRangeStartW #define nvtxRangePush nvtxRangePushW #define nvtxNameCategory nvtxNameCategoryW #define nvtxNameOsThread nvtxNameOsThreadW /* NVTX_VERSION_2 */ #define nvtxDomainCreate nvtxDomainCreateW #define nvtxDomainRegisterString nvtxDomainRegisterStringW #define nvtxDomainNameCategory nvtxDomainNameCategoryW #else #define nvtxMark nvtxMarkA #define nvtxRangeStart nvtxRangeStartA #define nvtxRangePush nvtxRangePushA #define nvtxNameCategory nvtxNameCategoryA #define nvtxNameOsThread nvtxNameOsThreadA /* NVTX_VERSION_2 */ #define nvtxDomainCreate nvtxDomainCreateA #define nvtxDomainRegisterString nvtxDomainRegisterStringA #define nvtxDomainNameCategory nvtxDomainNameCategoryA #endif /** \endcond */ #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ #define NVTX_IMPL_GUARD /* Ensure other headers cannot included directly */ #include "nvtxDetail/nvtxTypes.h" #ifndef NVTX_NO_IMPL #include "nvtxDetail/nvtxImpl.h" #endif /*NVTX_NO_IMPL*/ #undef NVTX_IMPL_GUARD #endif /* !defined(NVTX_VERSION) */ nccl-2.18.3-1/src/include/nvtx3/nvToolsExtCuda.h000066400000000000000000000110111444201535000212410ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #include "nvToolsExt.h" #include "cuda.h" #ifndef NVTOOLSEXT_CUDA_V3 #define NVTOOLSEXT_CUDA_V3 #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* ========================================================================= */ /** \name Functions for CUDA Resource Naming */ /** \addtogroup RESOURCE_NAMING * \section RESOURCE_NAMING_CUDA CUDA Resource Naming * * This section covers the API functions that allow to annotate CUDA resources * with user-provided names. * * @{ */ /* ------------------------------------------------------------------------- */ /* \cond SHOW_HIDDEN * \brief Used to build a non-colliding value for resource types separated class * \version \NVTX_VERSION_2 */ #define NVTX_RESOURCE_CLASS_CUDA 4 /** \endcond */ /* ------------------------------------------------------------------------- */ /** \brief Resource types for CUDA */ typedef enum nvtxResourceCUDAType_t { NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */ NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */ NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */ NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */ } nvtxResourceCUDAType_t; /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA device. * * Allows the user to associate a CUDA device with a user-provided name. * * \param device - The handle of the CUDA device to name. * \param name - The name of the CUDA device. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA context. * * Allows the user to associate a CUDA context with a user-provided name. * * \param context - The handle of the CUDA context to name. * \param name - The name of the CUDA context. * * \par Example: * \code * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice ); * if ( CUDA_SUCCESS != status ) * goto Error; * nvtxNameCuContext(cuContext, "CTX_NAME"); * \endcode * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA stream. * * Allows the user to associate a CUDA stream with a user-provided name. * * \param stream - The handle of the CUDA stream to name. * \param name - The name of the CUDA stream. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA event. * * Allows the user to associate a CUDA event with a user-provided name. * * \param event - The handle of the CUDA event to name. * \param name - The name of the CUDA event. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name); /** @} */ /** @} */ /* END RESOURCE_NAMING */ /* ========================================================================= */ #ifdef UNICODE #define nvtxNameCuDevice nvtxNameCuDeviceW #define nvtxNameCuContext nvtxNameCuContextW #define nvtxNameCuStream nvtxNameCuStreamW #define nvtxNameCuEvent nvtxNameCuEventW #else #define nvtxNameCuDevice nvtxNameCuDeviceA #define nvtxNameCuContext nvtxNameCuContextA #define nvtxNameCuStream nvtxNameCuStreamA #define nvtxNameCuEvent nvtxNameCuEventA #endif #ifdef __cplusplus } #endif /* __cplusplus */ #ifndef NVTX_NO_IMPL #define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */ #include "nvtxDetail/nvtxImplCuda_v3.h" #undef NVTX_IMPL_GUARD_CUDA #endif /*NVTX_NO_IMPL*/ #endif /* NVTOOLSEXT_CUDA_V3 */ nccl-2.18.3-1/src/include/nvtx3/nvToolsExtCudaRt.h000066400000000000000000000073361444201535000215660ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #include "nvToolsExt.h" #include "cuda.h" #include "driver_types.h" #ifndef NVTOOLSEXT_CUDART_V3 #define NVTOOLSEXT_CUDART_V3 #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* ========================================================================= */ /** \name Functions for CUDA Resource Naming */ /** \addtogroup RESOURCE_NAMING * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming * * This section covers the API functions that allow to annotate CUDA resources * with user-provided names. * * @{ */ /* ------------------------------------------------------------------------- */ /* \cond SHOW_HIDDEN * \brief Used to build a non-colliding value for resource types separated class * \version \NVTX_VERSION_2 */ #define NVTX_RESOURCE_CLASS_CUDART 5 /** \endcond */ /* ------------------------------------------------------------------------- */ /** \brief Resource types for CUDART */ typedef enum nvtxResourceCUDARTType_t { NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */ NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */ NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */ } nvtxResourceCUDARTType_t; /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA device. * * Allows the user to associate a CUDA device with a user-provided name. * * \param device - The id of the CUDA device to name. * \param name - The name of the CUDA device. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA stream. * * Allows the user to associate a CUDA stream with a user-provided name. * * \param stream - The handle of the CUDA stream to name. * \param name - The name of the CUDA stream. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates a CUDA event. * * Allows the user to associate a CUDA event with a user-provided name. * * \param event - The handle of the CUDA event to name. * \param name - The name of the CUDA event. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name); /** @} */ /** @} */ /* END RESOURCE_NAMING */ /* ========================================================================= */ #ifdef UNICODE #define nvtxNameCudaDevice nvtxNameCudaDeviceW #define nvtxNameCudaStream nvtxNameCudaStreamW #define nvtxNameCudaEvent nvtxNameCudaEventW #else #define nvtxNameCudaDevice nvtxNameCudaDeviceA #define nvtxNameCudaStream nvtxNameCudaStreamA #define nvtxNameCudaEvent nvtxNameCudaEventA #endif #ifdef __cplusplus } #endif /* __cplusplus */ #ifndef NVTX_NO_IMPL #define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */ #include "nvtxDetail/nvtxImplCudaRt_v3.h" #undef NVTX_IMPL_GUARD_CUDART #endif /*NVTX_NO_IMPL*/ #endif /* NVTOOLSEXT_CUDART_V3 */ nccl-2.18.3-1/src/include/nvtx3/nvToolsExtOpenCL.h000066400000000000000000000155001444201535000215140ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #include "nvToolsExt.h" #include #ifndef NVTOOLSEXT_OPENCL_V3 #define NVTOOLSEXT_OPENCL_V3 #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* ========================================================================= */ /** \name Functions for OpenCL Resource Naming */ /** \addtogroup RESOURCE_NAMING * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming * * This section covers the API functions that allow to annotate OpenCL resources * with user-provided names. * * @{ */ /* ------------------------------------------------------------------------- */ /* \cond SHOW_HIDDEN * \brief Used to build a non-colliding value for resource types separated class * \version \NVTX_VERSION_2 */ #define NVTX_RESOURCE_CLASS_OPENCL 6 /** \endcond */ /* ------------------------------------------------------------------------- */ /** \brief Resource types for OpenCL */ typedef enum nvtxResourceOpenCLType_t { NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1), NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2), NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3), NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4), NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5), NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6), NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7), } nvtxResourceOpenCLType_t; /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL device. * * Allows to associate an OpenCL device with a user-provided name. * * \param device - The handle of the OpenCL device to name. * \param name - The name of the OpenCL device. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL context. * * Allows to associate an OpenCL context with a user-provided name. * * \param context - The handle of the OpenCL context to name. * \param name - The name of the OpenCL context. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL command queue. * * Allows to associate an OpenCL command queue with a user-provided name. * * \param command_queue - The handle of the OpenCL command queue to name. * \param name - The name of the OpenCL command queue. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL memory object. * * Allows to associate an OpenCL memory object with a user-provided name. * * \param memobj - The handle of the OpenCL memory object to name. * \param name - The name of the OpenCL memory object. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL sampler. * * Allows to associate an OpenCL sampler with a user-provided name. * * \param sampler - The handle of the OpenCL sampler to name. * \param name - The name of the OpenCL sampler. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL program. * * Allows to associate an OpenCL program with a user-provided name. * * \param program - The handle of the OpenCL program to name. * \param name - The name of the OpenCL program. * * \code * cpProgram = clCreateProgramWithSource(cxGPUContext, 1, * (const char **) &cSourceCL, &program_length, &ciErrNum); * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME"); * \endcode * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name); /** @} */ /* ------------------------------------------------------------------------- */ /** \brief Annotates an OpenCL event. * * Allows to associate an OpenCL event with a user-provided name. * * \param evnt - The handle of the OpenCL event to name. * \param name - The name of the OpenCL event. * * \version \NVTX_VERSION_1 * @{ */ NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name); NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name); /** @} */ /** @} */ /* END RESOURCE_NAMING */ /* ========================================================================= */ #ifdef UNICODE #define nvtxNameClDevice nvtxNameClDeviceW #define nvtxNameClContext nvtxNameClContextW #define nvtxNameClCommandQueue nvtxNameClCommandQueueW #define nvtxNameClMemObject nvtxNameClMemObjectW #define nvtxNameClSampler nvtxNameClSamplerW #define nvtxNameClProgram nvtxNameClProgramW #define nvtxNameClEvent nvtxNameClEventW #else #define nvtxNameClDevice nvtxNameClDeviceA #define nvtxNameClContext nvtxNameClContextA #define nvtxNameClCommandQueue nvtxNameClCommandQueueA #define nvtxNameClMemObject nvtxNameClMemObjectA #define nvtxNameClSampler nvtxNameClSamplerA #define nvtxNameClProgram nvtxNameClProgramA #define nvtxNameClEvent nvtxNameClEventA #endif #ifdef __cplusplus } #endif /* __cplusplus */ #ifndef NVTX_NO_IMPL #define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */ #include "nvtxDetail/nvtxImplOpenCL_v3.h" #undef NVTX_IMPL_GUARD_OPENCL #endif /*NVTX_NO_IMPL*/ #endif /* NVTOOLSEXT_OPENCL_V3 */ nccl-2.18.3-1/src/include/nvtx3/nvToolsExtPayload.h000066400000000000000000000636741444201535000220040ustar00rootroot00000000000000/* * Copyright 2021-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #include "nvToolsExt.h" #ifndef NVTOOLSEXT_PAYLOAD_H #define NVTOOLSEXT_PAYLOAD_H #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /** * \brief A compatibility ID value used in initialization to identify version * differences. */ #define NVTX_EXT_COMPATID_PAYLOAD 0x0103 /** * \brief This module ID identifies the payload extension. It has to be unique * among the extension modules. */ #define NVTX_EXT_MODULEID_PAYLOAD 2 /** * \brief Additional values for the enum @ref nvtxPayloadType_t */ #define NVTX_PAYLOAD_TYPE_BINARY ((int32_t)0xDFBD0009) /** --------------------------------------------------------------------------- * Payload schema entry flags. * ------------------------------------------------------------------------- */ #define NVTX_PAYLOAD_ENTRY_FLAG_UNUSED 0 /** * Absolute pointer into a payload (entry) of the same event. */ #define NVTX_PAYLOAD_ENTRY_FLAG_POINTER (1 << 1) /** * Offset from base address of the payload. */ #define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_BASE (1 << 2) /** * Offset from the end of this payload entry. */ #define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_HERE (1 << 3) /** * The value is an array with fixed length, set with the field `arrayLength`. */ #define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE (1 << 4) /** * The value is a zero-/null-terminated array. */ #define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED (2 << 4) /** * \brief A single or multi-dimensional array of variable length. * * The field `arrayLength` contains the index of the schema entry that holds the * length(s). If the other field points to a scalar entry then this will be the * 1D array. If the other field points to a FIXED_SIZE array, then the number of * dimensions is defined with the registration of the scheme. If the other field * is ZERO_TERMINATED, the array the dimensions can be determined at runtime. */ #define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX (3 << 4) /** * A tool may not support deep copy and just ignore this flag. * See @ref NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY for more details. */ #define NVTX_PAYLOAD_ENTRY_FLAG_DEEP_COPY (1 << 9) /** * The entry specifies the message in a deferred event. The entry type can be * any string type. The flag is ignored for schemas that are not flagged with * `NVTX_PAYLOAD_SCHEMA_FLAG_RANGE*` or `NVTX_PAYLOAD_SCHEMA_FLAG_MARK`. */ #define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE (1 << 10) /** * @note The ‘array’ flags assume that the array is embedded. Otherwise, * @ref NVTX_PAYLOAD_ENTRY_FLAG_POINTER has to be additionally specified. Some * combinations may be invalid based on the `NVTX_PAYLOAD_SCHEMA_TYPE_*` this * entry is enclosed. For instance, variable length embedded arrays are valid * within @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC but invalid with * @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC. See `NVTX_PAYLOAD_SCHEMA_TYPE_*` for * additional details. */ /* Helper macro to check if an entry represents an array. */ #define NVTX_PAYLOAD_ENTRY_FLAG_IS_ARRAY (\ NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE | \ NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED | \ NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX) /** --------------------------------------------------------------------------- * Types of entries in a payload schema. * ------------------------------------------------------------------------- */ /** * @note Several of the predefined types contain the size (in bits) in their * names. For some data types the size (in bytes) is not fixed and may differ * for different platforms/operating systems/compilers. To provide portability, * an array of sizes (in bytes) for type 1 to 28 ( @ref * NVTX_PAYLOAD_ENTRY_TYPE_CHAR to @ref NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE) * is passed to the NVTX extension initialization function * @ref InitializeInjectionNvtxExtension via the `extInfo` field of * @ref nvtxExtModuleInfo_t. */ #define NVTX_PAYLOAD_ENTRY_TYPE_INVALID 0 /** * Basic integer types. */ #define NVTX_PAYLOAD_ENTRY_TYPE_CHAR 1 #define NVTX_PAYLOAD_ENTRY_TYPE_UCHAR 2 #define NVTX_PAYLOAD_ENTRY_TYPE_SHORT 3 #define NVTX_PAYLOAD_ENTRY_TYPE_USHORT 4 #define NVTX_PAYLOAD_ENTRY_TYPE_INT 5 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT 6 #define NVTX_PAYLOAD_ENTRY_TYPE_LONG 7 #define NVTX_PAYLOAD_ENTRY_TYPE_ULONG 8 #define NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG 9 #define NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG 10 /** * Integer types with explicit size. */ #define NVTX_PAYLOAD_ENTRY_TYPE_INT8 11 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT8 12 #define NVTX_PAYLOAD_ENTRY_TYPE_INT16 13 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT16 14 #define NVTX_PAYLOAD_ENTRY_TYPE_INT32 15 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT32 16 #define NVTX_PAYLOAD_ENTRY_TYPE_INT64 17 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT64 18 /** * C floating point types */ #define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT 19 #define NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE 20 #define NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE 21 /** * Size type (`size_t`) */ #define NVTX_PAYLOAD_ENTRY_TYPE_SIZE 22 /** * Any address, e.g. `void*`. If the pointer type matters, use the flag @ref * NVTX_PAYLOAD_ENTRY_FLAG_POINTER and the respective type instead. */ #define NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS 23 /** * Special character types. */ #define NVTX_PAYLOAD_ENTRY_TYPE_WCHAR 24 /* wide character (since C90) */ #define NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 25 /* since C2x and C++20 */ #define NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 26 #define NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 27 /** * There is type size and alignment information for all previous types. */ #define NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE (NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 + 1) /** * Store raw 8-bit binary data. As with `char`, 1-byte alignment is assumed. * Typically a tool will display this as hex or binary. */ #define NVTX_PAYLOAD_ENTRY_TYPE_BYTE 32 /** * These types do not have standardized equivalents. It is assumed that the * number at the end corresponds to the bits used to store the value and that * the alignment corresponds to standardized types of the same size. * A tool may not support these types. */ #define NVTX_PAYLOAD_ENTRY_TYPE_INT128 33 #define NVTX_PAYLOAD_ENTRY_TYPE_UINT128 34 #define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT16 42 #define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT32 43 #define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT64 44 #define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT128 45 #define NVTX_PAYLOAD_ENTRY_TYPE_BF16 50 #define NVTX_PAYLOAD_ENTRY_TYPE_TF32 52 /** * These types are normalized numbers stored in integers. UNORMs represent 0.0 * to 1.0 and SNORMs represent -1.0 to 1.0. The number after represents the * number of integer bits. Alignment is take from equivalent types INT# matching * to SNORM# and UINT# matching to UNORM#. */ #define NVTX_PAYLOAD_ENTRY_TYPE_SNORM8 61 #define NVTX_PAYLOAD_ENTRY_TYPE_UNORM8 62 #define NVTX_PAYLOAD_ENTRY_TYPE_SNORM16 63 #define NVTX_PAYLOAD_ENTRY_TYPE_UNORM16 64 #define NVTX_PAYLOAD_ENTRY_TYPE_SNORM32 65 #define NVTX_PAYLOAD_ENTRY_TYPE_UNORM32 66 #define NVTX_PAYLOAD_ENTRY_TYPE_SNORM64 67 #define NVTX_PAYLOAD_ENTRY_TYPE_UNORM64 68 /** * String types. * * If `arrayOrUnionDetail` is greater than `0`, the entry is a fixed-size string * with the provided length. * * `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE` is ignored for string types. It * just specifies once more that the entry is a fixed-size string. * * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED` indicates a * zero-terminated string. If `arrayOrUnionDetail` is greater than `0`, a zero- * terminated array of fixed-size strings is assumed. * * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX` specifies the * entry index of the entry which contains the string length. It is not possible * to describe a variable length array of strings. */ #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING 75 /* `char*`, system LOCALE */ #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF8 76 #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF16 77 #define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF32 78 /** * @ref nvtxStringHandle_t returned by @ref nvtxDomainRegisterString */ #define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE 80 /** * Entry types to be used in deferred events. Data types are as defined by * NVTXv3 core: category -> uint32_t, color -> uint32_t, color type -> int32_t. */ #define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_CATEGORY 90 #define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_COLORTYPE 91 #define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_COLOR 92 /** * This type marks the union selector member (entry index) in schemas used by * a union with internal internal selector. * See @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR. */ #define NVTX_PAYLOAD_ENTRY_TYPE_UNION_SELECTOR 100 /** * Timestamp types occupy the range from 128 to 255 */ #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP64 128 /* data type is uint64_t */ /** * CPU timestamp sources. * \todo All 64 bits? */ #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_TSC 129 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_TSC_NONVIRTUALIZED 130 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_REALTIME 131 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_REALTIME_COARSE 132 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC 133 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC_RAW 134 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC_COARSE 135 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_BOOTTIME 136 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_PROCESS_CPUTIME_ID 137 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_THREAD_CPUTIME_ID 138 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_QPC 160 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_GSTAFT 161 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_GSTAFTP 162 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_TIME 163 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_CLOCK 164 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_TIMESPEC_GET 165 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_STEADY_CLOCK 166 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_HIGH_RESOLUTION_CLOCK 167 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_SYSTEM_CLOCK 168 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_UTC_CLOCK 169 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_TAI_CLOCK 170 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_GPS_CLOCK 171 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_FILE_CLOCK 172 /** * \brief GPU timestamp sources. */ #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_GLOBALTIMER 192 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_SM_CLOCK 193 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_SM_CLOCK64 194 #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_CUPTI 195 /** * The timestamp was provided by the NVTX handler’s timestamp routine. */ #define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_TOOL_PROVIDED 224 /** * This predefined schema ID can be used in `nvtxPayloadData_t` to indicate that * the payload is a blob of memory which other payload entries may point into. * A tool will not expose this payload directly. */ #define NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED 1022 /** * This predefined schema ID can be used in `nvtxPayloadData_t` to indicate that * the payload is a blob which can be shown with an arbitrary data viewer. */ #define NVTX_TYPE_PAYLOAD_SCHEMA_RAW 1023 /* Custom (static) schema IDs. */ #define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START (1 << 24) /* Dynamic schema IDs (generated by the tool) start here. */ #define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START 4294967296 // 1 << 32 /** * \brief Size and alignment information for predefined payload entry types. * * The struct contains the size and the alignment size in bytes. A respective * array for the predefined types is passed via nvtxExtModuleInfo_t to the NVTX * client/handler. The type (ID) is used as index into this array. */ typedef struct nvtxPayloadEntryTypeInfo_t { uint16_t size; uint16_t align; } nvtxPayloadEntryTypeInfo_t; /** * \brief Entry in a schema. * * A payload schema consists of an array of payload schema entries. It is * registered with @ref nvtxPayloadSchemaRegister. `flag` can be set to `0` for * simple values, 'type' is the only "required" field. If not set explicitly, * all other fields are zero-initialized, which means that the entry has no name * and the offset is determined based on self-alignment rules. * * Example schema: * nvtxPayloadSchemaEntry_t desc[] = { * {0, NVTX_EXT_PAYLOAD_TYPE_UINT8, "one byte"}, * {0, NVTX_EXT_PAYLOAD_TYPE_INT32, "four bytes"} * }; */ typedef struct nvtxPayloadSchemaEntry_t { /** * \brief Flags to augment the basic type. * * This field allows additional properties of the payload entry to be * specified. Valid values are `NVTX_PAYLOAD_ENTRY_FLAG_*`. */ uint64_t flags; /** * \brief Predefined payload schema entry type or ID of a registered payload * schema. */ uint64_t type; /** * \brief Name of the payload entry. (Optional) * * Providing a name is useful to give a meaning to the associated value. */ const char* name; /** * \brief Description of the payload entry. (Optional) */ const char* description; /** * \brief String or array length or union selector for union types. * * If @ref type is a C string type, this defines the length of the string. * * If @ref flags specify that the entry is an array, this field defines the * length of the array. See `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_*` for more * details. * * If @ref type implies that the entry is a union with schema type * @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION (external selection of the union * member), this field contains the index (starting with 0) to an entry of * integer type in the same schema. The associated field contains the * selected union member. * * @note An array of schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION is not * supported. @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR can * be used instead. */ uint64_t arrayOrUnionDetail; /** * \brief Offset in the binary payload data (in bytes). * * This field specifies the byte offset from the base address of the actual * binary data (blob) to the data of this entry. * * This is an optional field, but it is recommended to specify this field to * avoid issues in the automatic detection of the offset by a tool/handler. */ uint64_t offset; /** * Semantics are not yet defined. */ void* semantics; /** * Reserved for future use. Do not use it! */ void* reserved; } nvtxPayloadSchemaEntry_t; /** * \brief Binary payload data, size and decoding information. * * An array of nvtxPayloadData_t is passed to the NVTX event attribute payload * member. To attach a single payload the macro @ref NVTX_EXT_PAYLOAD_SET_ATTR * can be used. */ typedef struct nvtxPayloadData_t { /** * The schema ID, which defines the layout of the binary data. */ uint64_t schemaId; /** * Size of the binary payload (blob) in bytes. */ size_t size; /** * Pointer to the binary payload data. */ const void* payload; } nvtxPayloadData_t; /* Helper macros for safe double-cast of pointer to uint64_t value */ #ifndef NVTX_POINTER_AS_PAYLOAD_ULLVALUE # ifdef __cplusplus # define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) \ static_cast(reinterpret_cast(p)) # else #define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) ((uint64_t)(uintptr_t)p) # endif #endif #define NVTX_PAYLOAD_CONCAT2(a,b) a##b #define NVTX_PAYLOAD_CONCAT(a,b) NVTX_PAYLOAD_CONCAT2(a,b) #define NVTX_DATA_VAR NVTX_PAYLOAD_CONCAT(nvtxDFDB,__LINE__) /** * \brief Helper macro to attach a single payload to an NVTX event attribute. * * @note The NVTX push, start or mark operation must not be in the same or a * nested scope. */ #define NVTX_PAYLOAD_EVTATTR_SET(EVTATTR, SCHEMA_ID, PAYLOAD_ADDR, SIZE) \ nvtxPayloadData_t NVTX_DATA_VAR[] = {{SCHEMA_ID, SIZE, PAYLOAD_ADDR}}; \ (EVTATTR).payload.ullValue = \ NVTX_POINTER_AS_PAYLOAD_ULLVALUE(NVTX_DATA_VAR); \ (EVTATTR).payloadType = NVTX_PAYLOAD_TYPE_BINARY; \ (EVTATTR).reserved0 = 1; /** * \brief Helper macro to attach multiple payloads to an NVTX event attribute. * * The payload data array (`nvtxPayloadData_t`) is passed as first argument to * this macro. */ #define NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE(EVTATTR, PAYLOADS) \ (EVTATTR).payloadType = NVTX_PAYLOAD_TYPE_BINARY; \ (EVTATTR).reserved0 = sizeof(PAYLOADS)/sizeof(nvtxPayloadData_t); \ (EVTATTR).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(PAYLOADS); /** * \brief The payload schema type. * * A schema can be either of these types. */ enum nvtxPayloadSchemaType { NVTX_PAYLOAD_SCHEMA_TYPE_INVALID = 0, NVTX_PAYLOAD_SCHEMA_TYPE_STATIC = 1, NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC = 2, NVTX_PAYLOAD_SCHEMA_TYPE_UNION = 3, NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR = 4 }; /** * \brief Flags for static and dynamic schemas. */ enum nvtxPayloadSchemaFlags { NVTX_PAYLOAD_SCHEMA_FLAG_NONE = 0, /** * This flag indicates that a schema and the corresponding payloads can * contain fields which require a deep copy. */ NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY = (1 << 1), /** * This flag indicates that a schema and the corresponding payloads can * be referenced by another payload of the same event. */ NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED = (1 << 2), /** * The schema describes a deferred event/marker. Such a schema requires one * timestamp entry and one string entry with the flag * `NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE`. Category and color can be * optionally specified with the respective entry types. The deferred event * can contain a binary payload itself by using a custom schema ID as type * its schema description. Multiple occurrences of the same event can be * described by specifying an array timestamps. */ NVTX_PAYLOAD_SCHEMA_FLAG_DEFERRED_EVENT = (1 << 3), /** * The schema describes a deferred event/marker. Such a schema requires * one start timestamp, one end timestamp and one string entry with the flag * `NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE`. Category and color can be * optionally specified with the respective entry types. The deferred range * can contain a binary payload itself by using a custom schema ID as type * its schema description. * * Timestamps can be provided in different ways: * - A single range has two timestamp entries with the first (smaller entry * index) being used as the start/push timestamp. * - If the range schema contains one array of timestamps, the tool assumes * that the array contains alternating start and end timestamps. * - If two timestamp arrays are specified the first entry (with the * smaller entry index) is assumed to contain the start timestamps. Both * arrays have to be of the same size. */ NVTX_PAYLOAD_SCHEMA_FLAG_DEFERRED_RANGE = (2 << 3) }; /** * The values allow the valid fields in @ref nvtxPayloadSchemaAttr_t to be * specified via setting the field `fieldMask`. */ #define NVTX_PAYLOAD_SCHEMA_ATTR_NAME (1 << 1) #define NVTX_PAYLOAD_SCHEMA_ATTR_TYPE (1 << 2) #define NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS (1 << 3) #define NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES (1 << 4) #define NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES (1 << 5) #define NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE (1 << 6) #define NVTX_PAYLOAD_SCHEMA_ATTR_ALIGNMENT (1 << 7) #define NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID (1 << 8) /** * NVTX payload schema attributes. */ typedef struct nvtxPayloadSchemaAttr_t { /** * \brief Mask of valid fields in this structure. * * The values from `enum nvtxPayloadSchemaAttributes` have to be used. */ uint64_t fieldMask; /** * \brief Name of the payload schema. (Optional) */ const char* name; /** * \brief Payload schema type. (Mandatory) \anchor PAYLOAD_TYPE_FIELD * * A value from `enum nvtxPayloadSchemaType` has to be used. */ uint64_t type; /** * \brief Payload schema flags. (Optional) * * Flags defined in `enum nvtxPayloadSchemaFlags` can be used to set * additional properties of the schema. */ uint64_t flags; /** * \brief Entries of a payload schema. (Mandatory) \anchor ENTRIES_FIELD * * This field is a pointer to an array of schema entries, each describing a * field in a data structure, e.g. in a C struct or union. */ const nvtxPayloadSchemaEntry_t* entries; /** * \brief Number of entries in the payload schema. (Mandatory) * * Number of entries in the array of payload entries \ref ENTRIES_FIELD. */ size_t numEntries; /** * \brief The binary payload size in bytes for static payload schemas. * * If \ref PAYLOAD_TYPE_FIELD is @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC this * value is ignored. If this field is not specified for a schema of type * @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, the size can be automatically * determined by a tool. */ size_t payloadStaticSize; /** * \brief The byte alignment for packed structures. * * If not specified, this field defaults to `0`, which means that the fields * in the data structure are not packed and natural alignment rules can be * applied. */ size_t packAlign; /* Static/custom schema ID must be >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */ uint64_t schemaId; } nvtxPayloadSchemaAttr_t; /** * \brief Register a payload schema. * * @param domain NVTX domain handle. * @param attr NVTX payload schema attributes. */ NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadSchemaRegister( nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr); /** * \brief Enumeration entry. * * Since the value of an enum entry might not be meaningful for the analysis, * a tool can show the name of enum entry instead. * * @note EXPERIMENTAL */ typedef struct nvtxPayloadEnum_t { /** * Name of the enum value. */ const char* name; /** * Value of the enum entry. */ uint64_t value; /** * Indicates that this entry sets a specific set of bits, which can be used * to easily define bitsets. */ int8_t isFlag; } nvtxPayloadEnum_t; /** * The values are used to set the field `fieldMask` and specify which fields in * `nvtxPayloadEnumAttr_t` are set. */ #define NVTX_PAYLOAD_ENUM_ATTR_NAME (1 << 1) #define NVTX_PAYLOAD_ENUM_ATTR_ENTRIES (1 << 2) #define NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES (1 << 3) #define NVTX_PAYLOAD_ENUM_ATTR_SIZE (1 << 4) #define NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID (1 << 5) /** * NVTX payload enumeration type attributes. */ typedef struct nvtxPayloadEnumAttr_t { /** * Mask of valid fields in this struct. * The values from `enum nvtxPayloadSchemaAttributes` have to be used. */ uint64_t fieldMask; /** * Name of the enum. (Optional) */ const char* name; /** * Entries of the enum. (Mandatory) */ const nvtxPayloadEnum_t* entries; /** * Number of entries in the enum. (Mandatory) */ size_t numEntries; /** * Size of enumeration type in bytes */ size_t sizeOfEnum; /** * Static/custom schema ID must be * >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and * < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */ uint64_t schemaId; } nvtxPayloadEnumAttr_t; /** * \brief Register an enumeration type with the payload extension. * * @param domain NVTX domain handle * @param attr NVTX payload enumeration type attributes. */ NVTX_DECLSPEC uint64_t nvtxPayloadEnumRegister(nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr); /** * \brief Callback Ids of API functions in the payload extension. * * The NVTX handler can use these values to register a handler function. When * InitializeInjectionNvtxExtension(nvtxExtModuleInfo_t* moduleInfo) is * executed, a handler routine 'handlenvtxPayloadRegisterSchema' can be * registered as follows: * moduleInfo->segments->slots[NVTX3EXT_CBID_nvtxPayloadSchemaRegister] = * (intptr_t)handlenvtxPayloadRegisterSchema; */ typedef enum NvtxExtPayloadCallbackId { NVTX3EXT_CBID_nvtxPayloadSchemaRegister = 0, NVTX3EXT_CBID_nvtxPayloadEnumRegister = 1, NVTX3EXT_CBID_PAYLOAD_FN_NUM = 2 } NvtxExtPayloadCallbackId; #ifdef __GNUC__ #pragma GCC visibility push(internal) #endif #define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot include directly */ #include "nvtxExtDetail/nvtxExtTypes.h" #undef NVTX_EXT_TYPES_GUARD #ifndef NVTX_NO_IMPL #define NVTX_EXT_IMPL_PAYLOAD_GUARD /* Ensure other headers cannot included directly */ #include "nvtxExtDetail/nvtxExtPayloadTypeInfo.h" #include "nvtxExtDetail/nvtxExtImplPayload_v1.h" #undef NVTX_EXT_IMPL_PAYLOAD_GUARD #endif /*NVTX_NO_IMPL*/ #ifdef __GNUC__ #pragma GCC visibility pop #endif #ifdef __cplusplus } #endif /* __cplusplus */ #endif /* NVTOOLSEXT_PAYLOAD_H */ nccl-2.18.3-1/src/include/nvtx3/nvToolsExtSync.h000066400000000000000000000315611444201535000213150ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #include "nvToolsExt.h" #ifndef NVTOOLSEXT_SYNC_V3 #define NVTOOLSEXT_SYNC_V3 #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* \cond SHOW_HIDDEN * \version \NVTX_VERSION_2 */ #define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) ) /** \endcond */ /** * \page PAGE_SYNCHRONIZATION Synchronization * * This section covers a subset of the API that allow users to track additional * synchronization details of their application. Naming OS synchronization primitives * may allow users to better understand the data collected by traced synchronization * APIs. Additionally, a user defined synchronization object can allow the users to * to tell the tools when the user is building their own synchronization system * that do not rely on the OS to provide behaviors and instead use techniques like * atomic operations and spinlocks. * * See module \ref SYNCHRONIZATION for details. * * \par Example: * \code * class MyMutex * { * volatile long bLocked; * nvtxSyncUser_t hSync; * public: * MyMutex(const char* name, nvtxDomainHandle_t d){ * bLocked = 0; * * nvtxSyncUserAttributes_t attribs = { 0 }; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE; * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII; * attribs.message.ascii = name; * hSync = nvtxDomainSyncUserCreate(d, &attribs); * } * * ~MyMutex() { * nvtxDomainSyncUserDestroy(hSync); * } * * bool Lock() { * nvtxDomainSyncUserAcquireStart(hSync); * bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic * if (acquired) { * nvtxDomainSyncUserAcquireSuccess(hSync); * } * else { * nvtxDomainSyncUserAcquireFailed(hSync); * } * return acquired; * } * void Unlock() { * nvtxDomainSyncUserReleasing(hSync); * bLocked = false; * } * }; * \endcode * * \version \NVTX_VERSION_2 */ /* ------------------------------------------------------------------------- */ /* \cond SHOW_HIDDEN * \brief Used to build a non-colliding value for resource types separated class * \version \NVTX_VERSION_2 */ #define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */ #define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/ /** \endcond */ /* ------------------------------------------------------------------------- */ /** \defgroup SYNCHRONIZATION Synchronization * See page \ref PAGE_SYNCHRONIZATION. * @{ */ /** \brief Resource type values for OSs with POSIX Thread API support */ typedef enum nvtxResourceSyncPosixThreadType_t { NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t */ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t */ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t */ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t */ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t */ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t */ } nvtxResourceSyncPosixThreadType_t; /** \brief Resource type values for Windows OSs */ typedef enum nvtxResourceSyncWindowsType_t { NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1), NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2), NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3), NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4), NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5) } nvtxResourceSyncWindowsType_t; /** \brief Resource type values for Linux and Linux derived OSs such as Android * \sa * ::nvtxResourceSyncPosixThreadType_t */ typedef enum nvtxResourceSyncLinuxType_t { NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1), NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2), NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3), NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4), NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5), NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6), NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7) } nvtxResourceSyncLinuxType_t; /** \brief Resource type values for Android come from Linux. * \sa * ::nvtxResourceSyncLinuxType_t * ::nvtxResourceSyncPosixThreadType_t */ typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t; /** \brief User Defined Synchronization Object Handle . * \anchor SYNCUSER_HANDLE_STRUCTURE * * This structure is opaque to the user and is used as a handle to reference * a user defined syncrhonization object. The tools will return a pointer through the API for the application * to hold on it's behalf to reference the string in the future. * */ typedef struct nvtxSyncUser* nvtxSyncUser_t; /** \brief User Defined Synchronization Object Attributes Structure. * \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE * * This structure is used to describe the attributes of a user defined synchronization * object. The layout of the structure is defined by a specific version of the tools * extension library and can change between different versions of the Tools Extension * library. * * \par Initializing the Attributes * * The caller should always perform the following three tasks when using * attributes: *
    *
  • Zero the structure *
  • Set the version field *
  • Set the size field *
* * Zeroing the structure sets all the event attributes types and values * to the default value. * * The version and size field are used by the Tools Extension * implementation to handle multiple versions of the attributes structure. * * It is recommended that the caller use one of the following to methods * to initialize the event attributes structure: * * \par Method 1: Initializing nvtxEventAttributes for future compatibility * \code * nvtxSyncUserAttributes_t attribs = {0}; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE; * \endcode * * \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version * \code * nvtxSyncUserAttributes_t attribs = {0}; * attribs.version = 1; * attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t)); * \endcode * * If the caller uses Method 1 it is critical that the entire binary * layout of the structure be configured to 0 so that all fields * are initialized to the default value. * * The caller should either use both NVTX_VERSION and * NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values * and a versioned type (Method 2). Using a mix of the two methods * will likely cause either source level incompatibility or binary * incompatibility in the future. * * \par Settings Attribute Types and Values * * * \par Example: * \code * // Initialize * nvtxSyncUserAttributes_t attribs = {0}; * attribs.version = NVTX_VERSION; * attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE; * * // Configure the Attributes * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII; * attribs.message.ascii = "Example"; * \endcode * * \sa * ::nvtxDomainSyncUserCreate */ typedef struct nvtxSyncUserAttributes_v0 { /** * \brief Version flag of the structure. * * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs * supported in this header file. This can optionally be overridden to * another version of the tools extension library. */ uint16_t version; /** * \brief Size of the structure. * * Needs to be set to the size in bytes of the event attribute * structure used to specify the event. */ uint16_t size; /** \brief Message type specified in this attribute structure. * * Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message * "message" field. * * Default Value is NVTX_MESSAGE_UNKNOWN */ int32_t messageType; /* nvtxMessageType_t */ /** \brief Message assigned to this attribute structure. * * The text message that is attached to an event. */ nvtxMessageValue_t message; } nvtxSyncUserAttributes_v0; typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t; /* ------------------------------------------------------------------------- */ /** \brief Create a user defined synchronization object * This is used to track non-OS synchronization working with spinlocks and atomics * * \param domain - Domain to own the resource * \param attribs - A structure to assign multiple attributes to the object. * * \return A handle that represents the newly created user defined synchronization object. * * \sa * ::nvtxDomainSyncUserCreate * ::nvtxDomainSyncUserDestroy * ::nvtxDomainSyncUserAcquireStart * ::nvtxDomainSyncUserAcquireFailed * ::nvtxDomainSyncUserAcquireSuccess * ::nvtxDomainSyncUserReleasing * * \version \NVTX_VERSION_2 */ NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs); /* ------------------------------------------------------------------------- */ /** \brief Destroy a user defined synchronization object * This is used to track non-OS synchronization working with spinlocks and atomics * * \param handle - A handle to the object to operate on. * * \sa * ::nvtxDomainSyncUserCreate * ::nvtxDomainSyncUserDestroy * ::nvtxDomainSyncUserAcquireStart * ::nvtxDomainSyncUserAcquireFailed * ::nvtxDomainSyncUserAcquireSuccess * ::nvtxDomainSyncUserReleasing * * \version \NVTX_VERSION_2 */ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle); /* ------------------------------------------------------------------------- */ /** \brief Signal to tools that an attempt to acquire a user defined synchronization object * * \param handle - A handle to the object to operate on. * * \sa * ::nvtxDomainSyncUserCreate * ::nvtxDomainSyncUserDestroy * ::nvtxDomainSyncUserAcquireStart * ::nvtxDomainSyncUserAcquireFailed * ::nvtxDomainSyncUserAcquireSuccess * ::nvtxDomainSyncUserReleasing * * \version \NVTX_VERSION_2 */ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle); /* ------------------------------------------------------------------------- */ /** \brief Signal to tools of failure in acquiring a user defined synchronization object * This should be called after \ref nvtxDomainSyncUserAcquireStart * * \param handle - A handle to the object to operate on. * * \sa * ::nvtxDomainSyncUserCreate * ::nvtxDomainSyncUserDestroy * ::nvtxDomainSyncUserAcquireStart * ::nvtxDomainSyncUserAcquireFailed * ::nvtxDomainSyncUserAcquireSuccess * ::nvtxDomainSyncUserReleasing * * \version \NVTX_VERSION_2 */NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle); /* ------------------------------------------------------------------------- */ /** \brief Signal to tools of success in acquiring a user defined synchronization object * This should be called after \ref nvtxDomainSyncUserAcquireStart. * * \param handle - A handle to the object to operate on. * * \sa * ::nvtxDomainSyncUserCreate * ::nvtxDomainSyncUserDestroy * ::nvtxDomainSyncUserAcquireStart * ::nvtxDomainSyncUserAcquireFailed * ::nvtxDomainSyncUserAcquireSuccess * ::nvtxDomainSyncUserReleasing * * \version \NVTX_VERSION_2 */NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle); /* ------------------------------------------------------------------------- */ /** \brief Signal to tools of releasing a reservation on user defined synchronization object * This should be called after \ref nvtxDomainSyncUserAcquireSuccess. * * \param handle - A handle to the object to operate on. * * \sa * ::nvtxDomainSyncUserCreate * ::nvtxDomainSyncUserDestroy * ::nvtxDomainSyncUserAcquireStart * ::nvtxDomainSyncUserAcquireFailed * ::nvtxDomainSyncUserAcquireSuccess * ::nvtxDomainSyncUserReleasing * * \version \NVTX_VERSION_2 */ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle); /** @} */ /*END defgroup*/ #ifdef __cplusplus } #endif /* __cplusplus */ #ifndef NVTX_NO_IMPL #define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */ #include "nvtxDetail/nvtxImplSync_v3.h" #undef NVTX_IMPL_GUARD_SYNC #endif /*NVTX_NO_IMPL*/ #endif /* NVTOOLSEXT_SYNC_V3 */ nccl-2.18.3-1/src/include/nvtx3/nvtx3.hpp000066400000000000000000003060521444201535000177550ustar00rootroot00000000000000/* * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* Temporary helper #defines, #undef'ed at end of header */ #define NVTX3_CPP_VERSION_MAJOR 1 #define NVTX3_CPP_VERSION_MINOR 0 /* This section handles the decision of whether to provide unversioned symbols. * If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is #defined, unversioned symbols are * not provided, and explicit-version symbols such as nvtx3::v1::scoped_range * and NVTX3_V1_FUNC_RANGE must be used. By default, the first #include of this * header will define the unversioned symbols such as nvtx3::scoped_range and * NVTX3_FUNC_RANGE. Subsequently including a different major version of this * header without #defining NVTX3_CPP_REQUIRE_EXPLICIT_VERSION triggers an error * since the symbols would conflict. Subsequently including of a different * minor version within the same major version is allowed. Functionality of * minor versions is cumulative, regardless of include order. * * Since NVTX3_CPP_REQUIRE_EXPLICIT_VERSION allows all combinations of versions * to coexist without problems within a translation unit, the recommended best * practice for instrumenting header-based libraries with NVTX C++ Wrappers is * is to #define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION before including nvtx3.hpp, * #undef it afterward, and only use explicit-version symbols. This is not * necessary in common cases, such as instrumenting a standalone application, or * static/shared libraries in .cpp files or headers private to those projects. */ /* clang-format off */ #if !defined(NVTX3_CPP_REQUIRE_EXPLICIT_VERSION) /* Define macro used by all definitions in this header to indicate the * unversioned symbols should be defined in addition to the versioned ones. */ #define NVTX3_INLINE_THIS_VERSION #if !defined(NVTX3_CPP_INLINED_VERSION_MAJOR) /* First occurrence of this header in the translation unit. Define macros * indicating which version shall be used for unversioned symbols. */ /** * @brief Semantic major version number for NVTX C++ wrappers of unversioned symbols * * Breaking changes may occur between major versions, and different major versions * cannot provide unversioned symbols in the same translation unit (.cpp file). * * Note: If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is defined, this macro is not defined. * * Not to be confused with the version number of the NVTX core library. */ #define NVTX3_CPP_INLINED_VERSION_MAJOR 1 // NVTX3_CPP_VERSION_MAJOR /** * @brief Semantic minor version number for NVTX C++ wrappers of unversioned symbols * * No breaking changes occur between minor versions -- minor version changes within * a major version are purely additive. * * Note: If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is defined, this macro is not defined. * * Not to be confused with the version number of the NVTX core library. */ #define NVTX3_CPP_INLINED_VERSION_MINOR 0 // NVTX3_CPP_VERSION_MINOR #elif NVTX3_CPP_INLINED_VERSION_MAJOR != NVTX3_CPP_VERSION_MAJOR /* Unsupported case -- cannot define unversioned symbols for different major versions * in the same translation unit. */ #error \ "Two different major versions of the NVTX C++ Wrappers are being included in a single .cpp file, with unversioned symbols enabled in both. Only one major version can enable unversioned symbols in a .cpp file. To disable unversioned symbols, #define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION before #including nvtx3.hpp, and use the explicit-version symbols instead -- this is the preferred way to use nvtx3.hpp from a header file." #elif (NVTX3_CPP_INLINED_VERSION_MAJOR == NVTX3_CPP_VERSION_MAJOR) && \ (NVTX3_CPP_INLINED_VERSION_MINOR < NVTX3_CPP_VERSION_MINOR) /* An older minor version of the same major version already defined unversioned * symbols. The new features provided in this header will be inlined * redefine the minor version macro to this header's version. */ #undef NVTX3_CPP_INLINED_VERSION_MINOR #define NVTX3_CPP_INLINED_VERSION_MINOR 0 // NVTX3_CPP_VERSION_MINOR // else, already have this version or newer, nothing to do #endif #endif /* clang-format on */ /** * @file nvtx3.hpp * * @brief Provides C++ constructs making the NVTX library safer and easier to * use with zero overhead. */ /** * \mainpage * \tableofcontents * * \section QUICK_START Quick Start * * To add NVTX ranges to your code, use the `nvtx3::scoped_range` RAII object. A * range begins when the object is created, and ends when the object is * destroyed. * * \code{.cpp} * #include "nvtx3.hpp" * void some_function() { * // Begins a NVTX range with the messsage "some_function" * // The range ends when some_function() returns and `r` is destroyed * nvtx3::scoped_range r{"some_function"}; * * for(int i = 0; i < 6; ++i) { * nvtx3::scoped_range loop{"loop range"}; * std::this_thread::sleep_for(std::chrono::seconds{1}); * } * } // Range ends when `r` is destroyed * \endcode * * The example code above generates the following timeline view in Nsight * Systems: * * \image html * https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/docs/images/example_range.png * * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add * ranges to your code that automatically use the name of the enclosing function * as the range's message. * * \code{.cpp} * #include "nvtx3.hpp" * void some_function() { * // Creates a range with a message "some_function" that ends when the * // enclosing function returns * NVTX3_FUNC_RANGE(); * ... * } * \endcode * * * \section Overview * * The NVTX library provides a set of functions for users to annotate their code * to aid in performance profiling and optimization. These annotations provide * information to tools like Nsight Systems to improve visualization of * application timelines. * * \ref RANGES are one of the most commonly used NVTX constructs for annotating * a span of time. For example, imagine a user wanted to see every time a * function, `my_function`, is called and how long it takes to execute. This can * be accomplished with an NVTX range created on the entry to the function and * terminated on return from `my_function` using the push/pop C APIs: * * \code{.cpp} * void my_function(...) { * nvtxRangePushA("my_function"); // Begins NVTX range * // do work * nvtxRangePop(); // Ends NVTX range * } * \endcode * * One of the challenges with using the NVTX C API is that it requires manually * terminating the end of the range with `nvtxRangePop`. This can be challenging * if `my_function()` has multiple returns or can throw exceptions as it * requires calling `nvtxRangePop()` before all possible return points. * * NVTX C++ solves this inconvenience through the "RAII" technique by providing * a `nvtx3::scoped_range` class that begins a range at construction and ends * the range on destruction. The above example then becomes: * * \code{.cpp} * void my_function(...) { * nvtx3::scoped_range r{"my_function"}; // Begins NVTX range * // do work * } // Range ends on exit from `my_function` when `r` is destroyed * \endcode * * The range object `r` is deterministically destroyed whenever `my_function` * returns---ending the NVTX range without manual intervention. For more * information, see \ref RANGES and `nvtx3::scoped_range_in`. * * Another inconvenience of the NVTX C APIs are the several constructs where the * user is expected to initialize an object at the beginning of an application * and reuse that object throughout the lifetime of the application. For example * see domains, categories, and registered messages. * * Example: * \code{.cpp} * nvtxDomainHandle_t D = nvtxDomainCreateA("my domain"); * // Reuse `D` throughout the rest of the application * \endcode * * This can be problematic if the user application or library does not have an * explicit initialization function called before all other functions to * ensure that these long-lived objects are initialized before being used. * * NVTX C++ makes use of the "construct on first use" technique to alleviate * this inconvenience. In short, a function local static object is constructed * upon the first invocation of a function and returns a reference to that * object on all future invocations. See the documentation for `nvtx3::domain`, * `nvtx3::named_category`, `nvtx3::registered_string`, and * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use for more * information. * * Using construct on first use, the above example becomes: * \code{.cpp} * struct my_domain{ static constexpr char const* name{"my domain"}; }; * * // The first invocation of `domain::get` for the type `my_domain` will * // construct a `nvtx3::domain` object and return a reference to it. Future * // invocations simply return a reference. * nvtx3::domain const& D = nvtx3::domain::get(); * \endcode * For more information about NVTX and how it can be used, see * https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx and * https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ * for more information. * * \section RANGES Ranges * * Ranges are used to describe a span of time during the execution of an * application. Common examples are using ranges to annotate the time it takes * to execute a function or an iteration of a loop. * * NVTX C++ uses RAII to automate the generation of ranges that are tied to the * lifetime of objects. Similar to `std::lock_guard` in the C++ Standard * Template Library. * * \subsection scoped_range Scoped Range * * `nvtx3::scoped_range_in` is a class that begins a range upon construction * and ends the range at destruction. This is one of the most commonly used * constructs in NVTX C++ and is useful for annotating spans of time on a * particular thread. These ranges can be nested to arbitrary depths. * * `nvtx3::scoped_range` is an alias for a `nvtx3::scoped_range_in` in the * global NVTX domain. For more information about Domains, see \ref DOMAINS. * * Various attributes of a range can be configured constructing a * `nvtx3::scoped_range_in` with a `nvtx3::event_attributes` object. For * more information, see \ref ATTRIBUTES. * * Example: * * \code{.cpp} * void some_function() { * // Creates a range for the duration of `some_function` * nvtx3::scoped_range r{}; * * while(true) { * // Creates a range for every loop iteration * // `loop_range` is nested inside `r` * nvtx3::scoped_range loop_range{}; * } * } * \endcode * * \subsection unique_range Unique Range * * `nvtx3::unique_range` is similar to `nvtx3::scoped_range`, with a few key differences: * - `unique_range` objects can be destroyed in any order whereas `scoped_range` objects must be * destroyed in exact reverse creation order * - `unique_range` can start and end on different threads * - `unique_range` is moveable * - `unique_range` objects can be constructed as heap objects * * There is extra overhead associated with `unique_range` constructs and therefore use of * `nvtx3::scoped_range_in` should be preferred. * * \section MARKS Marks * * `nvtx3::mark` annotates an instantaneous point in time with a "marker". * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::mark("operation failed!"); * } * \endcode * * \section DOMAINS Domains * * Similar to C++ namespaces, domains allow for scoping NVTX events. By default, * all NVTX events belong to the "global" domain. Libraries and applications * should scope their events to use a custom domain to differentiate where the * events originate from. * * It is common for a library or application to have only a single domain and * for the name of that domain to be known at compile time. Therefore, Domains * in NVTX C++ are represented by _tag types_. * * For example, to define a custom domain, simply define a new concrete type * (a `class` or `struct`) with a `static` member called `name` that contains * the desired name of the domain. * * \code{.cpp} * struct my_domain{ static constexpr char const* name{"my domain"}; }; * \endcode * * For any NVTX C++ construct that can be scoped to a domain, the type * `my_domain` can be passed as an explicit template argument to scope it to * the custom domain. * * The tag type `nvtx3::domain::global` represents the global NVTX domain. * * \code{.cpp} * // By default, `scoped_range_in` belongs to the global domain * nvtx3::scoped_range_in<> r0{}; * * // Alias for a `scoped_range_in` in the global domain * nvtx3::scoped_range r1{}; * * // `r` belongs to the custom domain * nvtx3::scoped_range_in r{}; * \endcode * * When using a custom domain, it is recommended to define type aliases for NVTX * constructs in the custom domain. * \code{.cpp} * using my_scoped_range = nvtx3::scoped_range_in; * using my_registered_string = nvtx3::registered_string_in; * using my_named_category = nvtx3::named_category_in; * \endcode * * See `nvtx3::domain` for more information. * * \section ATTRIBUTES Event Attributes * * NVTX events can be customized with various attributes to provide additional * information (such as a custom message) or to control visualization of the * event (such as the color used). These attributes can be specified per-event * via arguments to a `nvtx3::event_attributes` object. * * NVTX events can be customized via four "attributes": * - \ref COLOR : color used to visualize the event in tools. * - \ref MESSAGES : Custom message string. * - \ref PAYLOAD : User-defined numerical value. * - \ref CATEGORY : Intra-domain grouping. * * It is possible to construct a `nvtx3::event_attributes` from any number of * attribute objects (nvtx3::color, nvtx3::message, nvtx3::payload, * nvtx3::category) in any order. If an attribute is not specified, a tool * specific default value is used. See `nvtx3::event_attributes` for more * information. * * \code{.cpp} * // Set message, same as passing nvtx3::message{"message"} * nvtx3::event_attributes attr{"message"}; * * // Set message and color * nvtx3::event_attributes attr{"message", nvtx3::rgb{127, 255, 0}}; * * // Set message, color, payload, category * nvtx3::event_attributes attr{"message", * nvtx3::rgb{127, 255, 0}, * nvtx3::payload{42}, * nvtx3::category{1}}; * * // Same as above -- can use any order of arguments * nvtx3::event_attributes attr{nvtx3::payload{42}, * nvtx3::category{1}, * "message", * nvtx3::rgb{127, 255, 0}}; * * // Multiple arguments of the same type are allowed, but only the first is * // used -- in this example, payload is set to 42: * nvtx3::event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; * * // Using the nvtx3 namespace in a local scope makes the syntax more succinct: * using namespace nvtx3; * event_attributes attr{"message", rgb{127, 255, 0}, payload{42}, category{1}}; * \endcode * * \subsection MESSAGES message * * `nvtx3::message` sets the message string for an NVTX event. * * Example: * \code{.cpp} * // Create an `event_attributes` with the message "my message" * nvtx3::event_attributes attr{nvtx3::message{"my message"}}; * * // strings and string literals implicitly assumed to be a `nvtx3::message` * nvtx3::event_attributes attr{"my message"}; * \endcode * * \subsubsection REGISTERED_MESSAGE Registered Messages * * Associating a `nvtx3::message` with an event requires copying the contents of * the message every time the message is used, i.e., copying the entire message * string. This may cause non-trivial overhead in performance sensitive code. * * To eliminate this overhead, NVTX allows registering a message string, * yielding a "handle" that is inexpensive to copy that may be used in place of * a message string. When visualizing the events, tools such as Nsight Systems * will take care of mapping the message handle to its string. * * A message should be registered once and the handle reused throughout the rest * of the application. This can be done by either explicitly creating static * `nvtx3::registered_string` objects, or using the * `nvtx3::registered_string::get` construct on first use helper (recommended). * * Similar to \ref DOMAINS, `nvtx3::registered_string::get` requires defining a * custom tag type with a static `message` member whose value will be the * contents of the registered string. * * Example: * \code{.cpp} * // Explicitly constructed, static `registered_string` in my_domain: * static registered_string_in static_message{"my message"}; * * // Or use construct on first use: * // Define a tag type with a `message` member string to register * struct my_message{ static constexpr char const* message{ "my message" }; }; * * // Uses construct on first use to register the contents of * // `my_message::message` * auto& msg = nvtx3::registered_string_in::get(); * \endcode * * \subsection COLOR color * * Associating a `nvtx3::color` with an event allows controlling how the event * is visualized in a tool such as Nsight Systems. This is a convenient way to * visually differentiate among different events. * * \code{.cpp} * // Define a color via rgb color values * nvtx3::color c{nvtx3::rgb{127, 255, 0}}; * nvtx3::event_attributes attr{c}; * * // rgb color values can be passed directly to an `event_attributes` * nvtx3::event_attributes attr1{nvtx3::rgb{127,255,0}}; * \endcode * * \subsection CATEGORY category * * A `nvtx3::category` is simply an integer id that allows for fine-grain * grouping of NVTX events. For example, one might use separate categories for * IO, memory allocation, compute, etc. * * \code{.cpp} * nvtx3::event_attributes{nvtx3::category{1}}; * \endcode * * \subsubsection NAMED_CATEGORIES Named Categories * * Associates a `name` string with a category `id` to help differentiate among * categories. * * For any given category id `Id`, a `named_category{Id, "name"}` should only * be constructed once and reused throughout an application. This can be done by * either explicitly creating static `nvtx3::named_category` objects, or using * the `nvtx3::named_category::get` construct on first use helper (recommended). * * Similar to \ref DOMAINS, `nvtx3::named_category::get` requires defining a * custom tag type with static `name` and `id` members. * * \code{.cpp} * // Explicitly constructed, static `named_category` in my_domain: * static nvtx3::named_category_in static_category{42, "my category"}; * * // Or use construct on first use: * // Define a tag type with `name` and `id` members * struct my_category { * static constexpr char const* name{"my category"}; // category name * static constexpr uint32_t id{42}; // category id * }; * * // Use construct on first use to name the category id `42` * // with name "my category": * auto& cat = named_category_in::get(); * * // Range `r` associated with category id `42` * nvtx3::event_attributes attr{cat}; * \endcode * * \subsection PAYLOAD payload * * Allows associating a user-defined numerical value with an event. * * \code{.cpp} * // Constructs a payload from the `int32_t` value 42 * nvtx3:: event_attributes attr{nvtx3::payload{42}}; * \endcode * * * \section EXAMPLE Example * * Putting it all together: * \code{.cpp} * // Define a custom domain tag type * struct my_domain{ static constexpr char const* name{"my domain"}; }; * * // Define a named category tag type * struct my_category{ * static constexpr char const* name{"my category"}; * static constexpr uint32_t id{42}; * }; * * // Define a registered string tag type * struct my_message{ static constexpr char const* message{"my message"}; }; * * // For convenience, use aliases for domain scoped objects * using my_scoped_range = nvtx3::scoped_range_in; * using my_registered_string = nvtx3::registered_string_in; * using my_named_category = nvtx3::named_category_in; * * // Default values for all attributes * nvtx3::event_attributes attr{}; * my_scoped_range r0{attr}; * * // Custom (unregistered) message, and unnamed category * nvtx3::event_attributes attr1{"message", nvtx3::category{2}}; * my_scoped_range r1{attr1}; * * // Alternatively, pass arguments of `event_attributes` ctor directly to * // `my_scoped_range` * my_scoped_range r2{"message", nvtx3::category{2}}; * * // construct on first use a registered string * auto& msg = my_registered_string::get(); * * // construct on first use a named category * auto& cat = my_named_category::get(); * * // Use registered string and named category with a custom payload * my_scoped_range r3{msg, cat, nvtx3::payload{42}}; * * // Any number of arguments in any order * my_scoped_range r{nvtx3::rgb{127, 255,0}, msg}; * * \endcode * \section MACROS Convenience Macros * * Oftentimes users want to quickly and easily add NVTX ranges to their library * or application to aid in profiling and optimization. * * A convenient way to do this is to use the \ref NVTX3_FUNC_RANGE and * \ref NVTX3_FUNC_RANGE_IN macros. These macros take care of constructing an * `nvtx3::scoped_range_in` with the name of the enclosing function as the * range's message. * * \code{.cpp} * void some_function() { * // Automatically generates an NVTX range for the duration of the function * // using "some_function" as the event's message. * NVTX3_FUNC_RANGE(); * } * \endcode * */ /* Temporary helper #defines, removed with #undef at end of header */ /* Some compilers do not correctly support SFINAE, which is used in this API * to detect common usage errors and provide clearer error messages (by using * static_assert) than the compiler would produce otherwise. These compilers * will generate errors while compiling this file such as: * * error: ‘name’ is not a member of ‘nvtx3::v1::domain::global’ * * The following compiler versions are known to have this problem, and so are * set by default to disable the SFINAE-based checks: * * - All MSVC versions prior to VS2017 Update 7 (15.7) * - GCC 8.1-8.3 (the problem was fixed in GCC 8.4) * * If you find your compiler hits this problem, you can work around it by * defining NVTX3_USE_CHECKED_OVERLOADS_FOR_GET to 0 before including this * header, or you can add a check for your compiler version to this #if. * Also, please report the issue on the NVTX github page. */ #if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET) #if defined(_MSC_VER) && _MSC_VER < 1914 \ || defined(__GNUC__) && __GNUC__ == 8 && __GNUC_MINOR__ < 4 #define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 0 #else #define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 1 #endif #define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE #endif /* Within this header, nvtx3::NVTX3_VERSION_NAMESPACE resolves to nvtx3::vX, * where "X" is the major version number. */ #define NVTX3_CONCAT(A, B) A##B #define NVTX3_NAMESPACE_FOR(VERSION) NVTX3_CONCAT(v, VERSION) #define NVTX3_VERSION_NAMESPACE NVTX3_NAMESPACE_FOR(NVTX3_CPP_VERSION_MAJOR) /* Avoid duplicating #if defined(NVTX3_INLINE_THIS_VERSION) for namespaces * in each minor version by making a macro to use unconditionally, which * resolves to "inline" or nothing as appropriate. */ #if defined(NVTX3_INLINE_THIS_VERSION) #define NVTX3_INLINE_IF_REQUESTED inline #else #define NVTX3_INLINE_IF_REQUESTED #endif /* Enables the use of constexpr when support for C++14 constexpr is present. * * Initialization of a class member that is a union to a specific union member * can only be done in the body of a constructor, not in a member initializer * list. A constexpr constructor must have an empty body until C++14, so there * is no way to make an initializer of a member union constexpr in C++11. This * macro allows making functions constexpr in C++14 or newer, but non-constexpr * in C++11 compilation. It is used here on constructors that initialize their * member unions. */ #if __cpp_constexpr >= 201304L #define NVTX3_CONSTEXPR_IF_CPP14 constexpr #else #define NVTX3_CONSTEXPR_IF_CPP14 #endif /* Use a macro for static asserts, which defaults to static_assert, but that * testing tools can replace with a logging function. For example: * #define NVTX3_STATIC_ASSERT(c, m) \ * do { if (!(c)) printf("static_assert would fail: %s\n", m); } while (0) */ #if !defined(NVTX3_STATIC_ASSERT) #define NVTX3_STATIC_ASSERT(condition, message) static_assert(condition, message); #define NVTX3_STATIC_ASSERT_DEFINED_HERE #endif /* Implementation sections, enclosed in guard macros for each minor version */ #ifndef NVTX3_CPP_DEFINITIONS_V1_0 #define NVTX3_CPP_DEFINITIONS_V1_0 #include "nvToolsExt.h" #include "nvToolsExtPayload.h" #include #include #include #include #include namespace nvtx3 { NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE { namespace detail { template struct always_false : std::false_type {}; template struct has_name : std::false_type {}; template struct has_name : std::true_type {}; template struct has_id : std::false_type {}; template struct has_id : std::true_type {}; template struct has_message : std::false_type {}; template struct has_message : std::true_type {}; template struct is_c_string : std::false_type {}; template struct is_c_string::value || std::is_convertible::value >::type> : std::true_type {}; template using is_uint32 = std::is_same::type, uint32_t>; } // namespace detail /** * @brief `domain`s allow for grouping NVTX events into a single scope to * differentiate them from events in other `domain`s. * * By default, all NVTX constructs are placed in the "global" NVTX domain. * * A custom `domain` may be used in order to differentiate a library's or * application's NVTX events from other events. * * `domain`s are expected to be long-lived and unique to a library or * application. As such, it is assumed a domain's name is known at compile * time. Therefore, all NVTX constructs that can be associated with a domain * require the domain to be specified via a *type* `D` passed as an * explicit template parameter. * * The type `domain::global` may be used to indicate that the global NVTX * domain should be used. * * None of the C++ NVTX constructs require the user to manually construct a * `domain` object. Instead, if a custom domain is desired, the user is * expected to define a type `D` that contains a member * `D::name` which resolves to either a `char const*` or `wchar_t * const*`. The value of `D::name` is used to name and uniquely * identify the custom domain. * * Upon the first use of an NVTX construct associated with the type * `D`, the "construct on first use" pattern is used to construct a * function local static `domain` object. All future NVTX constructs * associated with `D` will use a reference to the previously * constructed `domain` object. See `domain::get`. * * Example: * \code{.cpp} * // The type `my_domain` defines a `name` member used to name and identify * // the `domain` object identified by `my_domain`. * struct my_domain{ static constexpr char const* name{"my_domain"}; }; * * // The NVTX range `r` will be grouped with all other NVTX constructs * // associated with `my_domain`. * nvtx3::scoped_range_in r{}; * * // An alias can be created for a `scoped_range_in` in the custom domain * using my_scoped_range = nvtx3::scoped_range_in; * my_scoped_range my_range{}; * * // `domain::global` indicates that the global NVTX domain is used * nvtx3::scoped_range_in r2{}; * * // For convenience, `nvtx3::scoped_range` is an alias for a range in the * // global domain * nvtx3::scoped_range r3{}; * \endcode */ class domain { public: domain(domain const&) = delete; domain& operator=(domain const&) = delete; domain(domain&&) = delete; domain& operator=(domain&&) = delete; /** * @brief Tag type for the "global" NVTX domain. * * This type may be passed as a template argument to any function/class * expecting a type to identify a domain to indicate that the global domain * should be used. * * All NVTX events in the global domain across all libraries and * applications will be grouped together. * */ struct global { }; #if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET /** * @brief Returns reference to an instance of a function local static * `domain` object. * * Uses the "construct on first use" idiom to safely ensure the `domain` * object is initialized exactly once upon first invocation of * `domain::get()`. All following invocations will return a * reference to the previously constructed `domain` object. See * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use * * None of the constructs in this header require the user to directly invoke * `domain::get`. It is automatically invoked when constructing objects like * a `scoped_range_in` or `category`. Advanced users may wish to use * `domain::get` for the convenience of the "construct on first use" idiom * when using domains with their own use of the NVTX C API. * * This function is threadsafe as of C++11. If two or more threads call * `domain::get` concurrently, exactly one of them is guaranteed * to construct the `domain` object and the other(s) will receive a * reference to the object after it is fully constructed. * * The domain's name is specified via the type `D` pass as an * explicit template parameter. `D` is required to contain a * member `D::name` that resolves to either a `char const*` or * `wchar_t const*`. The value of `D::name` is used to name and * uniquely identify the `domain`. * * Example: * \code{.cpp} * // The type `my_domain` defines a `name` member used to name and identify * // the `domain` object identified by `my_domain`. * struct my_domain{ static constexpr char const* name{"my domain"}; }; * * auto& D1 = domain::get(); // First invocation constructs a * // `domain` with the name "my domain" * * auto& D2 = domain::get(); // Quickly returns reference to * // previously constructed `domain`. * \endcode * * @tparam D Type that contains a `D::name` member used to * name the `domain` object. * @return Reference to the `domain` corresponding to the type `D`. */ template ::value , int>::type = 0> static domain const& get() noexcept { static domain const d(D::name); return d; } /** * @brief Overload of `domain::get` to provide a clear compile error when * `D` has a `name` member that is not directly convertible to either * `char const*` or `wchar_t const*`. */ template ::value , int>::type = 0> static domain const& get() noexcept { NVTX3_STATIC_ASSERT(detail::always_false::value, "Type used to identify an NVTX domain must contain a static constexpr member " "called 'name' of type const char* or const wchar_t* -- 'name' member is not " "convertible to either of those types"); static domain const unused; return unused; // Function must compile for static_assert to be triggered } /** * @brief Overload of `domain::get` to provide a clear compile error when * `D` does not have a `name` member. */ template ::value , int>::type = 0> static domain const& get() noexcept { NVTX3_STATIC_ASSERT(detail::always_false::value, "Type used to identify an NVTX domain must contain a static constexpr member " "called 'name' of type const char* or const wchar_t* -- 'name' member is missing"); static domain const unused; return unused; // Function must compile for static_assert to be triggered } #else template static domain const& get() noexcept { static domain const d(D::name); return d; } #endif /** * @brief Conversion operator to `nvtxDomainHandle_t`. * * Allows transparently passing a domain object into an API expecting a * native `nvtxDomainHandle_t` object. */ operator nvtxDomainHandle_t() const noexcept { return _domain; } private: /** * @brief Construct a new domain with the specified `name`. * * This constructor is private as it is intended that `domain` objects only * be created through the `domain::get` function. * * @param name A unique name identifying the domain */ explicit domain(char const* name) noexcept : _domain{nvtxDomainCreateA(name)} {} /** * @brief Construct a new domain with the specified `name`. * * This constructor is private as it is intended that `domain` objects only * be created through the `domain::get` function. * * @param name A unique name identifying the domain */ explicit domain(wchar_t const* name) noexcept : _domain{nvtxDomainCreateW(name)} {} /** * @brief Construct a new domain with the specified `name`. * * This constructor is private as it is intended that `domain` objects only * be created through the `domain::get` function. * * @param name A unique name identifying the domain */ explicit domain(std::string const& name) noexcept : domain{name.c_str()} {} /** * @brief Construct a new domain with the specified `name`. * * This constructor is private as it is intended that `domain` objects only * be created through the `domain::get` function. * * @param name A unique name identifying the domain */ explicit domain(std::wstring const& name) noexcept : domain{name.c_str()} {} /** * @brief Default constructor creates a `domain` representing the * "global" NVTX domain. * * All events not associated with a custom `domain` are grouped in the * "global" NVTX domain. * */ domain() noexcept {} /** * @brief Intentionally avoid calling nvtxDomainDestroy on the `domain` object. * * No currently-available tools attempt to free domain resources when the * nvtxDomainDestroy function is called, due to the thread-safety and * efficiency challenges of freeing thread-local storage for other threads. * Since libraries may be disallowed from introducing static destructors, * and destroying the domain is likely to have no effect, the destructor * for `domain` intentionally chooses to not destroy the domain. * * In a situation where domain destruction is necessary, either manually * call nvtxDomainDestroy on the domain's handle, or make a class that * derives from `domain` and calls nvtxDomainDestroy in its destructor. */ ~domain() = default; private: nvtxDomainHandle_t const _domain{}; ///< The `domain`s NVTX handle }; /** * @brief Returns reference to the `domain` object that represents the global * NVTX domain. * * This specialization for `domain::global` returns a default constructed, * `domain` object for use when the "global" domain is desired. * * All NVTX events in the global domain across all libraries and applications * will be grouped together. * * @return Reference to the `domain` corresponding to the global NVTX domain. * */ template <> inline domain const& domain::get() noexcept { static domain const d{}; return d; } /** * @brief Indicates the values of the red, green, and blue color channels for * an RGB color to use as an event attribute (assumes no transparency). * */ struct rgb { /// Type used for component values using component_type = uint8_t; /** * @brief Construct a rgb with red, green, and blue channels * specified by `red_`, `green_`, and `blue_`, respectively. * * Valid values are in the range `[0,255]`. * * @param red_ Value of the red channel * @param green_ Value of the green channel * @param blue_ Value of the blue channel */ constexpr rgb( component_type red_, component_type green_, component_type blue_) noexcept : red{red_}, green{green_}, blue{blue_} { } component_type red{}; ///< Red channel value component_type green{}; ///< Green channel value component_type blue{}; ///< Blue channel value }; /** * @brief Indicates the value of the alpha, red, green, and blue color * channels for an ARGB color to use as an event attribute. * */ struct argb final : rgb { /** * @brief Construct an argb with alpha, red, green, and blue channels * specified by `alpha_`, `red_`, `green_`, and `blue_`, respectively. * * Valid values are in the range `[0,255]`. * * @param alpha_ Value of the alpha channel (opacity) * @param red_ Value of the red channel * @param green_ Value of the green channel * @param blue_ Value of the blue channel * */ constexpr argb( component_type alpha_, component_type red_, component_type green_, component_type blue_) noexcept : rgb{red_, green_, blue_}, alpha{alpha_} { } component_type alpha{}; ///< Alpha channel value }; /** * @brief Represents a custom color that can be associated with an NVTX event * via it's `event_attributes`. * * Specifying colors for NVTX events is a convenient way to visually * differentiate among different events in a visualization tool such as Nsight * Systems. * */ class color { public: /// Type used for the color's value using value_type = uint32_t; /** * @brief Constructs a `color` using the value provided by `hex_code`. * * `hex_code` is expected to be a 4 byte argb hex code. * * The most significant byte indicates the value of the alpha channel * (opacity) (0-255) * * The next byte indicates the value of the red channel (0-255) * * The next byte indicates the value of the green channel (0-255) * * The least significant byte indicates the value of the blue channel * (0-255) * * @param hex_code The hex code used to construct the `color` */ constexpr explicit color(value_type hex_code) noexcept : _value{hex_code} {} /** * @brief Construct a `color` using the alpha, red, green, blue components * in `argb`. * * @param argb The alpha, red, green, blue components of the desired `color` */ constexpr color(argb argb_) noexcept : color{from_bytes_msb_to_lsb(argb_.alpha, argb_.red, argb_.green, argb_.blue)} { } /** * @brief Construct a `color` using the red, green, blue components in * `rgb`. * * Uses maximum value for the alpha channel (opacity) of the `color`. * * @param rgb The red, green, blue components of the desired `color` */ constexpr color(rgb rgb_) noexcept : color{from_bytes_msb_to_lsb(0xFF, rgb_.red, rgb_.green, rgb_.blue)} { } /** * @brief Returns the `color`s argb hex code * */ constexpr value_type get_value() const noexcept { return _value; } /** * @brief Return the NVTX color type of the color. * */ constexpr nvtxColorType_t get_type() const noexcept { return _type; } color() = delete; ~color() = default; color(color const&) = default; color& operator=(color const&) = default; color(color&&) = default; color& operator=(color&&) = default; private: /** * @brief Constructs an unsigned, 4B integer from the component bytes in * most to least significant byte order. * */ constexpr static value_type from_bytes_msb_to_lsb( uint8_t byte3, uint8_t byte2, uint8_t byte1, uint8_t byte0) noexcept { return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 | uint32_t{byte1} << 8 | uint32_t{byte0}; } value_type _value{}; ///< color's argb color code nvtxColorType_t _type{NVTX_COLOR_ARGB}; ///< NVTX color type code }; /** * @brief Object for intra-domain grouping of NVTX events. * * A `category` is simply an integer id that allows for fine-grain grouping of * NVTX events. For example, one might use separate categories for IO, memory * allocation, compute, etc. * * Example: * \code{.cpp} * nvtx3::category cat1{1}; * * // Range `r1` belongs to the category identified by the value `1`. * nvtx3::scoped_range r1{cat1}; * * // Range `r2` belongs to the same category as `r1` * nvtx3::scoped_range r2{nvtx3::category{1}}; * \endcode * * To associate a name string with a category id, see `named_category`. * */ class category { public: /// Type used for `category`s integer id. using id_type = uint32_t; /** * @brief Construct a `category` with the specified `id`. * * The `category` will be unnamed and identified only by its `id` value. * * All `category`s in a domain sharing the same `id` are equivalent. * * @param[in] id The `category`'s identifying value */ constexpr explicit category(id_type id) noexcept : id_{id} {} /** * @brief Returns the id of the category. * */ constexpr id_type get_id() const noexcept { return id_; } category() = delete; ~category() = default; category(category const&) = default; category& operator=(category const&) = default; category(category&&) = default; category& operator=(category&&) = default; private: id_type id_{}; ///< category's unique identifier }; /** * @brief A `category` with an associated name string. * * Associates a `name` string with a category `id` to help differentiate among * categories. * * For any given category id `Id`, a `named_category(Id, "name")` should only * be constructed once and reused throughout an application. This can be done * by either explicitly creating static `named_category` objects, or using the * `named_category::get` construct on first use helper (recommended). * * Creating two or more `named_category` objects with the same value for `id` * in the same domain results in undefined behavior. * * Similarly, behavior is undefined when a `named_category` and `category` * share the same value of `id`. * * Example: * \code{.cpp} * // Explicitly constructed, static `named_category` in global domain: * static nvtx3::named_category static_category{42, "my category"}; * * // Range `r` associated with category id `42` * nvtx3::scoped_range r{static_category}; * * // OR use construct on first use: * * // Define a type with `name` and `id` members * struct my_category { * static constexpr char const* name{"my category"}; // category name * static constexpr uint32_t id{42}; // category id * }; * * // Use construct on first use to name the category id `42` * // with name "my category" * auto& cat = named_category_in::get(); * * // Range `r` associated with category id `42` * nvtx3::scoped_range r{cat}; * \endcode * * `named_category_in`'s association of a name to a category id is local to * the domain specified by the type `D`. An id may have a different name in * another domain. * * @tparam D Type containing `name` member used to identify the `domain` to * which the `named_category_in` belongs. Else, `domain::global` to indicate * that the global NVTX domain should be used. */ template class named_category_in final : public category { public: #if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET /** * @brief Returns a global instance of a `named_category_in` as a * function-local static. * * Creates a `named_category_in` with name and id specified by the contents * of a type `C`. `C::name` determines the name and `C::id` determines the * category id. * * This function is useful for constructing a named `category` exactly once * and reusing the same instance throughout an application. * * Example: * \code{.cpp} * // Define a type with `name` and `id` members * struct my_category { * static constexpr char const* name{"my category"}; // category name * static constexpr uint32_t id{42}; // category id * }; * * // Use construct on first use to name the category id `42` * // with name "my category" * auto& cat = named_category_in::get(); * * // Range `r` associated with category id `42` * nvtx3::scoped_range r{cat}; * \endcode * * Uses the "construct on first use" idiom to safely ensure the `category` * object is initialized exactly once. See * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use * * @tparam C Type containing a member `C::name` that resolves to either a * `char const*` or `wchar_t const*` and `C::id`. */ template ::value && detail::is_uint32::value , int>::type = 0> static named_category_in const& get() noexcept { static named_category_in const cat(C::id, C::name); return cat; } /** * @brief Overload of `named_category_in::get` to provide a clear compile error * when `C` has the required `name` and `id` members, but they are not the * required types. `name` must be directly convertible to `char const*` or * `wchar_t const*`, and `id` must be `uint32_t`. */ template ::value || !detail::is_uint32::value , int>::type = 0> static named_category_in const& get() noexcept { NVTX3_STATIC_ASSERT(detail::is_c_string::value, "Type used to name an NVTX category must contain a static constexpr member " "called 'name' of type const char* or const wchar_t* -- 'name' member is not " "convertible to either of those types"); NVTX3_STATIC_ASSERT(detail::is_uint32::value, "Type used to name an NVTX category must contain a static constexpr member " "called 'id' of type uint32_t -- 'id' member is the wrong type"); static named_category_in const unused; return unused; // Function must compile for static_assert to be triggered } /** * @brief Overload of `named_category_in::get` to provide a clear compile error * when `C` does not have the required `name` and `id` members. */ template ::value || !detail::has_id::value , int>::type = 0> static named_category_in const& get() noexcept { NVTX3_STATIC_ASSERT(detail::has_name::value, "Type used to name an NVTX category must contain a static constexpr member " "called 'name' of type const char* or const wchar_t* -- 'name' member is missing"); NVTX3_STATIC_ASSERT(detail::has_id::value, "Type used to name an NVTX category must contain a static constexpr member " "called 'id' of type uint32_t -- 'id' member is missing"); static named_category_in const unused; return unused; // Function must compile for static_assert to be triggered } #else template static named_category_in const& get() noexcept { static named_category_in const cat(C::id, C::name); return cat; } #endif private: // Default constructor is only used internally for static_assert(false) cases. named_category_in() noexcept : category{0} {} public: /** * @brief Construct a `named_category_in` with the specified `id` and `name`. * * The name `name` will be registered with `id`. * * Every unique value of `id` should only be named once. * * @param[in] id The category id to name * @param[in] name The name to associated with `id` */ named_category_in(id_type id, char const* name) noexcept : category{id} { #ifndef NVTX_DISABLE nvtxDomainNameCategoryA(domain::get(), get_id(), name); #else (void)id; (void)name; #endif }; /** * @brief Construct a `named_category_in` with the specified `id` and `name`. * * The name `name` will be registered with `id`. * * Every unique value of `id` should only be named once. * * @param[in] id The category id to name * @param[in] name The name to associated with `id` */ named_category_in(id_type id, wchar_t const* name) noexcept : category{id} { #ifndef NVTX_DISABLE nvtxDomainNameCategoryW(domain::get(), get_id(), name); #else (void)id; (void)name; #endif }; }; /** * @brief Alias for a `named_category_in` in the global NVTX domain. * */ using named_category = named_category_in; /** * @brief A message registered with NVTX. * * Normally, associating a `message` with an NVTX event requires copying the * contents of the message string. This may cause non-trivial overhead in * highly performance sensitive regions of code. * * message registration is an optimization to lower the overhead of * associating a message with an NVTX event. Registering a message yields a * handle that is inexpensive to copy that may be used in place of a message * string. * * A particular message should only be registered once and the handle * reused throughout the rest of the application. This can be done by either * explicitly creating static `registered_string_in` objects, or using the * `registered_string_in::get` construct on first use helper (recommended). * * Example: * \code{.cpp} * // Explicitly constructed, static `registered_string` in my_domain: * static registered_string_in static_message{"message"}; * * // "message" is associated with the range `r` * nvtx3::scoped_range r{static_message}; * * // Or use construct on first use: * * // Define a type with a `message` member that defines the contents of the * // registered string * struct my_message{ static constexpr char const* message{ "my message" }; }; * * // Uses construct on first use to register the contents of * // `my_message::message` * auto& msg = registered_string_in::get(); * * // "my message" is associated with the range `r` * nvtx3::scoped_range r{msg}; * \endcode * * `registered_string_in`s are local to a particular domain specified via * the type `D`. * * @tparam D Type containing `name` member used to identify the `domain` to * which the `registered_string_in` belongs. Else, `domain::global` to indicate * that the global NVTX domain should be used. */ template class registered_string_in { public: #if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET /** * @brief Returns a global instance of a `registered_string_in` as a function * local static. * * Provides a convenient way to register a message with NVTX without having * to explicitly register the message. * * Upon first invocation, constructs a `registered_string_in` whose contents * are specified by `message::message`. * * All future invocations will return a reference to the object constructed * in the first invocation. * * Example: * \code{.cpp} * // Define a type with a `message` member that defines the contents of the * // registered string * struct my_message{ static constexpr char const* message{ "my message" }; * }; * * // Uses construct on first use to register the contents of * // `my_message::message` * auto& msg = registered_string_in::get(); * * // "my message" is associated with the range `r` * nvtx3::scoped_range r{msg}; * \endcode * * @tparam M Type required to contain a member `M::message` that * resolves to either a `char const*` or `wchar_t const*` used as the * registered string's contents. * @return Reference to a `registered_string_in` associated with the type `M`. */ template ::value , int>::type = 0> static registered_string_in const& get() noexcept { static registered_string_in const regstr(M::message); return regstr; } /** * @brief Overload of `registered_string_in::get` to provide a clear compile error * when `M` has a `message` member that is not directly convertible to either * `char const*` or `wchar_t const*`. */ template ::value , int>::type = 0> static registered_string_in const& get() noexcept { NVTX3_STATIC_ASSERT(detail::always_false::value, "Type used to register an NVTX string must contain a static constexpr member " "called 'message' of type const char* or const wchar_t* -- 'message' member is " "not convertible to either of those types"); static registered_string_in const unused; return unused; // Function must compile for static_assert to be triggered } /** * @brief Overload of `registered_string_in::get` to provide a clear compile error when * `M` does not have a `message` member. */ template ::value , int>::type = 0> static registered_string_in const& get() noexcept { NVTX3_STATIC_ASSERT(detail::always_false::value, "Type used to register an NVTX string must contain a static constexpr member " "called 'message' of type const char* or const wchar_t* -- 'message' member " "is missing"); static registered_string_in const unused; return unused; // Function must compile for static_assert to be triggered } #else template static registered_string_in const& get() noexcept { static registered_string_in const regstr(M::message); return regstr; } #endif /** * @brief Constructs a `registered_string_in` from the specified `msg` string. * * Registers `msg` with NVTX and associates a handle with the registered * message. * * A particular message should should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message */ explicit registered_string_in(char const* msg) noexcept : handle_{nvtxDomainRegisterStringA(domain::get(), msg)} { } /** * @brief Constructs a `registered_string_in` from the specified `msg` string. * * Registers `msg` with NVTX and associates a handle with the registered * message. * * A particular message should should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message */ explicit registered_string_in(std::string const& msg) noexcept : registered_string_in{msg.c_str()} {} /** * @brief Constructs a `registered_string_in` from the specified `msg` string. * * Registers `msg` with NVTX and associates a handle with the registered * message. * * A particular message should should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message */ explicit registered_string_in(wchar_t const* msg) noexcept : handle_{nvtxDomainRegisterStringW(domain::get(), msg)} { } /** * @brief Constructs a `registered_string_in` from the specified `msg` string. * * Registers `msg` with NVTX and associates a handle with the registered * message. * * A particular message should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message */ explicit registered_string_in(std::wstring const& msg) noexcept : registered_string_in{msg.c_str()} {} /** * @brief Returns the registered string's handle * */ nvtxStringHandle_t get_handle() const noexcept { return handle_; } private: // Default constructor is only used internally for static_assert(false) cases. registered_string_in() noexcept {}; public: ~registered_string_in() = default; registered_string_in(registered_string_in const&) = default; registered_string_in& operator=(registered_string_in const&) = default; registered_string_in(registered_string_in&&) = default; registered_string_in& operator=(registered_string_in&&) = default; private: nvtxStringHandle_t handle_{}; ///< The handle returned from ///< registering the message with NVTX }; /** * @brief Alias for a `registered_string_in` in the global NVTX domain. * */ using registered_string = registered_string_in; /** * @brief Allows associating a message string with an NVTX event via * its `EventAttribute`s. * * Associating a `message` with an NVTX event through its `event_attributes` * allows for naming events to easily differentiate them from other events. * * Every time an NVTX event is created with an associated `message`, the * contents of the message string must be copied. This may cause non-trivial * overhead in highly performance sensitive sections of code. Use of a * `nvtx3::registered_string` is recommended in these situations. * * Example: * \code{.cpp} * // Creates an `event_attributes` with message "message 0" * nvtx3::event_attributes attr0{nvtx3::message{"message 0"}}; * * // `range0` contains message "message 0" * nvtx3::scoped_range range0{attr0}; * * // `std::string` and string literals are implicitly assumed to be * // the contents of an `nvtx3::message` * // Creates an `event_attributes` with message "message 1" * nvtx3::event_attributes attr1{"message 1"}; * * // `range1` contains message "message 1" * nvtx3::scoped_range range1{attr1}; * * // `range2` contains message "message 2" * nvtx3::scoped_range range2{nvtx3::Mesage{"message 2"}}; * * // `std::string` and string literals are implicitly assumed to be * // the contents of an `nvtx3::message` * // `range3` contains message "message 3" * nvtx3::scoped_range range3{"message 3"}; * \endcode */ class message { public: using value_type = nvtxMessageValue_t; /** * @brief Construct a `message` whose contents are specified by `msg`. * * @param msg The contents of the message */ NVTX3_CONSTEXPR_IF_CPP14 message(char const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_ASCII} { value_.ascii = msg; } /** * @brief Construct a `message` whose contents are specified by `msg`. * * @param msg The contents of the message */ message(std::string const& msg) noexcept : message{msg.c_str()} {} /** * @brief Disallow construction for `std::string` r-value * * `message` is a non-owning type and therefore cannot take ownership of an * r-value. Therefore, constructing from an r-value is disallowed to prevent * a dangling pointer. * */ message(std::string&&) = delete; /** * @brief Construct a `message` whose contents are specified by `msg`. * * @param msg The contents of the message */ NVTX3_CONSTEXPR_IF_CPP14 message(wchar_t const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_UNICODE} { value_.unicode = msg; } /** * @brief Construct a `message` whose contents are specified by `msg`. * * @param msg The contents of the message */ message(std::wstring const& msg) noexcept : message{msg.c_str()} {} /** * @brief Disallow construction for `std::wstring` r-value * * `message` is a non-owning type and therefore cannot take ownership of an * r-value. Therefore, constructing from an r-value is disallowed to prevent * a dangling pointer. * */ message(std::wstring&&) = delete; /** * @brief Construct a `message` from a `registered_string_in`. * * @tparam D Type containing `name` member used to identify the `domain` * to which the `registered_string_in` belongs. Else, `domain::global` to * indicate that the global NVTX domain should be used. * @param msg The message that has already been registered with NVTX. */ template NVTX3_CONSTEXPR_IF_CPP14 message(registered_string_in const& msg) noexcept : type_{NVTX_MESSAGE_TYPE_REGISTERED} { value_.registered = msg.get_handle(); } /** * @brief Construct a `message` from NVTX C API type and value. * * @param type nvtxMessageType_t enum value indicating type of the payload * @param value nvtxMessageValue_t union containing message */ constexpr message( nvtxMessageType_t const& type, nvtxMessageValue_t const& value) noexcept : type_{type}, value_(value) { } /** * @brief Construct a `message` from NVTX C API registered string handle. * * @param handle nvtxStringHandle_t value of registered string handle */ NVTX3_CONSTEXPR_IF_CPP14 message(nvtxStringHandle_t handle) noexcept : type_{NVTX_MESSAGE_TYPE_REGISTERED} { value_.registered = handle; } /** * @brief Return the union holding the value of the message. * */ constexpr value_type get_value() const noexcept { return value_; } /** * @brief Return the type information about the value the union holds. * */ constexpr nvtxMessageType_t get_type() const noexcept { return type_; } private: nvtxMessageType_t type_{}; ///< message type nvtxMessageValue_t value_{}; ///< message contents }; /** * @brief A numerical value that can be associated with an NVTX event via * its `event_attributes`. * * Example: * \code{.cpp} * // Constructs a payload from the int32_t value 42 * nvtx3:: event_attributes attr{nvtx3::payload{42}}; * * // `range0` will have an int32_t payload of 42 * nvtx3::scoped_range range0{attr}; * * // range1 has double payload of 3.14 * nvtx3::scoped_range range1{nvtx3::payload{3.14}}; * \endcode */ class payload { public: using value_type = typename nvtxEventAttributes_v2::payload_t; /** * @brief Construct a `payload` from a signed, 8 byte integer. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(int64_t value) noexcept : type_{NVTX_PAYLOAD_TYPE_INT64}, value_{} { value_.llValue = value; } /** * @brief Construct a `payload` from a signed, 4 byte integer. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(int32_t value) noexcept : type_{NVTX_PAYLOAD_TYPE_INT32}, value_{} { value_.iValue = value; } /** * @brief Construct a `payload` from an unsigned, 8 byte integer. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(uint64_t value) noexcept : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64}, value_{} { value_.ullValue = value; } /** * @brief Construct a `payload` from an unsigned, 4 byte integer. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(uint32_t value) noexcept : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32}, value_{} { value_.uiValue = value; } /** * @brief Construct a `payload` from a single-precision floating point * value. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(float value) noexcept : type_{NVTX_PAYLOAD_TYPE_FLOAT}, value_{} { value_.fValue = value; } /** * @brief Construct a `payload` from a double-precision floating point * value. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(double value) noexcept : type_{NVTX_PAYLOAD_TYPE_DOUBLE}, value_{} { value_.dValue = value; } /** * @brief Construct a `payload` from NVTX C API type and value. * * @param type nvtxPayloadType_t enum value indicating type of the payload * @param value nvtxEventAttributes_t::payload_t union containing payload */ constexpr payload( nvtxPayloadType_t const& type, value_type const& value) noexcept : type_{type}, value_(value) { } /** * @brief Return the union holding the value of the payload * */ constexpr value_type get_value() const noexcept { return value_; } /** * @brief Return the information about the type the union holds. * */ constexpr nvtxPayloadType_t get_type() const noexcept { return type_; } private: nvtxPayloadType_t type_; ///< Type of the payload value value_type value_; ///< Union holding the payload value }; /** * @brief Describes the attributes of a NVTX event. * * NVTX events can be customized via four "attributes": * * - color: color used to visualize the event in tools such as Nsight * Systems. See `color`. * - message: Custom message string. See `message`. * - payload: User-defined numerical value. See `payload`. * - category: Intra-domain grouping. See `category`. * * These component attributes are specified via an `event_attributes` object. * See `nvtx3::color`, `nvtx3::message`, `nvtx3::payload`, and * `nvtx3::category` for how these individual attributes are constructed. * * While it is possible to specify all four attributes, it is common to want * to only specify a subset of attributes and use default values for the * others. For convenience, `event_attributes` can be constructed from any * number of attribute components in any order. * * Example: * \code{.cpp} * // Set message, same as using nvtx3::message{"message"} * event_attributes attr{"message"}; * * // Set message and color * event_attributes attr{"message", nvtx3::rgb{127, 255, 0}}; * * // Set message, color, payload, category * event_attributes attr{"message", * nvtx3::rgb{127, 255, 0}, * nvtx3::payload{42}, * nvtx3::category{1}}; * * // Same as above -- can use any order of arguments * event_attributes attr{nvtx3::payload{42}, * nvtx3::category{1}, * "message", * nvtx3::rgb{127, 255, 0}}; * * // Multiple arguments of the same type are allowed, but only the first is * // used -- in this example, payload is set to 42: * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; * * // Range `r` will be customized according the attributes in `attr` * nvtx3::scoped_range r{attr}; * * // For convenience, `event_attributes` constructor arguments may be passed * // to the `scoped_range_in` contructor -- they are forwarded to the * // `event_attributes` constructor * nvtx3::scoped_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"}; * * // Using the nvtx3 namespace in a local scope makes the syntax more succinct: * using namespace nvtx3; * scoped_range r{payload{42}, category{1}, "message"}; * \endcode * */ class event_attributes { public: using value_type = nvtxEventAttributes_t; /** * @brief Default constructor creates an `event_attributes` with no * category, color, payload, nor message. */ constexpr event_attributes() noexcept : attributes_{ NVTX_VERSION, // version sizeof(nvtxEventAttributes_t), // size 0, // category NVTX_COLOR_UNKNOWN, // color type 0, // color value NVTX_PAYLOAD_UNKNOWN, // payload type 0, // reserved 4B 0, // payload value (union) NVTX_MESSAGE_UNKNOWN, // message type 0 // message value (union) } { } /** * @brief Variadic constructor where the first argument is a `category`. * * Sets the value of the `EventAttribute`s category based on `c` and * forwards the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(category const& c, Args const&... args) noexcept : event_attributes(args...) { attributes_.category = c.get_id(); } /** * @brief Variadic constructor where the first argument is a `color`. * * Sets the value of the `EventAttribute`s color based on `c` and forwards * the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(color const& c, Args const&... args) noexcept : event_attributes(args...) { attributes_.color = c.get_value(); attributes_.colorType = c.get_type(); } /** * @brief Variadic constructor where the first argument is a `payload`. * * Sets the value of the `EventAttribute`s payload based on `p` and forwards * the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(payload const& p, Args const&... args) noexcept : event_attributes(args...) { attributes_.payload = p.get_value(); attributes_.payloadType = p.get_type(); } /** * @brief Variadic constructor where the first argument is a `message`. * * Sets the value of the `EventAttribute`s message based on `m` and forwards * the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(message const& m, Args const&... args) noexcept : event_attributes(args...) { attributes_.message = m.get_value(); attributes_.messageType = m.get_type(); } /** * @brief Variadic constructor where the first argument is a binary payload. * * Sets the value of the `EventAttribute`s message based on `m` and forwards * the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t const* bpl, Args const&... args) noexcept : event_attributes(args...) { attributes_.payloadType = NVTX_PAYLOAD_TYPE_BINARY; attributes_.reserved0 = 1; // NCCL uses only a single binary payload per event. attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(bpl); } ~event_attributes() = default; event_attributes(event_attributes const&) = default; event_attributes& operator=(event_attributes const&) = default; event_attributes(event_attributes&&) = default; event_attributes& operator=(event_attributes&&) = default; /** * @brief Get raw pointer to underlying NVTX attributes object. * */ constexpr value_type const* get() const noexcept { return &attributes_; } private: value_type attributes_{}; ///< The NVTX attributes structure }; /** * @brief A RAII object for creating a NVTX range local to a thread within a * domain. * * When constructed, begins a nested NVTX range on the calling thread in the * specified domain. Upon destruction, ends the NVTX range. * * Behavior is undefined if a `scoped_range_in` object is * created/destroyed on different threads. * * `scoped_range_in` is neither moveable nor copyable. * * `scoped_range_in`s may be nested within other ranges. * * The domain of the range is specified by the template type parameter `D`. * By default, the `domain::global` is used, which scopes the range to the * global NVTX domain. The convenience alias `scoped_range` is provided for * ranges scoped to the global domain. * * A custom domain can be defined by creating a type, `D`, with a static * member `D::name` whose value is used to name the domain associated with * `D`. `D::name` must resolve to either `char const*` or `wchar_t const*` * * Example: * \code{.cpp} * // Define a type `my_domain` with a member `name` used to name the domain * // associated with the type `my_domain`. * struct my_domain{ * static constexpr char const* name{"my domain"}; * }; * \endcode * * Usage: * \code{.cpp} * nvtx3::scoped_range_in r1{"range 1"}; // Range in my domain * * // Three equivalent ways to make a range in the global domain: * nvtx3::scoped_range_in r2{"range 2"}; * nvtx3::scoped_range_in<> r3{"range 3"}; * nvtx3::scoped_range r4{"range 4"}; * * // Create an alias to succinctly make ranges in my domain: * using my_scoped_range = nvtx3::scoped_range_in; * * my_scoped_range r3{"range 3"}; * \endcode */ template class scoped_range_in { public: /** * @brief Construct a `scoped_range_in` with the specified * `event_attributes` * * Example: * \code{cpp} * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; * nvtx3::scoped_range range{attr}; // Creates a range with message contents * // "msg" and green color * \endcode * * @param[in] attr `event_attributes` that describes the desired attributes * of the range. */ explicit scoped_range_in(event_attributes const& attr) noexcept { #ifndef NVTX_DISABLE nvtxDomainRangePushEx(domain::get(), attr.get()); #else (void)attr; #endif } /** * @brief Constructs a `scoped_range_in` from the constructor arguments * of an `event_attributes`. * * Forwards the arguments `args...` to construct an * `event_attributes` object. The `event_attributes` object is then * associated with the `scoped_range_in`. * * For more detail, see `event_attributes` documentation. * * Example: * \code{cpp} * // Creates a range with message "message" and green color * nvtx3::scoped_range r{"message", nvtx3::rgb{127,255,0}}; * \endcode * * @param[in] args Arguments to used to construct an `event_attributes` associated with this * range. * */ template explicit scoped_range_in(Args const&... args) noexcept : scoped_range_in{event_attributes{args...}} { } /** * @brief Default constructor creates a `scoped_range_in` with no * message, color, payload, nor category. * */ scoped_range_in() noexcept : scoped_range_in{event_attributes{}} {} /** * @brief Delete `operator new` to disallow heap allocated objects. * * `scoped_range_in` must follow RAII semantics to guarantee proper push/pop semantics. * */ void* operator new(std::size_t) = delete; scoped_range_in(scoped_range_in const&) = delete; scoped_range_in& operator=(scoped_range_in const&) = delete; scoped_range_in(scoped_range_in&&) = delete; scoped_range_in& operator=(scoped_range_in&&) = delete; /** * @brief Destroy the scoped_range_in, ending the NVTX range event. */ ~scoped_range_in() noexcept { #ifndef NVTX_DISABLE nvtxDomainRangePop(domain::get()); #endif } }; /** * @brief Alias for a `scoped_range_in` in the global NVTX domain. * */ using scoped_range = scoped_range_in; namespace detail { /// @cond internal template class optional_scoped_range_in { public: optional_scoped_range_in() = default; void begin(event_attributes const& attr) noexcept { #ifndef NVTX_DISABLE // This class is not meant to be part of the public NVTX C++ API and should // only be used in the `NVTX3_FUNC_RANGE_IF` and `NVTX3_FUNC_RANGE_IF_IN` // macros. However, to prevent developers from misusing this class, make // sure to not start multiple ranges. if (initialized) { return; } nvtxDomainRangePushEx(domain::get(), attr.get()); initialized = true; #endif } ~optional_scoped_range_in() noexcept { #ifndef NVTX_DISABLE if (initialized) { nvtxDomainRangePop(domain::get()); } #endif } void* operator new(std::size_t) = delete; optional_scoped_range_in(optional_scoped_range_in const&) = delete; optional_scoped_range_in& operator=(optional_scoped_range_in const&) = delete; optional_scoped_range_in(optional_scoped_range_in&&) = delete; optional_scoped_range_in& operator=(optional_scoped_range_in&&) = delete; private: #ifndef NVTX_DISABLE bool initialized = false; #endif }; /// @endcond } // namespace detail /** * @brief Handle used for correlating explicit range start and end events. * * A handle is "null" if it does not correspond to any range. * */ struct range_handle { /// Type used for the handle's value using value_type = nvtxRangeId_t; /** * @brief Construct a `range_handle` from the given id. * */ constexpr explicit range_handle(value_type id) noexcept : _range_id{id} {} /** * @brief Constructs a null range handle. * * A null range_handle corresponds to no range. Calling `end_range` on a * null handle is undefined behavior when a tool is active. * */ constexpr range_handle() noexcept = default; /** * @brief Checks whether this handle is null * * Provides contextual conversion to `bool`. * * \code{cpp} * range_handle handle{}; * if (handle) {...} * \endcode * */ constexpr explicit operator bool() const noexcept { return get_value() != null_range_id; }; /** * @brief Implicit conversion from `nullptr` constructs a null handle. * * Satisfies the "NullablePointer" requirement to make `range_handle` comparable with `nullptr`. * */ constexpr range_handle(std::nullptr_t) noexcept {} /** * @brief Returns the `range_handle`'s value * * @return value_type The handle's value */ constexpr value_type get_value() const noexcept { return _range_id; } private: /// Sentinel value for a null handle that corresponds to no range static constexpr value_type null_range_id = nvtxRangeId_t{0}; value_type _range_id{null_range_id}; ///< The underlying NVTX range id }; /** * @brief Compares two range_handles for equality * * @param lhs The first range_handle to compare * @param rhs The second range_handle to compare */ inline constexpr bool operator==(range_handle lhs, range_handle rhs) noexcept { return lhs.get_value() == rhs.get_value(); } /** * @brief Compares two range_handles for inequality * * @param lhs The first range_handle to compare * @param rhs The second range_handle to compare */ inline constexpr bool operator!=(range_handle lhs, range_handle rhs) noexcept { return !(lhs == rhs); } /** * @brief Manually begin an NVTX range. * * Explicitly begins an NVTX range and returns a unique handle. To end the * range, pass the handle to `end_range_in()`. * * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and * `nvtx3::start_range_in(...)`. * * `start_range_in/end_range_in` are the most explicit and lowest level APIs * provided for creating ranges. Use of `nvtx3::unique_range_in` should be * preferred unless one is unable to tie the range to the lifetime of an object. * * Example: * \code{.cpp} * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; * // Manually begin a range * nvtx3::range_handle h = nvtx3::start_range_in(attr); * ... * nvtx3::end_range_in(h); // End the range * \endcode * * @tparam D Type containing `name` member used to identify the `domain` * to which the range belongs. Else, `domain::global` to indicate that the * global NVTX domain should be used. * @param[in] attr `event_attributes` that describes the desired attributes * of the range. * @return Unique handle to be passed to `end_range_in` to end the range. */ template inline range_handle start_range_in(event_attributes const& attr) noexcept { #ifndef NVTX_DISABLE return range_handle{nvtxDomainRangeStartEx(domain::get(), attr.get())}; #else (void)attr; return {}; #endif } /** * @brief Manually begin an NVTX range. * * Explicitly begins an NVTX range and returns a unique handle. To end the * range, pass the handle to `end_range_in()`. * * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and * `nvtx3::start_range_in(...)`. * * `start_range_in/end_range_in` are the most explicit and lowest level APIs * provided for creating ranges. Use of `nvtx3::unique_range_in` should be * preferred unless one is unable to tie the range to the lifetime of an object. * * This overload uses `args...` to construct an `event_attributes` to * associate with the range. For more detail, see `event_attributes`. * * Example: * \code{cpp} * // Manually begin a range * nvtx3::range_handle h = nvtx3::start_range_in("msg", nvtx3::rgb{127,255,0}); * ... * nvtx3::end_range_in(h); // Ends the range * \endcode * * @tparam D Type containing `name` member used to identify the `domain` * to which the range belongs. Else, `domain::global` to indicate that the * global NVTX domain should be used. * @param args[in] Variadic parameter pack of the arguments for an `event_attributes`. * @return Unique handle to be passed to `end_range` to end the range. */ template inline range_handle start_range_in(Args const&... args) noexcept { #ifndef NVTX_DISABLE return start_range_in(event_attributes{args...}); #else return {}; #endif } /** * @brief Manually begin an NVTX range in the global domain. * * Explicitly begins an NVTX range and returns a unique handle. To end the * range, pass the handle to `end_range()`. * * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and * `nvtx3::start_range_in(...)`. * * `start_range/end_range` are the most explicit and lowest level APIs * provided for creating ranges. Use of `nvtx3::unique_range` should be * preferred unless one is unable to tie the range to the lifetime of an object. * * Example: * \code{.cpp} * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; * // Manually begin a range * nvtx3::range_handle h = nvtx3::start_range(attr); * ... * nvtx3::end_range(h); // End the range * \endcode * * @param[in] attr `event_attributes` that describes the desired attributes * of the range. * @return Unique handle to be passed to `end_range_in` to end the range. */ inline range_handle start_range(event_attributes const& attr) noexcept { #ifndef NVTX_DISABLE return start_range_in(attr); #else (void)attr; return {}; #endif } /** * @brief Manually begin an NVTX range in the global domain. * * Explicitly begins an NVTX range and returns a unique handle. To end the * range, pass the handle to `end_range_in()`. * * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and * `nvtx3::start_range_in(...)`. * * `start_range_in/end_range_in` are the most explicit and lowest level APIs * provided for creating ranges. Use of `nvtx3::unique_range_in` should be * preferred unless one is unable to tie the range to the lifetime of an object. * * This overload uses `args...` to construct an `event_attributes` to * associate with the range. For more detail, see `event_attributes`. * * Example: * \code{cpp} * // Manually begin a range * nvtx3::range_handle h = nvtx3::start_range("msg", nvtx3::rgb{127,255,0}); * ... * nvtx3::end_range(h); // Ends the range * \endcode * * @param args[in] Variadic parameter pack of the arguments for an `event_attributes`. * @return Unique handle to be passed to `end_range` to end the range. */ template inline range_handle start_range(Args const&... args) noexcept { #ifndef NVTX_DISABLE return start_range_in(args...); #else return {}; #endif } /** * @brief Manually end the range associated with the handle `r` in domain `D`. * * Explicitly ends the NVTX range indicated by the handle `r` returned from a * prior call to `start_range_in`. The range may end on a different thread * from where it began. * * @tparam D Type containing `name` member used to identify the `domain` to * which the range belongs. Else, `domain::global` to indicate that the global * NVTX domain should be used. * @param r Handle to a range started by a prior call to `start_range_in`. * * @warning The domain type specified as template parameter to this function * must be the same that was specified on the associated `start_range_in` call. */ template inline void end_range_in(range_handle r) noexcept { #ifndef NVTX_DISABLE nvtxDomainRangeEnd(domain::get(), r.get_value()); #else (void)r; #endif } /** * @brief Manually end the range associated with the handle `r` in the global * domain. * * Explicitly ends the NVTX range indicated by the handle `r` returned from a * prior call to `start_range`. The range may end on a different thread from * where it began. * * @param r Handle to a range started by a prior call to `start_range`. * * @warning The domain type specified as template parameter to this function * must be the same that was specified on the associated `start_range` call. */ inline void end_range(range_handle r) noexcept { #ifndef NVTX_DISABLE end_range_in(r); #else (void)r; #endif } /** * @brief A RAII object for creating a NVTX range within a domain that can * be created and destroyed on different threads. * * When constructed, begins a NVTX range in the specified domain. Upon * destruction, ends the NVTX range. * * Similar to `nvtx3::scoped_range_in`, with a few key differences: * - `unique_range` objects can be destroyed in an order whereas `scoped_range` objects must be * destroyed in exact reverse creation order * - `unique_range` can start and end on different threads * - `unique_range` is moveable * - `unique_range` objects can be constructed as heap objects * * There is extra overhead associated with `unique_range` constructs and therefore use of * `nvtx3::scoped_range_in` should be preferred. * * @tparam D Type containing `name` member used to identify the `domain` * to which the `unique_range_in` belongs. Else, `domain::global` to * indicate that the global NVTX domain should be used. */ template class unique_range_in { public: /** * @brief Construct a new unique_range_in object with the specified event attributes * * Example: * \code{cpp} * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; * nvtx3::unique_range_in range{attr}; // Creates a range with message contents * // "msg" and green color * \endcode * * @param[in] attr `event_attributes` that describes the desired attributes * of the range. */ explicit unique_range_in(event_attributes const& attr) noexcept : handle_{start_range_in(attr)} { } /** * @brief Constructs a `unique_range_in` from the constructor arguments * of an `event_attributes`. * * Forwards the arguments `args...` to construct an * `event_attributes` object. The `event_attributes` object is then * associated with the `unique_range_in`. * * For more detail, see `event_attributes` documentation. * * Example: * \code{.cpp} * // Creates a range with message "message" and green color * nvtx3::unique_range_in<> r{"message", nvtx3::rgb{127,255,0}}; * \endcode * * @param[in] args Variadic parameter pack of arguments to construct an `event_attributes` * associated with this range. */ template explicit unique_range_in(Args const&... args) noexcept : unique_range_in{event_attributes{args...}} { } /** * @brief Default constructor creates a `unique_range_in` with no * message, color, payload, nor category. * */ constexpr unique_range_in() noexcept : unique_range_in{event_attributes{}} {} /** * @brief Destroy the `unique_range_in` ending the range. * */ ~unique_range_in() noexcept = default; /** * @brief Move constructor allows taking ownership of the NVTX range from * another `unique_range_in`. * * @param other The range to take ownership of */ unique_range_in(unique_range_in&& other) noexcept = default; /** * @brief Move assignment operator allows taking ownership of an NVTX range * from another `unique_range_in`. * * @param other The range to take ownership of */ unique_range_in& operator=(unique_range_in&& other) noexcept = default; /// Copy construction is not allowed to prevent multiple objects from owning /// the same range handle unique_range_in(unique_range_in const&) = delete; /// Copy assignment is not allowed to prevent multiple objects from owning the /// same range handle unique_range_in& operator=(unique_range_in const&) = delete; private: struct end_range_handle { using pointer = range_handle; /// Override the pointer type of the unique_ptr void operator()(range_handle h) const noexcept { end_range_in(h); } }; /// Range handle used to correlate the start/end of the range std::unique_ptr handle_; }; /** * @brief Alias for a `unique_range_in` in the global NVTX domain. * */ using unique_range = unique_range_in; /** * @brief Annotates an instantaneous point in time with a "marker", using the * attributes specified by `attr`. * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::event_attributes attr{"operation failed!", nvtx3::rgb{255,0,0}}; * nvtx3::mark_in(attr); * } * \endcode * * Note that nvtx3::mark_in is a function, not a class like scoped_range_in. * * @tparam D Type containing `name` member used to identify the `domain` * to which the `unique_range_in` belongs. Else, `domain::global` to * indicate that the global NVTX domain should be used. * @param[in] attr `event_attributes` that describes the desired attributes * of the mark. */ template inline void mark_in(event_attributes const& attr) noexcept { #ifndef NVTX_DISABLE nvtxDomainMarkEx(domain::get(), attr.get()); #else (void)(attr); #endif } /** * @brief Annotates an instantaneous point in time with a "marker", using the * arguments to construct an `event_attributes`. * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::mark_in("operation failed!", nvtx3::rgb{255,0,0}); * } * \endcode * * Note that nvtx3::mark_in is a function, not a class like scoped_range_in. * * Forwards the arguments `args...` to construct an `event_attributes` object. * The attributes are then associated with the marker. For more detail, see * the `event_attributes` documentation. * * @tparam D Type containing `name` member used to identify the `domain` * to which the `unique_range_in` belongs. Else `domain::global` to * indicate that the global NVTX domain should be used. * @param[in] args Variadic parameter pack of arguments to construct an `event_attributes` * associated with this range. * */ template inline void mark_in(Args const&... args) noexcept { #ifndef NVTX_DISABLE mark_in(event_attributes{args...}); #endif } /** * @brief Annotates an instantaneous point in time with a "marker", using the * attributes specified by `attr`, in the global domain. * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::event_attributes attr{"operation failed!", nvtx3::rgb{255,0,0}}; * nvtx3::mark(attr); * } * \endcode * * Note that nvtx3::mark is a function, not a class like scoped_range. * * @param[in] attr `event_attributes` that describes the desired attributes * of the mark. */ inline void mark(event_attributes const& attr) noexcept { #ifndef NVTX_DISABLE mark_in(attr); #endif } /** * @brief Annotates an instantaneous point in time with a "marker", using the * arguments to construct an `event_attributes`, in the global domain. * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::mark("operation failed!", nvtx3::rgb{255,0,0}); * } * \endcode * * Note that nvtx3::mark is a function, not a class like scoped_range. * * Forwards the arguments `args...` to construct an `event_attributes` object. * The attributes are then associated with the marker. For more detail, see * the `event_attributes` documentation. * * @param[in] args Variadic parameter pack of arguments to construct an * `event_attributes` associated with this range. * */ template inline void mark(Args const&... args) noexcept { #ifndef NVTX_DISABLE mark_in(args...); #endif } } // namespace NVTX3_VERSION_NAMESPACE } // namespace nvtx3 #ifndef NVTX_DISABLE /** * @brief Convenience macro for generating a range in the specified `domain` * from the lifetime of a function * * This macro is useful for generating an NVTX range in `domain` from * the entry point of a function to its exit. It is intended to be the first * line of the function. * * Constructs a static `registered_string_in` using the name of the immediately * enclosing function returned by `__func__` and constructs a * `nvtx3::scoped_range` using the registered function name as the range's * message. * * Example: * \code{.cpp} * struct my_domain{static constexpr char const* name{"my_domain"};}; * * void foo(...) { * NVTX3_FUNC_RANGE_IN(my_domain); // Range begins on entry to foo() * // do stuff * ... * } // Range ends on return from foo() * \endcode * * @param[in] D Type containing `name` member used to identify the * `domain` to which the `registered_string_in` belongs. Else, * `domain::global` to indicate that the global NVTX domain should be used. */ #define NVTX3_V1_FUNC_RANGE_IN(D) \ static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; /** * @brief Convenience macro for generating a range in the specified `domain` * from the lifetime of a function if the given boolean expression evaluates * to true. * * Similar to `NVTX3_V1_FUNC_RANGE_IN(D)`, the only difference being that * `NVTX3_V1_FUNC_RANGE_IF_IN(D, C)` only generates a range if the given boolean * expression evaluates to true. * * @param[in] D Type containing `name` member used to identify the * `domain` to which the `registered_string_in` belongs. Else, * `domain::global` to indicate that the global NVTX domain should be used. * * @param[in] C Boolean expression used to determine if a range should be * generated. */ #define NVTX3_V1_FUNC_RANGE_IF_IN(D, C) \ ::nvtx3::v1::detail::optional_scoped_range_in optional_nvtx3_range__; \ if (C) { \ static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ optional_nvtx3_range__.begin(nvtx3_func_attr__); \ } #else #define NVTX3_V1_FUNC_RANGE_IN(D) #define NVTX3_V1_FUNC_RANGE_IF_IN(D, C) #endif // NVTX_DISABLE /** * @brief Convenience macro for generating a range in the global domain from the * lifetime of a function. * * This macro is useful for generating an NVTX range in the global domain from * the entry point of a function to its exit. It is intended to be the first * line of the function. * * Constructs a static `registered_string_in` using the name of the immediately * enclosing function returned by `__func__` and constructs a * `nvtx3::scoped_range` using the registered function name as the range's * message. * * Example: * \code{.cpp} * void foo(...) { * NVTX3_FUNC_RANGE(); // Range begins on entry to foo() * // do stuff * ... * } // Range ends on return from foo() * \endcode */ #define NVTX3_V1_FUNC_RANGE() NVTX3_V1_FUNC_RANGE_IN(::nvtx3::v1::domain::global) /** * @brief Convenience macro for generating a range in the global domain from the * lifetime of a function if the given boolean expression evaluates to true. * * Similar to `NVTX3_V1_FUNC_RANGE()`, the only difference being that * `NVTX3_V1_FUNC_RANGE_IF(C)` only generates a range if the given boolean * expression evaluates to true. * * @param[in] C Boolean expression used to determine if a range should be * generated. */ #define NVTX3_V1_FUNC_RANGE_IF(C) NVTX3_V1_FUNC_RANGE_IF_IN(::nvtx3::v1::domain::global, C) /* When inlining this version, versioned macros must have unversioned aliases. * For each NVTX3_Vx_ #define, make an NVTX3_ alias of it here.*/ #if defined(NVTX3_INLINE_THIS_VERSION) /* clang format off */ #define NVTX3_FUNC_RANGE NVTX3_V1_FUNC_RANGE #define NVTX3_FUNC_RANGE_IF NVTX3_V1_FUNC_RANGE_IF #define NVTX3_FUNC_RANGE_IN NVTX3_V1_FUNC_RANGE_IN #define NVTX3_FUNC_RANGE_IF_IN NVTX3_V1_FUNC_RANGE_IF_IN /* clang format on */ #endif #endif // NVTX3_CPP_DEFINITIONS_V1_0 /* Add functionality for new minor versions here, by copying the above section enclosed * in #ifndef NVTX3_CPP_DEFINITIONS_Vx_y, and incrementing the minor version. This code * is an example of how additions for version 1.2 would look, indented for clarity. Note * that the versioned symbols and macros are always provided, and the unversioned symbols * are only provided if NVTX3_INLINE_THIS_VERSION was defined at the top of this header. * * \code{.cpp} * #ifndef NVTX3_CPP_DEFINITIONS_V1_2 * #define NVTX3_CPP_DEFINITIONS_V1_2 * namespace nvtx3 { * NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE { * class new_class {}; * inline void new_function() {} * } * } * * // Macros must have the major version in their names: * #define NVTX3_V1_NEW_MACRO_A() ... * #define NVTX3_V1_NEW_MACRO_B() ... * * // If inlining, make aliases for the macros with the version number omitted * #if defined(NVTX3_INLINE_THIS_VERSION) * #define NVTX3_NEW_MACRO_A NVTX3_V1_NEW_MACRO_A * #define NVTX3_NEW_MACRO_B NVTX3_V1_NEW_MACRO_B * #endif * #endif // NVTX3_CPP_DEFINITIONS_V1_2 * \endcode */ /* Undefine all temporarily-defined unversioned macros, which would conflict with * subsequent includes of different versions of this header. */ #undef NVTX3_CPP_VERSION_MAJOR #undef NVTX3_CPP_VERSION_MINOR #undef NVTX3_CONCAT #undef NVTX3_NAMESPACE_FOR #undef NVTX3_VERSION_NAMESPACE #undef NVTX3_INLINE_IF_REQUESTED #undef NVTX3_CONSTEXPR_IF_CPP14 #if defined(NVTX3_INLINE_THIS_VERSION) #undef NVTX3_INLINE_THIS_VERSION #endif #if defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE) #undef NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE #undef NVTX3_USE_CHECKED_OVERLOADS_FOR_GET #endif #if defined(NVTX3_STATIC_ASSERT_DEFINED_HERE) #undef NVTX3_STATIC_ASSERT_DEFINED_HERE #undef NVTX3_STATIC_ASSERT #endif nccl-2.18.3-1/src/include/nvtx3/nvtxDetail/000077500000000000000000000000001444201535000202765ustar00rootroot00000000000000nccl-2.18.3-1/src/include/nvtx3/nvtxDetail/nvtxImpl.h000066400000000000000000000522421444201535000222750ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif /* ---- Include required platform headers ---- */ #if defined(_WIN32) #include #else #include #if defined(__ANDROID__) #include #endif #if defined(__linux__) || defined(__CYGWIN__) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* ---- Define macros used in this file ---- */ #define NVTX_INIT_STATE_FRESH 0 #define NVTX_INIT_STATE_STARTED 1 #define NVTX_INIT_STATE_COMPLETE 2 #ifdef NVTX_DEBUG_PRINT #ifdef __ANDROID__ #include #define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__); #define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__); #else #include #define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__) #define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__) #endif #else /* !defined(NVTX_DEBUG_PRINT) */ #define NVTX_ERR(...) #define NVTX_INFO(...) #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ #ifdef __GNUC__ #pragma GCC visibility push(hidden) #endif /* ---- Forward declare all functions referenced in globals ---- */ NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)( NvtxCallbackModule module, NvtxFunctionTable* out_table, unsigned int* out_size); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)( uint32_t version); NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)( uint32_t exportTableId); #include "nvtxInitDecls.h" /* ---- Define all globals ---- */ typedef struct nvtxGlobals_t { volatile unsigned int initState; NvtxExportTableCallbacks etblCallbacks; NvtxExportTableVersionInfo etblVersionInfo; /* Implementation function pointers */ nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr; nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr; nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr; nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr; nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr; nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr; nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr; nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr; nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr; nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr; nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr; nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr; nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr; nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr; nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr; nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr; nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr; nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr; nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr; nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr; nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr; nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr; nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr; nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr; nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr; nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr; nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr; nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr; nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr; nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr; nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr; nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr; nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr; nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr; nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr; nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr; nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr; nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr; nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr; nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr; nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr; nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr; nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr; nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr; nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr; nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr; nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr; nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr; nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr; nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr; nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr; nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr; nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr; nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr; nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr; nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr; nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr; nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr; nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr; nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr; nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr; nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr; nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr; nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr; /* Tables of function pointers -- Extra null added to the end to ensure * a crash instead of silent corruption if a tool reads off the end. */ NvtxFunctionPointer* functionTable_CORE [NVTX_CBID_CORE_SIZE + 1]; NvtxFunctionPointer* functionTable_CUDA [NVTX_CBID_CUDA_SIZE + 1]; NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1]; NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1]; NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE + 1]; NvtxFunctionPointer* functionTable_SYNC [NVTX_CBID_SYNC_SIZE + 1]; } nvtxGlobals_t; NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) = { NVTX_INIT_STATE_FRESH, { sizeof(NvtxExportTableCallbacks), NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable) }, { sizeof(NvtxExportTableVersionInfo), NVTX_VERSION, 0, NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion) }, /* Implementation function pointers */ NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init), NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init), /* Tables of function pointers */ { 0, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr, 0 }, { 0, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr, 0 }, { 0, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr, 0 }, { 0, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr, 0 }, { 0, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr, 0 }, { 0, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr, (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr, 0 } }; /* ---- Define static inline implementations of core API functions ---- */ #include "nvtxImplCore.h" /* ---- Define implementations of export table functions ---- */ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)( NvtxCallbackModule module, NvtxFunctionTable* out_table, unsigned int* out_size) { unsigned int bytes = 0; NvtxFunctionTable table = (NvtxFunctionTable)0; switch (module) { case NVTX_CB_MODULE_CORE: table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE; bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE); break; case NVTX_CB_MODULE_CUDA: table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA; bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA); break; case NVTX_CB_MODULE_OPENCL: table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL; bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL); break; case NVTX_CB_MODULE_CUDART: table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART; bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART); break; case NVTX_CB_MODULE_CORE2: table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2; bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2); break; case NVTX_CB_MODULE_SYNC: table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC; bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC); break; default: return 0; } if (out_size) *out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1; if (out_table) *out_table = table; return 1; } NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId) { switch (exportTableId) { case NVTX_ETID_CALLBACKS: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks; case NVTX_ETID_VERSIONINFO: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo; default: return 0; } } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version) { /* Reserved for custom implementations to resolve problems with tools */ (void)version; } /* ---- Define implementations of init versions of all API functions ---- */ #include "nvtxInitDefs.h" /* ---- Define implementations of initialization functions ---- */ #include "nvtxInit.h" #ifdef __GNUC__ #pragma GCC visibility pop #endif #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ nccl-2.18.3-1/src/include/nvtx3/nvtxDetail/nvtxImplCore.h000066400000000000000000000234331444201535000231060ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib) { #ifndef NVTX_DISABLE nvtxMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr; if(local!=0) (*local)(eventAttrib); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message) { #ifndef NVTX_DISABLE nvtxMarkA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr; if(local!=0) (*local)(message); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message) { #ifndef NVTX_DISABLE nvtxMarkW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr; if(local!=0) (*local)(message); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib) { #ifndef NVTX_DISABLE nvtxRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr; if(local!=0) return (*local)(eventAttrib); else #endif /*NVTX_DISABLE*/ return (nvtxRangeId_t)0; } NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message) { #ifndef NVTX_DISABLE nvtxRangeStartA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr; if(local!=0) return (*local)(message); else #endif /*NVTX_DISABLE*/ return (nvtxRangeId_t)0; } NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message) { #ifndef NVTX_DISABLE nvtxRangeStartW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr; if(local!=0) return (*local)(message); else #endif /*NVTX_DISABLE*/ return (nvtxRangeId_t)0; } NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id) { #ifndef NVTX_DISABLE nvtxRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr; if(local!=0) (*local)(id); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib) { #ifndef NVTX_DISABLE nvtxRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr; if(local!=0) return (*local)(eventAttrib); else #endif /*NVTX_DISABLE*/ return (int)NVTX_NO_PUSH_POP_TRACKING; } NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message) { #ifndef NVTX_DISABLE nvtxRangePushA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr; if(local!=0) return (*local)(message); else #endif /*NVTX_DISABLE*/ return (int)NVTX_NO_PUSH_POP_TRACKING; } NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message) { #ifndef NVTX_DISABLE nvtxRangePushW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr; if(local!=0) return (*local)(message); else #endif /*NVTX_DISABLE*/ return (int)NVTX_NO_PUSH_POP_TRACKING; } NVTX_DECLSPEC int NVTX_API nvtxRangePop(void) { #ifndef NVTX_DISABLE nvtxRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr; if(local!=0) return (*local)(); else #endif /*NVTX_DISABLE*/ return (int)NVTX_NO_PUSH_POP_TRACKING; } NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name) { #ifndef NVTX_DISABLE nvtxNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr; if(local!=0) (*local)(category, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr; if(local!=0) (*local)(category, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name) { #ifndef NVTX_DISABLE nvtxNameOsThreadA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr; if(local!=0) (*local)(threadId, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameOsThreadW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr; if(local!=0) (*local)(threadId, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib) { #ifndef NVTX_DISABLE nvtxDomainMarkEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr; if(local!=0) (*local)(domain, eventAttrib); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib) { #ifndef NVTX_DISABLE nvtxDomainRangeStartEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr; if(local!=0) return (*local)(domain, eventAttrib); else #endif /*NVTX_DISABLE*/ return (nvtxRangeId_t)0; } NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id) { #ifndef NVTX_DISABLE nvtxDomainRangeEnd_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr; if(local!=0) (*local)(domain, id); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib) { #ifndef NVTX_DISABLE nvtxDomainRangePushEx_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr; if(local!=0) return (*local)(domain, eventAttrib); else #endif /*NVTX_DISABLE*/ return (int)NVTX_NO_PUSH_POP_TRACKING; } NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain) { #ifndef NVTX_DISABLE nvtxDomainRangePop_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr; if(local!=0) return (*local)(domain); else #endif /*NVTX_DISABLE*/ return (int)NVTX_NO_PUSH_POP_TRACKING; } NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs) { #ifndef NVTX_DISABLE nvtxDomainResourceCreate_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr; if(local!=0) return (*local)(domain, attribs); else #endif /*NVTX_DISABLE*/ return (nvtxResourceHandle_t)0; } NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource) { #ifndef NVTX_DISABLE nvtxDomainResourceDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr; if(local!=0) (*local)(resource); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name) { #ifndef NVTX_DISABLE nvtxDomainNameCategoryA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr; if(local!=0) (*local)(domain, category, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxDomainNameCategoryW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr; if(local!=0) (*local)(domain, category, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string) { #ifndef NVTX_DISABLE nvtxDomainRegisterStringA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr; if(local!=0) return (*local)(domain, string); else #endif /*NVTX_DISABLE*/ return (nvtxStringHandle_t)0; } NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string) { #ifndef NVTX_DISABLE nvtxDomainRegisterStringW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr; if(local!=0) return (*local)(domain, string); else #endif /*NVTX_DISABLE*/ return (nvtxStringHandle_t)0; } NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* message) { #ifndef NVTX_DISABLE nvtxDomainCreateA_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr; if(local!=0) return (*local)(message); else #endif /*NVTX_DISABLE*/ return (nvtxDomainHandle_t)0; } NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* message) { #ifndef NVTX_DISABLE nvtxDomainCreateW_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr; if(local!=0) return (*local)(message); else #endif /*NVTX_DISABLE*/ return (nvtxDomainHandle_t)0; } NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain) { #ifndef NVTX_DISABLE nvtxDomainDestroy_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr; if(local!=0) (*local)(domain); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved) { #ifndef NVTX_DISABLE nvtxInitialize_impl_fntype local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr; if(local!=0) (*local)(reserved); #endif /*NVTX_DISABLE*/ } nccl-2.18.3-1/src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h000066400000000000000000000060401444201535000240030ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD_CUDART #error Never include this file directly -- it is automatically included by nvToolsExtCudaRt.h (except when NVTX_NO_IMPL is defined). #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name); typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name); typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name); typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name); typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name); typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name); NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name) { #ifndef NVTX_DISABLE nvtxNameCudaDeviceA_impl_fntype local = (nvtxNameCudaDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr; if(local!=0) (*local)(device, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCudaDeviceW_impl_fntype local = (nvtxNameCudaDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr; if(local!=0) (*local)(device, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name) { #ifndef NVTX_DISABLE nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr; if(local!=0) (*local)(stream, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr; if(local!=0) (*local)(stream, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name) { #ifndef NVTX_DISABLE nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr; if(local!=0) (*local)(event, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr; if(local!=0) (*local)(event, name); #endif /*NVTX_DISABLE*/ } #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ nccl-2.18.3-1/src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h000066400000000000000000000074531444201535000235060ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD_CUDA #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name); typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name); typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name); typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name); typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name); typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name); typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name); typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name); NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name) { #ifndef NVTX_DISABLE nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr; if(local!=0) (*local)(device, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr; if(local!=0) (*local)(device, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name) { #ifndef NVTX_DISABLE nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr; if(local!=0) (*local)(context, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr; if(local!=0) (*local)(context, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name) { #ifndef NVTX_DISABLE nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr; if(local!=0) (*local)(stream, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr; if(local!=0) (*local)(stream, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name) { #ifndef NVTX_DISABLE nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr; if(local!=0) (*local)(event, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr; if(local!=0) (*local)(event, name); #endif /*NVTX_DISABLE*/ } #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ nccl-2.18.3-1/src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h000066400000000000000000000147201444201535000237450ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD_OPENCL #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ typedef void (NVTX_API * nvtxNameClDeviceA_impl_fntype)(cl_device_id device, const char* name); typedef void (NVTX_API * nvtxNameClDeviceW_impl_fntype)(cl_device_id device, const wchar_t* name); typedef void (NVTX_API * nvtxNameClContextA_impl_fntype)(cl_context context, const char* name); typedef void (NVTX_API * nvtxNameClContextW_impl_fntype)(cl_context context, const wchar_t* name); typedef void (NVTX_API * nvtxNameClCommandQueueA_impl_fntype)(cl_command_queue command_queue, const char* name); typedef void (NVTX_API * nvtxNameClCommandQueueW_impl_fntype)(cl_command_queue command_queue, const wchar_t* name); typedef void (NVTX_API * nvtxNameClMemObjectA_impl_fntype)(cl_mem memobj, const char* name); typedef void (NVTX_API * nvtxNameClMemObjectW_impl_fntype)(cl_mem memobj, const wchar_t* name); typedef void (NVTX_API * nvtxNameClSamplerA_impl_fntype)(cl_sampler sampler, const char* name); typedef void (NVTX_API * nvtxNameClSamplerW_impl_fntype)(cl_sampler sampler, const wchar_t* name); typedef void (NVTX_API * nvtxNameClProgramA_impl_fntype)(cl_program program, const char* name); typedef void (NVTX_API * nvtxNameClProgramW_impl_fntype)(cl_program program, const wchar_t* name); typedef void (NVTX_API * nvtxNameClEventA_impl_fntype)(cl_event evnt, const char* name); typedef void (NVTX_API * nvtxNameClEventW_impl_fntype)(cl_event evnt, const wchar_t* name); NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name) { #ifndef NVTX_DISABLE nvtxNameClDeviceA_impl_fntype local = (nvtxNameClDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr; if(local!=0) (*local)(device, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClDeviceW_impl_fntype local = (nvtxNameClDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr; if(local!=0) (*local)(device, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name) { #ifndef NVTX_DISABLE nvtxNameClContextA_impl_fntype local = (nvtxNameClContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr; if(local!=0) (*local)(context, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClContextW_impl_fntype local = (nvtxNameClContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr; if(local!=0) (*local)(context, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name) { #ifndef NVTX_DISABLE nvtxNameClCommandQueueA_impl_fntype local = (nvtxNameClCommandQueueA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr; if(local!=0) (*local)(command_queue, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClCommandQueueW_impl_fntype local = (nvtxNameClCommandQueueW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr; if(local!=0) (*local)(command_queue, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name) { #ifndef NVTX_DISABLE nvtxNameClMemObjectA_impl_fntype local = (nvtxNameClMemObjectA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr; if(local!=0) (*local)(memobj, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClMemObjectW_impl_fntype local = (nvtxNameClMemObjectW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr; if(local!=0) (*local)(memobj, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name) { #ifndef NVTX_DISABLE nvtxNameClSamplerA_impl_fntype local = (nvtxNameClSamplerA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr; if(local!=0) (*local)(sampler, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClSamplerW_impl_fntype local = (nvtxNameClSamplerW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr; if(local!=0) (*local)(sampler, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name) { #ifndef NVTX_DISABLE nvtxNameClProgramA_impl_fntype local = (nvtxNameClProgramA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr; if(local!=0) (*local)(program, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClProgramW_impl_fntype local = (nvtxNameClProgramW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr; if(local!=0) (*local)(program, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name) { #ifndef NVTX_DISABLE nvtxNameClEventA_impl_fntype local = (nvtxNameClEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr; if(local!=0) (*local)(evnt, name); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name) { #ifndef NVTX_DISABLE nvtxNameClEventW_impl_fntype local = (nvtxNameClEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr; if(local!=0) (*local)(evnt, name); #endif /*NVTX_DISABLE*/ } #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ nccl-2.18.3-1/src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h000066400000000000000000000064331444201535000235430ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD_SYNC #error Never include this file directly -- it is automatically included by nvToolsExtCuda.h (except when NVTX_NO_IMPL is defined). #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs); typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle); NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs) { #ifndef NVTX_DISABLE nvtxDomainSyncUserCreate_impl_fntype local = (nvtxDomainSyncUserCreate_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr; if(local!=0) return (*local)(domain, attribs); else #endif /*NVTX_DISABLE*/ return (nvtxSyncUser_t)0; } NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle) { #ifndef NVTX_DISABLE nvtxDomainSyncUserDestroy_impl_fntype local = (nvtxDomainSyncUserDestroy_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr; if(local!=0) (*local)(handle); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle) { #ifndef NVTX_DISABLE nvtxDomainSyncUserAcquireStart_impl_fntype local = (nvtxDomainSyncUserAcquireStart_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr; if(local!=0) (*local)(handle); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle) { #ifndef NVTX_DISABLE nvtxDomainSyncUserAcquireFailed_impl_fntype local = (nvtxDomainSyncUserAcquireFailed_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr; if(local!=0) (*local)(handle); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle) { #ifndef NVTX_DISABLE nvtxDomainSyncUserAcquireSuccess_impl_fntype local = (nvtxDomainSyncUserAcquireSuccess_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr; if(local!=0) (*local)(handle); #endif /*NVTX_DISABLE*/ } NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle) { #ifndef NVTX_DISABLE nvtxDomainSyncUserReleasing_impl_fntype local = (nvtxDomainSyncUserReleasing_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr; if(local!=0) (*local)(handle); #endif /*NVTX_DISABLE*/ } #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ nccl-2.18.3-1/src/include/nvtx3/nvtxDetail/nvtxInit.h000066400000000000000000000313711444201535000222770ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif /* ---- Platform-independent helper definitions and functions ---- */ /* Prefer macros over inline functions to reduce symbol resolution at link time */ #if defined(_WIN32) #define NVTX_PATHCHAR wchar_t #define NVTX_STR(x) L##x #define NVTX_GETENV _wgetenv #define NVTX_BUFSIZE MAX_PATH #define NVTX_DLLHANDLE HMODULE #define NVTX_DLLOPEN(x) LoadLibraryW(x) #define NVTX_DLLFUNC GetProcAddress #define NVTX_DLLCLOSE FreeLibrary #define NVTX_YIELD() SwitchToThread() #define NVTX_MEMBAR() MemoryBarrier() #define NVTX_ATOMIC_WRITE_32(address, value) InterlockedExchange((volatile LONG*)address, value) #define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand) #elif defined(__GNUC__) #define NVTX_PATHCHAR char #define NVTX_STR(x) x #define NVTX_GETENV getenv #define NVTX_BUFSIZE PATH_MAX #define NVTX_DLLHANDLE void* #define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY) #define NVTX_DLLFUNC dlsym #define NVTX_DLLCLOSE dlclose #define NVTX_YIELD() sched_yield() #define NVTX_MEMBAR() __sync_synchronize() /* Ensure full memory barrier for atomics, to match Windows functions */ #define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value) #define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand) #else #error The library does not support your configuration! #endif /* Define this to 1 for platforms that where pre-injected libraries can be discovered. */ #if defined(_WIN32) /* TODO */ #define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0 #else #define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0 #endif /* Define this to 1 for platforms that support environment variables */ /* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */ /* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */ #define NVTX_SUPPORT_ENV_VARS 1 /* Define this to 1 for platforms that support dynamic/shared libraries */ #define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1 /* Injection libraries implementing InitializeInjectionNvtx2 may be statically linked, * and this will override any dynamic injection. Useful for platforms where dynamic * injection is not available. Since weak symbols not explicitly marked extern are * guaranteed to be initialized to zero if no definitions are found by the linker, the * dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */ #if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__) #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1 /* To statically inject an NVTX library, define InitializeInjectionNvtx2_fnptr as a normal * symbol (not weak) pointing to the implementation of InitializeInjectionNvtx2 (which * does not need to be named "InitializeInjectionNvtx2" as is necessary in a dynamic * injection library. */ __attribute__((weak)) NvtxInitializeInjectionNvtxFunc_t InitializeInjectionNvtx2_fnptr; #else #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0 #endif /* This function tries to find or load an NVTX injection library and get the * address of its InitializeInjection2 function. If such a function pointer * is found, it is called, and passed the address of this NVTX instance's * nvtxGetExportTable function, so the injection can attach to this instance. * If the initialization fails for any reason, any dynamic library loaded will * be freed, and all NVTX implementation functions will be set to no-ops. If * initialization succeeds, NVTX functions not attached to the tool will be set * to no-ops. This is implemented as one function instead of several small * functions to minimize the number of weak symbols the linker must resolve. * Order of search is: * - Pre-injected library exporting InitializeInjectionNvtx2 * - Loadable library exporting InitializeInjectionNvtx2 * - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64) * - On Android, libNvtxInjection??.so within the package (?? is 32 or 64) * - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr */ NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void); NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(void) { const char* const initFuncName = "InitializeInjectionNvtx2"; NvtxInitializeInjectionNvtxFunc_t init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)0; NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0; int entryPointStatus = 0; #if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY /* Use POSIX global symbol chain to query for init function from any module */ init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(0, initFuncName); #endif #if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY /* Try discovering dynamic injection library to load */ if (!init_fnptr) { #if NVTX_SUPPORT_ENV_VARS /* If env var NVTX_INJECTION64_PATH is set, it should contain the path * to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */ const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4) ? NVTX_STR("NVTX_INJECTION32_PATH") : NVTX_STR("NVTX_INJECTION64_PATH"); #endif /* NVTX_SUPPORT_ENV_VARS */ NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE]; const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0; /* Refer to this variable explicitly in case all references to it are #if'ed out */ (void)injectionLibraryPathBuf; #if NVTX_SUPPORT_ENV_VARS /* Disable the warning for getenv & _wgetenv -- this usage is safe because * these functions are not called again before using the returned value. */ #if defined(_MSC_VER) #pragma warning( push ) #pragma warning( disable : 4996 ) #endif injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName); #if defined(_MSC_VER) #pragma warning( pop ) #endif #endif #if defined(__ANDROID__) if (!injectionLibraryPath) { const char *bits = (sizeof(void*) == 4) ? "32" : "64"; char cmdlineBuf[32]; char pkgName[PATH_MAX]; int count; int pid; FILE *fp; size_t bytesRead; size_t pos; pid = (int)getpid(); count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid); if (count <= 0 || count >= (int)sizeof(cmdlineBuf)) { NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid); return NVTX_ERR_INIT_ACCESS_LIBRARY; } fp = fopen(cmdlineBuf, "r"); if (!fp) { NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf); return NVTX_ERR_INIT_ACCESS_LIBRARY; } bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp); fclose(fp); if (bytesRead == 0) { NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf); return NVTX_ERR_INIT_ACCESS_LIBRARY; } pkgName[bytesRead] = 0; /* String can contain colon as a process separator. In this case the package name is before the colon. */ pos = 0; while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0') { ++pos; } pkgName[pos] = 0; count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits); if (count <= 0 || count >= NVTX_BUFSIZE) { NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits); return NVTX_ERR_INIT_ACCESS_LIBRARY; } /* On Android, verify path is accessible due to aggressive file access restrictions. */ /* For dlopen, if the filename contains a leading slash, then it is interpreted as a */ /* relative or absolute pathname; otherwise it will follow the rules in ld.so. */ if (injectionLibraryPathBuf[0] == '/') { #if (__ANDROID_API__ < 21) int access_err = access(injectionLibraryPathBuf, F_OK | R_OK); #else int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0); #endif if (access_err != 0) { NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf); return NVTX_ERR_INIT_ACCESS_LIBRARY; } } injectionLibraryPath = injectionLibraryPathBuf; } #endif /* At this point, injectionLibraryPath is specified if a dynamic * injection library was specified by a tool. */ if (injectionLibraryPath) { /* Load the injection library */ injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath); if (!injectionLibraryHandle) { NVTX_ERR("Failed to load injection library\n"); return NVTX_ERR_INIT_LOAD_LIBRARY; } else { /* Attempt to get the injection library's entry-point */ init_fnptr = (NvtxInitializeInjectionNvtxFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName); if (!init_fnptr) { NVTX_DLLCLOSE(injectionLibraryHandle); NVTX_ERR("Failed to get address of function InitializeInjectionNvtx2 from injection library\n"); return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT; } } } } #endif #if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY if (!init_fnptr) { /* Check weakly-defined function pointer. A statically-linked injection can define this as * a normal symbol and it will take precedence over a dynamic injection. */ if (InitializeInjectionNvtx2_fnptr) { init_fnptr = InitializeInjectionNvtx2_fnptr; } } #endif /* At this point, if init_fnptr is not set, then no tool has specified * an NVTX injection library -- return non-success result so all NVTX * API functions will be set to no-ops. */ if (!init_fnptr) { return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE; } /* Invoke injection library's initialization function. If it returns * 0 (failure) and a dynamic injection was loaded, unload it. */ entryPointStatus = init_fnptr(NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)); if (entryPointStatus == 0) { NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n"); if (injectionLibraryHandle) { NVTX_DLLCLOSE(injectionLibraryHandle); } return NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT; } return NVTX_SUCCESS; } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void) { unsigned int old; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState == NVTX_INIT_STATE_COMPLETE) { return; } NVTX_ATOMIC_CAS_32( old, &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState, NVTX_INIT_STATE_STARTED, NVTX_INIT_STATE_FRESH); if (old == NVTX_INIT_STATE_FRESH) { int result; int forceAllToNoops; /* Load & initialize injection library -- it will assign the function pointers */ result = NVTX_VERSIONED_IDENTIFIER(nvtxInitializeInjectionLibrary)(); /* Set all pointers not assigned by the injection to null */ forceAllToNoops = result != NVTX_SUCCESS; /* Set all to null if injection init failed */ NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(forceAllToNoops); /* Signal that initialization has finished, so now the assigned function pointers will be used */ NVTX_ATOMIC_WRITE_32( &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState, NVTX_INIT_STATE_COMPLETE); } else /* Spin-wait until initialization has finished */ { NVTX_MEMBAR(); while (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).initState != NVTX_INIT_STATE_COMPLETE) { NVTX_YIELD(); NVTX_MEMBAR(); } } } nccl-2.18.3-1/src/include/nvtx3/nvtxDetail/nvtxInitDecls.h000066400000000000000000000226201444201535000232470ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved); NVTX_LINKONCE_FWDDECL_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle); NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle); nccl-2.18.3-1/src/include/nvtx3/nvtxDetail/nvtxInitDefs.h000066400000000000000000001055471444201535000231100ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_IMPL_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxMarkEx(eventAttrib); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init)(const char* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxMarkA(message); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init)(const wchar_t* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxMarkW(message); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangeStartEx(eventAttrib); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init)(const char* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangeStartA(message); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init)(const wchar_t* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangeStartW(message); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init)(nvtxRangeId_t id){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxRangeEnd(id); } NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init)(const nvtxEventAttributes_t* eventAttrib){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangePushEx(eventAttrib); } NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init)(const char* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangePushA(message); } NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init)(const wchar_t* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangePushW(message); } NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init)(void){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxRangePop(); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init)(uint32_t category, const char* name){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxNameCategoryA(category, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init)(uint32_t category, const wchar_t* name){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxNameCategoryW(category, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init)(uint32_t threadId, const char* name){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxNameOsThreadA(threadId, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init)(uint32_t threadId, const wchar_t* name){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxNameOsThreadW(threadId, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxDomainMarkEx(domain, eventAttrib); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxRangeId_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainRangeStartEx(domain, eventAttrib); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init)(nvtxDomainHandle_t domain, nvtxRangeId_t id){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxDomainRangeEnd(domain, id); } NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainRangePushEx(domain, eventAttrib); } NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init)(nvtxDomainHandle_t domain){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainRangePop(domain); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxResourceHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainResourceCreate(domain, attribs); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init)(nvtxResourceHandle_t resource){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxDomainResourceDestroy(resource); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const char* name){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxDomainNameCategoryA(domain, category, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxDomainNameCategoryW(domain, category, name); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init)(nvtxDomainHandle_t domain, const char* string){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainRegisterStringA(domain, string); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxStringHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init)(nvtxDomainHandle_t domain, const wchar_t* string){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainRegisterStringW(domain, string); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init)(const char* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainCreateA(message); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxDomainHandle_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init)(const wchar_t* message){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); return nvtxDomainCreateW(message); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init)(nvtxDomainHandle_t domain){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxDomainDestroy(domain); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init)(const void* reserved){ NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); nvtxInitialize(reserved); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init)(nvtx_CUdevice device, const char* name){ nvtxNameCuDeviceA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr; if (local) local(device, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init)(nvtx_CUdevice device, const wchar_t* name){ nvtxNameCuDeviceW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr; if (local) local(device, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init)(nvtx_CUcontext context, const char* name){ nvtxNameCuContextA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr; if (local) local(context, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init)(nvtx_CUcontext context, const wchar_t* name){ nvtxNameCuContextW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr; if (local) local(context, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init)(nvtx_CUstream stream, const char* name){ nvtxNameCuStreamA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr; if (local) local(stream, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init)(nvtx_CUstream stream, const wchar_t* name){ nvtxNameCuStreamW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr; if (local) local(stream, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init)(nvtx_CUevent event, const char* name){ nvtxNameCuEventA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr; if (local) local(event, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init)(nvtx_CUevent event, const wchar_t* name){ nvtxNameCuEventW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr; if (local) local(event, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init)(int device, const char* name){ nvtxNameCudaDeviceA_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr; if (local) local(device, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init)(int device, const wchar_t* name){ nvtxNameCudaDeviceW_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr; if (local) local(device, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init)(nvtx_cudaStream_t stream, const char* name){ nvtxNameCudaStreamA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr; if (local) local(stream, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init)(nvtx_cudaStream_t stream, const wchar_t* name){ nvtxNameCudaStreamW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr; if (local) local(stream, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init)(nvtx_cudaEvent_t event, const char* name){ nvtxNameCudaEventA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr; if (local) local(event, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init)(nvtx_cudaEvent_t event, const wchar_t* name){ nvtxNameCudaEventW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr; if (local) local(event, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init)(nvtx_cl_device_id device, const char* name){ nvtxNameClDeviceA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr; if (local) local(device, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init)(nvtx_cl_device_id device, const wchar_t* name){ nvtxNameClDeviceW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr; if (local) local(device, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init)(nvtx_cl_context context, const char* name){ nvtxNameClContextA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr; if (local) local(context, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init)(nvtx_cl_context context, const wchar_t* name){ nvtxNameClContextW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr; if (local) local(context, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init)(nvtx_cl_command_queue command_queue, const char* name){ nvtxNameClCommandQueueA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr; if (local) local(command_queue, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init)(nvtx_cl_command_queue command_queue, const wchar_t* name){ nvtxNameClCommandQueueW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr; if (local) local(command_queue, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init)(nvtx_cl_mem memobj, const char* name){ nvtxNameClMemObjectA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr; if (local) local(memobj, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init)(nvtx_cl_mem memobj, const wchar_t* name){ nvtxNameClMemObjectW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr; if (local) local(memobj, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init)(nvtx_cl_sampler sampler, const char* name){ nvtxNameClSamplerA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr; if (local) local(sampler, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init)(nvtx_cl_sampler sampler, const wchar_t* name){ nvtxNameClSamplerW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr; if (local) local(sampler, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init)(nvtx_cl_program program, const char* name){ nvtxNameClProgramA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr; if (local) local(program, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init)(nvtx_cl_program program, const wchar_t* name){ nvtxNameClProgramW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr; if (local) local(program, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init)(nvtx_cl_event evnt, const char* name){ nvtxNameClEventA_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr; if (local) local(evnt, name); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init)(nvtx_cl_event evnt, const wchar_t* name){ nvtxNameClEventW_fakeimpl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr; if (local) local(evnt, name); } NVTX_LINKONCE_DEFINE_FUNCTION nvtxSyncUser_t NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs){ nvtxDomainSyncUserCreate_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr; if (local) { return local(domain, attribs); } return (nvtxSyncUser_t)0; } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init)(nvtxSyncUser_t handle){ nvtxDomainSyncUserDestroy_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr; if (local) local(handle); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init)(nvtxSyncUser_t handle){ nvtxDomainSyncUserAcquireStart_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr; if (local) local(handle); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init)(nvtxSyncUser_t handle){ nvtxDomainSyncUserAcquireFailed_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr; if (local) local(handle); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init)(nvtxSyncUser_t handle){ nvtxDomainSyncUserAcquireSuccess_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr; if (local) local(handle); } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init)(nvtxSyncUser_t handle){ nvtxDomainSyncUserReleasing_impl_fntype local; NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(); local = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr; if (local) local(handle); } NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops); NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxSetInitFunctionsToNoops)(int forceAllToNoops) { if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr = NULL; if (NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr == NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init) || forceAllToNoops) NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr = NULL; } nccl-2.18.3-1/src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h000066400000000000000000000100261444201535000230700ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef __NVTX_LINKONCE_H__ #define __NVTX_LINKONCE_H__ /* This header defines macros to permit making definitions of global variables * and functions in C/C++ header files which may be included multiple times in * a translation unit or linkage unit. It allows authoring header-only libraries * which can be used by multiple other header-only libraries (either as the same * copy or multiple copies), and does not require any build changes, such as * adding another .c file, linking a static library, or deploying a dynamic * library. Globals defined with these macros have the property that they have * the same address, pointing to a single instance, for the entire linkage unit. * It is expected but not guaranteed that each linkage unit will have a separate * instance. * * In some situations it is desirable to declare a variable without initializing * it, refer to it in code or other variables' initializers, and then initialize * it later. Similarly, functions can be prototyped, have their address taken, * and then have their body defined later. In such cases, use the FWDDECL macros * when forward-declaring LINKONCE global variables without initializers and * function prototypes, and then use the DEFINE macros when later defining them. * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro, * following this pattern makes code maximally portable. */ #if defined(__MINGW32__) /* MinGW */ #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0."))) #if defined(__cplusplus) #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline NVTX_LINKONCE_WEAK #else #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK #endif #elif defined(_MSC_VER) /* MSVC */ #if defined(__cplusplus) #define NVTX_LINKONCE_DEFINE_GLOBAL extern "C" __declspec(selectany) #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline #else #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) #define NVTX_LINKONCE_DEFINE_FUNCTION __inline #endif #elif defined(__CYGWIN__) && defined(__clang__) /* Clang on Cygwin */ #define NVTX_LINKONCE_WEAK __attribute__((section(".gnu.linkonce.0."))) #if defined(__cplusplus) #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_WEAK #else #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK #endif #elif defined(__CYGWIN__) /* Assume GCC or compatible */ #define NVTX_LINKONCE_WEAK __attribute__((weak)) #if defined(__cplusplus) #define NVTX_LINKONCE_DEFINE_GLOBAL __declspec(selectany) #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" inline #else #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_WEAK #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_WEAK #endif #else /* All others: Assume GCC, clang, or compatible */ #define NVTX_LINKONCE_WEAK __attribute__((weak)) #define NVTX_LINKONCE_HIDDEN __attribute__((visibility("hidden"))) #if defined(__cplusplus) #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK #define NVTX_LINKONCE_DEFINE_FUNCTION extern "C" NVTX_LINKONCE_HIDDEN inline #else #define NVTX_LINKONCE_DEFINE_GLOBAL NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK #define NVTX_LINKONCE_DEFINE_FUNCTION NVTX_LINKONCE_HIDDEN NVTX_LINKONCE_WEAK #endif #endif #define NVTX_LINKONCE_FWDDECL_GLOBAL NVTX_LINKONCE_DEFINE_GLOBAL extern #define NVTX_LINKONCE_FWDDECL_FUNCTION NVTX_LINKONCE_DEFINE_FUNCTION #endif /* __NVTX_LINKONCE_H__ */ nccl-2.18.3-1/src/include/nvtx3/nvtxDetail/nvtxTypes.h000066400000000000000000000366251444201535000225070ustar00rootroot00000000000000/* * Copyright 2009-2022 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ /* This header defines types which are used by the internal implementation * of NVTX and callback subscribers. API clients do not use these types, * so they are defined here instead of in nvToolsExt.h to clarify they are * not part of the NVTX client API. */ #ifndef NVTX_IMPL_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h. #endif /* ------ Dependency-free types binary-compatible with real types ------- */ /* In order to avoid having the NVTX core API headers depend on non-NVTX * headers like cuda.h, NVTX defines binary-compatible types to use for * safely making the initialization versions of all NVTX functions without * needing to have definitions for the real types. */ typedef int nvtx_CUdevice; typedef void* nvtx_CUcontext; typedef void* nvtx_CUstream; typedef void* nvtx_CUevent; typedef void* nvtx_cudaStream_t; typedef void* nvtx_cudaEvent_t; typedef void* nvtx_cl_platform_id; typedef void* nvtx_cl_device_id; typedef void* nvtx_cl_context; typedef void* nvtx_cl_command_queue; typedef void* nvtx_cl_mem; typedef void* nvtx_cl_program; typedef void* nvtx_cl_kernel; typedef void* nvtx_cl_event; typedef void* nvtx_cl_sampler; typedef struct nvtxSyncUser* nvtxSyncUser_t; struct nvtxSyncUserAttributes_v0; typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t; /* --------- Types for function pointers (with fake API types) ---------- */ typedef void (NVTX_API * nvtxMarkEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib); typedef void (NVTX_API * nvtxMarkA_impl_fntype)(const char* message); typedef void (NVTX_API * nvtxMarkW_impl_fntype)(const wchar_t* message); typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib); typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartA_impl_fntype)(const char* message); typedef nvtxRangeId_t (NVTX_API * nvtxRangeStartW_impl_fntype)(const wchar_t* message); typedef void (NVTX_API * nvtxRangeEnd_impl_fntype)(nvtxRangeId_t id); typedef int (NVTX_API * nvtxRangePushEx_impl_fntype)(const nvtxEventAttributes_t* eventAttrib); typedef int (NVTX_API * nvtxRangePushA_impl_fntype)(const char* message); typedef int (NVTX_API * nvtxRangePushW_impl_fntype)(const wchar_t* message); typedef int (NVTX_API * nvtxRangePop_impl_fntype)(void); typedef void (NVTX_API * nvtxNameCategoryA_impl_fntype)(uint32_t category, const char* name); typedef void (NVTX_API * nvtxNameCategoryW_impl_fntype)(uint32_t category, const wchar_t* name); typedef void (NVTX_API * nvtxNameOsThreadA_impl_fntype)(uint32_t threadId, const char* name); typedef void (NVTX_API * nvtxNameOsThreadW_impl_fntype)(uint32_t threadId, const wchar_t* name); /* Real impl types are defined in nvtxImplCuda_v3.h, where CUDA headers are included */ typedef void (NVTX_API * nvtxNameCuDeviceA_fakeimpl_fntype)(nvtx_CUdevice device, const char* name); typedef void (NVTX_API * nvtxNameCuDeviceW_fakeimpl_fntype)(nvtx_CUdevice device, const wchar_t* name); typedef void (NVTX_API * nvtxNameCuContextA_fakeimpl_fntype)(nvtx_CUcontext context, const char* name); typedef void (NVTX_API * nvtxNameCuContextW_fakeimpl_fntype)(nvtx_CUcontext context, const wchar_t* name); typedef void (NVTX_API * nvtxNameCuStreamA_fakeimpl_fntype)(nvtx_CUstream stream, const char* name); typedef void (NVTX_API * nvtxNameCuStreamW_fakeimpl_fntype)(nvtx_CUstream stream, const wchar_t* name); typedef void (NVTX_API * nvtxNameCuEventA_fakeimpl_fntype)(nvtx_CUevent event, const char* name); typedef void (NVTX_API * nvtxNameCuEventW_fakeimpl_fntype)(nvtx_CUevent event, const wchar_t* name); /* Real impl types are defined in nvtxImplOpenCL_v3.h, where OPENCL headers are included */ typedef void (NVTX_API * nvtxNameClDeviceA_fakeimpl_fntype)(nvtx_cl_device_id device, const char* name); typedef void (NVTX_API * nvtxNameClDeviceW_fakeimpl_fntype)(nvtx_cl_device_id device, const wchar_t* name); typedef void (NVTX_API * nvtxNameClContextA_fakeimpl_fntype)(nvtx_cl_context context, const char* name); typedef void (NVTX_API * nvtxNameClContextW_fakeimpl_fntype)(nvtx_cl_context context, const wchar_t* name); typedef void (NVTX_API * nvtxNameClCommandQueueA_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const char* name); typedef void (NVTX_API * nvtxNameClCommandQueueW_fakeimpl_fntype)(nvtx_cl_command_queue command_queue, const wchar_t* name); typedef void (NVTX_API * nvtxNameClMemObjectA_fakeimpl_fntype)(nvtx_cl_mem memobj, const char* name); typedef void (NVTX_API * nvtxNameClMemObjectW_fakeimpl_fntype)(nvtx_cl_mem memobj, const wchar_t* name); typedef void (NVTX_API * nvtxNameClSamplerA_fakeimpl_fntype)(nvtx_cl_sampler sampler, const char* name); typedef void (NVTX_API * nvtxNameClSamplerW_fakeimpl_fntype)(nvtx_cl_sampler sampler, const wchar_t* name); typedef void (NVTX_API * nvtxNameClProgramA_fakeimpl_fntype)(nvtx_cl_program program, const char* name); typedef void (NVTX_API * nvtxNameClProgramW_fakeimpl_fntype)(nvtx_cl_program program, const wchar_t* name); typedef void (NVTX_API * nvtxNameClEventA_fakeimpl_fntype)(nvtx_cl_event evnt, const char* name); typedef void (NVTX_API * nvtxNameClEventW_fakeimpl_fntype)(nvtx_cl_event evnt, const wchar_t* name); /* Real impl types are defined in nvtxImplCudaRt_v3.h, where CUDART headers are included */ typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name); typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name); typedef void (NVTX_API * nvtxNameCudaStreamA_fakeimpl_fntype)(nvtx_cudaStream_t stream, const char* name); typedef void (NVTX_API * nvtxNameCudaStreamW_fakeimpl_fntype)(nvtx_cudaStream_t stream, const wchar_t* name); typedef void (NVTX_API * nvtxNameCudaEventA_fakeimpl_fntype)(nvtx_cudaEvent_t event, const char* name); typedef void (NVTX_API * nvtxNameCudaEventW_fakeimpl_fntype)(nvtx_cudaEvent_t event, const wchar_t* name); typedef void (NVTX_API * nvtxDomainMarkEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); typedef nvtxRangeId_t (NVTX_API * nvtxDomainRangeStartEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); typedef void (NVTX_API * nvtxDomainRangeEnd_impl_fntype)(nvtxDomainHandle_t domain, nvtxRangeId_t id); typedef int (NVTX_API * nvtxDomainRangePushEx_impl_fntype)(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib); typedef int (NVTX_API * nvtxDomainRangePop_impl_fntype)(nvtxDomainHandle_t domain); typedef nvtxResourceHandle_t (NVTX_API * nvtxDomainResourceCreate_impl_fntype)(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs); typedef void (NVTX_API * nvtxDomainResourceDestroy_impl_fntype)(nvtxResourceHandle_t resource); typedef void (NVTX_API * nvtxDomainNameCategoryA_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const char* name); typedef void (NVTX_API * nvtxDomainNameCategoryW_impl_fntype)(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name); typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringA_impl_fntype)(nvtxDomainHandle_t domain, const char* string); typedef nvtxStringHandle_t (NVTX_API * nvtxDomainRegisterStringW_impl_fntype)(nvtxDomainHandle_t domain, const wchar_t* string); typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateA_impl_fntype)(const char* message); typedef nvtxDomainHandle_t (NVTX_API * nvtxDomainCreateW_impl_fntype)(const wchar_t* message); typedef void (NVTX_API * nvtxDomainDestroy_impl_fntype)(nvtxDomainHandle_t domain); typedef void (NVTX_API * nvtxInitialize_impl_fntype)(const void* reserved); typedef nvtxSyncUser_t (NVTX_API * nvtxDomainSyncUserCreate_impl_fntype)(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs); typedef void (NVTX_API * nvtxDomainSyncUserDestroy_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserAcquireStart_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserAcquireFailed_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserAcquireSuccess_impl_fntype)(nvtxSyncUser_t handle); typedef void (NVTX_API * nvtxDomainSyncUserReleasing_impl_fntype)(nvtxSyncUser_t handle); /* ---------------- Types for callback subscription --------------------- */ typedef const void *(NVTX_API * NvtxGetExportTableFunc_t)(uint32_t exportTableId); typedef int (NVTX_API * NvtxInitializeInjectionNvtxFunc_t)(NvtxGetExportTableFunc_t exportTable); typedef enum NvtxCallbackModule { NVTX_CB_MODULE_INVALID = 0, NVTX_CB_MODULE_CORE = 1, NVTX_CB_MODULE_CUDA = 2, NVTX_CB_MODULE_OPENCL = 3, NVTX_CB_MODULE_CUDART = 4, NVTX_CB_MODULE_CORE2 = 5, NVTX_CB_MODULE_SYNC = 6, /* --- New constants must only be added directly above this line --- */ NVTX_CB_MODULE_SIZE, NVTX_CB_MODULE_FORCE_INT = 0x7fffffff } NvtxCallbackModule; typedef enum NvtxCallbackIdCore { NVTX_CBID_CORE_INVALID = 0, NVTX_CBID_CORE_MarkEx = 1, NVTX_CBID_CORE_MarkA = 2, NVTX_CBID_CORE_MarkW = 3, NVTX_CBID_CORE_RangeStartEx = 4, NVTX_CBID_CORE_RangeStartA = 5, NVTX_CBID_CORE_RangeStartW = 6, NVTX_CBID_CORE_RangeEnd = 7, NVTX_CBID_CORE_RangePushEx = 8, NVTX_CBID_CORE_RangePushA = 9, NVTX_CBID_CORE_RangePushW = 10, NVTX_CBID_CORE_RangePop = 11, NVTX_CBID_CORE_NameCategoryA = 12, NVTX_CBID_CORE_NameCategoryW = 13, NVTX_CBID_CORE_NameOsThreadA = 14, NVTX_CBID_CORE_NameOsThreadW = 15, /* --- New constants must only be added directly above this line --- */ NVTX_CBID_CORE_SIZE, NVTX_CBID_CORE_FORCE_INT = 0x7fffffff } NvtxCallbackIdCore; typedef enum NvtxCallbackIdCore2 { NVTX_CBID_CORE2_INVALID = 0, NVTX_CBID_CORE2_DomainMarkEx = 1, NVTX_CBID_CORE2_DomainRangeStartEx = 2, NVTX_CBID_CORE2_DomainRangeEnd = 3, NVTX_CBID_CORE2_DomainRangePushEx = 4, NVTX_CBID_CORE2_DomainRangePop = 5, NVTX_CBID_CORE2_DomainResourceCreate = 6, NVTX_CBID_CORE2_DomainResourceDestroy = 7, NVTX_CBID_CORE2_DomainNameCategoryA = 8, NVTX_CBID_CORE2_DomainNameCategoryW = 9, NVTX_CBID_CORE2_DomainRegisterStringA = 10, NVTX_CBID_CORE2_DomainRegisterStringW = 11, NVTX_CBID_CORE2_DomainCreateA = 12, NVTX_CBID_CORE2_DomainCreateW = 13, NVTX_CBID_CORE2_DomainDestroy = 14, NVTX_CBID_CORE2_Initialize = 15, /* --- New constants must only be added directly above this line --- */ NVTX_CBID_CORE2_SIZE, NVTX_CBID_CORE2_FORCE_INT = 0x7fffffff } NvtxCallbackIdCore2; typedef enum NvtxCallbackIdCuda { NVTX_CBID_CUDA_INVALID = 0, NVTX_CBID_CUDA_NameCuDeviceA = 1, NVTX_CBID_CUDA_NameCuDeviceW = 2, NVTX_CBID_CUDA_NameCuContextA = 3, NVTX_CBID_CUDA_NameCuContextW = 4, NVTX_CBID_CUDA_NameCuStreamA = 5, NVTX_CBID_CUDA_NameCuStreamW = 6, NVTX_CBID_CUDA_NameCuEventA = 7, NVTX_CBID_CUDA_NameCuEventW = 8, /* --- New constants must only be added directly above this line --- */ NVTX_CBID_CUDA_SIZE, NVTX_CBID_CUDA_FORCE_INT = 0x7fffffff } NvtxCallbackIdCuda; typedef enum NvtxCallbackIdCudaRt { NVTX_CBID_CUDART_INVALID = 0, NVTX_CBID_CUDART_NameCudaDeviceA = 1, NVTX_CBID_CUDART_NameCudaDeviceW = 2, NVTX_CBID_CUDART_NameCudaStreamA = 3, NVTX_CBID_CUDART_NameCudaStreamW = 4, NVTX_CBID_CUDART_NameCudaEventA = 5, NVTX_CBID_CUDART_NameCudaEventW = 6, /* --- New constants must only be added directly above this line --- */ NVTX_CBID_CUDART_SIZE, NVTX_CBID_CUDART_FORCE_INT = 0x7fffffff } NvtxCallbackIdCudaRt; typedef enum NvtxCallbackIdOpenCL { NVTX_CBID_OPENCL_INVALID = 0, NVTX_CBID_OPENCL_NameClDeviceA = 1, NVTX_CBID_OPENCL_NameClDeviceW = 2, NVTX_CBID_OPENCL_NameClContextA = 3, NVTX_CBID_OPENCL_NameClContextW = 4, NVTX_CBID_OPENCL_NameClCommandQueueA = 5, NVTX_CBID_OPENCL_NameClCommandQueueW = 6, NVTX_CBID_OPENCL_NameClMemObjectA = 7, NVTX_CBID_OPENCL_NameClMemObjectW = 8, NVTX_CBID_OPENCL_NameClSamplerA = 9, NVTX_CBID_OPENCL_NameClSamplerW = 10, NVTX_CBID_OPENCL_NameClProgramA = 11, NVTX_CBID_OPENCL_NameClProgramW = 12, NVTX_CBID_OPENCL_NameClEventA = 13, NVTX_CBID_OPENCL_NameClEventW = 14, /* --- New constants must only be added directly above this line --- */ NVTX_CBID_OPENCL_SIZE, NVTX_CBID_OPENCL_FORCE_INT = 0x7fffffff } NvtxCallbackIdOpenCL; typedef enum NvtxCallbackIdSync { NVTX_CBID_SYNC_INVALID = 0, NVTX_CBID_SYNC_DomainSyncUserCreate = 1, NVTX_CBID_SYNC_DomainSyncUserDestroy = 2, NVTX_CBID_SYNC_DomainSyncUserAcquireStart = 3, NVTX_CBID_SYNC_DomainSyncUserAcquireFailed = 4, NVTX_CBID_SYNC_DomainSyncUserAcquireSuccess = 5, NVTX_CBID_SYNC_DomainSyncUserReleasing = 6, /* --- New constants must only be added directly above this line --- */ NVTX_CBID_SYNC_SIZE, NVTX_CBID_SYNC_FORCE_INT = 0x7fffffff } NvtxCallbackIdSync; /* IDs for NVTX Export Tables */ typedef enum NvtxExportTableID { NVTX_ETID_INVALID = 0, NVTX_ETID_CALLBACKS = 1, NVTX_ETID_RESERVED0 = 2, NVTX_ETID_VERSIONINFO = 3, /* --- New constants must only be added directly above this line --- */ NVTX_ETID_SIZE, NVTX_ETID_FORCE_INT = 0x7fffffff } NvtxExportTableID; typedef void (* NvtxFunctionPointer)(void); /* generic uncallable function pointer, must be casted to appropriate function type */ typedef NvtxFunctionPointer** NvtxFunctionTable; /* double pointer because array(1) of pointers(2) to function pointers */ typedef struct NvtxExportTableCallbacks { size_t struct_size; /* returns an array of pointer to function pointers*/ int (NVTX_API *GetModuleFunctionTable)( NvtxCallbackModule module, NvtxFunctionTable* out_table, unsigned int* out_size); } NvtxExportTableCallbacks; typedef struct NvtxExportTableVersionInfo { /* sizeof(NvtxExportTableVersionInfo) */ size_t struct_size; /* The API version comes from the NVTX library linked to the app. The * injection library is can use this info to make some assumptions */ uint32_t version; /* Reserved for alignment, do not use */ uint32_t reserved0; /* This must be set by tools when attaching to provide applications * the ability to, in emergency situations, detect problematic tools * versions and modify the NVTX source to prevent attaching anything * that causes trouble in the app. Currently, this value is ignored. */ void (NVTX_API *SetInjectionNvtxVersion)( uint32_t version); } NvtxExportTableVersionInfo; nccl-2.18.3-1/src/include/nvtx3/nvtxExtDetail/000077500000000000000000000000001444201535000207575ustar00rootroot00000000000000nccl-2.18.3-1/src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h000066400000000000000000000042101444201535000234270ustar00rootroot00000000000000/* * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_IMPL_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif #ifndef NVTX_EXT_IMPL_H #define NVTX_EXT_IMPL_H /* ---- Include required platform headers ---- */ #if defined(_WIN32) #include #else #include #if defined(__ANDROID__) #include #endif #if defined(__linux__) || defined(__CYGWIN__) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* ---- Define macros used in this file ---- */ #ifdef NVTX_DEBUG_PRINT #ifdef __ANDROID__ #include #define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__); #define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__); #else #include #define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__) #define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__) #endif #else /* !defined(NVTX_DEBUG_PRINT) */ #define NVTX_ERR(...) #define NVTX_INFO(...) #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ // #ifdef __GNUC__ // #pragma GCC visibility push(hidden) // #endif #define NVTX_EXTENSION_FRESH 0 #define NVTX_EXTENSION_DISABLED 1 #define NVTX_EXTENSION_STARTING 2 #define NVTX_EXTENSION_LOADED 3 NVTX_LINKONCE_DEFINE_GLOBAL NvtxExtInitializeInjectionFunc_t NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = (NvtxExtInitializeInjectionFunc_t)0; #define NVTX_EXT_INIT_GUARD #include "nvtxExtInit.h" #undef NVTX_EXT_INIT_GUARD // #ifdef __GNUC__ // #pragma GCC visibility pop // #endif #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ #endif /* NVTX_EXT_IMPL_H */nccl-2.18.3-1/src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h000066400000000000000000000062521444201535000253570ustar00rootroot00000000000000/* * Copyright 2021 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD #error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined). #endif #define NVTX_EXT_IMPL_GUARD #include "nvtxExtImpl.h" #undef NVTX_EXT_IMPL_GUARD #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ #define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \ NAME##_v##VERSION##_mem##COMPATID #define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \ NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) #define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \ NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_PAYLOAD) /* * Function slots for the binary payload extension. First entry is the module * state, initialized to `0` (`NVTX_EXTENSION_FRESH`). */ NVTX_LINKONCE_DEFINE_GLOBAL intptr_t NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM + 1] = {0}; NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)() { intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1; nvtxExtModuleSegment_t segment = { 0, // unused (only one segment) NVTX3EXT_CBID_PAYLOAD_FN_NUM, fnSlots }; nvtxExtModuleInfo_t module = { NVTX_VERSION, sizeof(nvtxExtModuleInfo_t), NVTX_EXT_MODULEID_PAYLOAD, NVTX_EXT_COMPATID_PAYLOAD, 1, &segment, // number of segments, segments NULL, // no export function needed // bake type sizes and alignment information into program binary &nvtxExtPayloadTypeInfo }; NVTX_INFO( "%s\n", __FUNCTION__ ); NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module, NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)); } #define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \ typedef ret_val ( * fn_name##_impl_fntype )signature; \ NVTX_LINKONCE_DEFINE_FUNCTION ret_val fn_name signature { \ intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ if (slot != NVTX_EXTENSION_DISABLED) { \ if (slot) { \ return (*(fn_name##_impl_fntype)slot) arg_names; \ } else { \ NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \ slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \ if (slot != NVTX_EXTENSION_DISABLED && slot) { \ return (*(fn_name##_impl_fntype)slot) arg_names; \ } \ } \ } \ return ((ret_val)(intptr_t)-1); \ } NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadSchemaRegister, (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), (domain, attr)) NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadEnumRegister, (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), (domain, attr)) #undef NVTX_EXT_FN_IMPL #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */nccl-2.18.3-1/src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h000066400000000000000000000350231444201535000234370ustar00rootroot00000000000000/* * Copyright 2009-2020 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_INIT_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* ---- Platform-independent helper definitions and functions ---- */ /* Prefer macros over inline functions to reduce symbol resolution at link time */ #if defined(_WIN32) #define NVTX_PATHCHAR wchar_t #define NVTX_STR(x) L##x #define NVTX_GETENV _wgetenv #define NVTX_BUFSIZE MAX_PATH #define NVTX_DLLHANDLE HMODULE #define NVTX_DLLOPEN(x) LoadLibraryW(x) #define NVTX_DLLFUNC GetProcAddress #define NVTX_DLLCLOSE FreeLibrary #define NVTX_YIELD() SwitchToThread() #define NVTX_MEMBAR() MemoryBarrier() #define NVTX_ATOMIC_WRITE_32(address, value) InterlockedExchange((volatile LONG*)address, value) #define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand) #define NVTX_ATOMIC_WRITE_PTR(address, value) InterlockedExchangePointer((volatile PVOID*)address, (PVOID)value) #define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) old = (intptr_t)InterlockedCompareExchangePointer((volatile PVOID*)address, (PVOID)exchange, (PVOID)comparand) #elif defined(__GNUC__) #define NVTX_PATHCHAR char #define NVTX_STR(x) x #define NVTX_GETENV getenv #define NVTX_BUFSIZE PATH_MAX #define NVTX_DLLHANDLE void* #define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY) #define NVTX_DLLFUNC dlsym #define NVTX_DLLCLOSE dlclose #define NVTX_YIELD() sched_yield() #define NVTX_MEMBAR() __sync_synchronize() /* Ensure full memory barrier for atomics, to match Windows functions */ #define NVTX_ATOMIC_WRITE_32(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value) #define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand) #define NVTX_ATOMIC_WRITE_PTR(address, value) __sync_synchronize(); __sync_lock_test_and_set(address, value) #define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand) #else #error The library does not support your configuration! #endif /* Define this to 1 for platforms that where pre-injected libraries can be discovered. */ #if defined(_WIN32) /* TODO */ #define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0 #else #define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0 #endif /* Define this to 1 for platforms that support environment variables */ /* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */ /* Try: #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */ #define NVTX_SUPPORT_ENV_VARS 1 /* Define this to 1 for platforms that support dynamic/shared libraries */ #define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1 /* Injection libraries implementing InitializeInjectionNvtxExtension may be statically linked, * and this will override any dynamic injection. Useful for platforms where dynamic * injection is not available. Since weak symbols not explicitly marked extern are * guaranteed to be initialized to zero if no definitions are found by the linker, the * dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */ #if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__) #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1 /* To statically inject an NVTX library, define InitializeInjectionNvtxExtension_fnptr as a normal * symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension (which * does not need to be named "InitializeInjectionNvtxExtension" as is necessary in a dynamic * injection library. */ __attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxExtension_fnptr; #else #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0 #endif /* This function tries to find or load an NVTX injection library and get the * address of its InitializeInjectionExtension function. If such a function pointer * is found, it is called, and passed the address of this NVTX instance's * nvtxGetExportTable function, so the injection can attach to this instance. * If the initialization fails for any reason, any dynamic library loaded will * be freed, and all NVTX implementation functions will be set to no-ops. If * initialization succeeds, NVTX functions not attached to the tool will be set * to no-ops. This is implemented as one function instead of several small * functions to minimize the number of weak symbols the linker must resolve. * Order of search is: * - Pre-injected library exporting InitializeInjectionNvtxExtension * - Loadable library exporting InitializeInjectionNvtxExtension * - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64) * - On Android, libNvtxInjection??.so within the package (?? is 32 or 64) * - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr */ NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr); NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr) { const char* const initFuncName = "InitializeInjectionNvtxExtension"; NvtxExtInitializeInjectionFunc_t init_fnptr = (NvtxExtInitializeInjectionFunc_t)0; NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0; if(out_init_fnptr){ *out_init_fnptr = (NvtxExtInitializeInjectionFunc_t)0; } #if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY /* Use POSIX global symbol chain to query for init function from any module */ init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(0, initFuncName); #endif #if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY /* Try discovering dynamic injection library to load */ if (!init_fnptr) { #if NVTX_SUPPORT_ENV_VARS /* If env var NVTX_INJECTION64_PATH is set, it should contain the path * to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */ const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4) ? NVTX_STR("NVTX_INJECTION32_PATH") : NVTX_STR("NVTX_INJECTION64_PATH"); #endif /* NVTX_SUPPORT_ENV_VARS */ NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE]; const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0; /* Refer to this variable explicitly in case all references to it are #if'ed out */ (void)injectionLibraryPathBuf; #if NVTX_SUPPORT_ENV_VARS /* Disable the warning for getenv & _wgetenv -- this usage is safe because * these functions are not called again before using the returned value. */ #if defined(_MSC_VER) #pragma warning( push ) #pragma warning( disable : 4996 ) #endif injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName); #if defined(_MSC_VER) #pragma warning( pop ) #endif #endif #if defined(__ANDROID__) if (!injectionLibraryPath) { const char *bits = (sizeof(void*) == 4) ? "32" : "64"; char cmdlineBuf[32]; char pkgName[PATH_MAX]; int count; int pid; FILE *fp; size_t bytesRead; size_t pos; pid = (int)getpid(); count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid); if (count <= 0 || count >= (int)sizeof(cmdlineBuf)) { NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid); return NVTX_ERR_INIT_ACCESS_LIBRARY; } fp = fopen(cmdlineBuf, "r"); if (!fp) { NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf); return NVTX_ERR_INIT_ACCESS_LIBRARY; } bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp); fclose(fp); if (bytesRead == 0) { NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf); return NVTX_ERR_INIT_ACCESS_LIBRARY; } pkgName[bytesRead] = 0; /* String can contain colon as a process separator. In this case the package name is before the colon. */ pos = 0; while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0') { ++pos; } pkgName[pos] = 0; count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits); if (count <= 0 || count >= NVTX_BUFSIZE) { NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits); return NVTX_ERR_INIT_ACCESS_LIBRARY; } /* On Android, verify path is accessible due to aggressive file access restrictions. */ /* For dlopen, if the filename contains a leading slash, then it is interpreted as a */ /* relative or absolute pathname; otherwise it will follow the rules in ld.so. */ if (injectionLibraryPathBuf[0] == '/') { #if (__ANDROID_API__ < 21) int access_err = access(injectionLibraryPathBuf, F_OK | R_OK); #else int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0); #endif if (access_err != 0) { NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf); return NVTX_ERR_INIT_ACCESS_LIBRARY; } } injectionLibraryPath = injectionLibraryPathBuf; } #endif /* At this point, injectionLibraryPath is specified if a dynamic * injection library was specified by a tool. */ if (injectionLibraryPath) { /* Load the injection library */ injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath); if (!injectionLibraryHandle) { NVTX_ERR("Failed to load injection library\n"); return NVTX_ERR_INIT_LOAD_LIBRARY; } else { /* Attempt to get the injection library's entry-point */ init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName); if (!init_fnptr) { NVTX_DLLCLOSE(injectionLibraryHandle); NVTX_ERR("Failed to get address of function %s from injection library\n", initFuncName); return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT; } } } } #endif #if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY if (!init_fnptr) { /* Check weakly-defined function pointer. A statically-linked injection can define this as * a normal symbol and it will take precedence over a dynamic injection. */ if (InitializeInjectionNvtxExtension_fnptr) { init_fnptr = InitializeInjectionNvtxExtension_fnptr; } } #endif if(out_init_fnptr){ *out_init_fnptr = init_fnptr; } /* At this point, if init_fnptr is not set, then no tool has specified * an NVTX injection library -- return non-success result so all NVTX * API functions will be set to no-ops. */ if (!init_fnptr) { return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE; } return NVTX_SUCCESS; } NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) ( nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState ) { intptr_t old; NVTX_INFO( "%s\n", __FUNCTION__ ); if( *moduleState == NVTX_EXTENSION_LOADED) { return; } NVTX_ATOMIC_CAS_PTR( old, moduleState, NVTX_EXTENSION_STARTING, NVTX_EXTENSION_FRESH); if (old == NVTX_EXTENSION_FRESH) { NvtxExtInitializeInjectionFunc_t init_fnptr = NVTX_VERSIONED_IDENTIFIER(injectionFnPtr); int entryPointStatus = 0; int forceAllToNoops = 0; /* Load & initialize injection library -- it will assign the function pointers */ if(init_fnptr == 0){ int result = 0; /* try to load vanilla NVTX first*/ nvtxInitialize(0); result = NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(&init_fnptr); /*at this point init_fnptr will be either 0 or a real function*/ if(result == NVTX_SUCCESS) { NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = init_fnptr; } else { NVTX_ERR("Failed to load injection library\n"); } } if(init_fnptr != 0) { /* Invoke injection library's initialization function. If it returns * 0 (failure) and a dynamic injection was loaded, unload it. */ entryPointStatus = init_fnptr(moduleInfo); if (entryPointStatus == 0) { NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n"); } } /* Clean up any functions that are still uninitialized so that they are skipped. * Set all to null if injection init function failed as well. */ forceAllToNoops = (init_fnptr == 0) || (entryPointStatus == 0); for(size_t s = 0; s < moduleInfo->segmentsCount; ++s){ nvtxExtModuleSegment_t* segment = moduleInfo->segments+s; for(size_t i = 0; i < segment->slotCount; ++i){ if(forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH)){ segment->functionSlots[i] = NVTX_EXTENSION_DISABLED; } } } NVTX_MEMBAR(); /* Signal that initialization has finished, so now the assigned function pointers will be used */ NVTX_ATOMIC_WRITE_PTR( moduleState, NVTX_EXTENSION_LOADED); } else /* Spin-wait until initialization has finished */ { NVTX_MEMBAR(); while (*moduleState != NVTX_EXTENSION_LOADED) { NVTX_YIELD(); NVTX_MEMBAR(); } } } #ifdef __cplusplus } #endif /* __cplusplus */ nccl-2.18.3-1/src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h000066400000000000000000000134071444201535000256050ustar00rootroot00000000000000/* * Copyright 2021-2023 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ #ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD #error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined). #endif typedef void* pointer_type; #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) #include #include #endif /* `alignof` is available as of C11 or C++11 */ #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L) #define nvtx_alignof(type) alignof(type) #define nvtx_alignof2(type,tname) alignof(type) #else /* (__STDC_VERSION__ >= 201112L) || (__cplusplus >= 201103L) */ /* Create helper structs to determine type alignment. */ #define MKTYPEDEF(type) typedef struct {char c; type d;} _nvtx_##type #define MKTYPEDEF2(type,tname) typedef struct {char c; type d;} _nvtx_##tname MKTYPEDEF(char); MKTYPEDEF2(unsigned char, uchar); MKTYPEDEF(short); MKTYPEDEF2(unsigned short, ushort); MKTYPEDEF(int); MKTYPEDEF2(unsigned int, uint); MKTYPEDEF(long); MKTYPEDEF2(unsigned long, ulong); MKTYPEDEF2(long long, longlong); MKTYPEDEF2(unsigned long long, ulonglong); MKTYPEDEF(int8_t); MKTYPEDEF(uint8_t); MKTYPEDEF(int16_t); MKTYPEDEF(uint16_t); MKTYPEDEF(int32_t); MKTYPEDEF(uint32_t); MKTYPEDEF(int64_t); MKTYPEDEF(uint64_t); MKTYPEDEF(float); MKTYPEDEF(double); MKTYPEDEF2(long double, longdouble); MKTYPEDEF(size_t); MKTYPEDEF(pointer_type); MKTYPEDEF(wchar_t); /* `char8_t` is available as of C++20 or C23 */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L) MKTYPEDEF(char8_t); #endif /* `char16_t` and `char32_t` are available as of C++11 or C11 */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L) MKTYPEDEF(char16_t); MKTYPEDEF(char32_t); #endif /* C requires to include stddef.h to use `offsetof` */ #ifndef __cplusplus #include #endif #define nvtx_alignof(tname) offsetof(_nvtx_##tname, d) #define nvtx_alignof2(type, tname) offsetof(_nvtx_##tname, d) #endif /* __STDC_VERSION__ >= 201112L */ #undef MKTYPEDEF #undef MKTYPEDEF2 /* * Helper array to get the alignment for each predefined C/C++ language type. * The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`. */ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] = { /* The first entry contains this array's length and the size of each entry in this array. */ {NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE, sizeof(nvtxPayloadEntryTypeInfo_t)}, /*** C integer types ***/ /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR */ {sizeof(char), nvtx_alignof(char)}, /* NVTX_PAYLOAD_ENTRY_TYPE_UCHAR */ {sizeof(unsigned char), nvtx_alignof2(unsigned char, uchar)}, /* NVTX_PAYLOAD_ENTRY_TYPE_SHORT */ {sizeof(short), nvtx_alignof(short)}, /* NVTX_PAYLOAD_ENTRY_TYPE_USHORT */ {sizeof(unsigned short), nvtx_alignof2(unsigned short, ushort)}, /* NVTX_PAYLOAD_ENTRY_TYPE_INT */ {sizeof(int), nvtx_alignof(int)}, /* NVTX_PAYLOAD_ENTRY_TYPE_UINT */ {sizeof(unsigned int), nvtx_alignof2(unsigned int, uint)}, /* NVTX_PAYLOAD_ENTRY_TYPE_LONG */ {sizeof(long), nvtx_alignof(long)}, /* NVTX_PAYLOAD_ENTRY_TYPE_ULONG */ {sizeof(unsigned long), nvtx_alignof2(unsigned long, ulong)}, /* NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG */ {sizeof(long long), nvtx_alignof2(long long, longlong)}, /* NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG */ {sizeof(unsigned long long), nvtx_alignof2(unsigned long long,ulonglong)}, /*** Integer types with explicit size ***/ /* NVTX_PAYLOAD_ENTRY_TYPE_INT8 */ {sizeof(int8_t), nvtx_alignof(int8_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_UINT8 */ {sizeof(uint8_t), nvtx_alignof(uint8_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_INT16 */ {sizeof(int16_t), nvtx_alignof(int16_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_UINT16 */ {sizeof(uint16_t), nvtx_alignof(uint16_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_INT32 */ {sizeof(int32_t), nvtx_alignof(int32_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_UINT32 */ {sizeof(uint32_t), nvtx_alignof(uint32_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_INT64 */ {sizeof(int64_t), nvtx_alignof(int64_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_UINT64 */ {sizeof(uint64_t), nvtx_alignof(uint64_t)}, /*** C floating point types ***/ /* NVTX_PAYLOAD_ENTRY_TYPE_FLOAT */ {sizeof(float), nvtx_alignof(float)}, /* NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE */ {sizeof(double), nvtx_alignof(double)}, /* NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE */ {sizeof(long double), nvtx_alignof2(long double, longdouble)}, /* NVTX_PAYLOAD_ENTRY_TYPE_SIZE */ {sizeof(size_t), nvtx_alignof(size_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(pointer_type), nvtx_alignof(pointer_type)}, /*** Special character types ***/ /* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)}, #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L) /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {sizeof(char8_t), nvtx_alignof(char8_t)}, #else /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {0, 0}, #endif #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L) /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)}, /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)} #else /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {0, 0}, /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {0, 0} #endif }; #undef nvtx_alignof #undef nvtx_alignof2 nccl-2.18.3-1/src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h000066400000000000000000000024711444201535000236410ustar00rootroot00000000000000/* * Copyright 2021 NVIDIA Corporation. All rights reserved. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ /* This header defines types which are used by the internal implementation * of NVTX and callback subscribers. API clients do not use these types, * so they are defined here instead of in nvToolsExt.h to clarify they are * not part of the NVTX client API. */ #ifndef NVTXEXTTYPES_H #define NVTXEXTTYPES_H #ifndef NVTX_EXT_TYPES_GUARD #error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h. #endif typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId); typedef struct nvtxExtModuleSegment_t { size_t segmentId; size_t slotCount; intptr_t* functionSlots; } nvtxExtModuleSegment_t; typedef struct nvtxExtModuleInfo_t { uint16_t nvtxVer; uint16_t structSize; uint16_t moduleId; uint16_t compatId; size_t segmentsCount; nvtxExtModuleSegment_t* segments; NvtxExtGetExportFunction_t getExportFunction; const void* extInfo; } nvtxExtModuleInfo_t; typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo); #endif /* NVTXEXTTYPES_H */nccl-2.18.3-1/src/include/p2p.h000066400000000000000000000016201444201535000157430ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include #ifndef NCCL_P2P_H_ #define NCCL_P2P_H_ #define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR typedef struct { int data; // Currently only support an fd based descriptor } ncclCuDesc; typedef union { // Legacy CUDA IPC cudaIpcMemHandle_t devIpc; // cuMem API support ncclCuDesc cuDesc; } ncclIpcDesc; ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr); ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc); ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr); #endif nccl-2.18.3-1/src/include/param.h000066400000000000000000000017251444201535000163500ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_PARAM_H_ #define NCCL_PARAM_H_ #include const char* userHomeDir(); void setEnvFile(const char* fileName); void initEnv(); void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); #define NCCL_PARAM(name, env, deftVal) \ int64_t ncclParam##name() { \ constexpr int64_t uninitialized = INT64_MIN; \ static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \ static int64_t cache = uninitialized; \ if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \ ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \ } \ return cache; \ } #endif nccl-2.18.3-1/src/include/profiler.h000066400000000000000000000016131444201535000170660ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_PROFILER_H_ #define NCCL_PROFILER_H_ #include "proxy.h" enum ncclProxyProfileState { ncclProxyProfileBegin = 0, ncclProxyProfileSendGPUWait = 1, ncclProxyProfileSendWait = 2, ncclProxyProfileRecvWait = 1, ncclProxyProfileRecvFlushWait = 2, ncclProxyProfileRecvGPUWait = 3, ncclProxyProfileEnd = 4, ncclProxyProfileSleep = 8, ncclProxyProfileWakeup = 9, ncclProxyProfileIdle = 16, ncclProxyProfileActive = 17, ncclProxyProfileAppend = 24, ncclProxyProfileAppendEnd = 25 }; ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state); void ncclProfilingDump(); #endif nccl-2.18.3-1/src/include/proxy.h000066400000000000000000000174651444201535000164410ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_PROXY_H_ #define NCCL_PROXY_H_ #include "devcomm.h" #include "info.h" #include "socket.h" #include "ipcsocket.h" #include #include "shm.h" #include "p2p.h" enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; struct ncclProxyArgs; typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*); #define NCCL_PROXY_MAX_SUBS MAXCHANNELS static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements"); struct ncclProxyOp { struct ncclProxyConnection* connection; int channelId; int nsteps; ssize_t nbytes; int root; int next; uint64_t opCount; int sliceSteps; int chunkSteps; int chunkSize; uint8_t /*ncclDataType_t*/ dtype; uint8_t /*ncclDevRedOp_t*/ redOp; uint8_t /*ncclPattern_t*/ pattern; uint8_t protocol; union { uint64_t unused; // For use by enqueue.cc struct ncclProxyOp *enqNext; }; }; static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch"); struct ncclProxySubArgs { struct ncclProxyConnection* connection; int channelId; int nsteps; ssize_t nbytes; int peer; int groupSize; // Number of consecutive sub operations sharing the same recvComm uint64_t base; uint64_t posted; uint64_t received; uint64_t flushed; uint64_t transmitted; uint64_t done; uint64_t end; void* requests[NCCL_STEPS]; void* profilingEvents[NCCL_STEPS]; }; struct ncclProxyArgs { struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS]; proxyProgressFunc_t progress; int nsubs; int done; uint64_t opCount; int sliceSteps; int chunkSteps; int chunkSize; uint8_t /*ncclDataType_t*/ dtype; uint8_t /*ncclDevRedOp_t*/ redOp; uint8_t /*ncclPattern_t*/ pattern; uint8_t protocol; int state; char* sharedBuff[NCCL_STEPS]; int sharedSize[NCCL_STEPS]; int idle; // Element linking struct ncclProxyArgs* next; struct ncclProxyArgs* nextPeer; struct ncclProxyArgs** proxyAppendPtr; }; #define NCCL_MAX_NETDEVS 128 // ProxyOps are used to communicate between main thread and service thread // Make sure we have enough to store two full rounds of operations on all channels. // Otherwise we'd be unable to post half of them to free new elements. #define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P) #define NCCL_MAX_LOCAL_RANKS 64 struct ncclProxyOpsPool { struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS]; volatile int nextOps; volatile int nextOpsEnd; volatile int freeOps[NCCL_MAX_LOCAL_RANKS]; pthread_mutex_t mutex; pthread_cond_t cond; }; struct ncclProxyOps { ncclProxyOpsPool* pool; ncclShmHandle_t handle; int count; int freeOp; int nextOps; int nextOpsEnd; }; struct ncclProxySharedP2p { int refcount; int size; char* cudaBuff; char* hostBuff; // CUDA IPC ncclIpcDesc ipcDesc; struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv }; struct ncclProxyPeer { struct ncclProxySharedP2p send; struct ncclProxySharedP2p recv; }; struct ncclSharedNetComms { void* sendComm[MAXCHANNELS]; void* recvComm[MAXCHANNELS]; int sendRefCount[MAXCHANNELS]; int recvRefCount[MAXCHANNELS]; }; struct ncclProxyPool; struct ncclProxyProgressState { // Used by main threads to send work to progress thread struct ncclProxyOpsPool* opsPool; ncclShmHandle_t handle; char opsPoolShmSuffix[6]; pthread_t thread; bool stop; struct ncclProxyPeer** localPeers; struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS]; struct ncclProxyArgs* active; struct ncclProxyArgs* pool; struct ncclProxyPool* pools; int nextOps; }; // Expected proxy response fifo struct ncclExpectedProxyResponse { void* opId; int respSize; bool done; void* respBuff; struct ncclExpectedProxyResponse* next; }; struct ncclProxyAsyncOp { int type; struct ncclProxyConnection* connection; int reqSize, respSize; char *reqBuff, *respBuff; void* opId; ncclProxyAsyncOp* next; }; struct ncclProxyLocalPeer { struct ncclSocket sock; int tpRank; int tpLocalRank; ncclProxyAsyncOp* asyncOps; int asyncOpCounter; }; struct ncclProxyState { int refCount; int tpRank; int tpnRanks; int tpLocalnRanks; int cudaDev; int p2pnChannels; int p2pChunkSize; int nChannels; int buffSizes[NCCL_NUM_PROTOCOLS]; bool allocP2pNetLLBuffers; bool dmaBufSupport; ncclNet_t* ncclNet; ncclCollNet_t* ncclCollNet; volatile uint32_t* abortFlag; // Service thread pthread_t thread; struct ncclSocket* listenSock; int stop; CUcontext cudaCtx; // Used by main thread union ncclSocketAddress* peerAddresses; struct ncclSocket* peerSocks; struct ncclProxyOps* proxyOps; void** sharedDevMems; struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS) // Progress thread struct ncclProxyProgressState progressState; // Queue of expected responses from the proxy struct ncclExpectedProxyResponse* expectedResponses; }; enum proxyConnectState { connUninitialized = 0, connInitialized = 1, connSharedInitialized = 2, connSetupDone = 3, connConnected = 4, numConnStates = 5 }; struct ncclProxyConnection { int send, transport, shared; int tpLocalRank, sameProcess; struct ncclSocket* sock; struct ncclTransportComm* tcomm; struct ncclProxyArgs *proxyAppend; struct ncclProxyArgs **proxyAppendPtr; void* transportResources; proxyConnectState state; struct ncclCollNetSharedRes* collNet; }; typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); enum proxyMode { proxyRing = 0, proxyFrom = 1, proxyTo = 2 }; ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire); ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp); ncclResult_t ncclProxyStart(struct ncclComm* comm); ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses); ncclResult_t ncclProxyCreate(struct ncclComm* comm); ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn); enum ncclProxyMsgType { ncclProxyMsgInit = 1, ncclProxyMsgSharedInit = 2, ncclProxyMsgSetup = 3, ncclProxyMsgConnect = 4, ncclProxyMsgStart = 5, ncclProxyMsgClose = 6, ncclProxyMsgAbort = 7, ncclProxyMsgStop = 8, ncclProxyMsgConvertFd = 9, // cuMem API support (UDS) }; // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types // Call this function on the client, supplying a locally unique opId. Then, poll on the return value of // ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId); // This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize); ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId); ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd); ncclResult_t ncclProxyStop(struct ncclComm* comm); ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm); ncclResult_t ncclProxyDestroy(struct ncclComm* comm); #endif nccl-2.18.3-1/src/include/shm.h000066400000000000000000000011251444201535000160310ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_SHM_H_ #define NCCL_SHM_H_ #include "nccl.h" typedef void* ncclShmHandle_t; ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle); ncclResult_t ncclShmClose(ncclShmHandle_t handle); ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); #endif nccl-2.18.3-1/src/include/socket.h000066400000000000000000000076151444201535000165440ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_SOCKET_H_ #define NCCL_SOCKET_H_ #include "nccl.h" #include #include #include #include #include #include #define MAX_IFS 16 #define MAX_IF_NAME_SIZE 16 #define SLEEP_INT 1000 // connection retry sleep interval in usec #define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) #define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL /* Common socket address storage structure for IPv4/IPv6 */ union ncclSocketAddress { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; enum ncclSocketState { ncclSocketStateNone = 0, ncclSocketStateInitialized = 1, ncclSocketStateAccepting = 2, ncclSocketStateAccepted = 3, ncclSocketStateConnecting = 4, ncclSocketStateConnectPolling = 5, ncclSocketStateConnected = 6, ncclSocketStateReady = 7, ncclSocketStateClosed = 8, ncclSocketStateError = 9, ncclSocketStateNum = 10 }; enum ncclSocketType { ncclSocketTypeUnknown = 0, ncclSocketTypeBootstrap = 1, ncclSocketTypeProxy = 2, ncclSocketTypeNetSocket = 3, ncclSocketTypeNetIb = 4 }; struct ncclSocket { int fd; int acceptFd; int timedOutRetries; int refusedRetries; union ncclSocketAddress addr; volatile uint32_t* abortFlag; int asyncFlag; enum ncclSocketState state; int salen; uint64_t magic; enum ncclSocketType type; }; const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); // Initialize a socket ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0); // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call ncclResult_t ncclSocketListen(struct ncclSocket* sock); ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr); // Connect to sock->addr. sock->fd is set after a successful call. ncclResult_t ncclSocketConnect(struct ncclSocket* sock); // Return socket connection state. ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running); // Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr. ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock); ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd); ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock); #define NCCL_SOCKET_SEND 0 #define NCCL_SOCKET_RECV 1 ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size); ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking); ncclResult_t ncclSocketClose(struct ncclSocket* sock); #endif nccl-2.18.3-1/src/include/strongstream.h000066400000000000000000000117301444201535000177750ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_STRONGSTREAM_H_ #define NCCL_STRONGSTREAM_H_ #include "nccl.h" #include "checks.h" #include /* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes * easily. */ struct ncclCudaGraph { #if CUDART_VERSION >= 11030 cudaGraph_t graph; unsigned long long graphId; #endif }; inline struct ncclCudaGraph ncclCudaGraphNone() { struct ncclCudaGraph tmp; #if CUDART_VERSION >= 11030 tmp.graph = nullptr; tmp.graphId = ULLONG_MAX; #endif return tmp; } inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) { #if CUDART_VERSION >= 11030 return graph.graph != nullptr; #else return false; #endif } inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) { #if CUDART_VERSION >= 11030 return a.graphId == b.graphId; #else return true; #endif } ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, cudaStream_t stream); ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg); /* ncclStrongStream: An abstraction over CUDA streams that do not lose their * identity while being captured. Regular streams have the deficiency that the * captured form of a stream in one graph launch has no relation to the * uncaptured stream or to the captured form in other graph launches. This makes * streams unfit for the use of serializing access to a persistent resource. * Strong streams have been introduced to address this need. * * - All updates to a strong stream must be enclosed by a Acquire/Release pair. * * - The Acquire, Release, and all updates take a ncclCudaGraph parameter * indicating the currently capturing graph (or none). This parameter must be * the same for the entire sequence of {Acquire; ...; Release}. * * - An {Acquire; ...; Release} sequence must not be concurrent with any * other operations against the strong stream including graph launches which * reference this stream. */ struct ncclStrongStream; ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss); ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss); // Acquire-fence the strong stream. ncclResult_t ncclStrongStreamAcquire( struct ncclCudaGraph graph, struct ncclStrongStream* ss ); // Acquire-fence the strong stream assuming no graph is capturing. This permits // the caller to enqueue directly to the `ss->cudaStream` member using native CUDA // calls. Strong stream still must be released via: // ncclStrongStreamRelease(ncclCudaGraphNone(), ss); ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss); // Release-fence of the strong stream. ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss); // Add a host launch to the stream. ncclResult_t ncclStrongStreamLaunchHost( struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg ); // Add a kernel launch to the stream. ncclResult_t ncclStrongStreamLaunchKernel( struct ncclCudaGraph graph, struct ncclStrongStream* ss, void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes ); // Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired. // `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus // we want to fast-forward `a` to be a clone of `b`. Knowing this permits the // implementation to induce few graph dependencies. ncclResult_t ncclStrongStreamWaitStream( struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false ); // `b` must be capturing within `graph`. ncclResult_t ncclStrongStreamWaitStream( struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false ); // `a` must be capturing within `graph`. ncclResult_t ncclStrongStreamWaitStream( struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false ); // Synchrnoization does not need the strong stream to be acquired. ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss); //////////////////////////////////////////////////////////////////////////////// struct ncclStrongStreamGraph; // internal to ncclStrongStream struct ncclStrongStream { // Used when not graph capturing. cudaStream_t cudaStream; #if CUDART_VERSION >= 11030 // The event used to establish order between graphs and streams. During acquire // this event is waited on, during release it is recorded to. cudaEvent_t serialEvent; // This stream ever appeared in a graph capture. bool everCaptured; // Tracks whether serialEvent needs to be recorded to upon Release(). bool serialEventNeedsRecord; struct ncclStrongStreamGraph* graphHead; #else cudaEvent_t scratchEvent; #endif }; #endif nccl-2.18.3-1/src/include/timer.h000066400000000000000000000031131444201535000163610ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_TIMER_H_ #define NCCL_TIMER_H_ #if ENABLE_TIMER #include #include #include static double freq = -1; static void calibrate() { struct timeval tv; gettimeofday(&tv, NULL); uint64_t timeCycles = __rdtsc(); double time = - tv.tv_sec*1E6 - tv.tv_usec; uint64_t total = 0ULL; for (int i=0; i<10000; i++) total += __rdtsc(); gettimeofday(&tv, NULL); timeCycles = __rdtsc() - timeCycles; time += tv.tv_sec*1E6 + tv.tv_usec; freq = timeCycles/time; } static inline double gettime() { if (freq == -1) calibrate(); return __rdtsc()/freq; } static uint64_t counts[8]; static double times[8]; static double startTimes[8]; #define TIME_START(index) do { \ counts[index]++; \ startTimes[index] = gettime(); \ } while (0); #define TIME_STOP(index) do { \ times[index] += gettime() - startTimes[index]; \ } while (0); #define TIME_CANCEL(index) do { \ counts[index]--; \ } while (0); #define TIME_PRINT(name) do { \ printf("%s stats", name); \ for (int i=0; i<8; i++) { \ if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \ counts[i] = 0; \ } \ printf("\n"); \ } while (0); #else #define TIME_START(index) while(0); #define TIME_STOP(index) while(0); #define TIME_CANCEL(index) while(0); #define TIME_PRINT(name) #endif #endif nccl-2.18.3-1/src/include/transport.h000066400000000000000000000077201444201535000173050ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_TRANSPORT_H_ #define NCCL_TRANSPORT_H_ #include "devcomm.h" #include "graph.h" #include "nvmlwrap.h" #include "core.h" #define NTRANSPORTS 4 #define TRANSPORT_P2P 0 #define TRANSPORT_SHM 1 #define TRANSPORT_NET 2 #define TRANSPORT_COLLNET 3 #include "proxy.h" extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; extern struct ncclTransport netTransport; extern struct ncclTransport collNetTransport; extern struct ncclTransport* ncclTransports[]; // Forward declarations struct ncclRing; struct ncclConnector; struct ncclComm; struct ncclPeerInfo { int rank; int cudaDev; int nvmlDev; int gdrSupport; uint64_t hostHash; uint64_t pidHash; dev_t shmDev; int64_t busId; struct ncclComm* comm; int cudaCompCap; }; #define CONNECT_SIZE 128 struct ncclConnect { char data[CONNECT_SIZE]; }; #if CUDART_VERSION >= 12010 #define NVLS_HANDLE_SIZE 64 struct ncclNvlsSharedRes { int refCount; CUmulticastObjectProp properties; CUmemAccessDesc accessDesc; int dev; size_t size; size_t granularity; CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer char* mcBuff; // Multicast NVLS buffer address CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer char* ucBuff; // Unicast NVLS buffer address char shareableHandle[NVLS_HANDLE_SIZE]; int nChannels; }; #endif /* CUDART_VERSION >= 12010 */ struct ncclCollNetSharedRes { int refCount; int size; char* cudaBuff; char* hostBuff; struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS]; void* resources; int nChannels; size_t buffSize; }; struct ncclTransportComm { ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex); ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*); ncclResult_t (*free)(struct ncclConnector*); ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels); ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState); ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); }; struct ncclTransport { const char name[8]; ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*); struct ncclTransportComm send; struct ncclTransportComm recv; }; ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL); ncclResult_t ncclNvlsInit(struct ncclComm* comm); ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent); ncclResult_t ncclNvlsFree(struct ncclComm* comm); enum { collNetRecv=0, collNetSend=1 }; int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type); ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail); ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm); #endif nccl-2.18.3-1/src/include/trees.h000066400000000000000000000010761444201535000163710ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_TREES_H_ #define NCCL_TREES_H_ ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType); ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1); #endif nccl-2.18.3-1/src/include/utils.h000066400000000000000000000403671444201535000164150ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_UTILS_H_ #define NCCL_UTILS_H_ #include "nccl.h" #include "alloc.h" #include "checks.h" #include #include #include #include int ncclCudaCompCap(); // PCI Bus ID <-> int64 conversion functions ncclResult_t int64ToBusId(int64_t id, char* busId); ncclResult_t busIdToInt64(const char* busId, int64_t* id); ncclResult_t getBusId(int cudaDev, int64_t *busId); ncclResult_t getHostName(char* hostname, int maxlen, const char delim); uint64_t getHash(const char* string, int n); uint64_t getHostHash(); uint64_t getPidHash(); ncclResult_t getRandomData(void* buffer, size_t bytes); struct netIf { char prefix[64]; int port; }; int parseStringList(const char* string, struct netIf* ifList, int maxList); bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); static long log2i(long n) { long l = 0; while (n>>=1) l++; return l; } inline uint64_t clockNano() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec; } /* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else * return -1 */ inline ncclResult_t getRandomData(void* buffer, size_t bytes) { ncclResult_t ret = ncclSuccess; if (bytes > 0) { const size_t one = 1UL; FILE* fp = fopen("/dev/urandom", "r"); if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) ret = ncclSystemError; if (fp) fclose(fp); } return ret; } //////////////////////////////////////////////////////////////////////////////// template inline void ncclAtomicRefCountIncrement(Int* refs) { __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED); } template inline Int ncclAtomicRefCountDecrement(Int* refs) { return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL); } //////////////////////////////////////////////////////////////////////////////// /* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that * granularity of LIFO is not per object, instead frames containing many objects * are pushed and popped. Therefor deallocation is extremely cheap since its * done at the frame granularity. * * The initial state of the stack is with one frame, the "nil" frame, which * cannot be popped. Therefor objects allocated in the nil frame cannot be * deallocated sooner than stack destruction. */ struct ncclMemoryStack; void ncclMemoryStackConstruct(struct ncclMemoryStack* me); void ncclMemoryStackDestruct(struct ncclMemoryStack* me); void ncclMemoryStackPush(struct ncclMemoryStack* me); void ncclMemoryStackPop(struct ncclMemoryStack* me); template T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1); //////////////////////////////////////////////////////////////////////////////// /* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for * a pool instance to ever hold objects whose type have differing * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by * a backing `ncclMemoryStack` passed during Alloc(). If memory * backing any currently held object is deallocated then it is an error to do * anything other than reconstruct it, after which it is a valid empty pool. */ struct ncclMemoryPool; // Equivalent to zero-initialization void ncclMemoryPoolConstruct(struct ncclMemoryPool* me); template T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing); template void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj); void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from); //////////////////////////////////////////////////////////////////////////////// /* ncclIntruQueue: A singly-linked list queue where the per-object next pointer * field is given via the `next` template argument. * * Example: * struct Foo { * struct Foo *next1, *next2; // can be a member of two lists at once * }; * ncclIntruQueue list1; * ncclIntruQueue list2; */ template struct ncclIntruQueue; template void ncclIntruQueueConstruct(ncclIntruQueue *me); template bool ncclIntruQueueEmpty(ncclIntruQueue *me); template T* ncclIntruQueueHead(ncclIntruQueue *me); template void ncclIntruQueueEnqueue(ncclIntruQueue *me, T *x); template T* ncclIntruQueueDequeue(ncclIntruQueue *me); template T* ncclIntruQueueTryDequeue(ncclIntruQueue *me); template void ncclIntruQueueFreeAll(ncclIntruQueue *me, ncclMemoryPool *memPool); //////////////////////////////////////////////////////////////////////////////// /* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex" * and "cond" fields are part of the public interface. */ struct ncclThreadSignal { pthread_mutex_t mutex; pthread_cond_t cond; }; // returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER} constexpr ncclThreadSignal ncclThreadSignalStaticInitializer(); void ncclThreadSignalConstruct(struct ncclThreadSignal* me); void ncclThreadSignalDestruct(struct ncclThreadSignal* me); // A convenience instance per-thread. extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance; //////////////////////////////////////////////////////////////////////////////// template struct ncclIntruQueueMpsc; template void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc* me); template bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc* me); // Enqueue element. Returns true if queue is not abandoned. Even if queue is // abandoned the element enqueued, so the caller needs to make arrangements for // the queue to be tended. template bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc* me, T* x); // Dequeue all elements at a glance. If there aren't any and `waitSome` is // true then this call will wait until it can return a non empty list. template T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc* me, bool waitSome); // Dequeue all elements and set queue to abandoned state. template T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc* me); //////////////////////////////////////////////////////////////////////////////// struct ncclMemoryStack { struct Hunk { struct Hunk* above; // reverse stack pointer size_t size; // size of this allocation (including this header struct) }; struct Unhunk { // proxy header for objects allocated out-of-hunk struct Unhunk* next; void* obj; }; struct Frame { struct Hunk* hunk; // top of non-empty hunks uintptr_t bumper, end; // points into top hunk struct Unhunk* unhunks; struct Frame* below; }; static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align); static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align); struct Hunk stub; struct Frame topFrame; }; inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) { me->stub.above = nullptr; me->stub.size = 0; me->topFrame.hunk = &me->stub; me->topFrame.bumper = 0; me->topFrame.end = 0; me->topFrame.unhunks = nullptr; me->topFrame.below = nullptr; } inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) { uintptr_t o = (me->topFrame.bumper + align-1) & -uintptr_t(align); void* obj; if (__builtin_expect(o + size <= me->topFrame.end, true)) { me->topFrame.bumper = o + size; obj = reinterpret_cast(o); } else { obj = allocateSpilled(me, size, align); } return obj; } template inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) { void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T)); memset(obj, 0, n*sizeof(T)); return (T*)obj; } inline void ncclMemoryStackPush(struct ncclMemoryStack* me) { using Frame = ncclMemoryStack::Frame; Frame tmp = me->topFrame; Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame)); *snapshot = tmp; // C++ struct assignment me->topFrame.unhunks = nullptr; me->topFrame.below = snapshot; } inline void ncclMemoryStackPop(struct ncclMemoryStack* me) { ncclMemoryStack::Unhunk* un = me->topFrame.unhunks; while (un != nullptr) { free(un->obj); un = un->next; } me->topFrame = *me->topFrame.below; // C++ struct assignment } //////////////////////////////////////////////////////////////////////////////// struct ncclMemoryPool { struct Cell { Cell *next; }; template union CellSized { Cell cell; alignas(Align) char space[Size]; }; struct Cell* head; struct Cell* tail; // meaningful only when head != nullptr }; inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) { me->head = nullptr; } template inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) { using Cell = ncclMemoryPool::Cell; using CellSized = ncclMemoryPool::CellSized; Cell* cell; if (__builtin_expect(me->head != nullptr, true)) { cell = me->head; me->head = cell->next; } else { // Use the internal allocate() since it doesn't memset to 0 yet. cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized)); } memset(cell, 0, sizeof(T)); return reinterpret_cast(cell); } template inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) { using Cell = ncclMemoryPool::Cell; Cell* cell = reinterpret_cast(obj); cell->next = me->head; if (me->head == nullptr) me->tail = cell; me->head = cell; } inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) { if (from->head != nullptr) { from->tail->next = me->head; if (me->head == nullptr) me->tail = from->tail; me->head = from->head; from->head = nullptr; } } //////////////////////////////////////////////////////////////////////////////// template struct ncclIntruQueue { T *head, *tail; }; template inline void ncclIntruQueueConstruct(ncclIntruQueue *me) { me->head = nullptr; me->tail = nullptr; } template inline bool ncclIntruQueueEmpty(ncclIntruQueue *me) { return me->head == nullptr; } template inline T* ncclIntruQueueHead(ncclIntruQueue *me) { return me->head; } template inline T* ncclIntruQueueTail(ncclIntruQueue *me) { return me->tail; } template inline void ncclIntruQueueEnqueue(ncclIntruQueue *me, T *x) { x->*next = nullptr; (me->head ? me->tail->*next : me->head) = x; me->tail = x; } template inline T* ncclIntruQueueDequeue(ncclIntruQueue *me) { T *ans = me->head; me->head = ans->*next; if (me->head == nullptr) me->tail = nullptr; return ans; } template inline T* ncclIntruQueueTryDequeue(ncclIntruQueue *me) { T *ans = me->head; if (ans != nullptr) { me->head = ans->*next; if (me->head == nullptr) me->tail = nullptr; } return ans; } template void ncclIntruQueueFreeAll(ncclIntruQueue *me, ncclMemoryPool *pool) { T *head = me->head; me->head = nullptr; me->tail = nullptr; while (head != nullptr) { T *tmp = head->*next; ncclMemoryPoolFree(pool, tmp); head = tmp; } } //////////////////////////////////////////////////////////////////////////////// constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() { return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}; } inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) { pthread_mutex_init(&me->mutex, nullptr); pthread_cond_init(&me->cond, nullptr); } inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) { pthread_mutex_destroy(&me->mutex); pthread_cond_destroy(&me->cond); } //////////////////////////////////////////////////////////////////////////////// template struct ncclIntruQueueMpsc { T* head; uintptr_t tail; struct ncclThreadSignal* waiting; }; template void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc* me) { me->head = nullptr; me->tail = 0x0; me->waiting = nullptr; } template bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc* me) { return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2; } template bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc* me, T* x) { __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED); uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast(x), __ATOMIC_ACQ_REL); T* prev = reinterpret_cast(utail); T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next); __atomic_store_n(prevNext, x, __ATOMIC_RELAXED); if (utail == 0x1) { // waiting __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting // This lock/unlock is essential to ensure we don't race ahead of the consumer // and signal the cond before they begin waiting on it. struct ncclThreadSignal* waiting = me->waiting; pthread_mutex_lock(&waiting->mutex); pthread_mutex_unlock(&waiting->mutex); pthread_cond_broadcast(&waiting->cond); } return utail != 0x2; // not abandoned } template T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc* me, bool waitSome) { T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); if (head == nullptr) { if (!waitSome) return nullptr; uint64_t t0 = clockNano(); bool sleeping = false; do { if (clockNano()-t0 >= 10*1000) { // spin for first 10us struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance; pthread_mutex_lock(&waitSignal->mutex); uintptr_t expected = sleeping ? 0x1 : 0x0; uintptr_t desired = 0x1; me->waiting = waitSignal; // release done by successful compare exchange if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) { sleeping = true; pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex); } pthread_mutex_unlock(&waitSignal->mutex); } head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); } while (head == nullptr); } __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED); uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL); T* tail = utail <= 0x2 ? nullptr : reinterpret_cast(utail); T *x = head; while (x != tail) { T *x1; int spins = 0; while (true) { x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED); if (x1 != nullptr) break; if (++spins == 1024) { spins = 1024-1; sched_yield(); } } x = x1; } return head; } template T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc* me) { uintptr_t expected = 0x0; if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) { return nullptr; } else { int spins = 0; T* head; while (true) { head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); if (head != nullptr) break; if (++spins == 1024) { spins = 1024-1; sched_yield(); } } __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED); uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL); T* tail = utail <= 0x2 ? nullptr : reinterpret_cast(utail); T *x = head; while (x != tail) { T *x1; spins = 0; while (true) { x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED); if (x1 != nullptr) break; if (++spins == 1024) { spins = 1024-1; sched_yield(); } } x = x1; } return head; } } #endif nccl-2.18.3-1/src/init.cc000066400000000000000000002444301444201535000147300ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #include "channel.h" #include "nvmlwrap.h" #include "gdrwrap.h" #include "bootstrap.h" #include "transport.h" #include "group.h" #include "net.h" #include "coll_net.h" #include "enqueue.h" #include "graph.h" #include "argcheck.h" #include #include #include #include #include #include #include #include #define STR2(v) #v #define STR(v) STR2(v) #if CUDART_VERSION >= 9020 #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream #else #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream #endif const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" }; const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree" }; const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT); static ncclResult_t commReclaim(ncclComm_t comm); static uint64_t hashUniqueId(ncclUniqueId const &id) { char const *bytes = (char const*)&id; uint64_t h = 0xdeadbeef; for(int i=0; i < (int)sizeof(ncclUniqueId); i++) { h ^= h >> 32; h *= 0x8db3db47fa2994ad; h += bytes[i]; } return h; } // GDRCOPY support: Off by default NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0); // GDRCOPY support gdr_t ncclGdrCopy = NULL; ncclResult_t initGdrCopy() { if (ncclParamGdrCopyEnable() == 1) { ncclGdrCopy = ncclGdrInit(); } return ncclSuccess; } pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; static bool initialized = false; static ncclResult_t ncclInit() { if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return ncclSuccess; pthread_mutex_lock(&initLock); if (!initialized) { initEnv(); initGdrCopy(); // Always initialize bootstrap network NCCLCHECK(bootstrapNetInit()); NCCLCHECK(ncclNetPluginInit()); initNvtxRegisteredEnums(); __atomic_store_n(&initialized, true, __ATOMIC_RELEASE); } pthread_mutex_unlock(&initLock); return ncclSuccess; } NCCL_API(ncclResult_t, ncclGetVersion, int* version); ncclResult_t ncclGetVersion(int* version) { if (version == NULL) return ncclInvalidArgument; *version = NCCL_VERSION_CODE; return ncclSuccess; } NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out); ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { NCCLCHECK(ncclInit()); NCCLCHECK(PtrCheck(out, "GetUniqueId", "out")); ncclResult_t res = bootstrapGetUniqueId((struct ncclBootstrapHandle*)out); TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out)); return res; } // Prevent compiler from optimizing out these operations #ifdef __clang__ #define NCCL_NO_OPTIMIZE __attribute__((optnone)) #else #define NCCL_NO_OPTIMIZE __attribute__((optimize("O0"))) #endif void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) { // Important that this does not trash intraComm0. comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1; } #undef NCCL_NO_OPTIMIZE static ncclResult_t ncclDestructorFnFree(struct ncclDestructor* dtor) { free(dtor->obj); return ncclSuccess; } void ncclCommPushFree(struct ncclComm* comm, void* obj) { struct ncclDestructor* dtor = ncclMemoryStackAlloc(&comm->memPermanent); dtor->fn = ncclDestructorFnFree; dtor->obj = obj; dtor->next = comm->destructorHead; comm->destructorHead = dtor; } static ncclResult_t ncclDestructorFnCudaFree(struct ncclDestructor* dtor) { NCCLCHECK(ncclCudaFree(dtor->obj)); return ncclSuccess; } void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) { struct ncclDestructor* dtor = ncclMemoryStackAlloc(&comm->memPermanent); dtor->fn = ncclDestructorFnCudaFree; dtor->obj = obj; dtor->next = comm->destructorHead; comm->destructorHead = dtor; } static ncclResult_t ncclDestructorFnCudaHostFree(struct ncclDestructor* dtor) { CUDACHECK(cudaFreeHost(dtor->obj)); return ncclSuccess; } void ncclCommPushCudaHostFree(struct ncclComm* comm, void* obj) { struct ncclDestructor* dtor = ncclMemoryStackAlloc(&comm->memPermanent); dtor->fn = ncclDestructorFnCudaHostFree; dtor->obj = obj; dtor->next = comm->destructorHead; comm->destructorHead = dtor; } static ncclResult_t ncclDestructorFnCudaGdrFree(struct ncclDestructor* dtor) { NCCLCHECK(ncclGdrCudaFree(dtor->obj)); return ncclSuccess; } void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle) { struct ncclDestructor* dtor = ncclMemoryStackAlloc(&comm->memPermanent); dtor->fn = ncclDestructorFnCudaGdrFree; dtor->obj = handle; dtor->next = comm->destructorHead; comm->destructorHead = dtor; } static ncclResult_t commFree(ncclComm_t comm) { /* commFree() should not involve any sync among ranks. */ if (comm == NULL) return ncclSuccess; /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will * free all intra-process communicators; therefore, we only need to focus on local * resource cleanup in commFree(). */ if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) { pthread_join(comm->proxyState->thread, nullptr); } delete[] comm->userRedOps; free(comm->connectSend); free(comm->connectRecv); free(comm->peerInfo); if (comm->topo) ncclTopoFree(comm->topo); if (comm->nodeRanks) { for (int n=0; nnNodes; n++) free(comm->nodeRanks[n].localRankToRank); free(comm->nodeRanks); } free(comm->rankToNode); free(comm->rankToLocalRank); free(comm->collNetHeads); if (comm->bootstrap) NCCLCHECK(bootstrapClose(comm->bootstrap)); for (int channel=0; channelchannels+channel, comm->nRanks, 1, comm->localRanks)); if (comm->sharedRes) { if (ncclAtomicRefCountDecrement(&comm->sharedRes->refCount) == 0) { for (int c=0; csharedRes->peers[c]) free(comm->sharedRes->peers[c]); if (comm->sharedRes->devPeers[c]) ncclCudaFree(comm->sharedRes->devPeers[c]); } free(comm->sharedRes->tpRankToLocalRank); NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream)); NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream)); NCCLCHECK(ncclProxyDestroy(comm)); free(comm->sharedRes); } } if (comm->nvlsSupport) NCCLCHECK(ncclNvlsFree(comm)); struct ncclDestructor* dtor = comm->destructorHead; while (dtor != nullptr) { NCCLCHECK(dtor->fn(dtor)); dtor = dtor->next; } ncclMemoryStackDestruct(&comm->memScoped); ncclMemoryStackDestruct(&comm->memPermanent); if (ncclAtomicRefCountDecrement(comm->abortFlagRefCount) == 0) { NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag)); free(comm->abortFlagRefCount); } free((void*)comm->config.netName); free(comm->topParentRanks); free(comm->topParentLocalRanks); commPoison(comm); // poison comm before free to avoid comm reuse. free(comm); return ncclSuccess; } NCCL_PARAM(AggChannelSize, "AGG_CHANNEL_SIZE", -2); NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0); // GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1); NCCL_PARAM(WorkFifoDepth, "WORK_FIFO_DEPTH", 64<<10); enum ncclLaunchMode ncclParamLaunchMode; NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 1); // Detect DMA-BUF support static ncclResult_t dmaBufSupported(struct ncclComm* comm) { if (ncclParamDmaBufEnable() == 0 || comm->ncclNet->regMrDmaBuf == NULL || ncclCudaLibraryInit() != ncclSuccess) return ncclInternalError; #if CUDA_VERSION >= 11070 int flag = 0; CUdevice dev; int cudaDriverVersion; CUDACHECK(cudaDriverGetVersion(&cudaDriverVersion)); if (CUPFN(cuDeviceGet) == NULL || cudaDriverVersion < 11070) return ncclInternalError; CUCHECK(cuDeviceGet(&dev, comm->cudaDev)); // Query device to see if DMA-BUF support is available (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev)); if (flag == 0) return ncclInternalError; INFO(NCCL_INIT, "DMA-BUF is available on GPU device %d", comm->cudaDev); return ncclSuccess; #endif return ncclInternalError; } ncclResult_t ncclCommEnsureReady(ncclComm_t comm) { /* comm must be ready, or error will be reported */ ncclResult_t ret = ncclSuccess; if (*comm->abortFlag) { ncclGroupJobAbort(); } else { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); if (ret != ncclSuccess) { /* if ret is not ncclInProgress, we just keep it. */ WARN("Attempt to use communicator before the previous operation returned ncclSuccess"); if (ret == ncclInProgress) ret = ncclInvalidArgument; goto exit; } } exit: return ret; } static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, int ndev, int rank) { if (ndev < 1) { WARN("invalid device count (%d) requested", ndev); return ncclInvalidArgument; } if (rank >= ndev || rank < 0) { WARN("rank %d exceeds ndev=%d", rank, ndev); return ncclInvalidArgument; } ncclMemoryStackConstruct(&comm->memPermanent); ncclMemoryStackConstruct(&comm->memScoped); comm->destructorHead = nullptr; comm->rank = rank; comm->nRanks = ndev; NCCLCHECK(ncclNetInit(comm)); INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name); if (parent && parent->config.splitShare) { if (parent->ncclNet != comm->ncclNet) { WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name); return ncclInvalidUsage; } } // Try to create a CUDA object right away. If there is something wrong with // the device we're on (failure cause #1) , better know it early. CUDACHECK(cudaGetDevice(&comm->cudaDev)); NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); nvmlDevice_t nvmlDev; char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; NCCLCHECK(int64ToBusId(comm->busId, busId)); NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev)); NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&comm->nvmlDev)); comm->compCap = ncclCudaCompCap(); TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap); comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false; comm->collNetSupport = 0; memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix)); ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan); ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp); ncclMemoryPoolConstruct(&comm->memPool_ncclPointerList); comm->groupNext = reinterpret_cast(0x1); comm->preconnectNext = reinterpret_cast(0x1); comm->channelSize = ncclParamAggChannelSize(); static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels"); static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels"); NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks)); NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks)); // Mark channels as non initialized. for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1; if (parent == NULL || !parent->config.splitShare) { struct ncclSharedResources* sharedRes = NULL; NCCLCHECK(ncclCalloc(&sharedRes, 1)); /* most of attributes are assigned later in initTransportsRank(). */ sharedRes->owner = comm; sharedRes->tpNRanks = comm->nRanks; NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks)); NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream)); NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream)); comm->sharedRes = sharedRes; sharedRes->refCount = 1; } else { comm->sharedRes = parent->sharedRes; ncclAtomicRefCountIncrement(&parent->sharedRes->refCount); } if (comm->topParentRanks == NULL) { NCCLCHECK(ncclCalloc(&comm->topParentRanks, comm->nRanks)); for (int i = 0; i < comm->nRanks; ++i) comm->topParentRanks[i] = i; } ncclIntruQueueMpscConstruct(&comm->callbackQueue); return ncclSuccess; } static ncclResult_t devCommSetup(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; int nRanks = comm->nRanks; struct ncclDevCommAndChannels tmpCommAndChans; struct ncclDevCommAndChannels *devCommAndChans = NULL; NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail); NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); ncclCommPushCudaFree(comm, devCommAndChans); comm->devComm = &devCommAndChans->comm; tmpCommAndChans.comm.rank = comm->rank; tmpCommAndChans.comm.nRanks = nRanks; tmpCommAndChans.comm.abortFlag = comm->abortFlag; for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) { tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p]; } tmpCommAndChans.comm.channels = &devCommAndChans->channels[0]; comm->workFifoDepth = ncclParamWorkFifoDepth(); if (0 != (comm->workFifoDepth & (comm->workFifoDepth-1))) { WARN("NCCL_WORK_FIFO_DEPTH=%d is being ignored because it is not a power of 2.", comm->workFifoDepth); comm->workFifoDepth = 64<<10; } tmpCommAndChans.comm.workFifoDepth = comm->workFifoDepth; if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) { // The workFifoHeap lives in GDR mapped CUDA memory. NCCLCHECKGOTO(ncclGdrCudaCalloc(&comm->workFifoHeap, &comm->devWorkFifoHeap, comm->workFifoDepth, &comm->workFifoHeapGdrHandle), ret, fail); ncclCommPushCudaGdrFree(comm, comm->workFifoHeapGdrHandle); } else { // The workFifoHeap lives in cudaHost memory. comm->workFifoHeapGdrHandle = nullptr; NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoHeap, comm->workFifoDepth), ret, fail); ncclCommPushCudaHostFree(comm, comm->workFifoHeap); comm->devWorkFifoHeap = comm->workFifoHeap; } tmpCommAndChans.comm.workFifoHeap = comm->devWorkFifoHeap; NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoDone, MAXCHANNELS), ret, fail); ncclCommPushCudaHostFree(comm, comm->workFifoDone); comm->workFifoSent = 0; comm->workFifoAckdMin = 0; for (int c=0; c < MAXCHANNELS; c++) { tmpCommAndChans.channels[c].peers = comm->channels[c].devPeers; tmpCommAndChans.channels[c].ring = comm->channels[c].ring; tmpCommAndChans.channels[c].ring.userRanks = comm->channels[c].devRingUserRanks; tmpCommAndChans.channels[c].tree = comm->channels[c].tree; tmpCommAndChans.channels[c].collnetChain = comm->channels[c].collnetChain; tmpCommAndChans.channels[c].collnetDirect = comm->channels[c].collnetDirect; tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls; tmpCommAndChans.channels[c].workFifoDone = &comm->workFifoDone[c]; if (comm->channels[c].ring.userRanks != nullptr) { NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); } } NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); exit: CUDACHECK(cudaStreamSynchronize(comm->sharedRes->deviceStream.cudaStream)); NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream)); return ret; fail: goto exit; } // Pre-process the string so that running "strings" on the lib can quickly reveal the version. #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR) static void showVersion() { static int shown = 0; if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) { printf("%s\n", VERSION_STRING); fflush(stdout); if (ncclDebugFile != stdout) INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files shown = 1; } } static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) { info->rank = comm->rank; info->cudaDev = comm->cudaDev; info->nvmlDev = comm->nvmlDev; info->hostHash=getHostHash()+commHash; info->pidHash=getPidHash()+commHash; // Get the device MAJOR:MINOR of /dev/shm so we can use that // information to decide whether we can use SHM for inter-process // communication in a container environment struct stat statbuf; SYSCHECK(stat("/dev/shm", &statbuf), "stat"); info->shmDev = statbuf.st_dev; info->busId = comm->busId; NCCLCHECK(ncclGpuGdrSupport(comm, &info->gdrSupport)); info->comm = comm; info->cudaCompCap = comm->minCompCap = comm->maxCompCap = comm->compCap; return ncclSuccess; } static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) { TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); NCCLCHECK(initChannel(comm, channelId)); struct ncclRing* ring = &comm->channels[channelId].ring; // Find our ring-distance from rank zero and reorganize ranks to start with rank. int ixZero=0, ixRank=0; for (int i=0; i < nranks; i++) { if (ringRanks[i] == 0) ixZero = i; if (ringRanks[i] == rank) ixRank = i; } ring->index = (ixRank-ixZero + nranks)%nranks; for (int i=0; iuserRanks[i] = ringRanks[(i+ixRank)%nranks]; } return ncclSuccess; } #define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine)) #define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t)) #define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */ #define DEFAULT_BUFFSIZE_ARM (1 << 20) /* 1MiB */ NCCL_PARAM(BuffSize, "BUFFSIZE", -2); NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2); NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2); NCCL_PARAM(P2pNetChunkSize, "P2P_NET_CHUNKSIZE", (1 << 17)); /* 128 kB */ NCCL_PARAM(P2pPciChunkSize, "P2P_PCI_CHUNKSIZE", (1 << 17)); /* 128 kB */ NCCL_PARAM(P2pNvlChunkSize, "P2P_NVL_CHUNKSIZE", (1 << 19)); /* 512 kB */ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { int cpuArch, cpuVendor, cpuModel; NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() }; int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE }; if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM; for (int p=0; pbuffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p]; } if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize(); else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize(); else comm->p2pChunkSize = ncclParamP2pPciChunkSize(); if (comm->sharedRes->owner != comm) { /* make sure split comm p2pChunkSize won't exceed shared p2pChunkSize. */ comm->p2pChunkSize = std::min(comm->p2pChunkSize, comm->sharedRes->tpP2pChunkSize); } else { comm->sharedRes->tpP2pChunkSize = comm->p2pChunkSize; } INFO(NCCL_INIT, "P2P Chunksize set to %d", comm->p2pChunkSize); return ncclSuccess; } NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0); NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2); NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1); NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0); static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* collNetGraph) { ncclResult_t ret = ncclSuccess; int* heads = NULL; int rank = comm->rank; int collNetSetupFail = 0; int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_P2P }; // Find all head ranks int nHeads = collNetGraph->nChannels; int highestTransportType0, highestTransportType1; char line[1024]; bool share; struct collnetShareInfo { int headPosition; int isMaster; }; struct collnetShareInfo* infos = NULL; NCCLCHECKGOTO(ncclCalloc(&heads, nHeads), ret, fail); // Head GPU index is always 0 for (int c = 0; c < nHeads; c++) { heads[c] = collNetGraph->intra[c * comm->localRanks + 0]; } comm->collNetHeads = heads; comm->collNetHeadsNum = nHeads; if (parent && parent->collNetSupport && parent->config.splitShare && parent->nNodes == comm->nNodes) { NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail); /* check whether child can share collnet resources of parent. Since parent builds each collnet communicator * based on heads with the same head position in each node, as long as the collnet heads of child comm * can match parent's heads, we can let child communicator share parent's collnet resources. */ for (int h = 0; h < nHeads; ++h) { int prev = INT_MIN; struct collnetShareInfo* myinfo; share = true; myinfo = infos + comm->rank; memset(myinfo, 0, sizeof(struct collnetShareInfo)); /* find the child head position in parent collnet heads. */ if (heads[h] == comm->rank) { myinfo->headPosition = -1; myinfo->isMaster = 1; for (int th = 0; th < parent->collNetHeadsNum; ++th) if (parent->topParentRanks[parent->collNetHeads[th]] == comm->topParentRanks[comm->rank]) { myinfo->headPosition = th; break; } } NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, infos, sizeof(struct collnetShareInfo)), ret, fail); for (int i = 0; i < comm->nRanks; ++i) { if (infos[i].isMaster) { if (prev == INT_MIN) prev = infos[i].headPosition; if (infos[i].headPosition == -1 || prev != infos[i].headPosition) { share = false; break; } } } if (share) { if (myinfo->isMaster) { comm->collNetSharedRes = parent->collNetSharedRes; comm->collNetChannels = std::min(std::max(comm->nChannels, comm->nvlsChannels), parent->collNetSharedRes->nChannels); for (int c = 0; c < comm->collNetChannels; ++c) NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail); } } else { /* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot * share the sharp resource from parent, we cannot use sharp in this case. This restriction might be * lifted by sharp plugin/IB hardware in the future. */ collNetSetupFail = 1; if (comm->rank == 0) { WARN("Child comms (nRanks %d) fails to share parent comms (nRanks %d) sharp resources", comm->nRanks, parent->nRanks); } goto fail; } } share = true; } else { /* this allocated buffer will be freed on proxy side */ NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1)); /* TODO: min or max? */ comm->collNetChannels = comm->collNetSharedRes->nChannels = std::max(comm->nChannels, comm->nvlsChannels); comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE]; for (int c = 0; c < comm->collNetChannels; c++) { struct ncclChannel* channel = comm->channels + c; NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail); for (int h = 0; h < nHeads; h++) { const int head = heads[h]; collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv); if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend); } // Verify CollNet setup across ranks after trying the first channel if (c == 0) { NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); } } share = false; } if (share) { memcpy(comm->collNetSupportMatrix, parent->collNetSupportMatrix, sizeof(comm->collNetSupportMatrix)); } else { do { /* Initialize all entries in collNetSupportMatrix[redop][type]. Since some ranks don't connect to sharp we enable a (redop,type) if any rank claims support. */ const ncclRedOp_t redops[] = {ncclSum, ncclProd, ncclMin, ncclMax}; uint8_t(*matrix)[4][ncclNumTypes]; bool isHead = false; matrix = nullptr; NCCLCHECKGOTO(ncclCalloc(&matrix, comm->nRanks), ret, matrix_end); for (int h = 0; h < nHeads; h++) isHead |= (heads[h] == comm->rank); if (isHead) { for (int ty=0; ty < ncclNumTypes; ty++) { for (int i=0; i < 4; i++) { int support = 0; NCCLCHECKGOTO(collNetReduceSupport(comm, (ncclDataType_t)ty, redops[i], &support), ret, matrix_end); // bit 0 = not supported, bit 1 = supported matrix[rank][redops[i]][ty] = 1<<(support ? 1 : 0); } } } NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, matrix, sizeof(*matrix)), ret, matrix_end); for (int ty=0; ty < ncclNumTypes; ty++) { for (int i=0; i < 4; i++) { int op = redops[i]; uint8_t accum = 0; for (int r=0; r < comm->nRanks; r++) accum |= matrix[r][op][ty]; // We support (redop, type) if some rank supports it and no rank doesn't support it comm->collNetSupportMatrix[op][ty] = (accum == (1<<1)); } } matrix_end: free(matrix); if (ret != ncclSuccess) goto fail; } while (0); } // Verify CollNet setup across ranks after trying all channels NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail); TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank); line[0] = '\0'; for (int c = 0; c < comm->nChannels; c++) { struct ncclTree* chain = &comm->channels[c].collnetChain; snprintf(line + strlen(line), 1023 - strlen(line), " [%d] %d->%d->%d", c, chain->down[0], rank, chain->up); } line[1023] = '\0'; INFO(NCCL_INIT, "Collnet Chains %s", line); // Connect Collnet + chain for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channel = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0), ret, fail); for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channel = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1), ret, fail); INFO(NCCL_INIT, "Connected collnet + chain"); // Connect intra-node CollNet + Direct for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channelRecv = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0, &highestTransportType0), ret, fail); for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channelSend = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1, &highestTransportType1), ret, fail); // Exchange highest intra-node transport type among ranks // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; if (share) { comm->intraHighestTransportType = std::max(comm->intraHighestTransportType, parent->intraHighestTransportType); } NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail); for (int i = 0; i < comm->localRanks; i++) { if (highestTypes[i] > comm->intraHighestTransportType) comm->intraHighestTransportType = highestTypes[i]; } INFO(NCCL_INIT, "rank %d Connected CollNet", rank); exit: free(infos); return ret; fail: ncclTransportCollNetFree(comm); comm->collNetSupport = 0; goto exit; } static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent = NULL) { // We use 2 AllGathers // 1. { peerInfo, comm, compCap} // 2. { nChannels, graphInfo, topoRanks } ncclResult_t ret = ncclSuccess; int rank = comm->rank; int nranks = comm->nRanks; cpu_set_t affinitySave; struct ncclTopoGraph ringGraph; struct ncclTopoGraph treeGraph; struct ncclTopoGraph collNetGraph; struct ncclTopoGraph nvlsGraph; struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph }; struct graphInfo { int pattern; int nChannels; int sameChannels; float bwIntra; float bwInter; int typeIntra; int typeInter; }; struct allGatherInfo { struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS]; struct ncclTopoRanks topoRanks; }; int nChannelsOrig; struct allGatherInfo *allGather3Data = NULL; struct ncclTopoRanks** allTopoRanks = NULL; int *nodesFirstRank = NULL, *nodesTreePatterns = NULL; int *rings = NULL; int* nvbPeers = NULL; struct ncclProxyConnector proxyConn; int* pxnPeers = NULL; int *topParentLocalRanks = NULL; int tpProxyRank; // AllGather1 - begin NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail); for (int i = 0; i < nranks; i++) { if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) { WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId); ret = ncclInvalidUsage; goto fail; } } // AllGather1 - end do { // Compute intra-process ranks int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0; for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[rank].cudaCompCap); for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[rank].cudaCompCap); for (int i = 0; i < nranks; i++) { if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) { // Rank is in same process if (intraProcRanks == 0) intraProcRank0 = i; if (i == rank) intraProcRank = intraProcRanks; intraProcRanks++; if (intraProcRank0 == rank && rank != i) { comm->peerInfo[i].comm->intraNext = comm->intraNext; comm->intraNext = comm->peerInfo[i].comm; } } } TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0); if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) { WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0); ret = ncclInternalError; goto fail; } struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm; assert(intraProcRank==0 ? comm==comm0 : true); comm->intraComm0 = comm0; comm->intraRank = intraProcRank; comm->intraRanks = intraProcRanks; comm->intraBarrierPhase = 0; comm->intraBarrierCounter = 0; comm->intraBarrierGate = 0; } while(0); // Topo detection / System graph creation NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail); // Compute paths between GPUs and NICs NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail); // Remove inaccessible GPUs and unused NICs NCCLCHECKGOTO(ncclTopoTrimSystem(comm->topo, comm), ret, fail); // Recompute paths after trimming NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail); // Init search NCCLCHECKGOTO(ncclTopoSearchInit(comm->topo), ret, fail); // Print final topology NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail); // Set Affinity to a CPU local the our GPU, so that all memory we allocate // on the host is local. NCCLCHECKGOTO(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity), ret, fail); if (CPU_COUNT(&comm->cpuAffinity)) { sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); } // Determine local CollNet support if (collNetSupport(comm)) { char *collNetEnable = getenv("NCCL_COLLNET_ENABLE"); if (collNetEnable != NULL) { INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable); if (strcmp(collNetEnable, "1") == 0) { comm->collNetSupport = 1; } } } // Determine local Nvls support NCCLCHECK(ncclNvlsInit(comm)); // Get rings and trees ringGraph.id = 0; ringGraph.pattern = NCCL_TOPO_PATTERN_RING; ringGraph.collNet = 0; ringGraph.minChannels = 1; ringGraph.maxChannels = MAXCHANNELS/2; NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &ringGraph), ret, fail); treeGraph.id = 1; treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE; treeGraph.collNet = 0; treeGraph.minChannels = ringGraph.nChannels; treeGraph.maxChannels = ringGraph.nChannels; NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &treeGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &treeGraph), ret, fail); collNetGraph.id = 2; collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; collNetGraph.collNet = 1; collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels; if (comm->collNetSupport) { NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail); } else { collNetGraph.nChannels = 0; } nvlsGraph.id = 3; nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS; nvlsGraph.collNet = 0; nvlsGraph.minChannels = 1; nvlsGraph.maxChannels = MAXCHANNELS; if (comm->nvlsSupport) { NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail); } else { nvlsGraph.nChannels = 0; } // Initialize num P2P LL buffers for this communicator comm->allocP2pNetLLBuffers = ncclParamAllocP2pNetLLBuffers() == 1; if (comm->rank == ncclParamGraphDumpFileRank()) { struct ncclTopoGraph* dumpGraphs[4] = { &ringGraph, &treeGraph, &collNetGraph, &nvlsGraph }; NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 4, dumpGraphs), ret, fail); } // AllGather3 - begin NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail); for (int a=0; apattern; allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels; allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels; allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra; allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter; allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra; allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter; } comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail); // Determine nNodes, firstRanks, ... NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail); NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail); NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail); for (int r=0; rnNodes && nodesFirstRank[node] != firstRank; node++); if (node == comm->nNodes) { comm->nNodes++; nodesFirstRank[node] = firstRank; // Record tree pattern of each node as they can be different depending on sm arch nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern; } comm->rankToNode[r] = node; } // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail); NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail); for (int r=0; rnRanks; r++) { int node = comm->rankToNode[r]; comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks; comm->nodeRanks[node].localRanks++; } // Allocate ranks arrays for each node for (int n=0; nnNodes; n++) { NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail); comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks); comm->nodeRanks[n].localRanks = 0; } // And fill the ranks arrays for (int r=0; rnRanks; r++) { int node = comm->rankToNode[r]; comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r; } comm->node = comm->rankToNode[rank]; comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank; comm->localRank = comm->rankToLocalRank[rank]; comm->localRanks = comm->nodeRanks[comm->node].localRanks; TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d", rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]); if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) { WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d", rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]); ret = ncclInternalError; goto fail; } nChannelsOrig = comm->nChannels; NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail); for (int i=0; inChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels); graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels); graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra); graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter); graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra); graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter); } if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0; if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = 0; } comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); if (comm->nChannels < nChannelsOrig) { // We started duplicating channels during Preset(), so we need to move the // duplicated channels since we have removed some. for (int i=0; inChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel)); } // Determine CollNet support after all-gather now that we know nNodes and each node localRanks if (comm->collNetSupport == 1) { int collNetNodeThreshold = ncclParamCollNetNodeThreshold(); if (comm->nNodes < collNetNodeThreshold) { INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold); comm->collNetSupport = 0; } for (int n=0; nnNodes; n++) { if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) { WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1); comm->collNetSupport = 0; break; } } } NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail); NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs), ret, fail); // AllGather3 - end TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels); char line[1024]; line[0]='\0'; for (int c=0; cnChannels; c++) { struct ncclTree* tree = &comm->channels[c].tree; snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d", c, tree->down[0], tree->down[1], tree->down[2], rank, tree->up); INFO(NCCL_GRAPH, "Ring %02d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next); } line[1023] = '\0'; INFO(NCCL_INIT, "Trees%s", line); NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail); // Compute nChannels per peer for p2p NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail); /* until now, all info of comm should be known. We can initialize shared resources and * map localRanks to top parent local ranks. NOTE: this shareRes init must be put before * all proxy operations. */ if (comm->sharedRes->owner == comm) { comm->sharedRes->tpNLocalRanks = comm->localRanks; comm->sharedRes->magic = comm->magic; comm->sharedRes->tpNChannels = comm->nChannels; comm->sharedRes->tpP2pNChannels = comm->p2pnChannels; memcpy(comm->sharedRes->tpRankToLocalRank, comm->rankToLocalRank, sizeof(int) * comm->nRanks); } NCCLCHECKGOTO(ncclCalloc(&topParentLocalRanks, comm->localRanks), ret, fail); for (int i = 0; i < comm->localRanks; ++i) { int tpRank = comm->topParentRanks[comm->localRankToRank[i]]; topParentLocalRanks[i] = comm->sharedRes->tpRankToLocalRank[tpRank]; } comm->topParentLocalRanks = topParentLocalRanks; // Launch proxy service thread, after this, the proxy calls can be used. NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail); // Connect with prev/next for each ring for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail); if (comm->nRanks == 1) continue; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, fail); INFO(NCCL_INIT, "Connected all rings"); // Connect Trees for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; if (comm->nRanks == 1) continue; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail); NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail); INFO(NCCL_INIT, "Connected all trees"); // Setup NVLS NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail); // And NVLS trees if needed if (comm->nvlsSupport && comm->localRanks > 1) { for (int c=0; cnvlsChannels; c++) { struct ncclChannel* channel = comm->channels+c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail); NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &nvlsGraph, 0), ret, fail); INFO(NCCL_INIT, "Connected NVLS tree"); } // Check if we can setup CollNet if (comm->collNetSupport > 0) collNetTrySetup(comm, parent, &collNetGraph); TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); // Compute time models for algorithm and protocol combinations NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail); INFO(NCCL_INIT, "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); do { // Setup p2p structures in comm->tasks struct ncclTasks* tasks = &comm->tasks; int node = comm->node; int nNodes = comm->nNodes; struct ncclNodeRanks *nodeRanks = comm->nodeRanks; int localRank = comm->localRank; // We want to fuse along node boundaries. Make sure nsteps is a multiple or divides 8. int steps = ALIGN_POWER(comm->maxLocalRanks, NCCL_MAX_WORK_ELEMENTS_P2P/2); tasks->p2pOrderSteps = comm->nNodes * steps; tasks->peers = ncclMemoryStackAlloc(&comm->memPermanent, tasks->p2pOrderSteps); tasks->p2pSendOrder = ncclMemoryStackAlloc(&comm->memPermanent, tasks->p2pOrderSteps); tasks->p2pRecvOrder = ncclMemoryStackAlloc(&comm->memPermanent, tasks->p2pOrderSteps); int i=0; // schedule delta 0, +1, -1, +2, -2, ... // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even. for (int d=0; d <= nNodes/4; d++) { int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes }; int index = 0; int delta = deltas[index]; sched_delta: int recvNode = (node+nNodes-delta)%nNodes; int sendNode = (node+delta)%nNodes; for (int step=0; step < steps; step++) { int recvIndex = (localRank-step+steps)%steps; int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1; tasks->p2pRecvOrder[i] = recvRank; int sendIndex = (localRank+step)%steps; int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1; tasks->p2pSendOrder[i] = sendRank; i++; } index++; if (index == 1 && deltas[1] == deltas[0]) index++; if (index == 2 && deltas[2] == deltas[0]) index++; if (index == 3 && deltas[3] == deltas[2]) index++; if (index == 3 && deltas[3] == deltas[1]) index++; if (index < 4) { delta = deltas[index]; goto sched_delta; } } assert(i == tasks->p2pOrderSteps); } while (0); if (ncclParamNvbPreconnect()) { // Connect p2p when using NVB path int nvbNpeers; NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail); for (int r=0; rp2pnChannelsPerPeer; c++) { NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId), ret, fail); if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { comm->connectSend[peer] |= (1UL<p2pnChannelsPerPeer; c++) { NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId), ret, fail); if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { comm->connectRecv[peer] |= (1UL<topParentRanks[comm->rank]; NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); // Then to remote ones when using PXN if (ncclPxnDisable(comm) == 0) { int nranks; NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail); for (int r=0; rtopParentRanks[pxnPeers[r]]; NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail); NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail); } } if (comm->intraRank == 0) { // Load ncclParamLaunchMode char* str = getenv("NCCL_LAUNCH_MODE"); enum ncclLaunchMode mode, modeOld; if (str && strcasecmp(str, "GROUP") == 0) { mode = ncclLaunchModeGroup; } else { mode = ncclLaunchModeParallel; } // In theory we could be racing with other communicators not associated with // this one if the user is connecting to multiple ncclUniqueId's concurrently. modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED); if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') { INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP"); } } // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock. NCCLCHECKGOTO(devCommSetup(comm), ret, fail); /* Local intra-node barrier */ NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); // We should have allocated all buffers, collective fifos, ... we can // restore the affinity. TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); exit: if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be * properly cleaned up. */ if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess) ncclProxyShmUnlink(comm); free(allTopoRanks); free(nodesTreePatterns); free(nodesFirstRank); free(allGather3Data); free(rings); free(nvbPeers); free(pxnPeers); return ret; fail: goto exit; } NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0); NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", NCCL_CONFIG_UNDEF_INT); // Match config max/minCTAs NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT); NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT); #define NCCL_MAX_CGA_CLUSTER_SIZE 8 struct ncclCommInitRankAsyncJob { struct ncclAsyncJob base; struct ncclComm* comm; struct ncclComm** newcomm; int cudaDev; // For ncclCommInitRank int nranks, myrank; ncclUniqueId commId; // for ncclCommSplit struct ncclComm* parent; int color, key; }; struct ncclCommFinalizeAsyncJob { struct ncclAsyncJob base; ncclComm_t comm; }; NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT); static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) { int* colors = NULL; int* keys = NULL; int nRanks = 0, myRank = 0; ncclResult_t ret = ncclSuccess; NCCLCHECKGOTO(ncclCalloc(&colors, parent->nRanks), ret, fail); NCCLCHECKGOTO(ncclCalloc(&keys, parent->nRanks), ret, fail); // Compute nRanks, my rank and the ranks (of the original comm) before and after me colors[parent->rank] = color; keys[parent->rank] = key; NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, colors, sizeof(int)), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, keys, sizeof(int)), ret, fail); // Negative color does not create a new comm. Return now. if (color == NCCL_SPLIT_NOCOLOR) goto exit; memset(parentRanksRet, 0xff, sizeof(int) * parent->nRanks); for (int i = 0; i < parent->nRanks; i++) { if (colors[i] != color) continue; // Find where to insert this rank int insert = 0; while (insert < nRanks && keys[parentRanksRet[insert]] <= keys[i]) insert++; // Shift ranks by one after insert for (int r = nRanks; r > insert; r--) parentRanksRet[r] = parentRanksRet[r - 1]; // Insert our rank parentRanksRet[insert] = i; nRanks++; } for (int i = 0; i < nRanks; i++) { if (parentRanksRet[i] == parent->rank) myRank = i; } *nRanksRet = nRanks; *myRankRet = myRank; exit: free(colors); free(keys); return ret; fail: goto exit; } static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_; ncclComm_t comm = job->comm; ncclResult_t res = ncclSuccess; int archMajor, archMinor; size_t maxLocalSizeBytes = 0; int cudaDev = job->cudaDev; int* parentRanks = NULL; int cudaArch; CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail); CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail); CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail); cudaArch = 100*archMajor + 10*archMinor; NCCLCHECK(ncclInitKernelsForDevice(cudaArch, &maxLocalSizeBytes)); // Set the maximum kernel stack size of all kernels to avoid // a CUDA memory reconfig on load (c.f. NVSHMEM issue) if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) { TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes); CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes)); } if (job->parent) { NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail); NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail); // Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now. if (job->color == NCCL_SPLIT_NOCOLOR) goto exit; snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color); NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail); NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail); } else { NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail); NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail); } comm->cudaArch = cudaArch; comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId)); NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail); // update communicator state comm->initState = ncclSuccess; // Trace this call for replay tool if (job->parent) { /* unlink child abort flag. */ __atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE); TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks); } else { TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, (unsigned long long)hashUniqueId(job->commId), comm->rank, comm->cudaDev); } INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId)); exit: if (job->newcomm) { /* assign it to user pointer. */ __atomic_store_n(job->newcomm, comm, __ATOMIC_RELEASE); } free(parentRanks); return res; fail: comm->initState = res; goto exit; } #define NCCL_CONFIG_DEFAULT(config, field, undef, defvalue, fieldStr, format) \ if (config->field == undef) { \ config->field = defvalue; \ } else { \ INFO(NCCL_ENV, "Comm config " fieldStr " set to " format, config->field); \ } static ncclResult_t envConfigOverride(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; const char* tmpNetName = comm->config.netName; const char* envNetName; int blockingEnv; int cgaClusterSizeEnv; int minCTAsEnv; int maxCTAsEnv; int splitShareEnv; /* override configuration from env variable. */ blockingEnv = ncclParamCommBlocking(); if (blockingEnv == 0 || blockingEnv == 1) comm->config.blocking = blockingEnv; cgaClusterSizeEnv = ncclParamCGAClusterSize(); if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) { comm->config.cgaClusterSize = cgaClusterSizeEnv; } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) { WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE); comm->config.cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE; } minCTAsEnv = ncclParamMinCTAs(); if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) { comm->config.minCTAs = minCTAsEnv; } maxCTAsEnv = ncclParamMaxCTAs(); if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) { comm->config.maxCTAs = maxCTAsEnv; } envNetName = getenv("NCCL_NET"); if (envNetName) tmpNetName = envNetName; if (tmpNetName != NULL) { int netNameLen = strlen(tmpNetName) + 1; comm->config.netName = (char*)malloc(netNameLen); memcpy((void*)comm->config.netName, tmpNetName, netNameLen); } else { comm->config.netName = NULL; } splitShareEnv = ncclParamCommSplitShareResources(); if (splitShareEnv != NCCL_CONFIG_UNDEF_INT) { comm->config.splitShare = splitShareEnv; } /* cap channels if needed */ if (comm->config.minCTAs > MAXCHANNELS) { WARN("minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS); comm->config.minCTAs = MAXCHANNELS; } if (comm->config.maxCTAs > MAXCHANNELS) { WARN("maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS); comm->config.maxCTAs = MAXCHANNELS; } if (comm->config.minCTAs > comm->config.maxCTAs) { WARN("minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs); comm->config.minCTAs = comm->config.maxCTAs; } if (comm->config.splitShare != 1 && comm->config.splitShare != 0) { WARN("splitShare %d is not a valid value 0/1, set it to 0\n", comm->config.splitShare); comm->config.splitShare = 0; } return ret; } static ncclResult_t copyCommConfig(ncclComm_t childComm, ncclComm_t parnet) { memcpy(&childComm->config, &parnet->config, sizeof(ncclConfig_t)); NCCLCHECK(envConfigOverride(childComm)); return ncclSuccess; } static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { ncclResult_t ret = ncclSuccess; /* config must not be NULL in this function */ ncclConfig_t defaultConfig = NCCL_CONFIG_INITIALIZER; ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER; ncclConfig_t *internalConfigPtr; size_t realSize; internalConfigPtr = &internalConfig; if (config) { memcpy((void*)&realSize, (void*)config, sizeof(size_t)); realSize = realSize > sizeof(ncclConfig_t) ? sizeof(ncclConfig_t) : realSize; memcpy((void*)internalConfigPtr, (void*)config, realSize); if (internalConfigPtr->magic != 0xcafebeef) { WARN("ncclConfig_t argument not initialized via NCCL_CONFIG_INITIALIZER"); ret = ncclInvalidArgument; goto fail; } /* check version. */ if (internalConfigPtr->version < NCCL_VERSION(2, 14, 0)) { internalConfigPtr->blocking = defaultConfig.blocking; } if (internalConfigPtr->version < NCCL_VERSION(2, 17, 0)) { internalConfigPtr->cgaClusterSize = defaultConfig.cgaClusterSize; internalConfigPtr->minCTAs = defaultConfig.minCTAs; internalConfigPtr->maxCTAs = defaultConfig.maxCTAs; internalConfigPtr->netName = defaultConfig.netName; } } /* check input config attributes, -1 means user-undefined and we should use default value from NCCL. */ if (internalConfigPtr->blocking != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->blocking != 0 && internalConfigPtr->blocking != 1) { WARN("Invalid config blocking attribute value %d", internalConfigPtr->blocking); ret = ncclInvalidArgument; goto fail; } if (internalConfigPtr->cgaClusterSize != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->cgaClusterSize < 0) { WARN("Invalid config cgaClusterSize attribute value %d", internalConfigPtr->cgaClusterSize); ret = ncclInvalidArgument; goto fail; } if ((internalConfigPtr->minCTAs != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->minCTAs <= 0) || (internalConfigPtr->maxCTAs != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->maxCTAs <= 0) || (internalConfigPtr->minCTAs > internalConfigPtr->maxCTAs)) { WARN("Invalid config min/max channels attribute value %d/%d", internalConfigPtr->minCTAs, internalConfigPtr->maxCTAs); ret = ncclInvalidArgument; goto fail; } if (internalConfigPtr->splitShare != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->splitShare != 0 && internalConfigPtr->splitShare != 1) { WARN("Invalid config splitShare attribute value %d", internalConfigPtr->splitShare); ret = ncclInvalidArgument; goto fail; } /* default config value can be tuned on different platform. */ NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, minCTAs, NCCL_CONFIG_UNDEF_INT, 1, "Min CTAs", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s"); NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d"); /* assign config to communicator */ comm->config.blocking = internalConfigPtr->blocking; comm->config.cgaClusterSize = internalConfigPtr->cgaClusterSize; comm->config.minCTAs = internalConfigPtr->minCTAs; comm->config.maxCTAs = internalConfigPtr->maxCTAs; comm->config.netName = internalConfigPtr->netName; comm->config.splitShare = internalConfigPtr->splitShare; NCCLCHECKGOTO(envConfigOverride(comm), ret, fail); exit: return ret; fail: goto exit; } static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config) { ncclResult_t res = ncclSuccess; ncclComm_t comm = NULL; struct ncclCommInitRankAsyncJob *job = NULL; char* env = getenv("NCCL_COMM_ID"); if (env && myrank == 0) { INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env); NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&commId, true), res, fail); } NCCLCHECKGOTO(ncclInit(), res, fail); if (myrank == 0) showVersion(); // Make sure the CUDA runtime is initialized. CUDACHECKGOTO(cudaFree(NULL), res, fail); NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, fail); NCCLCHECKGOTO(PtrCheck(config, "CommInitRank", "config"), res, fail); if (nranks < 1 || myrank < 0 || myrank >= nranks) { WARN("Invalid rank requested : %d/%d", myrank, nranks); res = ncclInvalidArgument; goto fail; } NCCLCHECKGOTO(ncclCalloc(&comm, 1), res, fail); NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1), res, fail); NCCLCHECKGOTO(ncclCalloc((uint32_t**)&comm->abortFlagRefCount, 1), res, fail); *comm->abortFlagRefCount = 1; NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail); /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */ comm->initState = ncclInternalError; *newcomm = comm; NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); job->comm = comm; job->nranks = nranks; job->commId = commId; // C++ struct assignment job->myrank = myrank; job->cudaDev = cudaDev; NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail); exit: return ncclGroupErrCheck(res); fail: if (comm) { if (comm->abortFlag) ncclCudaHostFree((void *)comm->abortFlag); if (comm->abortFlagRefCount) free(comm->abortFlagRefCount); free(comm); } if (newcomm) *newcomm = NULL; goto exit; } struct NvtxParamsCommInitRank { int rank; int nranks; int cudaDev; }; constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"}, {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommInitRank, nranks)}, {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommInitRank, cudaDev)}, }; NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { // Load the CUDA driver and dlsym hooks (can fail on old drivers) (void)ncclCudaLibraryInit(); int cudaDev; ncclConfig_t config = NCCL_CONFIG_INITIALIZER; CUDACHECK(cudaGetDevice(&cudaDev)); NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload) NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config)); return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist); ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { ncclResult_t ret = ncclSuccess; int totalnDev; int *gpuFlags = NULL; ncclConfig_t config = NCCL_CONFIG_INITIALIZER; constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = { {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"} }; NVTX3_FUNC_WITH_PARAMS(CommInitAll, CommInitAllSchema, ndev) // Load the CUDA driver and dlsym hooks (can fail on old drivers) (void)ncclCudaLibraryInit(); NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail); if (ndev < 0) { WARN("Invalid device count requested : %d", ndev); ret = ncclInvalidArgument; goto fail; } CUDACHECKGOTO(cudaGetDeviceCount(&totalnDev), ret, fail); if (devlist) { NCCLCHECKGOTO(ncclCalloc(&gpuFlags, totalnDev), ret, fail); for (int i = 0; i < ndev; ++i) { /* invalid device check. */ if (devlist[i] < 0 || devlist[i] >= totalnDev) { ret = ncclUnhandledCudaError; goto fail; } /* duplicate device check. */ if (gpuFlags[devlist[i]] != 0) { ret = ncclInvalidUsage; goto fail; } gpuFlags[devlist[i]] = 1; } free(gpuFlags); gpuFlags = nullptr; } ncclUniqueId uniqueId; NCCLCHECKGOTO(ncclGetUniqueId(&uniqueId), ret, fail); NCCLCHECKGOTO(ncclGroupStart(), ret, fail); for (int i=0; i= ncclNumResults || comm == NULL) { WARN("ncclCommSetAsyncError: error comm %p sets state %d", comm, nextState); return ncclInvalidArgument; } __atomic_store_n(&comm->asyncResult, nextState, __ATOMIC_RELEASE); return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config); ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) { NVTX3_FUNC_RANGE_IN(nccl_domain); int cudaDev; ncclResult_t ret = ncclSuccess; ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER; ncclConfig_t *internalConfigPtr = NULL; NCCLCHECK(ncclGroupStartInternal()); (void)ncclCudaLibraryInit(); CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, fail); if (config == NULL) internalConfigPtr = &internalConfig; else internalConfigPtr = config; NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail); exit: ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret); return ret; fail: if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret); goto exit; } static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_; ncclComm_t comm = job->comm; int savedDevice; int commDevice = comm->cudaDev; ncclResult_t ret = ncclSuccess; CUDACHECKGOTO(cudaGetDevice(&savedDevice), ret, fail); if (savedDevice != commDevice) { CUDACHECKGOTO(cudaSetDevice(commDevice), ret, fail); } TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult); if (comm->initState == ncclSuccess) { NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), ret, fail); NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), ret, fail); } NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail); // And keep polling until all graphs referencing us die. while (comm->persistentRefs != 0) { NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail); } if (savedDevice != commDevice) { CUDACHECKGOTO(cudaSetDevice(savedDevice), ret, fail); } comm->finalizeCalled = true; exit: return ret; fail: goto exit; } static ncclResult_t commCleanup(ncclComm_t comm) { int savedDevice; int commDevice = comm->cudaDev; CUDACHECK(cudaGetDevice(&savedDevice)); if (savedDevice != commDevice) { CUDACHECK(cudaSetDevice(commDevice)); } NCCLCHECK(commFree(comm)); if (savedDevice != commDevice) { CUDACHECK(cudaSetDevice(savedDevice)); } return ncclSuccess; } static ncclResult_t commFinalize(ncclComm_t comm, bool userCalled) { ncclResult_t ret = ncclSuccess; struct ncclCommFinalizeAsyncJob *job = NULL; /* launch async thread to finalize comm. */ NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); job->comm = comm; if (userCalled) { NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail); } else { NCCLCHECKGOTO(commDestroySync(&job->base), ret, fail); free(job); } exit: return ncclGroupErrCheck(ret); fail: goto exit; } NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm); ncclResult_t ncclCommFinalize(ncclComm_t comm) { NVTX3_FUNC_RANGE_IN(nccl_domain); ncclResult_t ret = ncclSuccess; NCCLCHECK(ncclGroupStartInternal()); if (comm == NULL) goto exit; /* wait comm ready before finalize. */ NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail); /* prevent double finalize. */ if (comm->finalizeCalled) { ret = ncclInvalidArgument; goto fail; } /* finalize comm. */ ret = commFinalize(comm, true); exit: ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) }; return ret; fail: if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret); goto exit; } static ncclResult_t commReclaim(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; ncclResult_t state; int curRank; /* Debug info */ NCCLCHECKGOTO(ncclCommGetAsyncError(comm, &state), ret, fail); TRACE(NCCL_INIT, "commReclaim: reclaim comm %p rank %d state %d", comm, comm->rank, state); if (state == ncclSuccess && *comm->abortFlag == 0 && comm->finalizeCalled == false) { /* user does not call ncclCommFinalize and this is a normal comm destroy. ncclCommDestroy * should be nonblocking until last call of ncclCommDestroy. */ NCCLCHECKGOTO(commFinalize(comm, false), ret, fail); } if (comm->intraComm0 != NULL) { int curRankCnt; int intraRanks = comm->intraRanks; ncclComm_t intracomm0 = comm->intraComm0; int *finalizeRankCnt = &intracomm0->finalizeRankCnt; assert(intracomm0 != NULL && finalizeRankCnt != NULL); curRankCnt = __atomic_add_fetch(finalizeRankCnt, 1, __ATOMIC_ACQ_REL); if (curRankCnt == intraRanks) { ncclComm_t curIntraComm; ncclComm_t nextIntraComm = intracomm0; /* this is the last call to ncclCommDestroy/Abort, we need to make sure all comms * in the process have been finalized before we free local resources. */ while (nextIntraComm) { curIntraComm = nextIntraComm; curRank = curIntraComm->rank; nextIntraComm = nextIntraComm->intraNext; if (curIntraComm->finalizeCalled == false) { struct ncclCommFinalizeAsyncJob job; job.comm = curIntraComm; /* every comm aborts, commDestroySync should not be blocked. */ if ((ret = commDestroySync((struct ncclAsyncJob*) &job)) != ncclSuccess) WARN("commReclaim: comm %p (rank = %d) in abort, error %d", curIntraComm, curRank, ret); } } /* ncclProxyStop() loop must be put after commDestroySync() loop. Namely, you cannot do: * while(...) { * commDestroySync(...); * ncclProxyStop(...); * } * Considering one process multi-gpu case, we must guarantee all kernels are complete before * we free proxy resources; otherwise, we will face invalid memory issues where proxy connection * and related intermediate memory from one rank are freed but other ranks are still using it. * This is not a problem for multi-process case, since intermediate memory is opened by CUDA IPC * or mmap where memory free is guarded by CUDA driver and operating system, so we will not have * invalid memory access issue. */ nextIntraComm = intracomm0; while (nextIntraComm) { curIntraComm = nextIntraComm; curRank = curIntraComm->rank; nextIntraComm = nextIntraComm->intraNext; /* free intraprocess proxy resources. */ if ((ret = ncclProxyStop(curIntraComm)) != ncclSuccess) { WARN("commReclaim: comm %p (rank = %d) destroys proxy resource error %d", curIntraComm, curRank, ret); } } /* free local resources. */ nextIntraComm = intracomm0; while (nextIntraComm) { curIntraComm = nextIntraComm; curRank = curIntraComm->rank; nextIntraComm = nextIntraComm->intraNext; if ((ret = commCleanup(curIntraComm)) != ncclSuccess) { WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", curIntraComm, curRank, ret); } } } } exit: return ret; fail: goto exit; } NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); ncclResult_t ncclCommDestroy(ncclComm_t comm) { if (comm == NULL) { NVTX3_FUNC_RANGE_IN(nccl_domain); return ncclSuccess; } int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload) int64_t busId = comm->busId; TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId); // Try and prevent a double free of the comm struct (user error) if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) { WARN("comm %p has already been destroyed", comm); return ncclInvalidArgument; } /* init thread must be joined before we destroy the comm. */ NCCLCHECK(ncclCommEnsureReady(comm)); NCCLCHECK(commReclaim(comm)); INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Destroy COMPLETE", comm, rank, nranks, cudaDev, busId); return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); ncclResult_t ncclCommAbort(ncclComm_t comm) { if (comm == NULL) { NVTX3_FUNC_RANGE_IN(nccl_domain); return ncclSuccess; } volatile uint32_t* childAbortFlag; int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload) int64_t busId = comm->busId; TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId); // Ask anything that might still be running on the device to quit childAbortFlag = __atomic_load_n(&comm->childAbortFlag, __ATOMIC_ACQUIRE); if (childAbortFlag != NULL) { *childAbortFlag = 1; } *comm->abortFlag = 1; /* init thread must be joined before we destroy the comm, * and we should ignore the init error here. */ ncclCommEnsureReady(comm); (void) commReclaim(comm); INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Abort COMPLETE", comm, rank, nranks, cudaDev, busId); return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config); ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) { struct ncclCommInitRankAsyncJob *job = NULL; struct ncclComm* childComm = NCCL_COMM_NULL; ncclResult_t res = ncclSuccess; NCCLCHECK(ncclGroupStartInternal()); NCCLCHECKGOTO(PtrCheck(comm, "CommSplit", "comm"), res, fail); NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail); /* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */ *newcomm = NCCL_COMM_NULL; if (color == NCCL_SPLIT_NOCOLOR) { INFO(NCCL_INIT, "Rank %d has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator", comm->rank); } else { NCCLCHECKGOTO(ncclCalloc(&childComm, 1), res, fail); if (comm->config.splitShare) { childComm->abortFlag = comm->abortFlag; childComm->abortFlagRefCount = comm->abortFlagRefCount; comm->childAbortFlag = NULL; ncclAtomicRefCountIncrement(comm->abortFlagRefCount); } else { NCCLCHECKGOTO(ncclCudaHostCalloc((uint32_t**)&childComm->abortFlag, 1), res, fail); NCCLCHECKGOTO(ncclCalloc((uint32_t**)&childComm->abortFlagRefCount, 1), res, fail); /* temporarily used to abort everything during child comm init. */ comm->childAbortFlag = childComm->abortFlag; *childComm->abortFlagRefCount = 1; } if (config == NULL) { NCCLCHECKGOTO(copyCommConfig(childComm, comm), res, fail); } else { NCCLCHECKGOTO(parseCommConfig(childComm, config), res, fail); } /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */ childComm->initState = ncclInternalError; } NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); job->comm = childComm; job->newcomm = newcomm; job->parent = comm; job->color = color; job->key = key; job->cudaDev = comm->cudaDev; NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail); exit: ncclGroupErrCheck(res); NCCLCHECK(ncclGroupEndInternal()); return res; fail: if (childComm) { if (comm && !comm->config.splitShare) { if (childComm->abortFlag) ncclCudaHostFree((void*)childComm->abortFlag); if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount); } free(childComm); } if (newcomm) *newcomm = NULL; goto exit; } NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); const char* ncclGetErrorString(ncclResult_t code) { switch (code) { case ncclSuccess : return "no error"; case ncclUnhandledCudaError : return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)"; case ncclSystemError : return "unhandled system error (run with NCCL_DEBUG=INFO for details)"; case ncclInternalError : return "internal error - please report this issue to the NCCL developers"; case ncclInvalidArgument : return "invalid argument (run with NCCL_DEBUG=WARN for details)"; case ncclInvalidUsage : return "invalid usage (run with NCCL_DEBUG=WARN for details)"; case ncclRemoteError : return "remote process exited or there was a network error"; case ncclInProgress : return "NCCL operation in progress"; default : return "unknown result code"; } } /* Returns a human-readable message of the last error that occurred. * comm is currently unused and can be set to NULL */ NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm); const char* ncclGetLastError(ncclComm_t comm) { return ncclLastError; } NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError); ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm")); NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError")); *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE); return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count); ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECK(PtrCheck(comm, "CommCount", "comm")); NCCLCHECK(PtrCheck(count, "CommCount", "count")); /* init thread must be joined before we access the attributes of comm. */ NCCLCHECK(ncclCommEnsureReady(comm)); *count = comm->nRanks; return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid); ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm")); NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid")); NCCLCHECK(ncclCommEnsureReady(comm)); *devid = comm->cudaDev; return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank); ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { NVTX3_FUNC_RANGE_IN(nccl_domain); NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm")); NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank")); NCCLCHECK(ncclCommEnsureReady(comm)); *rank = comm->rank; return ncclSuccess; } nccl-2.18.3-1/src/init_nvtx.cc000066400000000000000000000015001444201535000157740ustar00rootroot00000000000000#include "nccl.h" #include "nvtx.h" static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = { {"Sum", ncclSum}, {"Product", ncclProd}, {"Max", ncclMax}, {"Min", ncclMin}, {"Avg", ncclAvg} }; // Must be called before the first call to any reduction operation. void initNvtxRegisteredEnums() { // Register schemas and strings constexpr const nvtxPayloadEnumAttr_t eAttr { .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID, .name = NULL, .entries = NvtxEnumRedSchema, .numEntries = std::extent::value, .sizeOfEnum = sizeof(ncclRedOp_t), .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP }; nvtxPayloadEnumRegister(nvtx3::domain::get(), &eAttr); } nccl-2.18.3-1/src/misc/000077500000000000000000000000001444201535000144025ustar00rootroot00000000000000nccl-2.18.3-1/src/misc/argcheck.cc000066400000000000000000000056731444201535000164730ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "argcheck.h" #include "comm.h" static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { cudaPointerAttributes attr; cudaError_t err = cudaPointerGetAttributes(&attr, pointer); if (err != cudaSuccess || attr.devicePointer == NULL) { WARN("%s : %s %p is not a valid pointer", opname, ptrname, pointer); return ncclInvalidArgument; } #if CUDART_VERSION >= 10000 if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { #else if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { #endif WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev); return ncclInvalidArgument; } return ncclSuccess; } ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { if (ptr == NULL) { WARN("%s : %s argument is NULL", opname, ptrname); return ncclInvalidArgument; } return ncclSuccess; } ncclResult_t ArgsCheck(struct ncclInfo* info) { // First, the easy ones if (info->root < 0 || info->root >= info->comm->nRanks) { WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks); return ncclInvalidArgument; } if (info->datatype < 0 || info->datatype >= ncclNumTypes) { WARN("%s : invalid type %d", info->opName, info->datatype); return ncclInvalidArgument; } // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars. NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks)); if (info->op < 0 || ncclMaxRedOp < info->op) { WARN("%s : invalid reduction operation %d", info->opName, info->op); return ncclInvalidArgument; } int opIx = int(ncclUserRedOpMangle(info->comm, info->op)) - int(ncclNumOps); if (ncclNumOps <= info->op && (info->comm->userRedOpCapacity <= opIx || info->comm->userRedOps[opIx].freeNext != -1)) { WARN("%s : reduction operation %d unknown to this communicator", info->opName, info->op); return ncclInvalidArgument; } if (info->comm->checkPointers) { if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv)) { if (info->count >0) NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName)); } else { // Check CUDA device pointers if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) { NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName)); } if (info->coll != ncclFuncReduce || info->comm->rank == info->root) { NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName)); } } } return ncclSuccess; } nccl-2.18.3-1/src/misc/cudawrap.cc000066400000000000000000000176321444201535000165300ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #include "debug.h" #include "param.h" #include "cudawrap.h" #include // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", 0); static int ncclCuMemSupported = 0; // Determine whether CUMEM & VMM RDMA is supported on this platform int ncclIsCuMemSupported() { #if CUDART_VERSION < 11030 return 0; #else CUdevice currentDev; int cudaDev; int cudaDriverVersion; int flag = 0; ncclResult_t ret = ncclSuccess; CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error); if (cudaDriverVersion < 12000) return 0; // Need CUDA_VISIBLE_DEVICES support CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error); if (CUPFN(cuMemCreate) == NULL) return 0; CUCHECKGOTO(cuDeviceGet(¤tDev, cudaDev), ret, error); // Query device to see if CUMEM VMM support is available CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error); if (!flag) return 0; // Query device to see if CUMEM RDMA support is available CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev), ret, error); if (!flag) return 0; error: return (ret == ncclSuccess); #endif } int ncclCuMemEnable() { return ((ncclParamCuMemEnable() == -2 && ncclCuMemSupported) || ncclParamCuMemEnable()); } #define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr #if CUDART_VERSION >= 11030 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */ DECLARE_CUDA_PFN(cuDeviceGet, 2000); DECLARE_CUDA_PFN(cuDeviceGetAttribute, 2000); DECLARE_CUDA_PFN(cuGetErrorString, 6000); DECLARE_CUDA_PFN(cuGetErrorName, 6000); /* enqueue.cc */ DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020); /* proxy.cc */ DECLARE_CUDA_PFN(cuCtxCreate, 3020); DECLARE_CUDA_PFN(cuCtxDestroy, 4000); DECLARE_CUDA_PFN(cuCtxGetCurrent, 4000); DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000); DECLARE_CUDA_PFN(cuCtxGetDevice, 2000); /* cuMem API support */ DECLARE_CUDA_PFN(cuMemAddressReserve, 10020); DECLARE_CUDA_PFN(cuMemAddressFree, 10020); DECLARE_CUDA_PFN(cuMemCreate, 10020); DECLARE_CUDA_PFN(cuMemGetAllocationGranularity, 10020); DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020); DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020); DECLARE_CUDA_PFN(cuMemMap, 10020); DECLARE_CUDA_PFN(cuMemRelease, 10020); DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000); DECLARE_CUDA_PFN(cuMemSetAccess, 10020); DECLARE_CUDA_PFN(cuMemUnmap, 10020); #if CUDA_VERSION >= 11070 /* transport/collNet.cc/net.cc*/ DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support #endif #if CUDA_VERSION >= 12010 /* NVSwitch Multicast support */ DECLARE_CUDA_PFN(cuMulticastAddDevice, 12010); DECLARE_CUDA_PFN(cuMulticastBindMem, 12010); DECLARE_CUDA_PFN(cuMulticastBindAddr, 12010); DECLARE_CUDA_PFN(cuMulticastCreate, 12010); DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010); DECLARE_CUDA_PFN(cuMulticastUnbind, 12010); #endif #endif /* CUDA Driver functions loaded with dlsym() */ DECLARE_CUDA_PFN(cuInit, 2000); DECLARE_CUDA_PFN(cuDriverGetVersion, 2020); DECLARE_CUDA_PFN(cuGetProcAddress, 11030); #define CUDA_DRIVER_MIN_VERSION 11030 static void *cudaLib; int ncclCudaDriverVersionCache = -1; bool ncclCudaLaunchBlocking = false; #if CUDART_VERSION >= 11030 /* Load the CUDA symbols */ static ncclResult_t cudaPfnFuncLoader(void) { CUresult res; #define LOAD_SYM(symbol, version, ignore) do { \ res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), version, 0); \ if (res != 0) { \ if (!ignore) { \ WARN("Retrieve %s version %d failed with %d", #symbol, version, res); \ return ncclSystemError; } \ } } while(0) LOAD_SYM(cuGetErrorString, 6000, 0); LOAD_SYM(cuGetErrorName, 6000, 0); LOAD_SYM(cuDeviceGet, 2000, 0); LOAD_SYM(cuDeviceGetAttribute, 2000, 0); LOAD_SYM(cuMemGetAddressRange, 3020, 1); LOAD_SYM(cuCtxCreate, 3020, 1); LOAD_SYM(cuCtxDestroy, 4000, 1); LOAD_SYM(cuCtxGetCurrent, 4000, 1); LOAD_SYM(cuCtxSetCurrent, 4000, 1); LOAD_SYM(cuCtxGetDevice, 2000, 1); /* cuMem API support */ LOAD_SYM(cuMemAddressReserve, 10020, 1); LOAD_SYM(cuMemAddressFree, 10020, 1); LOAD_SYM(cuMemCreate, 10020, 1); LOAD_SYM(cuMemGetAllocationGranularity, 10020, 1); LOAD_SYM(cuMemExportToShareableHandle, 10020, 1); LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1); LOAD_SYM(cuMemMap, 10020, 1); LOAD_SYM(cuMemRelease, 10020, 1); LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1); LOAD_SYM(cuMemSetAccess, 10020, 1); LOAD_SYM(cuMemUnmap, 10020, 1); #if CUDA_VERSION >= 11070 LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support #endif #if CUDA_VERSION >= 12010 /* NVSwitch Multicast support */ LOAD_SYM(cuMulticastAddDevice, 12010, 1); LOAD_SYM(cuMulticastBindMem, 12010, 1); LOAD_SYM(cuMulticastBindAddr, 12010, 1); LOAD_SYM(cuMulticastCreate, 12010, 1); LOAD_SYM(cuMulticastGetGranularity, 12010, 1); LOAD_SYM(cuMulticastUnbind, 12010, 1); #endif return ncclSuccess; } #endif static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; static ncclResult_t initResult; static void initOnceFunc() { do { char* val = getenv("CUDA_LAUNCH_BLOCKING"); ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0); } while (0); CUresult res; /* * Load CUDA driver library */ char path[1024]; char *ncclCudaPath = getenv("NCCL_CUDA_PATH"); if (ncclCudaPath == NULL) snprintf(path, 1024, "%s", "libcuda.so"); else snprintf(path, 1024, "%s/%s", ncclCudaPath, "libcuda.so"); (void) dlerror(); // Clear any previous errors cudaLib = dlopen(path, RTLD_LAZY); if (cudaLib == NULL) { WARN("Failed to find CUDA library %s (NCCL_CUDA_PATH='%s') : %s", path, ncclCudaPath ? ncclCudaPath : "", dlerror()); goto error; } /* * Load initial CUDA functions */ pfn_cuInit = (PFN_cuInit_v2000) dlsym(cudaLib, "cuInit"); if (pfn_cuInit == NULL) { WARN("Failed to load CUDA missing symbol cuInit"); goto error; } pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion_v2020) dlsym(cudaLib, "cuDriverGetVersion"); if (pfn_cuDriverGetVersion == NULL) { WARN("Failed to load CUDA missing symbol cuDriverGetVersion"); goto error; } int driverVersion; res = pfn_cuDriverGetVersion(&driverVersion); if (res != 0) { WARN("cuDriverGetVersion failed with %d", res); goto error; } INFO(NCCL_INIT, "cudaDriverVersion %d", driverVersion); if (driverVersion < CUDA_DRIVER_MIN_VERSION) { // WARN("CUDA Driver version found is %d. Minimum requirement is %d", driverVersion, CUDA_DRIVER_MIN_VERSION); // Silently ignore version check mismatch for backwards compatibility goto error; } pfn_cuGetProcAddress = (PFN_cuGetProcAddress_v11030) dlsym(cudaLib, "cuGetProcAddress"); if (pfn_cuGetProcAddress == NULL) { WARN("Failed to load CUDA missing symbol cuGetProcAddress"); goto error; } /* * Required to initialize the CUDA Driver. * Multiple calls of cuInit() will return immediately * without making any relevant change */ pfn_cuInit(0); #if CUDART_VERSION >= 11030 if (cudaPfnFuncLoader()) { WARN("CUDA some PFN functions not found in the library"); goto error; } #endif // Determine whether we support the cuMem APIs or not ncclCuMemSupported = ncclIsCuMemSupported(); initResult = ncclSuccess; return; error: initResult = ncclSystemError; return; } ncclResult_t ncclCudaLibraryInit() { pthread_once(&initOnceControl, initOnceFunc); return initResult; } nccl-2.18.3-1/src/misc/gdrwrap.cc000066400000000000000000000203331444201535000163600ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "gdrwrap.h" #ifndef GDR_DIRECT #include "core.h" /* Function pointers assigned from dlopen() */ static gdr_t (*gdr_internal_open)(void); static int (*gdr_internal_close)(gdr_t g); static int (*gdr_internal_pin_buffer)(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle); static int (*gdr_internal_unpin_buffer)(gdr_t g, gdr_mh_t handle); static int (*gdr_internal_get_info)(gdr_t g, gdr_mh_t handle, gdr_info_t *info); static int (*gdr_internal_map)(gdr_t g, gdr_mh_t handle, void **va, size_t size); static int (*gdr_internal_unmap)(gdr_t g, gdr_mh_t handle, void *va, size_t size); static void (*gdr_internal_runtime_get_version)(int *major, int *minor); static void (*gdr_internal_driver_get_version)(gdr_t g, int *major, int *minor); static int (*gdr_internal_copy_to_mapping)(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size); static int (*gdr_internal_copy_from_mapping)(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size); // Used to make the GDR library calls thread safe pthread_mutex_t gdrLock = PTHREAD_MUTEX_INITIALIZER; #define GDRAPI_LIBNAME "libgdrapi.so" #define LOAD_SYM(handle, symbol, funcptr) do { \ cast = (void**)&funcptr; \ tmp = dlsym(handle, symbol); \ if (tmp == NULL) { \ WARN("dlsym failed on %s - %s", symbol, dlerror());\ goto teardown; \ } \ *cast = tmp; \ } while (0) #define LOAD_SYM_OPTIONAL(handle, symbol, funcptr) do {\ cast = (void**)&funcptr; \ tmp = dlsym(handle, symbol); \ if (tmp == NULL) { \ INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \ } \ *cast = tmp; \ } while (0) static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; static ncclResult_t initResult; static void initOnceFunc(void) { static void* gdrhandle = NULL; void* tmp; void** cast; gdrhandle=dlopen(GDRAPI_LIBNAME, RTLD_NOW); if (!gdrhandle) { WARN("Failed to open %s", GDRAPI_LIBNAME); goto teardown; } /* Load the function pointers from the DL library image */ LOAD_SYM(gdrhandle, "gdr_open", gdr_internal_open); LOAD_SYM(gdrhandle, "gdr_close", gdr_internal_close); LOAD_SYM(gdrhandle, "gdr_pin_buffer", gdr_internal_pin_buffer); LOAD_SYM(gdrhandle, "gdr_unpin_buffer", gdr_internal_unpin_buffer); LOAD_SYM(gdrhandle, "gdr_get_info", gdr_internal_get_info); LOAD_SYM(gdrhandle, "gdr_map", gdr_internal_map); LOAD_SYM(gdrhandle, "gdr_unmap", gdr_internal_unmap); LOAD_SYM(gdrhandle, "gdr_runtime_get_version", gdr_internal_runtime_get_version); LOAD_SYM(gdrhandle, "gdr_driver_get_version", gdr_internal_driver_get_version); LOAD_SYM(gdrhandle, "gdr_copy_to_mapping", gdr_internal_copy_to_mapping); LOAD_SYM(gdrhandle, "gdr_copy_from_mapping", gdr_internal_copy_from_mapping); initResult = ncclSuccess; return; teardown: gdr_internal_open = NULL; gdr_internal_close = NULL; gdr_internal_pin_buffer = NULL; gdr_internal_unpin_buffer = NULL; gdr_internal_get_info = NULL; gdr_internal_map = NULL; gdr_internal_unmap = NULL; gdr_internal_runtime_get_version = NULL; gdr_internal_driver_get_version = NULL; gdr_internal_copy_to_mapping = NULL; gdr_internal_copy_from_mapping = NULL; if (gdrhandle != NULL) dlclose(gdrhandle); initResult = ncclSystemError; return; } ncclResult_t wrap_gdr_symbols(void) { pthread_once(&initOnceControl, initOnceFunc); return initResult; } gdr_t wrap_gdr_open(void) { if (gdr_internal_open == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return NULL; } return gdr_internal_open(); } ncclResult_t wrap_gdr_close(gdr_t g) { if (gdr_internal_close == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret = gdr_internal_close(g); if (ret != 0) { WARN("gdr_close() failed: %d", ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) { if (gdr_internal_pin_buffer == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_pin_buffer(g, addr, size, p2p_token, va_space, handle), ret); if (ret != 0) { WARN("gdr_pin_buffer(addr %lx, size %zi) failed: %d", addr, size, ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) { if (gdr_internal_unpin_buffer == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_unpin_buffer(g, handle), ret); if (ret != 0) { WARN("gdr_unpin_buffer(handle %lx) failed: %d", handle.h, ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) { if (gdr_internal_get_info == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_get_info(g, handle, info), ret); if (ret != 0) { WARN("gdr_get_info(handle %lx) failed: %d", handle.h, ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) { if (gdr_internal_map == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_map(g, handle, va, size), ret); if (ret != 0) { WARN("gdr_map(handle %lx, size %zi) failed: %d", handle.h, size, ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) { if (gdr_internal_unmap == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_unmap(g, handle, va, size), ret); if (ret != 0) { WARN("gdr_unmap(handle %lx, va %p, size %zi) failed: %d", handle.h, va, size, ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor) { if (gdr_internal_runtime_get_version == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } gdr_internal_runtime_get_version(major, minor); return ncclSuccess; } ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) { if (gdr_internal_driver_get_version == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } gdr_internal_driver_get_version(g, major, minor); return ncclSuccess; } ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) { if (gdr_internal_copy_to_mapping == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size), ret); if (ret != 0) { WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zi) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret); return ncclSystemError; } return ncclSuccess; } ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) { if (gdr_internal_copy_from_mapping == NULL) { WARN("GDRCOPY lib wrapper not initialized."); return ncclInternalError; } int ret; GDRLOCKCALL(gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size), ret); if (ret != 0) { WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zi) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret); return ncclSystemError; } return ncclSuccess; } #endif /* !GDR_DIRECT */ nccl-2.18.3-1/src/misc/ibvsymbols.cc000066400000000000000000000157661444201535000171210ustar00rootroot00000000000000#include #include #include "ibvsymbols.h" #ifdef NCCL_BUILD_RDMA_CORE /* RDMA-core linking mode. Symbols are pointers to linked IB Verbs */ #define ASSIGN_SYM(container, symbol, name) container->name= &symbol; // Passthrough function for ibv_reg_mr macro in verbs.h struct ibv_mr* ibv_internal_reg_mr( struct ibv_pd* pd, void* addr, size_t length, int access) { return ibv_reg_mr(pd, addr, length, access); } // Passthrough function for ibv_internal_query_port macro in verbs.h int ibv_internal_query_port( struct ibv_context* context, uint8_t port_num, struct ibv_port_attr* port_attr) { return ibv_query_port(context, port_num, port_attr); } ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) { ASSIGN_SYM(ibvSymbols, ibv_get_device_list, ibv_internal_get_device_list); ASSIGN_SYM(ibvSymbols, ibv_free_device_list, ibv_internal_free_device_list); ASSIGN_SYM(ibvSymbols, ibv_get_device_name, ibv_internal_get_device_name); ASSIGN_SYM(ibvSymbols, ibv_open_device, ibv_internal_open_device); ASSIGN_SYM(ibvSymbols, ibv_close_device, ibv_internal_close_device); ASSIGN_SYM(ibvSymbols, ibv_get_async_event, ibv_internal_get_async_event); ASSIGN_SYM(ibvSymbols, ibv_ack_async_event, ibv_internal_ack_async_event); ASSIGN_SYM(ibvSymbols, ibv_query_device, ibv_internal_query_device); ASSIGN_SYM(ibvSymbols, ibv_query_gid, ibv_internal_query_gid); ASSIGN_SYM(ibvSymbols, ibv_query_qp, ibv_internal_query_qp); ASSIGN_SYM(ibvSymbols, ibv_alloc_pd, ibv_internal_alloc_pd); ASSIGN_SYM(ibvSymbols, ibv_dealloc_pd, ibv_internal_dealloc_pd); ASSIGN_SYM(ibvSymbols, ibv_reg_mr_iova2, ibv_internal_reg_mr_iova2); ASSIGN_SYM(ibvSymbols, ibv_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr); ASSIGN_SYM(ibvSymbols, ibv_dereg_mr, ibv_internal_dereg_mr); ASSIGN_SYM(ibvSymbols, ibv_create_cq, ibv_internal_create_cq); ASSIGN_SYM(ibvSymbols, ibv_destroy_cq, ibv_internal_destroy_cq); ASSIGN_SYM(ibvSymbols, ibv_create_qp, ibv_internal_create_qp); ASSIGN_SYM(ibvSymbols, ibv_modify_qp, ibv_internal_modify_qp); ASSIGN_SYM(ibvSymbols, ibv_destroy_qp, ibv_internal_destroy_qp); ASSIGN_SYM(ibvSymbols, ibv_fork_init, ibv_internal_fork_init); ASSIGN_SYM(ibvSymbols, ibv_event_type_str, ibv_internal_event_type_str); ibvSymbols->ibv_internal_reg_mr = &ibv_internal_reg_mr; ibvSymbols->ibv_internal_query_port = &ibv_internal_query_port; return ncclSuccess; } #else /* RDMA-core dynamic loading mode. Symbols are loaded from shared objects. */ #include #include "core.h" // IBVERBS Library versioning #define IBVERBS_VERSION "IBVERBS_1.1" ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) { static void* ibvhandle = NULL; void* tmp; void** cast; ibvhandle=dlopen("libibverbs.so", RTLD_NOW); if (!ibvhandle) { ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW); if (!ibvhandle) { INFO(NCCL_INIT, "Failed to open libibverbs.so[.1]"); goto teardown; } } #define LOAD_SYM(handle, symbol, funcptr) do { \ cast = (void**)&funcptr; \ tmp = dlvsym(handle, symbol, IBVERBS_VERSION); \ if (tmp == NULL) { \ WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), IBVERBS_VERSION); \ goto teardown; \ } \ *cast = tmp; \ } while (0) // Attempt to load a specific symbol version - fail silently #define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \ cast = (void**)&funcptr; \ *cast = dlvsym(handle, symbol, version); \ } while (0) LOAD_SYM(ibvhandle, "ibv_get_device_list", ibvSymbols->ibv_internal_get_device_list); LOAD_SYM(ibvhandle, "ibv_free_device_list", ibvSymbols->ibv_internal_free_device_list); LOAD_SYM(ibvhandle, "ibv_get_device_name", ibvSymbols->ibv_internal_get_device_name); LOAD_SYM(ibvhandle, "ibv_open_device", ibvSymbols->ibv_internal_open_device); LOAD_SYM(ibvhandle, "ibv_close_device", ibvSymbols->ibv_internal_close_device); LOAD_SYM(ibvhandle, "ibv_get_async_event", ibvSymbols->ibv_internal_get_async_event); LOAD_SYM(ibvhandle, "ibv_ack_async_event", ibvSymbols->ibv_internal_ack_async_event); LOAD_SYM(ibvhandle, "ibv_query_device", ibvSymbols->ibv_internal_query_device); LOAD_SYM(ibvhandle, "ibv_query_port", ibvSymbols->ibv_internal_query_port); LOAD_SYM(ibvhandle, "ibv_query_gid", ibvSymbols->ibv_internal_query_gid); LOAD_SYM(ibvhandle, "ibv_query_qp", ibvSymbols->ibv_internal_query_qp); LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibvSymbols->ibv_internal_alloc_pd); LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibvSymbols->ibv_internal_dealloc_pd); LOAD_SYM(ibvhandle, "ibv_reg_mr", ibvSymbols->ibv_internal_reg_mr); // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8 LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibvSymbols->ibv_internal_reg_mr_iova2, "IBVERBS_1.8"); // Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12 LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibvSymbols->ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12"); LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibvSymbols->ibv_internal_dereg_mr); LOAD_SYM(ibvhandle, "ibv_create_cq", ibvSymbols->ibv_internal_create_cq); LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibvSymbols->ibv_internal_destroy_cq); LOAD_SYM(ibvhandle, "ibv_create_qp", ibvSymbols->ibv_internal_create_qp); LOAD_SYM(ibvhandle, "ibv_modify_qp", ibvSymbols->ibv_internal_modify_qp); LOAD_SYM(ibvhandle, "ibv_destroy_qp", ibvSymbols->ibv_internal_destroy_qp); LOAD_SYM(ibvhandle, "ibv_fork_init", ibvSymbols->ibv_internal_fork_init); LOAD_SYM(ibvhandle, "ibv_event_type_str", ibvSymbols->ibv_internal_event_type_str); return ncclSuccess; teardown: ibvSymbols->ibv_internal_get_device_list = NULL; ibvSymbols->ibv_internal_free_device_list = NULL; ibvSymbols->ibv_internal_get_device_name = NULL; ibvSymbols->ibv_internal_open_device = NULL; ibvSymbols->ibv_internal_close_device = NULL; ibvSymbols->ibv_internal_get_async_event = NULL; ibvSymbols->ibv_internal_ack_async_event = NULL; ibvSymbols->ibv_internal_query_device = NULL; ibvSymbols->ibv_internal_query_port = NULL; ibvSymbols->ibv_internal_query_gid = NULL; ibvSymbols->ibv_internal_query_qp = NULL; ibvSymbols->ibv_internal_alloc_pd = NULL; ibvSymbols->ibv_internal_dealloc_pd = NULL; ibvSymbols->ibv_internal_reg_mr = NULL; ibvSymbols->ibv_internal_reg_mr_iova2 = NULL; ibvSymbols->ibv_internal_reg_dmabuf_mr = NULL; ibvSymbols->ibv_internal_dereg_mr = NULL; ibvSymbols->ibv_internal_create_cq = NULL; ibvSymbols->ibv_internal_destroy_cq = NULL; ibvSymbols->ibv_internal_create_qp = NULL; ibvSymbols->ibv_internal_modify_qp = NULL; ibvSymbols->ibv_internal_destroy_qp = NULL; ibvSymbols->ibv_internal_fork_init = NULL; ibvSymbols->ibv_internal_event_type_str = NULL; if (ibvhandle != NULL) dlclose(ibvhandle); return ncclSystemError; } #endif nccl-2.18.3-1/src/misc/ibvwrap.cc000066400000000000000000000213651444201535000163720ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "ibvwrap.h" #include #include #include "ibvsymbols.h" static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; static ncclResult_t initResult; struct ncclIbvSymbols ibvSymbols; ncclResult_t wrap_ibv_symbols(void) { pthread_once(&initOnceControl, [](){ initResult = buildIbvSymbols(&ibvSymbols); }); return initResult; } /* CHECK_NOT_NULL: helper macro to check for NULL symbol */ #define CHECK_NOT_NULL(container, internal_name) \ if (container.internal_name == NULL) { \ WARN("lib wrapper not initialized."); \ return ncclInternalError; \ } #define IBV_PTR_CHECK_ERRNO(container, internal_name, call, retval, error_retval, name) \ CHECK_NOT_NULL(container, internal_name); \ retval = container.call; \ if (retval == error_retval) { \ WARN("Call to " name " failed with error %s", strerror(errno)); \ return ncclSystemError; \ } \ return ncclSuccess; #define IBV_PTR_CHECK(container, internal_name, call, retval, error_retval, name) \ CHECK_NOT_NULL(container, internal_name); \ retval = container.call; \ if (retval == error_retval) { \ WARN("Call to " name " failed"); \ return ncclSystemError; \ } \ return ncclSuccess; #define IBV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \ CHECK_NOT_NULL(container, internal_name); \ int ret = container.call; \ if (ret != success_retval) { \ WARN("Call to " name " failed with error %s", strerror(ret)); \ return ncclSystemError; \ } \ return ncclSuccess; #define IBV_INT_CHECK(container, internal_name, call, error_retval, name) \ CHECK_NOT_NULL(container, internal_name); \ int ret = container.call; \ if (ret == error_retval) { \ WARN("Call to " name " failed"); \ return ncclSystemError; \ } \ return ncclSuccess; #define IBV_PASSTHRU(container, internal_name, call) \ CHECK_NOT_NULL(container, internal_name); \ container.call; \ return ncclSuccess; ncclResult_t wrap_ibv_fork_init() { IBV_INT_CHECK(ibvSymbols, ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init"); } ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices) { *ret = ibvSymbols.ibv_internal_get_device_list(num_devices); if (*ret == NULL) *num_devices = 0; return ncclSuccess; } ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list) { IBV_PASSTHRU(ibvSymbols, ibv_internal_free_device_list, ibv_internal_free_device_list(list)); } const char *wrap_ibv_get_device_name(struct ibv_device *device) { if (ibvSymbols.ibv_internal_get_device_name == NULL) { WARN("lib wrapper not initialized."); exit(-1); } return ibvSymbols.ibv_internal_get_device_name(device); } ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/ IBV_PTR_CHECK(ibvSymbols, ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device"); } ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/ IBV_INT_CHECK(ibvSymbols, ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device"); } ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { /*returns 0 on success, and -1 on error*/ IBV_INT_CHECK(ibvSymbols, ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event"); } ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event) { IBV_PASSTHRU(ibvSymbols, ibv_internal_ack_async_event, ibv_internal_ack_async_event(event)); } ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device"); } ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port"); } ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) { IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid"); } ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) { IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp"); } ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) { IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd"); } ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd"); } ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) { IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr"); } struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) { if (ibvSymbols.ibv_internal_reg_mr == NULL) { WARN("lib wrapper not initialized."); return NULL; } return ibvSymbols.ibv_internal_reg_mr(pd, addr, length, access); } ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) { if (ibvSymbols.ibv_internal_reg_mr_iova2 == NULL) { return ncclInternalError; } if (ret == NULL) { return ncclSuccess; } // Assume dummy call IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2"); } /* DMA-BUF support */ ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) { IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr"); } struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) { if (ibvSymbols.ibv_internal_reg_dmabuf_mr == NULL) { errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set return NULL; } return ibvSymbols.ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access); } ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr"); } ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) { IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq"); } ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) { IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq"); } ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) { IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp"); } ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp"); } ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp"); } ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) { *ret = (char *) ibvSymbols.ibv_internal_event_type_str(event); return ncclSuccess; } nccl-2.18.3-1/src/misc/ipcsocket.cc000066400000000000000000000125131444201535000166770ustar00rootroot00000000000000/* * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. * * See COPYRIGHT for license information */ #include "ipcsocket.h" #include "utils.h" #include #include #include // Enable Linux abstract socket naming #define USE_ABSTRACT_SOCKET #define NCCL_IPC_SOCKNAME_STR "/tmp/nccl-socket-%d-%lx" /* * Create a Unix Domain Socket */ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) { int fd = -1; struct sockaddr_un cliaddr; char temp[NCCL_IPC_SOCKNAME_LEN] = ""; if (handle == NULL) { return ncclInternalError; } handle->fd = -1; handle->socketName[0] = '\0'; if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) { WARN("UDS: Socket creation error : %d", errno); return ncclSystemError; } bzero(&cliaddr, sizeof(cliaddr)); cliaddr.sun_family = AF_UNIX; // Create unique name for the socket. int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash); if (len > (sizeof(cliaddr.sun_path) - 1)) { WARN("UDS: Cannot bind provided name to socket. Name too large"); return ncclInternalError; } #ifndef USE_ABSTRACT_SOCKET unlink(temp); #endif TRACE(NCCL_INIT, "UDS: Creating socket %s", temp); strncpy(cliaddr.sun_path, temp, len); #ifdef USE_ABSTRACT_SOCKET cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick #endif if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) { WARN("UDS: Binding to socket %s failed : %d", temp, errno); close(fd); return ncclSystemError; } handle->fd = fd; strcpy(handle->socketName, temp); handle->abortFlag = abortFlag; // Mark socket as non-blocking if (handle->abortFlag) { int flags; EQCHECK(flags = fcntl(fd, F_GETFL), -1); SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); } return ncclSuccess; } ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) { if (handle == NULL) { return ncclInternalError; } if (handle->fd <= 0) { return ncclSuccess; } #ifndef USE_ABSTRACT_SOCKET if (handle->socketName[0] != '\0') { unlink(handle->socketName); } #endif close(handle->fd); return ncclSuccess; } ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) { struct msghdr msg = {0, 0, 0, 0, 0, 0, 0}; struct iovec iov[1]; // Union to guarantee alignment requirements for control array union { struct cmsghdr cm; char control[CMSG_SPACE(sizeof(int))]; } control_un; struct cmsghdr *cmptr; char dummy_buffer[1]; int ret; msg.msg_control = control_un.control; msg.msg_controllen = sizeof(control_un.control); iov[0].iov_base = (void *)dummy_buffer; iov[0].iov_len = sizeof(dummy_buffer); msg.msg_iov = iov; msg.msg_iovlen = 1; while ((ret = recvmsg(handle->fd, &msg, 0)) <= 0) { if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { WARN("UDS: Receiving data over socket failed : %d", errno); return ncclSystemError; } if (handle->abortFlag && *handle->abortFlag) return ncclInternalError; } if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) { if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) { WARN("UDS: Receiving data over socket failed"); return ncclSystemError; } memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd)); } else { WARN("UDS: Receiving data over socket %s failed", handle->socketName); return ncclSystemError; } TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName); return ncclSuccess; } ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) { struct msghdr msg; struct iovec iov[1]; char temp[NCCL_IPC_SOCKNAME_LEN]; union { struct cmsghdr cm; char control[CMSG_SPACE(sizeof(int))]; } control_un; struct cmsghdr *cmptr; struct sockaddr_un cliaddr; // Construct client address to send this shareable handle to bzero(&cliaddr, sizeof(cliaddr)); cliaddr.sun_family = AF_UNIX; int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash); if (len > (sizeof(cliaddr.sun_path) - 1)) { WARN("UDS: Cannot connect to provided name for socket. Name too large"); return ncclInternalError; } (void) strncpy(cliaddr.sun_path, temp, len); TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp); #ifdef USE_ABSTRACT_SOCKET cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick #endif msg.msg_control = control_un.control; msg.msg_controllen = sizeof(control_un.control); cmptr = CMSG_FIRSTHDR(&msg); cmptr->cmsg_len = CMSG_LEN(sizeof(int)); cmptr->cmsg_level = SOL_SOCKET; cmptr->cmsg_type = SCM_RIGHTS; memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd)); msg.msg_name = (void *)&cliaddr; msg.msg_namelen = sizeof(struct sockaddr_un); iov[0].iov_base = (void *)""; iov[0].iov_len = 1; msg.msg_iov = iov; msg.msg_iovlen = 1; msg.msg_flags = 0; ssize_t sendResult; while ((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) { if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { WARN("UDS: Sending data over socket %s failed : %d", temp, errno); return ncclSystemError; } if (handle->abortFlag && *handle->abortFlag) return ncclInternalError; } return ncclSuccess; } nccl-2.18.3-1/src/misc/nvmlwrap.cc000066400000000000000000000251751444201535000165710ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nvmlwrap.h" #include "checks.h" #include "debug.h" #include #include #include int ncclNvmlDeviceCount = 0; ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices]; ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices]; #if NCCL_NVML_DIRECT #define NCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name)arglist = name; #else #include #define NCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name)arglist = nullptr; #endif namespace { NCCL_NVML_FN(nvmlInit, nvmlReturn_t, ()) NCCL_NVML_FN(nvmlInit_v2, nvmlReturn_t, ()) NCCL_NVML_FN(nvmlShutdown, nvmlReturn_t, ()) NCCL_NVML_FN(nvmlDeviceGetCount, nvmlReturn_t, (unsigned int*)) NCCL_NVML_FN(nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*)) NCCL_NVML_FN(nvmlDeviceGetHandleByPciBusId, nvmlReturn_t, (const char* pciBusId, nvmlDevice_t* device)) NCCL_NVML_FN(nvmlDeviceGetHandleByIndex, nvmlReturn_t, (unsigned int index, nvmlDevice_t *device)) NCCL_NVML_FN(nvmlDeviceGetIndex, nvmlReturn_t, (nvmlDevice_t device, unsigned* index)) NCCL_NVML_FN(nvmlErrorString, char const*, (nvmlReturn_t r)) NCCL_NVML_FN(nvmlDeviceGetNvLinkState, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive)) NCCL_NVML_FN(nvmlDeviceGetNvLinkRemotePciInfo, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci)) NCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult)) NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor)) NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus)) NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values)) std::mutex lock; // NVML has had some thread safety bugs bool initialized = false; thread_local bool threadInitialized = false; ncclResult_t initResult; } ncclResult_t ncclNvmlEnsureInitialized() { // Optimization to avoid repeatedly grabbing the lock when we only want to // read from the global tables. if (threadInitialized) return initResult; threadInitialized = true; std::lock_guard locked(lock); if (initialized) return initResult; initialized = true; #if !NCCL_NVML_DIRECT if (pfn_nvmlInit == nullptr) { void *libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW); if (libhandle == nullptr) { WARN("Failed to open libnvidia-ml.so.1"); initResult = ncclSystemError; return initResult; } struct Symbol { void **ppfn; char const *name; }; std::initializer_list symbols = { {(void**)&pfn_nvmlInit, "nvmlInit"}, {(void**)&pfn_nvmlInit_v2, "nvmlInit_v2"}, {(void**)&pfn_nvmlShutdown, "nvmlShutdown"}, {(void**)&pfn_nvmlDeviceGetCount, "nvmlDeviceGetCount"}, {(void**)&pfn_nvmlDeviceGetCount_v2, "nvmlDeviceGetCount_v2"}, {(void**)&pfn_nvmlDeviceGetHandleByPciBusId, "nvmlDeviceGetHandleByPciBusId"}, {(void**)&pfn_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"}, {(void**)&pfn_nvmlDeviceGetIndex, "nvmlDeviceGetIndex"}, {(void**)&pfn_nvmlErrorString, "nvmlErrorString"}, {(void**)&pfn_nvmlDeviceGetNvLinkState, "nvmlDeviceGetNvLinkState"}, {(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"}, {(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"}, {(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"}, {(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"}, {(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"} }; for(Symbol sym: symbols) { *sym.ppfn = dlsym(libhandle, sym.name); } } #endif #if NCCL_NVML_DIRECT bool have_v2 = true; #else bool have_v2 = pfn_nvmlInit_v2 != nullptr; // if this compare is done in the NCCL_NVML_DIRECT=1 case then GCC warns about it never being null #endif nvmlReturn_t res1 = (have_v2 ? pfn_nvmlInit_v2 : pfn_nvmlInit)(); if (res1 != NVML_SUCCESS) { WARN("nvmlInit%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1)); initResult = ncclSystemError; return initResult; } unsigned int ndev; res1 = (have_v2 ? pfn_nvmlDeviceGetCount_v2 : pfn_nvmlDeviceGetCount)(&ndev); if (res1 != NVML_SUCCESS) { WARN("nvmlDeviceGetCount%s() failed: %s", have_v2 ? "_v2" :"", pfn_nvmlErrorString(res1)); initResult = ncclSystemError; return initResult; } ncclNvmlDeviceCount = int(ndev); if (ncclNvmlMaxDevices < ncclNvmlDeviceCount) { WARN("nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (ncclNvmlMaxDevices=%d)", ncclNvmlDeviceCount, ncclNvmlMaxDevices); initResult = ncclInternalError; return initResult; } for(int a=0; a < ncclNvmlDeviceCount; a++) { res1 = pfn_nvmlDeviceGetHandleByIndex(a, &ncclNvmlDevices[a].handle); if (res1 != NVML_SUCCESS) { WARN("nvmlDeviceGetHandleByIndex(%d) failed: %s", int(a), pfn_nvmlErrorString(res1)); initResult = ncclSystemError; return initResult; } res1 = pfn_nvmlDeviceGetCudaComputeCapability(ncclNvmlDevices[a].handle, &ncclNvmlDevices[a].computeCapabilityMajor, &ncclNvmlDevices[a].computeCapabilityMinor); if (res1 != NVML_SUCCESS) { WARN("nvmlDeviceGetCudaComputeCapability(%d) failed: %s", int(a), pfn_nvmlErrorString(res1)); initResult = ncclSystemError; return initResult; } } for(int a=0; a < ncclNvmlDeviceCount; a++) { for(int b=0; b < ncclNvmlDeviceCount; b++) { nvmlDevice_t da = ncclNvmlDevices[a].handle; nvmlDevice_t db = ncclNvmlDevices[b].handle; res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_READ, &ncclNvmlDevicePairs[a][b].p2pStatusRead); if (res1 != NVML_SUCCESS) { WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1)); initResult = ncclSystemError; return initResult; } res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_WRITE, &ncclNvmlDevicePairs[a][b].p2pStatusWrite); if (res1 != NVML_SUCCESS) { WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1)); initResult = ncclSystemError; return initResult; } } } initResult = ncclSuccess; return initResult; } #define NVMLCHECK(name, ...) do { \ nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \ if (e44241808 != NVML_SUCCESS) { \ WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \ return ncclSystemError; \ } \ } while(0) #define NVMLTRY(name, ...) do { \ if (!NCCL_NVML_DIRECT && pfn_##name == nullptr) \ return ncclInternalError; /* missing symbol is not a warned error */ \ nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \ if (e44241808 != NVML_SUCCESS) { \ if (e44241808 != NVML_ERROR_NOT_SUPPORTED) \ INFO(NCCL_INIT, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \ return ncclSystemError; \ } \ } while(0) ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); NVMLCHECK(nvmlDeviceGetHandleByPciBusId, pciBusId, device); return ncclSuccess; } ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) { NCCLCHECK(ncclNvmlEnsureInitialized()); *device = ncclNvmlDevices[index].handle; return ncclSuccess; } ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { NCCLCHECK(ncclNvmlEnsureInitialized()); for (int d=0; d < ncclNvmlDeviceCount; d++) { if (ncclNvmlDevices[d].handle == device) { *index = d; return ncclSuccess; } } return ncclInvalidArgument; } ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); NVMLTRY(nvmlDeviceGetNvLinkState, device, link, isActive); return ncclSuccess; } ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); NVMLTRY(nvmlDeviceGetNvLinkRemotePciInfo, device, link, pci); return ncclSuccess; } ncclResult_t ncclNvmlDeviceGetNvLinkCapability( nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult ) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); NVMLTRY(nvmlDeviceGetNvLinkCapability, device, link, capability, capResult); return ncclSuccess; } ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { NCCLCHECK(ncclNvmlEnsureInitialized()); for(int d=0; d < ncclNvmlDeviceCount; d++) { if(device == ncclNvmlDevices[d].handle) { *major = ncclNvmlDevices[d].computeCapabilityMajor; *minor = ncclNvmlDevices[d].computeCapabilityMinor; return ncclSuccess; } } return ncclInvalidArgument; } ncclResult_t ncclNvmlDeviceGetP2PStatus( nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus ) { NCCLCHECK(ncclNvmlEnsureInitialized()); if (p2pIndex == NVML_P2P_CAPS_INDEX_READ || p2pIndex == NVML_P2P_CAPS_INDEX_WRITE) { int a = -1, b = -1; for(int d=0; d < ncclNvmlDeviceCount; d++) { if(device1 == ncclNvmlDevices[d].handle) a = d; if(device2 == ncclNvmlDevices[d].handle) b = d; } if (a == -1 || b == -1) return ncclInvalidArgument; if (p2pIndex == NVML_P2P_CAPS_INDEX_READ) *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusRead; else *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusWrite; } else { std::lock_guard locked(lock); NVMLCHECK(nvmlDeviceGetP2PStatus, device1, device2, p2pIndex, p2pStatus); } return ncclSuccess; } ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); NVMLTRY(nvmlDeviceGetFieldValues, device, valuesCount, values); return ncclSuccess; } nccl-2.18.3-1/src/misc/param.cc000066400000000000000000000044171444201535000160170ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "param.h" #include "debug.h" #include #include #include #include #include #include #include #include #include const char* userHomeDir() { struct passwd *pwUser = getpwuid(getuid()); return pwUser == NULL ? NULL : pwUser->pw_dir; } void setEnvFile(const char* fileName) { FILE * file = fopen(fileName, "r"); if (file == NULL) return; char *line = NULL; char envVar[1024]; char envValue[1024]; size_t n = 0; ssize_t read; while ((read = getline(&line, &n, file)) != -1) { if (line[read-1] == '\n') line[read-1] = '\0'; int s=0; // Env Var Size while (line[s] != '\0' && line[s] != '=') s++; if (line[s] == '\0') continue; strncpy(envVar, line, std::min(1023,s)); envVar[s] = '\0'; s++; strncpy(envValue, line+s, 1023); envValue[1023]='\0'; setenv(envVar, envValue, 0); //printf("%s : %s->%s\n", fileName, envVar, envValue); } if (line) free(line); fclose(file); } void initEnv() { char confFilePath[1024]; const char * userDir = userHomeDir(); if (userDir) { sprintf(confFilePath, "%s/.nccl.conf", userDir); setEnvFile(confFilePath); } sprintf(confFilePath, "/etc/nccl.conf"); setEnvFile(confFilePath); } void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) { static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&mutex); if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) { char* str = getenv(env); int64_t value = deftVal; if (str && strlen(str) > 0) { errno = 0; value = strtoll(str, nullptr, 0); if (errno) { value = deftVal; INFO(NCCL_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal); } else { INFO(NCCL_ENV,"%s set by environment to %lld.", env, (long long)value); } } __atomic_store_n(cache, value, __ATOMIC_RELAXED); } pthread_mutex_unlock(&mutex); } nccl-2.18.3-1/src/misc/profiler.cc000066400000000000000000000112241444201535000165330ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "profiler.h" //#define PROFILE_PROXY 1 #ifdef PROFILE_PROXY #include "timer.h" #include "alloc.h" static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" }; static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" }; static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" }; struct ncclProxyProfileEvent { double timestamp[6]; uint64_t opCount; int peer; int step; uint16_t channel; uint8_t type; // send / recv uint8_t opIndex; }; struct ncclProxyProfileEvent* profilingEvents = NULL; int profilingIndex = 0; double profilingStart = 0; #define MAX_EVENTS 200000 ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { if (profilingEvents == NULL) { NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS)); profilingStart = gettime(); } struct ncclProxyProfileEvent* event = NULL; if (state%8 == 0) { if (profilingIndex == MAX_EVENTS) return ncclSuccess; args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++; if (state == ncclProxyProfileBegin) { // Proxy operation information event->opCount = args->opCount; event->channel = args->subs[sub].channelId; event->peer = args->subs[sub].peer; event->type = args->pattern; event->step = step; event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256; } else event->peer = -state; } else { event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS]; if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL; if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount; } // Timestamp event->timestamp[state%8] = gettime()-profilingStart; return ncclSuccess; } void ncclProfilingDump() { static int dumpDone = 0; if (dumpDone) return; dumpDone = 1; const char* str = getenv("NCCL_PROXY_PROFILE"); if (!str) { free(profilingEvents); return; } FILE* f = fopen(str, "w"); fprintf(f, "[\n"); for (int i=0; ipeer >= 0; const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") : profilingEventStr[-(e->peer/8)]; if (sendrecv) { int state = ncclProxyProfileBegin; const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr; fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n", typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex); while (statetimestamp[state]) { const char* name = stateStr[state]; fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", name, i, e->channel, e->timestamp[state]); state++; while (e->timestamp[state] == 0) state++; fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", name, i, e->channel, e->timestamp[state]); } } fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]); } else { if (e->peer == -ncclProxyProfileAppend) { fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n", typeStr, i, e->timestamp[0], e->opCount); } else { fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n", typeStr, i, e->timestamp[0]); } fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n", typeStr, i, e->timestamp[1]); } } fprintf(f, "{} ]\n"); fclose(f); free(profilingEvents); } #else ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } void ncclProfilingDump() {} #endif nccl-2.18.3-1/src/misc/shmutils.cc000066400000000000000000000130021444201535000165550ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "shm.h" #include "checks.h" #include #include #include #include #include #include #include #include #include struct shmHandleInternal { int fd; char* shmPath; char* shmPtr; void* devShmPtr; size_t shmSize; size_t realShmSize; int* refcount; }; static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmSize, char* hptr, void* dptr, bool create, struct shmHandleInternal* handle) { handle->fd = fd; handle->shmPtr = hptr; handle->devShmPtr = dptr; handle->shmSize = shmSize; handle->realShmSize = realShmSize; handle->refcount = (hptr != NULL) ? (int*)(hptr + shmSize) : NULL; if (create) { int slen = strlen(shmPath); handle->shmPath = (char*)malloc(slen + 1); memcpy(handle->shmPath, shmPath, slen + 1); if (hptr) memset(hptr, 0, shmSize); } else { handle->shmPath = NULL; } return; } ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) { int fd = -1; char* hptr = NULL; void* dptr = NULL; ncclResult_t ret = ncclSuccess; struct shmHandleInternal* tmphandle; bool create = refcount > 0 ? true : false; const size_t refSize = sizeof(int); /* extra sizeof(int) bytes for reference count */ const size_t realShmSize = shmSize + refSize; *handle = *shmPtr = NULL; /* assume shmPtr and handle always set correctly by users. */ EQCHECKGOTO(tmphandle = (struct shmHandleInternal*)calloc(1, sizeof(struct shmHandleInternal)), NULL, ret, fail); if (create) { /* refcount > 0 means the caller tries to allocate a shared memory. This shared memory segment will have * refcount references; when the peer attaches, it should pass -1 to reduce one reference count. When it * goes down to 0, unlink should be called in order to delete shared memory file. */ if (shmPath[0] == '\0') { sprintf(shmPath, "/dev/shm/nccl-XXXXXX"); fd = mkstemp(shmPath); } else { SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail); } if (ftruncate(fd, realShmSize) != 0) { WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize); ret = ncclSystemError; goto fail; } INFO(NCCL_ALLOC, "Allocated %ld bytes of shared memory in %s", realShmSize, shmPath); } else { SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), ret, fail); } hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (hptr == MAP_FAILED) { WARN("Could not map %s size %zi, error: %s", shmPath, realShmSize, strerror(errno)); ret = ncclSystemError; hptr = NULL; goto fail; } if (create) { *(int*)(hptr + shmSize) = refcount; } else { int remref = ncclAtomicRefCountDecrement((int*)(hptr + shmSize)); if (remref == 0) { /* the last peer has completed attachment, it should unlink the shm mem file. */ if (unlink(shmPath) != 0) { WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno)); } } } if (devShmPtr) { CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterMapped), ret, fail); CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail); } shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle); exit: *shmPtr = hptr; if (devShmPtr) *devShmPtr = dptr; *handle = (ncclShmHandle_t)tmphandle; return ret; fail: WARN("Error while %s shared memory segment %s (size %ld)", create ? "creating" : "attaching to", shmPath, shmSize); if (tmphandle) { shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle); ncclShmClose((ncclShmHandle_t)tmphandle); tmphandle = NULL; } hptr = NULL; dptr = NULL; goto exit; } ncclResult_t ncclShmClose(ncclShmHandle_t handle) { ncclResult_t ret = ncclSuccess; struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle; if (tmphandle) { if (tmphandle->fd >= 0) { close(tmphandle->fd); if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) { if (unlink(tmphandle->shmPath) != 0) { WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno)); ret = ncclSystemError; } } free(tmphandle->shmPath); } if (tmphandle->shmPtr) { if (tmphandle->devShmPtr) CUDACHECK(cudaHostUnregister(tmphandle->shmPtr)); if (munmap(tmphandle->shmPtr, tmphandle->realShmSize) != 0) { WARN("munmap of shared memory %p size %ld failed, error: %s", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno)); ret = ncclSystemError; } } free(tmphandle); } return ret; } ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) { ncclResult_t ret = ncclSuccess; struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle; if (tmphandle) { if (tmphandle->shmPath != NULL) { if (unlink(tmphandle->shmPath) != 0) { WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno)); ret = ncclSystemError; } free(tmphandle->shmPath); tmphandle->shmPath = NULL; } } return ret; } nccl-2.18.3-1/src/misc/socket.cc000066400000000000000000000734351444201535000162150ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "socket.h" #include "utils.h" #include #include #include #include static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) { int bytes = 0; *closed = 0; char* data = (char*)ptr; char line[SOCKET_NAME_MAXLEN+1]; do { if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL); if (op == NCCL_SOCKET_RECV && bytes == 0) { *closed = 1; return ncclSuccess; } if (bytes == -1) { if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { WARN("socketProgressOpt: Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno)); return ncclRemoteError; } else { bytes = 0; } } (*offset) += bytes; if (sock->abortFlag && *sock->abortFlag != 0) { INFO(NCCL_NET, "socketProgressOpt: abort called"); return ncclInternalError; } } while (bytes > 0 && (*offset) < size); return ncclSuccess; } static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { int closed; NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed)); if (closed) { char line[SOCKET_NAME_MAXLEN+1]; WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0)); return ncclRemoteError; } return ncclSuccess; } static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { while (*offset < size) NCCLCHECK(socketProgress(op, sock, ptr, size, offset)); return ncclSuccess; } /* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo() * * Output: "IPv4/IPv6 address" */ const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) { if (buf == NULL || addr == NULL) return NULL; struct sockaddr *saddr = &addr->sa; if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; } char host[NI_MAXHOST], service[NI_MAXSERV]; /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned. * (When not set, this will still happen in case the node's name cannot be determined.) */ int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0); (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag); sprintf(buf, "%s<%s>", host, service); return buf; } static uint16_t socketToPort(union ncclSocketAddress *addr) { struct sockaddr *saddr = &addr->sa; return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port); } /* Allow the user to force the IPv4/IPv6 interface selection */ static int envSocketFamily(void) { int family = -1; // Family selection is not forced, will use first one found char* env = getenv("NCCL_SOCKET_FAMILY"); if (env == NULL) return family; INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env); if (strcmp(env, "AF_INET") == 0) family = AF_INET; // IPv4 else if (strcmp(env, "AF_INET6") == 0) family = AF_INET6; // IPv6 return family; } static int findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; #endif struct netIf userIfs[MAX_IFS]; bool searchNot = prefixList && prefixList[0] == '^'; if (searchNot) prefixList++; bool searchExact = prefixList && prefixList[0] == '='; if (searchExact) prefixList++; int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); int found = 0; struct ifaddrs *interfaces, *interface; getifaddrs(&interfaces); for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) { if (interface->ifa_addr == NULL) continue; /* We only support IPv4 & IPv6 */ int family = interface->ifa_addr->sa_family; if (family != AF_INET && family != AF_INET6) continue; TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line)); /* Allow the caller to force the socket family type */ if (sock_family != -1 && family != sock_family) continue; /* We also need to skip IPv6 loopback interfaces */ if (family == AF_INET6) { struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr); if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue; } // check against user specified interfaces if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) { continue; } // Check that this interface has not already been saved // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link bool duplicate = false; for (int i = 0; i < found; i++) { if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; } } if (!duplicate) { // Store the interface name strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize); // Store the IP address int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); memcpy(addrs+found, interface->ifa_addr, salen); found++; } } freeifaddrs(interfaces); return found; } static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) { /* Check family first */ int family = local_if.ifa_addr->sa_family; if (family != remote->sa.sa_family) { return false; } if (family == AF_INET) { struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr); struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask); struct sockaddr_in& remote_addr = remote->sin; struct in_addr local_subnet, remote_subnet; local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr; remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr; return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true; } else if (family == AF_INET6) { struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr); struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask); struct sockaddr_in6& remote_addr = remote->sin6; struct in6_addr& local_in6 = local_addr->sin6_addr; struct in6_addr& mask_in6 = mask->sin6_addr; struct in6_addr& remote_in6 = remote_addr.sin6_addr; bool same = true; int len = 16; //IPv6 address is 16 unsigned char for (int c = 0; c < len; c++) { //Network byte order is big-endian char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c]; char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c]; if (c1 ^ c2) { same = false; break; } } // At last, we need to compare scope id // Two Link-type addresses can have the same subnet address even though they are not in the same scope // For Global type, this field is 0, so a comparison wouldn't matter same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id); return same; } else { WARN("Net : Unsupported address family type"); return false; } } int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; #endif char line_a[SOCKET_NAME_MAXLEN+1]; int found = 0; struct ifaddrs *interfaces, *interface; getifaddrs(&interfaces); for (interface = interfaces; interface && !found; interface = interface->ifa_next) { if (interface->ifa_addr == NULL) continue; /* We only support IPv4 & IPv6 */ int family = interface->ifa_addr->sa_family; if (family != AF_INET && family != AF_INET6) continue; // check against user specified interfaces if (!matchSubnet(*interface, remoteAddr)) { continue; } // Store the local IP address int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); memcpy(localAddrs+found, interface->ifa_addr, salen); // Store the interface name strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, ncclSocketToString(localAddrs+found, line), ncclSocketToString(remoteAddr, line_a)); found++; if (found == maxIfs) break; } if (found == 0) { WARN("Net : No interface found in the same subnet as remote address %s", ncclSocketToString(remoteAddr, line_a)); } freeifaddrs(interfaces); return found; } ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) { if (!(ip_port_pair && strlen(ip_port_pair) > 1)) { WARN("Net : string is null"); return ncclInvalidArgument; } bool ipv6 = ip_port_pair[0] == '['; /* Construct the sockaddress structure */ if (!ipv6) { struct netIf ni; // parse : string, expect one pair if (parseStringList(ip_port_pair, &ni, 1) != 1) { WARN("Net : No valid : pair found"); return ncclInvalidArgument; } struct addrinfo hints, *p; int rv; memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) { WARN("Net : error encountered when getting address info : %s", gai_strerror(rv)); return ncclInvalidArgument; } // use the first if (p->ai_family == AF_INET) { struct sockaddr_in& sin = ua->sin; memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in)); sin.sin_family = AF_INET; // IPv4 //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address sin.sin_port = htons(ni.port); // port } else if (p->ai_family == AF_INET6) { struct sockaddr_in6& sin6 = ua->sin6; memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6)); sin6.sin6_family = AF_INET6; // IPv6 sin6.sin6_port = htons(ni.port); // port sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete sin6.sin6_scope_id = 0; // should be global scope, set to 0 } else { WARN("Net : unsupported IP family"); return ncclInvalidArgument; } freeaddrinfo(p); // all done with this structure } else { int i, j = -1, len = strlen(ip_port_pair); for (i = 1; i < len; i++) { if (ip_port_pair[i] == '%') j = i; if (ip_port_pair[i] == ']') break; } if (i == len) { WARN("Net : No valid [IPv6]:port pair found"); return ncclInvalidArgument; } bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ]; memset(ip_str, '\0', sizeof(ip_str)); memset(port_str, '\0', sizeof(port_str)); memset(if_name, '\0', sizeof(if_name)); strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1); strncpy(port_str, ip_port_pair+i+2, len-i-1); int port = atoi(port_str); if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name struct sockaddr_in6& sin6 = ua->sin6; sin6.sin6_family = AF_INET6; // IPv6 inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address sin6.sin6_port = htons(port); // port sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope } return ncclSuccess; } int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) { static int shownIfName = 0; int nIfs = 0; // Allow user to force the INET socket family selection int sock_family = envSocketFamily(); // User specified interface char* env = getenv("NCCL_SOCKET_IFNAME"); if (env && strlen(env) > 1) { INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env); // Specified by user : find or fail if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env); nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); } else { // Try to automatically pick the right one // Start with IB nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); // else see if we can get some hint from COMM ID if (nIfs == 0) { char* commId = getenv("NCCL_COMM_ID"); if (commId && strlen(commId) > 1) { INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId); // Try to find interface that is in the same subnet as the IP in comm id union ncclSocketAddress idAddr; ncclSocketGetAddrFromString(&idAddr, commId); nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs); } } // Then look for anything else (but not docker or lo) if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); // Finally look for docker, then lo. if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); } return nIfs; } ncclResult_t ncclSocketListen(struct ncclSocket* sock) { if (sock == NULL) { WARN("ncclSocketListen: pass NULL socket"); return ncclInvalidArgument; } if (sock->fd == -1) { WARN("ncclSocketListen: file descriptor is -1"); return ncclInvalidArgument; } if (socketToPort(&sock->addr)) { // Port is forced by env. Make sure we get the port. int opt = 1; #if defined(SO_REUSEPORT) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt"); #else SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt"); #endif } // addr port should be 0 (Any port) SYSCHECK(bind(sock->fd, &sock->addr.sa, sock->salen), "bind"); /* Get the assigned Port */ socklen_t size = sock->salen; SYSCHECK(getsockname(sock->fd, &sock->addr.sa, &size), "getsockname"); #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", ncclSocketToString(&sock->addr, line)); #endif /* Put the socket in listen mode * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn */ SYSCHECK(listen(sock->fd, 16384), "listen"); sock->state = ncclSocketStateReady; return ncclSuccess; } ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr) { if (sock == NULL) { WARN("ncclSocketGetAddr: pass NULL socket"); return ncclInvalidArgument; } if (sock->state != ncclSocketStateReady) return ncclInternalError; memcpy(addr, &sock->addr, sizeof(union ncclSocketAddress)); return ncclSuccess; } static ncclResult_t socketTryAccept(struct ncclSocket* sock) { socklen_t socklen = sizeof(union ncclSocketAddress); sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen); if (sock->fd != -1) { sock->state = ncclSocketStateAccepted; } else if (errno != EAGAIN && errno != EWOULDBLOCK) { WARN("socketTryAccept: Accept failed: %s", strerror(errno)); return ncclSystemError; } return ncclSuccess; } static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) { uint64_t magic; enum ncclSocketType type; int received = 0; const int one = 1; SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received)); if (received == 0) return ncclSuccess; NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received)); if (magic != sock->magic) { WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic); close(sock->fd); sock->fd = -1; // Ignore spurious connection and accept again sock->state = ncclSocketStateAccepting; return ncclSuccess; } else { received = 0; NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received)); if (type != sock->type) { WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type); sock->state = ncclSocketStateError; close(sock->fd); sock->fd = -1; return ncclInternalError; } else { sock->state = ncclSocketStateReady; } } return ncclSuccess; } static ncclResult_t socketStartConnect(struct ncclSocket* sock) { /* blocking/non-blocking connect() is determined by asyncFlag. */ int ret = connect(sock->fd, &sock->addr.sa, sock->salen); if (ret == 0) { sock->state = ncclSocketStateConnected; return ncclSuccess; } else if (errno == EINPROGRESS) { sock->state = ncclSocketStateConnectPolling; return ncclSuccess; } else if (errno == ECONNREFUSED) { if (++sock->refusedRetries == RETRY_REFUSED_TIMES) { sock->state = ncclSocketStateError; WARN("socketStartConnect: exceeded retries (%d)", sock->refusedRetries); return ncclRemoteError; } usleep(SLEEP_INT); if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno)); return ncclSuccess; } else if (errno == ETIMEDOUT) { if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) { sock->state = ncclSocketStateError; WARN("socketStartConnect: exceeded timeouts (%d)", sock->timedOutRetries); return ncclRemoteError; } usleep(SLEEP_INT); return ncclSuccess; } else { char line[SOCKET_NAME_MAXLEN+1]; sock->state = ncclSocketStateError; WARN("socketStartConnect: Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno)); return ncclSystemError; } } static ncclResult_t socketPollConnect(struct ncclSocket* sock) { struct pollfd pfd; int timeout = 1, ret; socklen_t rlen = sizeof(int); memset(&pfd, 0, sizeof(struct pollfd)); pfd.fd = sock->fd; pfd.events = POLLOUT; ret = poll(&pfd, 1, timeout); if (ret == 0 || (ret < 0 && errno == EINTR)) { return ncclSuccess; } else if (ret < 0) { WARN("socketPollConnect poll() failed with error %s", strerror(errno)); return ncclRemoteError; } else { EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0); } /* check socket status */ SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt"); if (ret == 0) { sock->state = ncclSocketStateConnected; } else if (ret == ECONNREFUSED) { if (++sock->refusedRetries == RETRY_REFUSED_TIMES) { sock->state = ncclSocketStateError; WARN("socketPollConnect: exceeded retries (%d)", sock->refusedRetries); return ncclRemoteError; } if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno)); usleep(SLEEP_INT); sock->state = ncclSocketStateConnecting; } else if (ret == ETIMEDOUT) { if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) { sock->state = ncclSocketStateError; WARN("socketPollConnect: exceeded timeouts (%d)", sock->timedOutRetries); return ncclRemoteError; } usleep(SLEEP_INT); sock->state = ncclSocketStateConnecting; } else if (ret != EINPROGRESS) { sock->state = ncclSocketStateError; return ncclSystemError; } return ncclSuccess; } ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) { if (sock == NULL) { WARN("ncclSocketPollConnect: pass NULL socket"); return ncclInvalidArgument; } NCCLCHECK(socketPollConnect(sock)); return ncclSuccess; } static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) { int sent = 0; NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent)); if (sent == 0) return ncclSuccess; NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent)); sent = 0; NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent)); sock->state = ncclSocketStateReady; return ncclSuccess; } static ncclResult_t socketProgressState(struct ncclSocket* sock) { if (sock->state == ncclSocketStateAccepting) { NCCLCHECK(socketTryAccept(sock)); } if (sock->state == ncclSocketStateAccepted) { NCCLCHECK(socketFinalizeAccept(sock)); } if (sock->state == ncclSocketStateConnecting) { NCCLCHECK(socketStartConnect(sock)); } if (sock->state == ncclSocketStateConnectPolling) { NCCLCHECK(socketPollConnect(sock)); } if (sock->state == ncclSocketStateConnected) { NCCLCHECK(socketFinalizeConnect(sock)); } return ncclSuccess; } ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) { if (sock == NULL) { *running = 0; return ncclSuccess; } if (sock->state == ncclSocketStateError || sock->state == ncclSocketStateClosed) { WARN("ncclSocketReady: unexpected socket state %d", sock->state); return ncclRemoteError; } *running = (sock->state == ncclSocketStateReady) ? 1 : 0; if (*running == 0) { NCCLCHECK(socketProgressState(sock)); *running = (sock->state == ncclSocketStateReady) ? 1 : 0; } return ncclSuccess; } ncclResult_t ncclSocketConnect(struct ncclSocket* sock) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; #endif const int one = 1; if (sock == NULL) { WARN("ncclSocketConnect: pass NULL socket"); return ncclInvalidArgument; } if (sock->fd == -1) { WARN("ncclSocketConnect: file descriptor is -1"); return ncclInvalidArgument; } if (sock->state != ncclSocketStateInitialized) { WARN("ncclSocketConnect: wrong socket state %d", sock->state); if (sock->state == ncclSocketStateError) return ncclRemoteError; return ncclInternalError; } TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line)); SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); sock->state = ncclSocketStateConnecting; do { NCCLCHECK(socketProgressState(sock)); } while (sock->asyncFlag == 0 && (sock->abortFlag == NULL || *sock->abortFlag == 0) && (sock->state == ncclSocketStateConnecting || sock->state == ncclSocketStateConnectPolling || sock->state == ncclSocketStateConnected)); if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError; switch (sock->state) { case ncclSocketStateConnecting: case ncclSocketStateConnectPolling: case ncclSocketStateConnected: case ncclSocketStateReady: return ncclSuccess; case ncclSocketStateError: return ncclSystemError; default: WARN("ncclSocketConnect: wrong socket state %d", sock->state); return ncclInternalError; } } ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSock) { ncclResult_t ret = ncclSuccess; if (listenSock == NULL || sock == NULL) { WARN("ncclSocketAccept: pass NULL socket"); ret = ncclInvalidArgument; goto exit; } if (listenSock->state != ncclSocketStateReady) { WARN("ncclSocketAccept: wrong socket state %d", listenSock->state); if (listenSock->state == ncclSocketStateError) ret = ncclSystemError; else ret = ncclInternalError; goto exit; } if (sock->acceptFd == -1) { memcpy(sock, listenSock, sizeof(struct ncclSocket)); sock->acceptFd = listenSock->fd; sock->state = ncclSocketStateAccepting; } do { NCCLCHECKGOTO(socketProgressState(sock), ret, exit); } while (sock->asyncFlag == 0 && (sock->abortFlag == NULL || *sock->abortFlag == 0) && (sock->state == ncclSocketStateAccepting || sock->state == ncclSocketStateAccepted)); if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError; switch (sock->state) { case ncclSocketStateAccepting: case ncclSocketStateAccepted: case ncclSocketStateReady: ret = ncclSuccess; break; case ncclSocketStateError: ret = ncclSystemError; break; default: WARN("ncclSocketAccept: wrong socket state %d", sock->state); ret = ncclInternalError; break; } exit: return ret; } ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag) { ncclResult_t ret = ncclSuccess; if (sock == NULL) goto exit; sock->timedOutRetries = 0; sock->refusedRetries = 0; sock->abortFlag = abortFlag; sock->asyncFlag = asyncFlag; sock->state = ncclSocketStateInitialized; sock->magic = magic; sock->type = type; sock->fd = -1; sock->acceptFd = -1; if (addr) { /* IPv4/IPv6 support */ int family; memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress)); family = sock->addr.sa.sa_family; if (family != AF_INET && family != AF_INET6) { char line[SOCKET_NAME_MAXLEN+1]; WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)", ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6); ret = ncclInternalError; goto fail; } sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); /* Connect to a hostname / port */ sock->fd = socket(family, SOCK_STREAM, 0); if (sock->fd == -1) { WARN("ncclSocketInit: Socket creation failed : %s", strerror(errno)); ret = ncclSystemError; goto fail; } } else { memset(&sock->addr, 0, sizeof(union ncclSocketAddress)); } /* Set socket as non-blocking if async or if we need to be able to abort */ if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) { int flags; EQCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), -1, ret, fail); SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), ret, fail); } exit: return ret; fail: goto exit; } ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { if (sock == NULL) { WARN("ncclSocketProgress: pass NULL socket"); return ncclInvalidArgument; } NCCLCHECK(socketProgress(op, sock, ptr, size, offset)); return ncclSuccess; } ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { if (sock == NULL) { WARN("ncclSocketWait: pass NULL socket"); return ncclInvalidArgument; } NCCLCHECK(socketWait(op, sock, ptr, size, offset)); return ncclSuccess; } ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) { int offset = 0; if (sock == NULL) { WARN("ncclSocketSend: pass NULL socket"); return ncclInvalidArgument; } if (sock->state != ncclSocketStateReady) { WARN("ncclSocketSend: socket state (%d) is not ready", sock->state); return ncclInternalError; } NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset)); return ncclSuccess; } ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) { int offset = 0; if (sock == NULL) { WARN("ncclSocketRecv: pass NULL socket"); return ncclInvalidArgument; } if (sock->state != ncclSocketStateReady) { WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state); return ncclInternalError; } NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset)); return ncclSuccess; } // Receive or detect connection closed ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) { int offset = 0; if (sock == NULL) { WARN("ncclSocketTryRecv: pass NULL socket"); return ncclInvalidArgument; } *closed = 0; // Block until connection closes or nbytes received if (blocking) { while (offset < size) { NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed)); if (*closed) return ncclSuccess; } } else { NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed)); if (*closed) return ncclSuccess; // If any bytes were received, block waiting for the rest if (offset > 0) { while (offset < size) { NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed)); if (*closed) return ncclSuccess; } // No bytes were received, return ncclInProgress } else { return ncclInProgress; } } return ncclSuccess; } ncclResult_t ncclSocketClose(struct ncclSocket* sock) { if (sock != NULL) { if (sock->fd >= 0) { /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful * connection close here. */ shutdown(sock->fd, SHUT_RDWR); close(sock->fd); } sock->state = ncclSocketStateClosed; sock->fd = -1; } return ncclSuccess; } ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd) { if (sock == NULL) { WARN("ncclSocketGetFd: pass NULL socket"); return ncclInvalidArgument; } if (fd) *fd = sock->fd; return ncclSuccess; } ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock) { if (sock == NULL) { WARN("ncclSocketGetFd: pass NULL socket"); return ncclInvalidArgument; } sock->fd = fd; return ncclSuccess; } nccl-2.18.3-1/src/misc/strongstream.cc000066400000000000000000000341101444201535000174400ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "strongstream.h" #include "cudawrap.h" #include "checks.h" #include "param.h" // Tracks the chain of graph nodes for a given graph captured identified by // its graph id. This state has to live for as long as captured work is being // submitted. CUDA doesn't have mechanism to inform us when the user ends capture // so the best we can do is get notified when the graph is destroyed. struct ncclStrongStreamGraph { struct ncclStrongStreamGraph* next; // Atomically exchanged to false by both the main thread or the graph destructor // callback. The last to arrive deletes the node. bool alive; unsigned long long graphId; // For each graph we track the "tip" of the chain of graph nodes. A linear // chain would always have just one node at its tip, but since we have to merge // in chains from other streams (via ncclStrongStreamWaitStream) some spots // in the chain can be wider than a single node and thus need a list, so we // maintain a dynamically sized array of tip nodes. int tipCount, tipCapacity; cudaGraphNode_t* tipNodes; }; static void ncclStrongStreamGraphDelete(struct ncclStrongStreamGraph* g) { free(g->tipNodes); free(g); } //////////////////////////////////////////////////////////////////////////////// ncclResult_t ncclCudaGetCapturingGraph( struct ncclCudaGraph* graph, cudaStream_t stream ) { #if CUDART_VERSION >= 10000 // cudaStreamGetCaptureInfo int driver; NCCLCHECK(ncclCudaDriverVersion(&driver)); if (CUDART_VERSION < 11030 || driver < 11030) { cudaStreamCaptureStatus status; unsigned long long gid; CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid)); #if CUDART_VERSION >= 11030 graph->graph = nullptr; graph->graphId = ULLONG_MAX; #endif if (status != cudaStreamCaptureStatusNone) { WARN("NCCL cannot be captured in a graph if either it wasn't built with CUDA runtime >= 11.3 or if the installed CUDA driver < R465."); return ncclInvalidUsage; } } else { #if CUDART_VERSION >= 11030 cudaStreamCaptureStatus status; unsigned long long gid; CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr)); if (status != cudaStreamCaptureStatusActive) { graph->graph = nullptr; gid = ULLONG_MAX; } graph->graphId = gid; #endif } #endif return ncclSuccess; } ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg) { #if CUDART_VERSION >= 11030 cudaUserObject_t object; CUDACHECK(cudaUserObjectCreate( &object, arg, fn, /*initialRefcount=*/1, cudaUserObjectNoDestructorSync )); // Hand over ownership to CUDA Graph CUDACHECK(cudaGraphRetainUserObject(graph.graph, object, 1, cudaGraphUserObjectMove)); return ncclSuccess; #else return ncclInvalidUsage; #endif } //////////////////////////////////////////////////////////////////////////////// ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) { CUDACHECK(cudaStreamCreateWithFlags(&ss->cudaStream, cudaStreamNonBlocking)); #if CUDART_VERSION >= 11030 CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming)); ss->everCaptured = false; ss->serialEventNeedsRecord = false; ss->graphHead = nullptr; #else CUDACHECK(cudaEventCreateWithFlags(&ss->scratchEvent, cudaEventDisableTiming)); #endif return ncclSuccess; } static void graphDestructor(void* arg) { struct ncclStrongStreamGraph* g = (struct ncclStrongStreamGraph*)arg; if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) { // Last to arrive deletes list node. ncclStrongStreamGraphDelete(g); } } ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) { CUDACHECK(cudaStreamDestroy(ss->cudaStream)); #if CUDART_VERSION >= 11030 CUDACHECK(cudaEventDestroy(ss->serialEvent)); // Delete list of per-graph chains. struct ncclStrongStreamGraph* g = ss->graphHead; while (g != nullptr) { struct ncclStrongStreamGraph* next = g->next; if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) { // Last to arrive deletes list node. ncclStrongStreamGraphDelete(g); } g = next; } #else CUDACHECK(cudaEventDestroy(ss->scratchEvent)); #endif return ncclSuccess; } NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1) static void ensureTips(struct ncclStrongStreamGraph* g, int n) { if (g->tipCapacity < n) { g->tipNodes = (cudaGraphNode_t*)realloc(g->tipNodes, n*sizeof(cudaGraphNode_t)); g->tipCapacity = n; } } ncclResult_t ncclStrongStreamAcquire( struct ncclCudaGraph graph, struct ncclStrongStream* ss ) { #if CUDART_VERSION >= 11030 bool mixing = ncclParamGraphMixingSupport(); if (graph.graph == nullptr) { if (mixing && ss->everCaptured) { CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); ss->serialEventNeedsRecord = false; } } else { ss->everCaptured = true; // Find the current graph in our list of graphs if it exists. struct ncclStrongStreamGraph** pg = &ss->graphHead; struct ncclStrongStreamGraph* g; while (*pg != nullptr) { g = *pg; if (g->graphId == graph.graphId) { // Move to front of list so that operations after acquire don't have to search the list. *pg = g->next; g->next = ss->graphHead; ss->graphHead = g; return ncclSuccess; } else if (false == __atomic_load_n(&g->alive, __ATOMIC_ACQUIRE)) { // Unrelated graph that has been destroyed. Remove and delete. *pg = g->next; ncclStrongStreamGraphDelete(g); } else { pg = &g->next; } } // This is a new graph so add to the list. g = (struct ncclStrongStreamGraph*)malloc(sizeof(struct ncclStrongStreamGraph)); g->graphId = graph.graphId; g->tipNodes = nullptr; g->tipCapacity = 0; g->tipCount = 0; g->next = ss->graphHead; ss->graphHead = g; g->alive = true; NCCLCHECK(ncclCudaGraphAddDestructor(graph, graphDestructor, (void*)g)); if (mixing && ss->serialEventNeedsRecord) { // Can only be here if previous release was for uncaptured work that // elided updating the event because no capture had yet occurred. CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream)); } ss->serialEventNeedsRecord = false; // First node in the chain must be a wait on the serialEvent. if (mixing) { ensureTips(g, 1); CUDACHECK(cudaGraphAddEventWaitNode(&g->tipNodes[0], graph.graph, nullptr, 0, ss->serialEvent)); g->tipCount = 1; } else { g->tipCount = 0; } } #endif return ncclSuccess; } ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) { #if CUDART_VERSION >= 11030 bool mixing = ncclParamGraphMixingSupport(); if (mixing && ss->everCaptured) { CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); } ss->serialEventNeedsRecord = true; // Assume the caller is going to add work to stream. #endif return ncclSuccess; } static ncclResult_t checkGraphId(struct ncclStrongStreamGraph* g, unsigned long long id) { if (g == nullptr || g->graphId != id) { WARN("Expected graph id=%llu was not at head of strong stream's internal list.", id); return ncclInternalError; } return ncclSuccess; } ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) { #if CUDART_VERSION >= 11030 bool mixing = ncclParamGraphMixingSupport(); if (mixing && ss->serialEventNeedsRecord) { if (graph.graph == nullptr) { if (ss->everCaptured) { CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream)); ss->serialEventNeedsRecord = false; } } else { struct ncclStrongStreamGraph* g = ss->graphHead; NCCLCHECK(checkGraphId(g, graph.graphId)); ensureTips(g, 1); CUDACHECK(cudaGraphAddEventRecordNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, ss->serialEvent)); g->tipCount = 1; ss->serialEventNeedsRecord = false; } } #endif return ncclSuccess; } ncclResult_t ncclStrongStreamLaunchHost( struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg ) { #if CUDART_VERSION >= 11030 if (graph.graph == nullptr) { CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg)); } else { cudaHostNodeParams p; p.fn = fn; p.userData = arg; struct ncclStrongStreamGraph* g = ss->graphHead; NCCLCHECK(checkGraphId(g, graph.graphId)); ensureTips(g, 1); CUDACHECK(cudaGraphAddHostNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p)); g->tipCount = 1; } ss->serialEventNeedsRecord = true; #else CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg)); #endif return ncclSuccess; } ncclResult_t ncclStrongStreamLaunchKernel( struct ncclCudaGraph graph, struct ncclStrongStream* ss, void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes ) { #if CUDART_VERSION >= 11030 if (graph.graph == nullptr) { CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream)); } else { cudaKernelNodeParams p; p.func = fn; p.gridDim = grid; p.blockDim = block; p.kernelParams = args; p.sharedMemBytes = sharedMemBytes; p.extra = nullptr; struct ncclStrongStreamGraph* g = ss->graphHead; NCCLCHECK(checkGraphId(g, graph.graphId)); ensureTips(g, 1); CUDACHECK(cudaGraphAddKernelNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p)); g->tipCount = 1; } ss->serialEventNeedsRecord = true; #else CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream)); #endif return ncclSuccess; } // Merge node list `b` into list `a` but don't add duplicates. static void mergeTips(struct ncclStrongStreamGraph* a, cudaGraphNode_t const* bNodes, int bn) { int an = a->tipCount; ensureTips(a, an + bn); for (int bi=0; bi < bn; bi++) { for (int ai=0; ai < an; ai++) { if (a->tipNodes[ai] == bNodes[bi]) goto next_b; } a->tipNodes[a->tipCount++] = bNodes[bi]; next_b:; } } ncclResult_t ncclStrongStreamWaitStream( struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a ) { #if CUDART_VERSION >= 11030 if (graph.graph == nullptr) { if (b->serialEventNeedsRecord) { b->serialEventNeedsRecord = false; CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream)); } CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->serialEvent, 0)); } else { struct ncclStrongStreamGraph* ag = a->graphHead; NCCLCHECK(checkGraphId(ag, graph.graphId)); struct ncclStrongStreamGraph* bg = b->graphHead; NCCLCHECK(checkGraphId(bg, graph.graphId)); if (b_subsumes_a) ag->tipCount = 0; mergeTips(ag, bg->tipNodes, bg->tipCount); } a->serialEventNeedsRecord = true; #else CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream)); CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->scratchEvent, 0)); #endif return ncclSuccess; } ncclResult_t ncclStrongStreamWaitStream( struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a ) { #if CUDART_VERSION >= 11030 if (graph.graph == nullptr) { // It is ok to use a->serialEvent to record b since we'll be setting // a->serialEventNeedsRecord so the event won't be considered accurate // until re-recorded. CUDACHECK(cudaEventRecord(a->serialEvent, b)); CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->serialEvent, 0)); } else { cudaStreamCaptureStatus status; unsigned long long bGraphId; cudaGraphNode_t const* bNodes; size_t bCount = 0; CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &bGraphId, nullptr, &bNodes, &bCount)); if (status != cudaStreamCaptureStatusActive || graph.graphId != bGraphId) { WARN("Stream is not being captured by the expected graph."); return ncclInvalidUsage; } struct ncclStrongStreamGraph* ag = a->graphHead; NCCLCHECK(checkGraphId(ag, graph.graphId)); if (b_subsumes_a) ag->tipCount = 0; mergeTips(ag, bNodes, bCount); } a->serialEventNeedsRecord = true; #else CUDACHECK(cudaEventRecord(a->scratchEvent, b)); CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->scratchEvent, 0)); #endif return ncclSuccess; } ncclResult_t ncclStrongStreamWaitStream( struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a ) { #if CUDART_VERSION >= 11030 if (graph.graph == nullptr) { if (b->serialEventNeedsRecord) { b->serialEventNeedsRecord = false; CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream)); } CUDACHECK(cudaStreamWaitEvent(a, b->serialEvent, 0)); } else { struct ncclStrongStreamGraph* bg = b->graphHead; NCCLCHECK(checkGraphId(bg, graph.graphId)); CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount, b_subsumes_a ? cudaStreamSetCaptureDependencies : cudaStreamAddCaptureDependencies )); } #else CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream)); CUDACHECK(cudaStreamWaitEvent(a, b->scratchEvent, 0)); #endif return ncclSuccess; } ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) { #if CUDART_VERSION >= 11030 CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); ss->serialEventNeedsRecord = false; #endif CUDACHECK(cudaStreamSynchronize(ss->cudaStream)); return ncclSuccess; } nccl-2.18.3-1/src/misc/utils.cc000066400000000000000000000226251444201535000160600ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "utils.h" #include "core.h" #include "nvmlwrap.h" #include // Get current Compute Capability int ncclCudaCompCap() { int cudaDev; if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0; int ccMajor, ccMinor; if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0; if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0; return ccMajor*10+ccMinor; } ncclResult_t int64ToBusId(int64_t id, char* busId) { sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf)); return ncclSuccess; } ncclResult_t busIdToInt64(const char* busId, int64_t* id) { char hexStr[17]; // Longest possible int64 hex string + null terminator. int hexOffset = 0; for (int i = 0; hexOffset < sizeof(hexStr) - 1; i++) { char c = busId[i]; if (c == '.' || c == ':') continue; if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) { hexStr[hexOffset++] = busId[i]; } else break; } hexStr[hexOffset] = '\0'; *id = strtol(hexStr, NULL, 16); return ncclSuccess; } // Convert a logical cudaDev index to the NVML device minor number ncclResult_t getBusId(int cudaDev, int64_t *busId) { // On most systems, the PCI bus ID comes back as in the 0000:00:00.0 // format. Still need to allocate proper space in case PCI domain goes // higher. char busIdStr[] = "00000000:00:00.0"; CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev)); NCCLCHECK(busIdToInt64(busIdStr, busId)); return ncclSuccess; } ncclResult_t getHostName(char* hostname, int maxlen, const char delim) { if (gethostname(hostname, maxlen) != 0) { strncpy(hostname, "unknown", maxlen); return ncclSystemError; } int i = 0; while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++; hostname[i] = '\0'; return ncclSuccess; } uint64_t getHash(const char* string, int n) { // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; for (int c = 0; c < n; c++) { result = ((result << 5) + result) ^ string[c]; } return result; } /* Generate a hash of the unique identifying string for this host * that will be unique for both bare-metal and container instances * Equivalent of a hash of; * * $(hostname)$(cat /proc/sys/kernel/random/boot_id) * * This string can be overridden by using the NCCL_HOSTID env var. */ #define HOSTID_FILE "/proc/sys/kernel/random/boot_id" uint64_t getHostHash(void) { char hostHash[1024]; char *hostId; // Fall back is the full hostname if something fails (void) getHostName(hostHash, sizeof(hostHash), '\0'); int offset = strlen(hostHash); if ((hostId = getenv("NCCL_HOSTID")) != NULL) { INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId); strncpy(hostHash, hostId, sizeof(hostHash)); } else { FILE *file = fopen(HOSTID_FILE, "r"); if (file != NULL) { char *p; if (fscanf(file, "%ms", &p) == 1) { strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); free(p); } } fclose(file); } // Make sure the string is terminated hostHash[sizeof(hostHash)-1]='\0'; TRACE(NCCL_INIT,"unique hostname '%s'", hostHash); return getHash(hostHash, strlen(hostHash)); } /* Generate a hash of the unique identifying string for this process * that will be unique for both bare-metal and container instances * Equivalent of a hash of; * * $$ $(readlink /proc/self/ns/pid) */ uint64_t getPidHash(void) { char pname[1024]; // Start off with our pid ($$) sprintf(pname, "%ld", (long) getpid()); int plen = strlen(pname); int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen); if (len < 0) len = 0; pname[plen+len]='\0'; TRACE(NCCL_INIT,"unique PID '%s'", pname); return getHash(pname, strlen(pname)); } int parseStringList(const char* string, struct netIf* ifList, int maxList) { if (!string) return 0; const char* ptr = string; int ifNum = 0; int ifC = 0; char c; do { c = *ptr; if (c == ':') { if (ifC > 0) { ifList[ifNum].prefix[ifC] = '\0'; ifList[ifNum].port = atoi(ptr+1); ifNum++; ifC = 0; } while (c != ',' && c != '\0') c = *(++ptr); } else if (c == ',' || c == '\0') { if (ifC > 0) { ifList[ifNum].prefix[ifC] = '\0'; ifList[ifNum].port = -1; ifNum++; ifC = 0; } } else { ifList[ifNum].prefix[ifC] = c; ifC++; } ptr++; } while (ifNum < maxList && c); return ifNum; } static bool matchIf(const char* string, const char* ref, bool matchExact) { // Make sure to include '\0' in the exact case int matchLen = matchExact ? strlen(string) + 1 : strlen(ref); return strncmp(string, ref, matchLen) == 0; } static bool matchPort(const int port1, const int port2) { if (port1 == -1) return true; if (port2 == -1) return true; if (port1 == port2) return true; return false; } bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) { // Make an exception for the case where no user list is defined if (listSize == 0) return true; for (int i=0; ihunks` points to the top of the stack non-empty hunks. Hunks above // this (reachable via `->above`) are empty. struct Hunk* top = me->topFrame.hunk; size_t mallocSize = 0; // If we have lots of space left in hunk but that wasn't enough then we'll // allocate the object unhunked. if (me->topFrame.end - me->topFrame.bumper >= 8<<10) goto unhunked; // If we have another hunk (which must be empty) waiting above this one and // the object fits then use that. if (top && top->above) { struct Hunk* top1 = top->above; uintptr_t uobj = (reinterpret_cast(top1) + sizeof(struct Hunk) + align-1) & -uintptr_t(align); if (uobj + size <= reinterpret_cast(top1) + top1->size) { me->topFrame.hunk = top1; me->topFrame.bumper = uobj + size; me->topFrame.end = reinterpret_cast(top1) + top1->size; return reinterpret_cast(uobj); } } { // If the next hunk we're going to allocate wouldn't be big enough but the // Unhunk proxy fits in the current hunk then go allocate as unhunked. size_t nextSize = (top ? top->size : 0) + (64<<10); constexpr size_t maxAlign = 64; if (nextSize < sizeof(struct Hunk) + maxAlign + size) { uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk)); if (uproxy + sizeof(struct Unhunk) <= me->topFrame.end) goto unhunked; } // At this point we must need another hunk, either to fit the object // itself or its Unhunk proxy. mallocSize = nextSize; INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize); struct Hunk *top1 = (struct Hunk*)malloc(mallocSize); if (top1 == nullptr) goto malloc_exhausted; top1->size = nextSize; top1->above = nullptr; if (top) top->above = top1; top = top1; me->topFrame.hunk = top; me->topFrame.end = reinterpret_cast(top) + nextSize; me->topFrame.bumper = reinterpret_cast(top) + sizeof(struct Hunk); } { // Try to fit object in the new top hunk. uintptr_t uobj = (me->topFrame.bumper + align-1) & -uintptr_t(align); if (uobj + size <= me->topFrame.end) { me->topFrame.bumper = uobj + size; return reinterpret_cast(uobj); } } unhunked: { // We need to allocate the object out-of-band and put an Unhunk proxy in-band // to keep track of it. uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk)); Unhunk* proxy = reinterpret_cast(uproxy); me->topFrame.bumper = uproxy + sizeof(Unhunk); proxy->next = me->topFrame.unhunks; me->topFrame.unhunks = proxy; mallocSize = size; proxy->obj = malloc(mallocSize); INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize); if (proxy->obj == nullptr) goto malloc_exhausted; return proxy->obj; } malloc_exhausted: WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize); abort(); } void ncclMemoryStackDestruct(struct ncclMemoryStack* me) { // Free unhunks first because both the frames and unhunk proxies lie within the hunks. struct ncclMemoryStack::Frame* f = &me->topFrame; while (f != nullptr) { struct ncclMemoryStack::Unhunk* u = f->unhunks; while (u != nullptr) { free(u->obj); u = u->next; } f = f->below; } // Free hunks struct ncclMemoryStack::Hunk* h = me->stub.above; while (h != nullptr) { struct ncclMemoryStack::Hunk *h1 = h->above; free(h); h = h1; } } nccl-2.18.3-1/src/nccl.h.in000066400000000000000000000427561444201535000151620ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_H_ #define NCCL_H_ #include #include #if CUDART_VERSION >= 11000 #include #endif #define NCCL_MAJOR ${nccl:Major} #define NCCL_MINOR ${nccl:Minor} #define NCCL_PATCH ${nccl:Patch} #define NCCL_SUFFIX "${nccl:Suffix}" #define NCCL_VERSION_CODE ${nccl:Version} #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) #ifdef __cplusplus extern "C" { #endif #include /* Opaque handle to communicator */ typedef struct ncclComm* ncclComm_t; #define NCCL_COMM_NULL NULL #define NCCL_UNIQUE_ID_BYTES 128 typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; /* Error type */ typedef enum { ncclSuccess = 0, ncclUnhandledCudaError = 1, ncclSystemError = 2, ncclInternalError = 3, ncclInvalidArgument = 4, ncclInvalidUsage = 5, ncclRemoteError = 6, ncclInProgress = 7, ncclNumResults = 8 } ncclResult_t; #define NCCL_CONFIG_UNDEF_INT INT_MIN #define NCCL_CONFIG_UNDEF_PTR NULL #define NCCL_SPLIT_NOCOLOR -1 /* Communicator configuration. Users can assign value to attributes to specify the * behavior of a communicator. */ typedef struct ncclConfig_v21700 { /* attributes that users should never touch. */ size_t size; unsigned int magic; unsigned int version; /* attributes that users are able to customize. */ int blocking; int cgaClusterSize; int minCTAs; int maxCTAs; const char *netName; int splitShare; } ncclConfig_t; /* Config initializer must be assigned to initialize config structure when it is created. * Not initialized config will result in NCCL error. */ #define NCCL_CONFIG_INITIALIZER { \ sizeof(ncclConfig_t), /* size */ \ 0xcafebeef, /* magic */ \ NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ NCCL_CONFIG_UNDEF_INT, /* blocking */ \ NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ NCCL_CONFIG_UNDEF_PTR, /* netName */ \ NCCL_CONFIG_UNDEF_INT /* splitShare */ \ } /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. * This integer is coded with the MAJOR, MINOR and PATCH level of the * NCCL library */ ncclResult_t ncclGetVersion(int *version); ncclResult_t pncclGetVersion(int *version); /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be * called once and the Id should be distributed to all ranks in the * communicator before calling ncclCommInitRank. */ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); /* Create a new communicator (multi thread/process version) with a configuration * set by users. */ ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); /* Creates a new communicator (multi thread/process version). * rank must be between 0 and nranks-1 and unique within a communicator clique. * Each rank is associated to a CUDA device, which has to be set before calling * ncclCommInitRank. * ncclCommInitRank implicitly syncronizes with other ranks, so it must be * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); /* Creates a clique of communicators (single process version). * This is a convenience function to create a single-process communicator clique. * Returns an array of ndev newly initialized communicators in comm. * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). * If devlist is NULL, the first ndev CUDA devices are used. * Order of devlist defines user-order of processors within the communicator. */ ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); /* Finalize a communicator. ncclCommFinalize flushes all issued communications, * and marks communicator state as ncclInProgress. The state will change to ncclSuccess * when the communicator is globally quiescent and related resources are freed; then, * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator * itself) without blocking. */ ncclResult_t ncclCommFinalize(ncclComm_t comm); ncclResult_t pncclCommFinalize(ncclComm_t comm); /* Frees local resources associated with communicator object. */ ncclResult_t ncclCommDestroy(ncclComm_t comm); ncclResult_t pncclCommDestroy(ncclComm_t comm); /* Frees resources associated with communicator object and aborts any operations * that might still be running on the device. */ ncclResult_t ncclCommAbort(ncclComm_t comm); ncclResult_t pncclCommAbort(ncclComm_t comm); /* Creates one or more communicators from an existing one. * Ranks with the same color will end up in the same communicator. * Within the new communicator, key will be used to order ranks. * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group * and will therefore return a NULL communicator. * If config is NULL, the new communicator will inherit the original communicator's * configuration*/ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); /* Returns a string for each error code. */ const char* ncclGetErrorString(ncclResult_t result); const char* pncclGetErrorString(ncclResult_t result); /* Returns a human-readable message of the last error that occurred. * comm is currently unused and can be set to NULL */ const char* ncclGetLastError(ncclComm_t comm); const char* pncclGetLastError(ncclComm_t comm); /* Checks whether the comm has encountered any asynchronous errors */ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); /* Gets the number of ranks in the communicator clique. */ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); /* Returns the cuda device number associated with the communicator. */ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); /* Returns the user-ordered "rank" associated with the communicator. */ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); /* Reduction operation selector */ typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; typedef enum { ncclSum = 0, ncclProd = 1, ncclMax = 2, ncclMin = 3, ncclAvg = 4, /* ncclNumOps: The number of built-in ncclRedOp_t values. Also * serves as the least possible value for dynamic ncclRedOp_t's * as constructed by ncclRedOpCreate*** functions. */ ncclNumOps = 5, /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. * It is defined to be the largest signed value (since compilers * are permitted to use signed enums) that won't grow * sizeof(ncclRedOp_t) when compared to previous NCCL versions to * maintain ABI compatibility. */ ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) } ncclRedOp_t; /* Data types */ typedef enum { ncclInt8 = 0, ncclChar = 0, ncclUint8 = 1, ncclInt32 = 2, ncclInt = 2, ncclUint32 = 3, ncclInt64 = 4, ncclUint64 = 5, ncclFloat16 = 6, ncclHalf = 6, ncclFloat32 = 7, ncclFloat = 7, ncclFloat64 = 8, ncclDouble = 8, #if defined(__CUDA_BF16_TYPES_EXIST__) ncclBfloat16 = 9, ncclNumTypes = 10 #else ncclNumTypes = 9 #endif } ncclDataType_t; /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ typedef enum { /* ncclScalarDevice: The scalar is in device-visible memory and will be * dereferenced while the collective is running. */ ncclScalarDevice = 0, /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be * dereferenced before the ncclRedOpCreate***() function returns. */ ncclScalarHostImmediate = 1 } ncclScalarResidence_t; /* * ncclRedOpCreatePreMulSum * * Creates a new reduction operator which pre-multiplies input values by a given * scalar locally before reducing them with peer values via summation. For use * only with collectives launched against *comm* and *datatype*. The * *residence* argument indicates how/when the memory pointed to by *scalar* * will be dereferenced. Upon return, the newly created operator's handle * is stored in *op*. */ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); /* * ncclRedOpDestroy * * Destroys the reduction operator *op*. The operator must have been created by * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be * destroyed as soon as the last NCCL function which is given that operator returns. */ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); /* * Collective communication operations * * Collective communication operations must be called separately for each * communicator in a communicator clique. * * They return when operations have been enqueued on the CUDA stream. * * Since they may perform inter-CPU synchronization, each call has to be done * from a different thread or process, or need to use Group Semantics (see * below). */ /* * Reduce * * Reduces data arrays of length count in sendbuff into recvbuff using op * operation. * recvbuff may be NULL on all calls except for root device. * root is the rank (not the CUDA device) where data will reside after the * operation is complete. * * In-place operation will happen if sendbuff == recvbuff. */ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); /* * (deprecated) Broadcast (in-place) * * Copies count values from root to all other devices. * root is the rank (not the CUDA device) where data resides before the * operation is started. * * This operation is implicitely in place. */ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); /* * Broadcast * * Copies count values from root to all other devices. * root is the rank (not the CUDA device) where data resides before the * operation is started. * * In-place operation will happen if sendbuff == recvbuff. */ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); /* * All-Reduce * * Reduces data arrays of length count in sendbuff using op operation, and * leaves identical copies of result on each recvbuff. * * In-place operation will happen if sendbuff == recvbuff. */ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); /* * Reduce-Scatter * * Reduces data in sendbuff using op operation and leaves reduced result * scattered over the devices so that recvbuff on rank i will contain the i-th * block of the result. * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff * should have a size of at least nranks*recvcount elements. * * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. */ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); /* * All-Gather * * Each device gathers sendcount values from other GPUs into recvbuff, * receiving data from rank i at offset i*sendcount. * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff * should have a size of at least nranks*sendcount elements. * * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. */ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); /* * Send * * Send data from sendbuff to rank peer. * * Rank peer needs to call ncclRecv with the same datatype and the same count from this * rank. * * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ * ncclGroupEnd section. */ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); /* * Receive * * Receive data from rank peer into recvbuff. * * Rank peer needs to call ncclSend with the same datatype and the same count to this * rank. * * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ * ncclGroupEnd section. */ ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); /* * Group semantics * * When managing multiple GPUs from a single thread, and since NCCL collective * calls may perform inter-CPU synchronization, we need to "group" calls for * different ranks/devices into a single call. * * Grouping NCCL calls as being part of the same collective operation is done * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all * collective calls until the ncclGroupEnd call, which will wait for all calls * to be complete. Note that for collective communication, ncclGroupEnd only * guarantees that the operations are enqueued on the streams, not that * the operation is effectively done. * * Both collective communication and ncclCommInitRank can be used in conjunction * of ncclGroupStart/ncclGroupEnd, but not together. * * Group semantics also allow to fuse multiple operations on the same device * to improve performance (for aggregated collective calls), or to permit * concurrent progress of multiple send/receive operations. */ /* * Group Start * * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into * a single NCCL operation. Nothing will be started on the CUDA stream until * ncclGroupEnd. */ ncclResult_t ncclGroupStart(); ncclResult_t pncclGroupStart(); /* * Group End * * End a group call. Start a fused NCCL operation consisting of all calls since * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations * need to be called after ncclGroupEnd. */ ncclResult_t ncclGroupEnd(); ncclResult_t pncclGroupEnd(); #ifdef __cplusplus } // end extern "C" #endif #endif // end include guard nccl-2.18.3-1/src/nccl.pc.in000077500000000000000000000004341444201535000153230ustar00rootroot00000000000000prefix=${nccl:Prefix} exec_prefix=${prefix} libdir=${exec_prefix}/lib includedir=${prefix}/include Name: nccl Description: Optimized primitives for collective multi-GPU communication Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch} Libs: -L${libdir} -lnccl Cflags: -I${includedir} nccl-2.18.3-1/src/net.cc000066400000000000000000000350341444201535000145510ustar00rootroot00000000000000#include "net.h" #include "bootstrap.h" #include "checks.h" #include #include #include //#include //#include //#include static ncclNet_v6_t ncclNet_v4_as_v6; static ncclNet_v6_t ncclNet_v5_as_v6; static ncclNet_v4_t *ncclNet_v4; static ncclNet_v5_t *ncclNet_v5; static ncclCollNet_v6_t ncclCollNet_v4_as_v6; static ncclCollNet_v6_t ncclCollNet_v5_as_v6; static ncclCollNet_v4_t *ncclCollNet_v4; static ncclCollNet_v5_t *ncclCollNet_v5; static ncclResult_t ncclNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) { ncclNetProperties_v4_t p4; ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4); if (ans != ncclSuccess) return ans; props->name = p4.name; props->pciPath = p4.pciPath; props->guid = p4.guid; props->ptrSupport = p4.ptrSupport; props->speed = p4.speed; props->port = p4.port; props->maxComms = p4.maxComms; props->maxRecvs = 1; props->latency = 0; return ncclSuccess; } static ncclResult_t ncclNet_v4_as_v6_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclNet_v4->isend(sendComm, data, size, mhandle, request); } static ncclResult_t ncclNet_v4_as_v6_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { if (n == 0) return ncclSuccess; if (n != 1) return ncclInvalidArgument; return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request); } static ncclResult_t ncclNet_v4_as_v6_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { if (n == 0) return ncclSuccess; if (n != 1) return ncclInvalidArgument; return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request); } // We use a wrapper around the v4 init to copy over the struct contents // post-init since they may not be initialized before hand. static ncclResult_t ncclNet_v4_as_v6_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclNet_v4->init(logfn)); ncclNet_v4_as_v6.name = ncclNet_v4->name; ncclNet_v4_as_v6.devices = ncclNet_v4->devices; ncclNet_v4_as_v6.getProperties = ncclNet_v4_as_v6_getProperties; ncclNet_v4_as_v6.listen = ncclNet_v4->listen; ncclNet_v4_as_v6.connect = ncclNet_v4->connect; ncclNet_v4_as_v6.accept = ncclNet_v4->accept; ncclNet_v4_as_v6.regMr = ncclNet_v4->regMr; ncclNet_v4_as_v6.regMrDmaBuf = NULL; ncclNet_v4_as_v6.deregMr = ncclNet_v4->deregMr; ncclNet_v4_as_v6.isend = ncclNet_v4_as_v6_isend; ncclNet_v4_as_v6.irecv = ncclNet_v4_as_v6_irecv; ncclNet_v4_as_v6.iflush = ncclNet_v4_as_v6_iflush; ncclNet_v4_as_v6.test = ncclNet_v4->test; ncclNet_v4_as_v6.closeSend = ncclNet_v4->closeSend; ncclNet_v4_as_v6.closeRecv = ncclNet_v4->closeRecv; ncclNet_v4_as_v6.closeListen = ncclNet_v4->closeListen; return ncclSuccess; } // We use a wrapper around the v5 init to copy over the struct contents // post-init since they may not be initialized before hand. static ncclResult_t ncclNet_v5_as_v6_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclNet_v5->init(logfn)); ncclNet_v5_as_v6.name = ncclNet_v5->name; ncclNet_v5_as_v6.devices = ncclNet_v5->devices; ncclNet_v5_as_v6.getProperties = ncclNet_v5->getProperties; ncclNet_v5_as_v6.listen = ncclNet_v5->listen; ncclNet_v5_as_v6.connect = ncclNet_v5->connect; ncclNet_v5_as_v6.accept = ncclNet_v5->accept; ncclNet_v5_as_v6.regMr = ncclNet_v5->regMr; ncclNet_v5_as_v6.regMrDmaBuf = NULL; ncclNet_v5_as_v6.deregMr = ncclNet_v5->deregMr; ncclNet_v5_as_v6.isend = ncclNet_v5->isend; ncclNet_v5_as_v6.irecv = ncclNet_v5->irecv; ncclNet_v5_as_v6.iflush = ncclNet_v5->iflush; ncclNet_v5_as_v6.test = ncclNet_v5->test; ncclNet_v5_as_v6.closeSend = ncclNet_v5->closeSend; ncclNet_v5_as_v6.closeRecv = ncclNet_v5->closeRecv; ncclNet_v5_as_v6.closeListen = ncclNet_v5->closeListen; return ncclSuccess; } static ncclResult_t ncclCollNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) { ncclNetProperties_v4_t p4; ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4); if (ans != ncclSuccess) return ans; props->name = p4.name; props->pciPath = p4.pciPath; props->guid = p4.guid; props->ptrSupport = p4.ptrSupport; props->speed = p4.speed; props->port = p4.port; props->maxComms = p4.maxComms; props->maxRecvs = 1; props->latency = 0; return ncclSuccess; } // We use a wrapper around the v4 init to copy over the struct contents // post-init since they may not be initialized before hand. static ncclResult_t ncclCollNet_v4_as_v6_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclCollNet_v4->init(logfn)); ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name; ncclCollNet_v4_as_v6.devices = ncclCollNet_v4->devices; ncclCollNet_v4_as_v6.getProperties = ncclCollNet_v4_as_v6_getProperties; ncclCollNet_v4_as_v6.listen = ncclCollNet_v4->listen; ncclCollNet_v4_as_v6.connect = ncclCollNet_v4->connect; ncclCollNet_v4_as_v6.reduceSupport = ncclCollNet_v4->reduceSupport; ncclCollNet_v4_as_v6.regMr = ncclCollNet_v4->regMr; ncclCollNet_v4_as_v6.regMrDmaBuf = NULL; ncclCollNet_v4_as_v6.deregMr = ncclCollNet_v4->deregMr; ncclCollNet_v4_as_v6.iallreduce = ncclCollNet_v4->iallreduce; ncclCollNet_v4_as_v6.iflush = ncclCollNet_v4->iflush; ncclCollNet_v4_as_v6.test = ncclCollNet_v4->test; ncclCollNet_v4_as_v6.closeColl = ncclCollNet_v4->closeColl; ncclCollNet_v4_as_v6.closeListen = ncclCollNet_v4->closeListen; return ncclSuccess; } // We use a wrapper around the v5 init to copy over the struct contents // post-init since they may not be initialized before hand. static ncclResult_t ncclCollNet_v5_as_v6_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclCollNet_v5->init(logfn)); ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name; ncclCollNet_v5_as_v6.devices = ncclCollNet_v5->devices; ncclCollNet_v5_as_v6.getProperties = ncclCollNet_v5->getProperties; ncclCollNet_v5_as_v6.listen = ncclCollNet_v5->listen; ncclCollNet_v5_as_v6.connect = ncclCollNet_v5->connect; ncclCollNet_v5_as_v6.reduceSupport = ncclCollNet_v5->reduceSupport; ncclCollNet_v5_as_v6.regMr = ncclCollNet_v5->regMr; ncclCollNet_v5_as_v6.regMrDmaBuf = NULL; ncclCollNet_v5_as_v6.deregMr = ncclCollNet_v5->deregMr; ncclCollNet_v5_as_v6.iallreduce = ncclCollNet_v5->iallreduce; ncclCollNet_v5_as_v6.iflush = ncclCollNet_v5->iflush; ncclCollNet_v5_as_v6.test = ncclCollNet_v5->test; ncclCollNet_v5_as_v6.closeColl = ncclCollNet_v5->closeColl; ncclCollNet_v5_as_v6.closeListen = ncclCollNet_v5->closeListen; return ncclSuccess; } static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; ncclNet_t* ncclNets[3] = { nullptr, &ncclNetIb, &ncclNetSocket }; ncclCollNet_t* ncclCollNets[3] = { nullptr, nullptr, nullptr }; enum ncclNetState { ncclNetStateInit = 0, ncclNetStateEnabled = 1, ncclNetStateDisabled = 2 }; enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; ncclResult_t ncclNetPluginInit() { char ncclNetPluginName[128]; const char* envPluginName = getenv("NCCL_NET_PLUGIN"); if (envPluginName && strlen(envPluginName)) { snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName); INFO(NCCL_INIT, "Plugin name set by env to %s", ncclNetPluginName); } else { sprintf(ncclNetPluginName, "libnccl-net.so"); } void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL); if (netPluginLib == nullptr) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load (%s) returned %d : %s", ncclNetPluginName, errno, dlerror()); INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found, using internal implementation"); return ncclSuccess; } ncclNets[0] = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6"); if (ncclNets[0] == nullptr) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol."); // Try v5 plugin ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); if (ncclNet_v5 == nullptr) { ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4"); if (ncclNet_v4 == nullptr) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (v4 or v5)."); if (netPluginLib != nullptr) dlclose(netPluginLib); return ncclSuccess; } ncclNets[0] = &ncclNet_v4_as_v6; ncclNet_v4_as_v6.init = ncclNet_v4_as_v6_init; // Set the name right away to allow for NCCL_NET=... to work ncclNet_v4_as_v6.name = ncclNet_v4->name; INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v4)", ncclNets[0]->name); } else { ncclNets[0] = &ncclNet_v5_as_v6; ncclNet_v5_as_v6.init = ncclNet_v5_as_v6_init; // Set the name right away to allow for NCCL_NET=... to work ncclNet_v5_as_v6.name = ncclNet_v5->name; INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name); } } // Check for CollNet ncclCollNets[0] = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6"); if (ncclCollNets[0] == nullptr) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol."); ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5"); if (ncclCollNet_v5 == nullptr) { ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4"); if (ncclCollNet_v4 == nullptr) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5)."); } else { ncclCollNets[0] = &ncclCollNet_v4_as_v6; ncclCollNet_v4_as_v6.init = ncclCollNet_v4_as_v6_init; ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name; INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v4)", ncclCollNets[0]->name); } } else { ncclCollNets[0] = &ncclCollNet_v5_as_v6; ncclCollNet_v5_as_v6.init = ncclCollNet_v5_as_v6_init; ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name; INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name); } } return ncclSuccess; } static ncclResult_t netGetState(int i, enum ncclNetState* state) { pthread_mutex_lock(&netLock); if (ncclNetStates[i] == ncclNetStateInit) { int ndev; if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled; else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled; else ncclNetStates[i] = ncclNetStateEnabled; } *state = ncclNetStates[i]; pthread_mutex_unlock(&netLock); return ncclSuccess; } static ncclResult_t collNetGetState(int i, enum ncclNetState* state) { if (ncclCollNetStates[i] == ncclNetStateInit) { int ndev; if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled; else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled; else ncclCollNetStates[i] = ncclNetStateEnabled; } *state = ncclCollNetStates[i]; return ncclSuccess; } ncclResult_t ncclNetInit(struct ncclComm* comm) { // Initialize main communication network const char* netName; bool ok = false; netName = comm->config.netName; for (int i=0; i<3; i++) { if (ncclNets[i] == nullptr) continue; enum ncclNetState state; NCCLCHECK(netGetState(i, &state)); if (state != ncclNetStateEnabled) continue; if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue; comm->ncclNet = ncclNets[i]; ok = true; if (ncclCollNets[i]) { NCCLCHECK(collNetGetState(i, &state)); if (state == ncclNetStateEnabled) { comm->ncclCollNet = ncclCollNets[i]; } } break; } if (!ok) { WARN("Error: network %s not found.", netName ? netName : ""); return ncclInvalidUsage; } return ncclSuccess; } ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { constexpr int GPU_BUF_SIZE = 2*1024*1024; #if CUDART_VERSION >= 11030 // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute int driverVersion; CUDACHECK(cudaDriverGetVersion(&driverVersion)); if (driverVersion >= 11030) { int cudaDev, attr = 0; CUDACHECK(cudaGetDevice(&cudaDev)); CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev)); *gdrSupport = attr; return ncclSuccess; } #endif static int gdrSupportMatrix[32] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; if (gdrSupportMatrix[comm->cudaDev] == -1) { int netDevs; NCCLCHECK(comm->ncclNet->devices(&netDevs)); gdrSupportMatrix[comm->cudaDev] = 0; for (int dev=0; devncclNet->getProperties(dev, &props)); if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; // Allocate memory on the GPU and try to register it on the NIC. void *lComm = NULL, *sComm = NULL, *rComm = NULL; ncclNetHandle_t handle; char* gpuPtr = NULL; void* mHandle = NULL; ncclResult_t ret; ncclDebugNoWarn = NCCL_NET; NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1); bool connected; connected = false; while (!connected) { // If we're aborting now, skip to cleanup if (*comm->abortFlag) { goto cleanup2; } if (sComm == NULL) NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm), ret, cleanup2); if (rComm == NULL) NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm), ret, cleanup2); connected = (rComm != NULL) && (sComm != NULL); } NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2); if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle)); NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle)); gdrSupportMatrix[comm->cudaDev] = 1; } ncclDebugNoWarn = 0; NCCLCHECK(ncclCudaFree(gpuPtr)); cleanup2: if (rComm != NULL) NCCLCHECK(comm->ncclNet->closeRecv(rComm)); if (sComm != NULL) NCCLCHECK(comm->ncclNet->closeSend(sComm)); NCCLCHECK(comm->ncclNet->closeListen(lComm)); cleanup1: break; } } *gdrSupport = gdrSupportMatrix[comm->cudaDev]; return ncclSuccess; } int ncclNetVersion(struct ncclComm* comm) { return (comm->ncclNet == &ncclNet_v4_as_v6) ? 4 : ((comm->ncclNet == &ncclNet_v5_as_v6) ? 5 : 6); } nccl-2.18.3-1/src/proxy.cc000066400000000000000000001723031444201535000151450ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "info.h" #include "collectives.h" #include "socket.h" #include "shm.h" #include "profiler.h" #define ENABLE_TIMER 0 #include "timer.h" #include #include enum { proxyRecv=0, proxySend=1 }; static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) { if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true; /* In chains, one rank does not need a proxy. Let's figure out which one it is */ /* Which index in the reorganized rings should we compare root against */ const int myrank = 0, nextrank = 1, prevrank = nranks-1; int index = pattern == ncclPatternPipelineFrom ? /* no recv / no send if root = */ /* bcast */ (type == proxyRecv ? myrank : nextrank ): /* reduce */ (type == proxyRecv ? prevrank : myrank ); int rank = ring->userRanks[index]; return (root != rank); } #define PROXYARGS_ALLOCATE_SIZE NCCL_MAX_OPS struct ncclProxyPool { struct ncclProxyPool *next; struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; }; static void expectedProxyResponseFree(struct ncclProxyState* state) { struct ncclExpectedProxyResponse* elem = state->expectedResponses; struct ncclExpectedProxyResponse* prev = NULL; while (elem) { prev = elem; elem = elem->next; free(prev->respBuff); free(prev); } } static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize) { struct ncclExpectedProxyResponse* elem = state->expectedResponses; while (elem) { if (elem->opId == opId) { if (respSize != elem->respSize) { WARN("Mismatched response size for opId=%p", opId); return ncclInternalError; } if (elem->done) { WARN("Storing response for already completed opId=%p", opId); return ncclInternalError; } memcpy(elem->respBuff, respBuff, respSize); free(respBuff); elem->done = true; return ncclSuccess; } elem = elem->next; } WARN("Proxy response for opId=%p doesn't match any expected response", opId); return ncclInternalError; } static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, void* opId, int respSize) { struct ncclExpectedProxyResponse* ex; NCCLCHECK(ncclCalloc(&ex, 1)); ex->opId = opId; // Pre-alloc response buffer ex->respBuff = malloc(respSize); ex->respSize = respSize; ex->done = false; // Enqueue struct ncclExpectedProxyResponse* list = state->expectedResponses; if (list == NULL) { state->expectedResponses = ex; return ncclSuccess; } while (list->next) list = list->next; list->next = ex; return ncclSuccess; } static ncclResult_t expectedProxyResponseDequeue(struct ncclProxyState* state, void* opId, void* respBuff, int* found) { struct ncclExpectedProxyResponse* elem = state->expectedResponses; struct ncclExpectedProxyResponse* prev = NULL; *found = 0; while (elem) { if ((elem->opId == opId) && elem->done) { if (prev == NULL) { state->expectedResponses = elem->next; } else { prev->next = elem->next; } memcpy(respBuff, elem->respBuff, elem->respSize); free(elem->respBuff); free(elem); *found = 1; return ncclSuccess; } prev = elem; elem = elem->next; } return ncclSuccess; } static ncclResult_t expectedProxyResponseRemove(struct ncclProxyState* state, void* opId) { struct ncclExpectedProxyResponse* elem = state->expectedResponses; struct ncclExpectedProxyResponse* prev = NULL; while (elem) { if (elem->opId == opId) { if (prev == NULL) { state->expectedResponses = elem->next; } else { prev->next = elem->next; } free(elem->respBuff); free(elem); return ncclSuccess; } prev = elem; elem = elem->next; } WARN("Couldn't find opId=%p", opId); return ncclInternalError; } static ncclResult_t asyncProxyOpEnqueue(struct ncclProxyLocalPeer* peer, ncclProxyAsyncOp* op) { ncclProxyAsyncOp* list = peer->asyncOps; if (list == NULL) { peer->asyncOps = op; return ncclSuccess; } while (list->next) list = list->next; list->next = op; return ncclSuccess; } static ncclResult_t asyncProxyOpDequeue(struct ncclProxyLocalPeer* peer, ncclProxyAsyncOp* op) { struct ncclProxyAsyncOp* elem = peer->asyncOps; struct ncclProxyAsyncOp* prev = NULL; while (elem) { if (elem->opId == op->opId) { if (prev == NULL) { peer->asyncOps = elem->next; } else { prev->next = elem->next; } if (elem->reqBuff) { free(elem->reqBuff); } if (elem->respBuff) { free(elem->respBuff); } free(elem); return ncclSuccess; } prev = elem; elem = elem->next; } if (op) { WARN("Attempting to dequeue nonexistent async opId=%p", op->opId); } else { WARN("Attempting to dequeue null operation"); } return ncclInternalError; } static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) { struct ncclProxyArgs* elem; if (state->pool == NULL) { // Allocate a new pool of elements. Make sure we allocate the memory close // to the network thread struct ncclProxyPool* newPool; NCCLCHECK(ncclCalloc(&newPool, 1)); struct ncclProxyArgs* newElems = newPool->elems; // Chain newly allocated elements for (int i=0; ipool = newElems; // Save the pool memory block for later resource release newPool->next = state->pools; state->pools = newPool; } elem = state->pool; state->pool = state->pool->next; elem->next = elem->nextPeer = NULL; *argsptr = elem; return ncclSuccess; } //#define DEBUG_PROXY 1 #ifdef DEBUG_PROXY #define DEBUG_PROXY_PRINT printf #else #define DEBUG_PROXY_PRINT(...) #endif #define OP_INDEX(op) ((op) ? (op)-state->pools->elems : -1) #define OP_SEEN 0x100000 ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState* state, int* poolIndex, int* opIndex) { struct ncclProxyPool* pool = state->pools; int p = 0; while (pool) { uint64_t o = op-pool->elems; if (o < PROXYARGS_ALLOCATE_SIZE) { *opIndex = o; *poolIndex = p; return ncclSuccess; } pool = pool->next; p++; } WARN("Could not find pool of op %p", op); return ncclInternalError; } ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex) { printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, op->pattern == ncclPatternSend ? "Send" : op->pattern == ncclPatternRecv ? "Recv" : "Coll"); for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = op->subs+s; if (op->state == ncclProxyOpProgress) { char status = ' '; if (op->pattern == ncclPatternRecv) { if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init else if (sub->received < sub->posted) status = 'R'; // Receiving else if (sub->received < sub->transmitted) status = 'R'; // Receiving else if (sub->transmitted < sub->received) status = 'F'; // Flushing else if (sub->done < sub->transmitted) status = 'G'; // Waiting on GPU else status = 'D'; // Done } else if (op->pattern == ncclPatternSend) { if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init else if (sub->transmitted < sub->posted) status = 'G'; // Waiting on GPU else if (sub->done < sub->transmitted) status = 'S'; // Sending else status = 'D'; // Done } printf(" %d%c/%d", sub->peer, status, sub->channelId); } else { printf(" %d/%d", sub->peer, sub->channelId); } } printf("]"); return ncclSuccess; } ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) { struct ncclProxyArgs* op = state->active; int poolIndex, opIndex; printf("ACTIVE OPS\n"); while (op) { NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); if (op->state & OP_SEEN) { WARN("List loop at element %d-%d", poolIndex, opIndex); } NCCLCHECK(printProxyOp(op, poolIndex, opIndex)); op->state |= OP_SEEN; printf("\n"); struct ncclProxyArgs* nextOp = op->nextPeer; while (nextOp) { NCCLCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex)); if (nextOp->state & OP_SEEN) { WARN("List loop at element %d-%d", poolIndex, opIndex); } printf("| `-> "); NCCLCHECK(printProxyOp(nextOp, poolIndex, opIndex)); nextOp->state |= OP_SEEN; printf("\n"); if (nextOp->next) { WARN("Inactive op has next set!"); } nextOp = nextOp->nextPeer; } if (op->nextPeer == NULL) printf("|\n"); op = op->next; printf("v\n"); } printf("[X]\n"); # if 0 printf("FREE OPS\n"); op = state->pool; while (op) { NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); if (op->state & OP_SEEN) { WARN("List loop at element %d-%d", poolIndex, opIndex); } NCCLCHECK(printProxyOp(op, poolIndex, opIndex)); op->state |= OP_SEEN; printf("->"); op = op->next; } printf("[X]\n"); #else op = state->pool; while (op) { NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); if (op->state & OP_SEEN) { WARN("List loop at element %d-%d", poolIndex, opIndex); } op->state |= OP_SEEN; op = op->next; } #endif struct ncclProxyPool* pool = state->pools; poolIndex = 0; while (pool) { struct ncclProxyArgs* elem = pool->elems; for (int e=0; estate & OP_SEEN) == 0) { printf("Elem %d-%d is not in any list:\n", poolIndex, e); NCCLCHECK(printProxyOp(elem, poolIndex, e)); printf("\n"); } else { elem->state -= OP_SEEN; } } pool = pool->next; poolIndex++; } return ncclSuccess; } static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyArgs* args, int subIndex) { struct ncclProxySubArgs* sub = args->subs+subIndex; if (subIndex >= NCCL_PROXY_MAX_SUBS) { WARN("Proxy append out of bounds"); return ncclInternalError; } //memset(sub, 0, sizeof(struct ncclProxySubArgs)); sub->connection = op->connection; sub->channelId = op->channelId; sub->nsteps = op->nsteps; sub->nbytes = op->nbytes; sub->peer = op->root; args->nsubs = subIndex+1; if (subIndex) { if ((args->sliceSteps != op->sliceSteps) || (args->chunkSteps != op->chunkSteps) || (args->protocol != op->protocol) || (args->dtype != op->dtype) || (args->redOp != op->redOp)) { WARN("Proxy append mismatch"); return ncclInternalError; } if (args->state != ncclProxyOpReady) { WARN("Proxy append on running operation"); return ncclInternalError; } return ncclSuccess; } //memset(&args->progress, 0, sizeof(struct ncclProxyArgs)-offsetof(struct ncclProxyArgs, progress)); args->done = 0; args->opCount = op->opCount; args->sliceSteps = op->sliceSteps; args->chunkSteps = op->chunkSteps; args->chunkSize = op->chunkSize; args->dtype = op->dtype; args->redOp = op->redOp; args->pattern = op->pattern; args->protocol = op->protocol; args->state = ncclProxyOpReady; args->progress = op->connection->tcomm->proxyProgress; args->proxyAppendPtr = op->connection->proxyAppendPtr; return ncclSuccess; } static ncclResult_t ProxyAppend(struct ncclProxyProgressState* state, struct ncclProxyOp* op) { struct ncclProxyConnection* connection = op->connection; int shared = connection->shared; struct ncclProxyArgs* args = *connection->proxyAppendPtr; if (args) { if (shared && args->opCount == op->opCount) { NCCLCHECK(ncclProxyOpToArgs(op, args, args->nsubs)); DEBUG_PROXY_PRINT("Insert (%d/%5ld/%5ld) as group with %5ld\n", shared, args->opCount, op->opCount, OP_INDEX(args)); } else { struct ncclProxyArgs* prevArgs = args; NCCLCHECK(allocateArgs(state, &args)); NCCLCHECK(ncclProxyOpToArgs(op, args, 0)); prevArgs->nextPeer = args; DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, prevArgs->opCount, args->opCount, OP_INDEX(prevArgs)); *(args->proxyAppendPtr) = args; } } else { // Nothing running for that peer. Add to the list NCCLCHECK(allocateArgs(state, &args)); NCCLCHECK(ncclProxyOpToArgs(op, args, 0)); if (state->active == NULL) { // Create the list DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element\n", OP_INDEX(args), shared, args->opCount); state->active = args; } else { // Append element at the end of the list struct ncclProxyArgs* last = state->active; while (last->next) last = last->next; last->next = args; DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element\n", OP_INDEX(args), shared, args->opCount); } *(args->proxyAppendPtr) = args; } return ncclSuccess; } ncclResult_t ncclProxyPost(struct ncclProxyOpsPool* pool, int nextOps, int nextOpsEnd) { pthread_mutex_lock(&pool->mutex); if (pool->nextOps == -1) { pool->nextOps = nextOps; pthread_cond_signal(&pool->cond); } else { pool->ops[pool->nextOpsEnd].next = nextOps; } pool->nextOpsEnd = nextOpsEnd; pthread_mutex_unlock(&pool->mutex); return ncclSuccess; } static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, struct ncclProxyOp* proxyOp) { int tpLocalRank = comm->topParentLocalRanks[comm->localRank]; struct ncclProxyOps* proxyOps = comm->proxyState->proxyOps; if (proxyOps == NULL) return ncclInternalError; proxyOps += proxyConn->tpLocalRank; struct ncclProxyOpsPool* pool = proxyOps->pool; TIME_START(0); int opIndex = proxyOps->freeOp; struct ncclProxyOp* op; if (opIndex != -1) { op = pool->ops+opIndex; proxyOps->freeOp = op->next; } else { int freeOp; while ((freeOp = pool->freeOps[tpLocalRank]) == -1) sched_yield(); int freeOpNew; while ((freeOpNew = __sync_val_compare_and_swap(pool->freeOps+tpLocalRank, freeOp, -1)) != freeOp) freeOp = freeOpNew; opIndex = freeOp; op = pool->ops+opIndex; proxyOps->freeOp = op->next; } if (op->next != -1) __builtin_prefetch(pool->ops+op->next); // Prefetch next free op memcpy(op, proxyOp, sizeof(struct ncclProxyOp)); op->next = -1; op->connection = proxyConn->connection; if (proxyOps->nextOps == -1) { proxyOps->nextOps = proxyOps->nextOpsEnd = opIndex; } else { pool->ops[proxyOps->nextOpsEnd].next = opIndex; proxyOps->nextOpsEnd = opIndex; } if (++proxyOps->count == MAX_OPS_PER_PEER) { // Post what we have so far to free some ops in the pool // Do not post last operations as we could have more coming with the same opCount, and posting // them in different batches would break proxyArgs aggregation with subs. uint64_t lastOpCount = pool->ops[proxyOps->nextOpsEnd].opCount; int lastOp = -1; int toSend = 0; int ops = 0; for (int op= proxyOps->nextOps; op != proxyOps->nextOpsEnd; op=pool->ops[op].next) { ops++; if (pool->ops[op].opCount != lastOpCount) { lastOp = op; toSend = ops; } } if (lastOp == -1) { WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount); return ncclInternalError; } // Cut chain at lastOp int nextOps = proxyOps->nextOps; proxyOps->nextOps = pool->ops[lastOp].next; pool->ops[lastOp].next = -1; NCCLCHECK(ncclProxyPost(proxyOps->pool, nextOps, lastOp)); proxyOps->count -= toSend; } TIME_STOP(0); return ncclSuccess; } static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) { if (peer < 0) return ncclSuccess; struct ncclChannelPeer* peerComm = channel->peers[peer]; struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex; if (connector->transportComm == NULL) { WARN("Rank %d has no transport for %s peer %d on channel %d/%d", comm->rank, type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex); return ncclInternalError; } if (connector->transportComm->proxyProgress == NULL) return ncclSuccess; if (justInquire) *justInquire = true; else { NCCLCHECK(ncclLocalOpAppend(comm, &connector->proxyConn, op)); } return ncclSuccess; } // justInquire != nullptr means don't actually do anything, just assertain need of // ncclProxySaveOp for this op. ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) { struct ncclChannel* channel = &comm->channels[op->channelId]; if (justInquire) *justInquire = false; switch (op->pattern) { case ncclPatternRing: case ncclPatternRingTwice: case ncclPatternPipelineFrom: case ncclPatternPipelineTo: { struct ncclRing* ring = &channel->ring; if (NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) { NCCLCHECK(SaveProxy(comm, channel, proxyRecv, ring->prev, op, 0, justInquire)); } if (NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) { NCCLCHECK(SaveProxy(comm, channel, proxySend, ring->next, op, 0, justInquire)); } } break; case ncclPatternTreeUp: case ncclPatternTreeDown: case ncclPatternTreeUpDown: { if (op->pattern != ncclPatternTreeDown) { // Tree up struct ncclTree* tree = &channel->tree; for (int i=0; idown[i], op, 0, justInquire)); } NCCLCHECK(SaveProxy(comm, channel, proxySend, tree->up, op, 0, justInquire)); } if (op->pattern != ncclPatternTreeUp) { // Tree down struct ncclTree* tree = &channel->tree; for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) { NCCLCHECK(SaveProxy(comm, channel, proxySend, tree->down[i], op, 0, justInquire)); } NCCLCHECK(SaveProxy(comm, channel, proxyRecv, tree->up, op, 0, justInquire)); } } break; case ncclPatternCollnetChain: { NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->collnetChain.up, op, 1, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetChain.up, op, 0, justInquire)); } break; case ncclPatternCollnetDirect: { NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->collnetDirect.out, op, 1, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetDirect.out, op, 0, justInquire)); } break; case ncclPatternNvls: { NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.out, op, 1, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.out, op, 0, justInquire)); } break; case ncclPatternNvlsTree: { NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[1], op, 0, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[2], op, 0, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeUp, op, 0, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[1], op, 0, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[2], op, 0, justInquire)); NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeUp, op, 0, justInquire)); } break; case ncclPatternSend: case ncclPatternRecv: { if (op->root == comm->rank) return ncclSuccess; NCCLCHECK(SaveProxy(comm, channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire)); } break; } return ncclSuccess; } NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0); ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) { memset(op, 0, sizeof(struct ncclProxyOp)); int channelId = info->channelId; struct ncclChannel* channel = info->comm->channels+channelId; op->channelId = channelId; op->sliceSteps = 1; op->chunkSteps = 1; op->dtype = info->datatype; op->protocol = info->protocol; int stepSize = info->comm->buffSizes[op->protocol]/NCCL_STEPS; if (op->protocol == NCCL_PROTO_SIMPLE) stepSize = info->comm->p2pChunkSize; info->chunkSize = stepSize; op->root = info->root; struct ncclChannelPeer* peer = channel->peers[op->root]; if (info->coll == ncclFuncSend) { op->pattern = ncclPatternSend; if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) { // Tune chunk size for the network if (info->count < stepSize) info->chunkSize /= 4; else if (info->count < 8*stepSize) info->chunkSize /= 2; } } else if (info->coll == ncclFuncRecv) { op->pattern = ncclPatternRecv; if (op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) { // Tune chunk size for the network if (info->count < stepSize) info->chunkSize /= 4; else if (info->count < 8*stepSize) info->chunkSize /= 2; } } else { WARN("P2p operation is neither send or recv"); return ncclInternalError; } if (ncclParamChunkSize() != 0) { info->chunkSize = ncclParamChunkSize(); } op->chunkSize = info->chunkSize; // Compute nSteps for proxies int chunkEffectiveSize = op->chunkSize; if (op->protocol == NCCL_PROTO_LL) { chunkEffectiveSize /= 2; } op->nbytes = stepSize; op->nsteps = DIVUP(info->count, chunkEffectiveSize); if (op->nsteps == 0) op->nsteps = 1; return ncclSuccess; } static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) { struct ncclProxyArgs* freeOp = *opPtr; struct ncclProxyArgs* next = freeOp->next; DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(next)); *opPtr = next; if (freeOp->nextPeer) { // replace op by nextPeer struct ncclProxyArgs* nextPeer = freeOp->nextPeer; if (*prevOpPtr) { (*prevOpPtr)->next = nextPeer; } else { state->active = nextPeer; } nextPeer->next = next; *(prevOpPtr) = nextPeer; } else { *(freeOp->proxyAppendPtr) = NULL; if (*prevOpPtr) { (*prevOpPtr)->next = next; } else { state->active = next; } } freeOp->next = state->pool; state->pool = freeOp; DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr)); #ifdef DEBUG_PROXY NCCLCHECK(dumpProxyState(state)); #endif return ncclSuccess; } static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) { struct ncclProxyArgs* prevOp = NULL; struct ncclProxyArgs* op = opStart; while (op) { if (op->state == ncclProxyOpNone) return ncclInternalError; TIME_START(0); TIME_START(1); NCCLCHECK(op->progress(proxyState, op)); if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); } *idle &= op->idle; if (op->state == ncclProxyOpNone) { TIME_START(2); NCCLCHECK(removeOp(state, &op, &prevOp)); TIME_STOP(2); } else { prevOp = op; op = op->next; } } return ncclSuccess; } NCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16); static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int* added) { struct ncclProxyProgressState* state = &proxyState->progressState; if (state->opsPool == NULL) return ncclInternalError; struct ncclProxyOpsPool* pool = state->opsPool; struct ncclProxyArgs profArgs; // Only used for profiling purposes if (state->nextOps != -1) goto process_nextops; // If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock // to be available. Exit, continue progress, and come back later. if (state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0)) return ncclSuccess; if (state->active == NULL) { pthread_mutex_lock(&pool->mutex); while (pool->nextOps == -1 && !state->stop) { struct ncclProxyArgs profArgs; // Only used for profiling purposes ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep); pthread_cond_wait(&pool->cond, &pool->mutex); ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup); } if (state->stop) { // We might have been woken up to stop. pthread_mutex_unlock(&pool->mutex); return ncclSuccess; } } state->nextOps = pool->nextOps; pool->nextOps = pool->nextOpsEnd = -1; pthread_mutex_unlock(&pool->mutex); if (state->nextOps == -1) return ncclInternalError; process_nextops: ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppend); TIME_START(2); int freeOp[NCCL_MAX_LOCAL_RANKS]; int freeOpEnd[NCCL_MAX_LOCAL_RANKS]; for (int i = 0; i < proxyState->tpLocalnRanks; i++) freeOp[i] = -1; uint64_t lastOpCount = 0; int lastPeer = -1; int count = 0; for (int opIndex = state->nextOps; opIndex != -1;) { struct ncclProxyOp* peerOp = pool->ops+opIndex; int peer = opIndex / MAX_OPS_PER_PEER; if ((lastOpCount && peerOp->opCount != lastOpCount) || ((lastPeer != -1) && peer != lastPeer)) count++; if (count == ncclParamProxyAppendBatchSize()+1) break; lastOpCount = peerOp->opCount; lastPeer = peer; if (peerOp->connection == NULL) return ncclInternalError; if (peerOp->next != -1) __builtin_prefetch(pool->ops+peerOp->next); NCCLCHECK(ProxyAppend(state, peerOp)); (*added)++; int lastOpIndex = opIndex; opIndex = peerOp->next; // Return op to peer pool if (freeOp[peer] == -1) { freeOpEnd[peer] = lastOpIndex; } else { peerOp->next = freeOp[peer]; } freeOp[peer] = lastOpIndex; state->nextOps = opIndex; } for (int i = 0; i < proxyState->tpLocalnRanks; i++) { if (freeOp[i] == -1) continue; int newFree = freeOp[i]; int oldFree = pool->freeOps[i]; pool->ops[freeOpEnd[i]].next = oldFree; if (oldFree == -1) { // Nothing for the main thread to consume, we can set it. pool->freeOps[i] = newFree; } else { // The main thread may recycle free ops at any time, replace the freeOps value atomically and check it worked. int swap = __sync_val_compare_and_swap(pool->freeOps+i, oldFree, newFree); if (swap != oldFree) { if (swap != -1) return ncclInternalError; // Ops were recycled while we were trying to swap, just set the value directly now. pool->ops[freeOpEnd[i]].next = -1; pool->freeOps[i] = newFree; } } } profArgs.opCount = *added; ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppendEnd); TIME_STOP(2); return ncclSuccess; } #include static ncclProxyProgressState* ncclLastProxyState; void ncclDumpProxyState(int signal) { dumpProxyState(ncclLastProxyState); } NCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0); static int setProxyThreadContext(struct ncclProxyState* proxyState) { #if CUDART_VERSION >= 11030 static int createThreadContext = -1; if (createThreadContext == -1) { createThreadContext = ncclParamCreateThreadContext(); if (createThreadContext) { if (CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) { WARN("Unable to create thread context due to old driver, disabling."); createThreadContext = 0; } } } if (createThreadContext) { if (proxyState->cudaCtx == NULL) { if (CUPFN(cuCtxCreate(&proxyState->cudaCtx, CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) { WARN("Failed to create CUDA context on device %d", proxyState->cudaDev); createThreadContext = 0; } } else { if (CUPFN(cuCtxSetCurrent(proxyState->cudaCtx)) != CUDA_SUCCESS) { WARN("Failed to set CUDA context on device %d", proxyState->cudaDev); return 0; } return 1; } } #endif return 0; } // Set to SIGUSR1 or SIGUSR2 to help debug proxy state during hangs NCCL_PARAM(ProxyDumpSignal, "PROXY_DUMP_SIGNAL", -1); NCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8); void* ncclProxyProgress(void *proxyState_) { struct ncclProxyState* proxyState = (struct ncclProxyState*)proxyState_; if (setProxyThreadContext(proxyState)) { INFO(NCCL_INIT, "[Proxy Progress] Created CUDA context on device %d", proxyState->cudaDev); } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) { WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev); } // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); struct ncclProxyProgressState* state = &proxyState->progressState; state->nextOps = -1; const int sig = ncclParamProxyDumpSignal(); if (sig != -1) signal(sig, ncclDumpProxyState); ncclLastProxyState = state; char threadName[NCCL_THREAD_NAMELEN]; snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", proxyState->cudaDev); nvtxNameOsThreadA(syscall(SYS_gettid), threadName); int lastIdle = 0; /* Too frequent call of ncclProxyGetPostedOps() will result in perf regression for small message * communication. proxyOpAppendCounter is a counter that helps us decide if we need to append proxy ops. * After each progress, proxyOpAppendCounter will increase by 1 and compare with environment variable * ncclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */ int proxyOpAppendCounter = 0; struct ncclProxyArgs profArgs; // Only used for profiling purposes while ((state->stop == false || (state->stop == true && state->active)) && *proxyState->abortFlag == 0) { int idle = 1; ncclResult_t ret = progressOps(proxyState, state, state->active, &idle); if (ret != ncclSuccess) { INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); return NULL; } if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle); if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive); if (idle || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) { int added = 0; proxyOpAppendCounter = 0; TIME_START(3); if (state->stop == false) ret = ncclProxyGetPostedOps(proxyState, &added); if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); } if (ret != ncclSuccess) { INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); } if (added == 0) { sched_yield(); // No request progressed. Let others run. } } lastIdle = idle; } return NULL; } ncclResult_t ncclProxyStart(struct ncclComm* comm) { struct ncclProxyOps* proxyOps = comm->proxyState->proxyOps; if (proxyOps == NULL) return ncclSuccess; TIME_START(1); for (int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) { struct ncclProxyOps* ops = proxyOps + r; if (ops->pool == NULL || ops->nextOps == -1) continue; NCCLCHECK(ncclProxyPost(ops->pool, ops->nextOps, ops->nextOpsEnd)); ops->nextOps = ops->nextOpsEnd = -1; ops->count = 0; } comm->opCount++; TIME_STOP(1); return ncclSuccess; } static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) { struct ncclProxyProgressState* state = &proxyState->progressState; if (!state->thread) { pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState); ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks); } return ncclSuccess; } ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) { struct ncclProxyProgressState* state = &proxyState->progressState; // Request the proxy to stop and then wake it if (state->opsPool) { pthread_mutex_lock(&state->opsPool->mutex); state->stop = true; pthread_cond_signal(&state->opsPool->cond); pthread_mutex_unlock(&state->opsPool->mutex); pthread_join(state->thread, NULL); } // Free off any memory allocated for the proxy arg pools while (state->pools != NULL) { struct ncclProxyPool *next = state->pools->next; free(state->pools); state->pools = next; } ncclProfilingDump(); TIME_PRINT("Proxy"); return ncclSuccess; } #define NCCL_PROXY_CONN_POOL_SIZE_POW2 7 #define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2)) #define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1) struct ncclProxyConnectionPool { struct ncclProxyConnection** pools; int banks; int offset; }; static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) { if (pool->offset == NCCL_PROXY_CONN_POOL_SIZE) { NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1)); NCCLCHECK(ncclCalloc(pool->pools+pool->banks, NCCL_PROXY_CONN_POOL_SIZE)); pool->banks++; pool->offset = 0; } *id = ((pool->banks-1) << NCCL_PROXY_CONN_POOL_SIZE_POW2) + pool->offset; pool->offset++; return ncclSuccess; } static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool, int id, struct ncclProxyConnection** conn) { int bank = id>>NCCL_PROXY_CONN_POOL_SIZE_POW2; int offset = id&NCCL_PROXY_CONN_POOL_MASK; if ((pool->pools == NULL) || (bank > pool->banks) || (pool->pools[bank] == NULL)) return ncclInternalError; *conn = pool->pools[bank]+offset; return ncclSuccess; } static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { if (connection->send) { if (ncclTransports[connection->transport]->send.proxyFree) { NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, proxyState)); } } else { if (ncclTransports[connection->transport]->recv.proxyFree) { NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, proxyState)); } } return ncclSuccess; } static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* pool, struct ncclProxyState* proxyState) { for (int b=0; bbanks; b++) { int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE; for (int i=0; ipools[b]+i; if (connection->state != connUninitialized) { NCCLCHECK(proxyFree(connection, proxyState)); } } free(pool->pools[b]); } free(pool->pools); return ncclSuccess; } #include "transport.h" struct ncclProxyInitReq { int transport; int send; int tpLocalRank; int tpRank; int sameProcess; }; struct ncclProxyInitResp { ncclProxyConnection* connection; char devShmPath[6]; // "XXXXXX" - May or may not be set }; ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int tpProxyRank, struct ncclProxyConnector* proxyConn) { struct ncclSocket* sock; int ready, proxyRank = -1; struct ncclProxyState* sharedProxyState = comm->proxyState; // Keep one connection per mlocal rank for (int i = 0; i < comm->localRanks; ++i) { /* find the proxy rank in comm. */ if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) { proxyRank = comm->localRankToRank[i]; break; } } proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; // Keep one connection per local rank proxyConn->connection = NULL; proxyConn->tpRank = tpProxyRank; if (sharedProxyState->peerSocks == NULL) { NCCLCHECK(ncclCalloc(&sharedProxyState->peerSocks, comm->sharedRes->tpNLocalRanks)); NCCLCHECK(ncclCalloc(&sharedProxyState->proxyOps, comm->sharedRes->tpNLocalRanks)); NCCLCHECK(ncclCalloc(&sharedProxyState->sharedDevMems, comm->sharedRes->tpNLocalRanks)); for (int i = 0; i < comm->sharedRes->tpNLocalRanks; ++i) { NCCLCHECK(ncclSocketSetFd(-1, &sharedProxyState->peerSocks[i])); } } proxyConn->tpLocalRank = comm->sharedRes->tpRankToLocalRank[proxyConn->tpRank]; sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank; NCCLCHECK(ncclSocketReady(sock, &ready)); if (!ready) { NCCLCHECK(ncclSocketInit(sock, sharedProxyState->peerAddresses+proxyConn->tpRank, comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag)); NCCLCHECK(ncclSocketConnect(sock)); } struct ncclProxyInitReq req = {0}; req.transport = transport; req.send = send; req.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; req.tpRank = comm->topParentRanks[comm->rank]; req.sameProcess = proxyConn->sameProcess; struct ncclProxyInitResp resp = {0}; // This usually sends proxyConn->connection to identify which connection this is. // However, this is part of the response and therefore is ignored NCCLCHECK(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgInit, &req, sizeof(req), &resp, sizeof(resp))); proxyConn->connection = resp.connection; // If we need proxy progress, map progress ops struct ncclTransportComm* tcomm = send ? &ncclTransports[transport]->send : &ncclTransports[transport]->recv; if (tcomm->proxyProgress) { char poolPath[] = "/dev/shm/nccl-XXXXXX"; strncpy(poolPath+sizeof("/dev/shm/nccl-")-1, resp.devShmPath, sizeof("XXXXXX")-1); struct ncclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank; if (proxyOps->pool == NULL) { NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0, &proxyOps->handle)); proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1; } } INFO(NCCL_NET|NCCL_PROXY, "Connection to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection); return ncclSuccess; } // cuMem API support // The response is sent out-of-band using ncclIpcSocket for this specific command ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd) { ncclResult_t ret = ncclSuccess; ncclResult_t res = ncclInProgress; struct ncclIpcSocket ipcSock = { 0 }; void* opId = malloc(1); // Create a UDS socket to receive the converted fd NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag)); // Request the conversion of the fd over sockets NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgConvertFd, &fd, sizeof(int), 0, opId), ret, error); // Receive converted fd over UDS NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, convertedFd)); TRACE(NCCL_PROXY, "UDS: ConvertFd rank %d returned %p %d", proxyConn->tpLocalRank, convertedFd, *convertedFd); NCCLCHECK(ncclIpcSocketClose(&ipcSock)); while (res == ncclInProgress) { res = ncclPollProxyResponse(comm, proxyConn, NULL, opId); } free(opId); return res; error: NCCLCHECK(ncclIpcSocketClose(&ipcSock)); WARN("ncclProxyClientConvertFd call to top parent rank %d failed", proxyConn->tpRank); return ret; } const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd" }; ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) { struct ncclSocket* sock; ncclResult_t ret = ncclSuccess; struct ncclProxyState* sharedProxyState = comm->proxyState; if (sharedProxyState->peerSocks == NULL) return ncclInternalError; sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank; if (sock == NULL) return ncclInternalError; NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error); NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error); if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error); // Send opId to proxy NCCLCHECKGOTO(ncclSocketSend(sock, &opId, sizeof(opId)), ret, error); // Add proxyOp to expected response queue NCCLCHECK(expectedProxyResponseEnqueue(sharedProxyState, opId, respSize)); return ncclSuccess; error: return ret; } ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) { struct ncclProxyState* sharedProxyState = comm->proxyState; // Receive the connection pointer from the Proxy if (*comm->abortFlag) { WARN("Comm %p is in abort state", comm); return ncclInternalError; } if (sharedProxyState->peerSocks == NULL) return ncclInternalError; // Check response queue int found = 0; NCCLCHECK(expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found)); if (found == 0) { // Attempt to read in a new response header from the proxy thread struct ncclSocket* sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank; void* recvOpId; int offset = 0; if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)) { WARN("Socket recv failed while polling for opId=%p", opId); return ncclInternalError; } if (offset == 0) { return ncclInProgress; // If we've returned a partial response, block to receive the rest of it } else if (offset < sizeof(recvOpId)) { while (offset < sizeof(recvOpId)) NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)); } INFO(NCCL_PROXY, "ncclPollProxyResponse Received new opId=%p", recvOpId); // Now do a blocking recv of the response size int respSize = 0; NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(respSize))); // If there's a respSize to recv if (respSize > 0) { if (recvOpId != opId) { // Unexpected response, need to buffer the socket data respBuff = malloc(respSize); } assert(respBuff != NULL); NCCLCHECK(ncclSocketRecv(sock, respBuff, respSize)); } if (recvOpId == opId) { INFO(NCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId); NCCLCHECK(expectedProxyResponseRemove(sharedProxyState, recvOpId)); return ncclSuccess; } else { INFO(NCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", recvOpId, respBuff, respSize); // Store the result and mark response as completed NCCLCHECK(expectedProxyResponseStore(sharedProxyState, recvOpId, respBuff, respSize)); return ncclInProgress; } } else { INFO(NCCL_PROXY, "ncclPollProxyResponse Dequeued cached opId=%p", opId); } return ncclSuccess; } ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) { // Alloc some memory to act as a handle ncclResult_t res = ncclSuccess; void* opId = malloc(1); NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, type, reqBuff, reqSize, respSize, opId), res, fail); do { res = ncclPollProxyResponse(comm, proxyConn, respBuff, opId); } while (res == ncclInProgress); exit: free(opId); return res; fail: goto exit; } static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) { struct ncclProxyProgressState* state = &proxyState->progressState; if (state->opsPool == NULL) { int size = sizeof(struct ncclProxyOpsPool); struct ncclProxyOpsPool* pool = NULL; char shmPath[sizeof("/dev/shm/nccl-XXXXXX")]; shmPath[0] = '\0'; NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks + 1, &state->handle)); // Init pool pool->nextOps = -1; for (int r = 0; r < proxyState->tpLocalnRanks; r++) { pool->freeOps[r] = r*MAX_OPS_PER_PEER; for (int i=0; iops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1; pool->ops[(r+1)*MAX_OPS_PER_PEER-1].next = -1; } // Setup mutex/cond to work inter-process pthread_mutexattr_t mutexAttr; pthread_mutexattr_init(&mutexAttr); pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED); pthread_mutex_init(&pool->mutex, &mutexAttr); pthread_condattr_t condAttr; pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED); pthread_cond_init(&pool->cond, &condAttr); state->opsPool = pool; memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1); // All ops structures are created, we can start the progress thread NCCLCHECK(ncclProxyProgressCreate(proxyState)); } return ncclSuccess; } static void proxyOpsFree(struct ncclProxyState* proxyState) { struct ncclProxyProgressState* state = &proxyState->progressState; if (ncclShmClose(state->handle) != ncclSuccess) { WARN("[Service thread] shm close failed"); } } ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) { struct ncclProxyProgressState* state = &comm->proxyState->progressState; if (state->opsPool == NULL) return ncclSuccess; if (ncclShmUnlink(state->handle) != ncclSuccess) { WARN("[Service thread] proxy ops shm unlink failed"); } return ncclSuccess; } static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, ncclProxyInitReq* req, ncclProxyInitResp* resp, struct ncclProxyConnection** connection) { int id; NCCLCHECK(ncclProxyNewConnection(connectionPool, &id)); NCCLCHECK(ncclProxyGetConnection(connectionPool, id, connection)); (*connection)->sock = &peer->sock; (*connection)->transport = req->transport; (*connection)->send = req->send; (*connection)->tpLocalRank = req->tpLocalRank; (*connection)->sameProcess = req->sameProcess; peer->tpLocalRank = req->tpLocalRank; peer->tpRank = req->tpRank; resp->connection = *connection; (*connection)->tcomm = (*connection)->send ? &ncclTransports[(*connection)->transport]->send : &ncclTransports[(*connection)->transport]->recv; // If we need proxy progress, let's allocate ops and start the thread if ((*connection)->tcomm->proxyProgress) { NCCLCHECK(proxyProgressInit(proxyState)); struct ncclProxyProgressState* state = &proxyState->progressState; strncpy(resp->devShmPath, state->opsPoolShmSuffix, sizeof(resp->devShmPath)); } INFO(NCCL_NET|NCCL_PROXY, "New proxy %s connection %d from local rank %d, transport %d", (*connection)->send ? "send":"recv", id, (*connection)->tpLocalRank, (*connection)->transport); __atomic_store_n(&(*connection)->state, connInitialized, __ATOMIC_RELEASE); return ncclSuccess; } // cuMem API support static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, int fd) { struct ncclIpcSocket ipcSock = { 0 }; uint64_t hash = (uint64_t) opId; INFO(NCCL_PROXY, "UDS proxyConvertFd received fd %d peer %d opId %lx", fd, peer->tpLocalRank, hash); // Send back the converted fd using UDS NCCLCHECK(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag)); NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash)); NCCLCHECK(ncclIpcSocketClose(&ipcSock)); return ncclSuccess; } static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclProxyState* proxyState, int* asyncOpCount, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool) { int done = 1; if (op->type == ncclProxyMsgSetup) { TRACE(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId); NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); } else if (op->type == ncclProxyMsgConnect) { TRACE(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff); NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); } else if (op->type == ncclProxyMsgSharedInit) { int nChannels = (int) *op->reqBuff; TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels); if (op->connection->tcomm->proxySharedInit) NCCLCHECK(op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels)); __atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE); } else if (op->type == ncclProxyMsgConvertFd) { int fd = *(int *)op->reqBuff; TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgConvertFd opId=%p op.reqBuff=%p fd=%d", op->opId, op->reqBuff, fd); NCCLCHECK(proxyConvertFd(peer, op->opId, proxyState, fd)); // cuMem API support } else if (op->type == ncclProxyMsgInit) { TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff); NCCLCHECK(proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection)); } else return ncclInternalError; if (done) { INFO(NCCL_PROXY, "proxyProgressAsync opId=%p op.type=%d op.reqBuff=%p op.respSize=%d done", op->opId, op->type, op->reqBuff, op->respSize); if (op->type == ncclProxyMsgSetup) __atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE); else if (op->type == ncclProxyMsgConnect) __atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE); /* if setup or connect is done, we should not return any error at this point since * ncclSocketSend might already send the respBuff to the requester. If we still choose * to abort and close the connection, it can cause segfault if the requester is using * the respBuff. */ // Send the opId for referencing async operation NCCLCHECK(ncclSocketSend(op->connection->sock, &op->opId, sizeof(op->opId))); // Send the response size NCCLCHECK(ncclSocketSend(op->connection->sock, &op->respSize, sizeof(op->respSize))); if (op->respSize) { // Send the response NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize)); } asyncProxyOpDequeue(peer, op); (*asyncOpCount)--; return ncclSuccess; } else if (*proxyState->abortFlag != 0) { return ncclInternalError; } return ncclInProgress; } static ncclResult_t proxyServiceInitOp(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, int* asyncOpCount) { struct ncclSocket* sock = &peer->sock; struct ncclProxyAsyncOp* asyncOp; NCCLCHECK(ncclCalloc(&asyncOp, 1)); asyncOp->type = type; NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*))); NCCLCHECK(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int))); NCCLCHECK(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int))); if (asyncOp->reqSize) { NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize)); NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize)); } // Store opId for completion response NCCLCHECK(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId))); if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize)); asyncProxyOpEnqueue(peer, asyncOp); (*asyncOpCount)++; NCCLCHECK(proxyProgressAsync(asyncOp, proxyState, asyncOpCount, peer, connectionPool)); return ncclSuccess; } #include static bool proxyMatchOpType(int type) { switch (type) { case ncclProxyMsgInit: case ncclProxyMsgSharedInit: case ncclProxyMsgSetup: case ncclProxyMsgConnect: case ncclProxyMsgConvertFd: return true; default: return false; } } void* ncclProxyService(void* _args) { struct ncclProxyState* proxyState = (struct ncclProxyState*) _args; // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); if (setProxyThreadContext(proxyState)) { INFO(NCCL_INIT, "[Proxy Service] Created CUDA context on device %d", proxyState->cudaDev); } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) { WARN("[Proxy Service] Failed to set CUDA device %d", proxyState->cudaDev); } // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); // Prepare poll descriptor struct ncclProxyConnectionPool connectionPool; connectionPool.pools = NULL; connectionPool.banks = 0; connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE; struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1]; struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS]; memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS); for (int s=0; slistenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) { WARN("[Proxy Service] Get listenSock fd fails"); return NULL; }; pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN; int maxnpeers = 0; int npeers = 0; int stop = 0; int asyncOpCount = 0; while (stop == 0 || (stop == 1 && npeers > 0)) { /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer * connections. Need to wait until all other related comms call abort and safely exit * together, or we could face segmentation fault. */ if (*proxyState->abortFlag != 0) stop = 1; /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */ int ret; do { ret = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500); } while (ret < 0 && errno == EINTR); if (ret < 0) { WARN("[Proxy Service] Poll failed: %s", strerror(errno)); return NULL; } if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) { int s = 0; while (s < NCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0) s++; if (s == NCCL_MAX_LOCAL_RANKS) { WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS); return NULL; } if (maxnpeers < s+1) maxnpeers = s+1; if (ncclSocketInit(&peers[s].sock) != ncclSuccess) { WARN("[Service thread] Initialize peers[%d].sock fails", s); return NULL; } if (ncclSocketAccept(&peers[s].sock, proxyState->listenSock) != ncclSuccess) { WARN("[Service thread] Accept failed %s", strerror(errno)); } else { if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) { WARN("[Service thread] Get peers[%d].sock fd fails", s); return NULL; } npeers++; peers[s].tpLocalRank = -1; } } for (int s=0; ssock; int closeConn = 0; int type = 0; ncclResult_t res = ncclSuccess; if (pollfds[s].fd == -1) continue; // Progress all ops for this ncclProxyLocalPeer ncclProxyAsyncOp* op = peer->asyncOps; while (op != nullptr) { ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */ type = op->type; res = proxyProgressAsync(op, proxyState, &asyncOpCount, peer, &connectionPool); if (res == ncclSuccess || res == ncclInProgress) { op = opnext; } else { // Res is a bad result closeConn = 1; WARN("[Service thread] Error encountered progressing operation=%s, res=%d, closing connection", ncclProxyMsgTypeStr[type], res); break; } } // Check for additional ops coming in if (pollfds[s].revents & POLLIN) { int closed; res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/); if (res != ncclSuccess && res != ncclInProgress) { WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed); closeConn = 1; } else if (closed) { INFO(NCCL_INIT|NCCL_NET|NCCL_PROXY, "[Service thread] Connection closed by localRank %d", peer->tpLocalRank); closeConn = 1; } else if (res == ncclSuccess) { // We received something from the sock if (type == ncclProxyMsgStop) { stop = 1; closeConn = 1; } else if (type == ncclProxyMsgClose) { closeConn = 1; } else if (proxyMatchOpType(type)) { res = proxyServiceInitOp(type, peers+s, &connectionPool, proxyState, &asyncOpCount); } else { WARN("[Service thread] Unknown command %d from localRank %d", type, peer->tpLocalRank); closeConn = 1; } INFO(NCCL_PROXY, "Received and initiated operation=%s res=%d", ncclProxyMsgTypeStr[type], res); } } else if (pollfds[s].revents & POLLHUP) { closeConn = 1; } if (res != ncclSuccess && res != ncclInProgress) { WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res); closeConn = 1; } if (closeConn) { ncclSocketClose(sock); if (op != nullptr) { asyncProxyOpDequeue(peer, op); asyncOpCount--; } pollfds[s].fd = -1; npeers--; } } } // Wait for all operations to complete and stop progress thread before freeing any resource if (ncclProxyProgressDestroy(proxyState) != ncclSuccess) { WARN("[Proxy Service] proxyDestroy failed"); } for (int s=0; slistenSock); free(proxyState->listenSock); proxyOpsFree(proxyState); return NULL; } ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) { assert(comm->sharedRes->proxyState == NULL); NCCLCHECK(ncclCalloc(&comm->sharedRes->proxyState, 1)); comm->proxyState = comm->sharedRes->proxyState; comm->proxyState->refCount = 1; comm->proxyState->listenSock = sock; comm->proxyState->peerAddresses = peerAddresses; return ncclSuccess; } ncclResult_t ncclProxyCreate(struct ncclComm* comm) { /* proxyState is shared among parent comm and split comms. comm->proxyState->thread is * pthread_join()'d by commFree() in init.cc when the refCount reduces down to 0. */ struct ncclProxyState* proxyState = comm->proxyState; if (proxyState->refCount == 1) { /* we have to make sure all following fields in comm have been initialized. */ proxyState->tpRank = comm->rank; proxyState->tpnRanks = comm->nRanks; proxyState->tpLocalnRanks = comm->localRanks; proxyState->cudaDev = comm->cudaDev; proxyState->abortFlag = comm->abortFlag; proxyState->p2pnChannels = comm->p2pnChannels; proxyState->p2pChunkSize = comm->p2pChunkSize; proxyState->nChannels = comm->nChannels; proxyState->allocP2pNetLLBuffers = comm->allocP2pNetLLBuffers; proxyState->dmaBufSupport = comm->dmaBufSupport; proxyState->ncclNet = comm->ncclNet; proxyState->ncclCollNet = comm->ncclCollNet; memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes)); pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState); ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev); } return ncclSuccess; } ncclResult_t ncclProxyStop(struct ncclComm* comm) { if (comm->sharedRes && comm->sharedRes->proxyState) { struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState; if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) { if (sharedProxyState->peerAddresses) { if (*comm->abortFlag == 0) { struct ncclSocket sock; int type = ncclProxyMsgStop; NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag)); NCCLCHECK(ncclSocketConnect(&sock)); NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int))); NCCLCHECK(ncclSocketClose(&sock)); } } if (sharedProxyState->peerSocks) { int tplocalRanks = comm->sharedRes->tpNLocalRanks; for (int i = 0; i < tplocalRanks; i++) { int fd; NCCLCHECK(ncclSocketGetFd(sharedProxyState->peerSocks + i, &fd)); if (fd >= 0) { if (sharedProxyState->proxyOps[i].pool) { NCCLCHECK(ncclShmClose(sharedProxyState->proxyOps[i].handle)); } if (sharedProxyState->sharedDevMems[i]) { if (!ncclCuMemEnable()) { CUDACHECK(cudaIpcCloseMemHandle(sharedProxyState->sharedDevMems[i])); } } int type = ncclProxyMsgClose; if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int))); NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i)); } } } } } return ncclSuccess; } ncclResult_t ncclProxyDestroy(struct ncclComm* comm) { struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState; assert(sharedProxyState->refCount == 0); free(sharedProxyState->peerAddresses); free(sharedProxyState->peerSocks); free(sharedProxyState->proxyOps); free(sharedProxyState->sharedDevMems); expectedProxyResponseFree(sharedProxyState); free(sharedProxyState); return ncclSuccess; } nccl-2.18.3-1/src/transport.cc000066400000000000000000000370741444201535000160250ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "info.h" #include "bootstrap.h" #define ENABLE_TIMER 0 #include "timer.h" struct ncclTransport* ncclTransports[NTRANSPORTS] = { &p2pTransport, &shmTransport, &netTransport, &collNetTransport }; template static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex, int* transportType) { struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank; struct ncclPeerInfo* peerInfo = comm->peerInfo+peer; struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer]->send + connIndex : comm->channels[channelId].peers[peer]->recv + connIndex; for (int t=0; tsend : &transport->recv; int ret = 0; NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo)); if (ret) { connector->transportComm = transportComm; NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex)); if (transportType) *transportType = t; return ncclSuccess; } } WARN("No transport found for rank %d[%lx] -> rank %d[%lx]", myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); return ncclSystemError; } ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) { TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); struct ncclChannel* channel = &comm->channels[channelId]; uint64_t mask = 1UL << channel->id; for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer]->recv[connIndex].connected) continue; comm->connectRecv[peer] |= mask; } for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer]->send[connIndex].connected) continue; comm->connectSend[peer] |= mask; } return ncclSuccess; } void dumpData(struct ncclConnect* data, int ndata) { for (int n=0; nnRanks); // Store intermediate send/recvData structs for connect struct ncclConnect** recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given recv connection within a channel struct ncclConnect** sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * comm->nRanks); // Points to entries inside data for given send connection within a channel NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); // First time initialization for (int i=1; inRanks; i++) { int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0); int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; int sendPeer = (comm->rank + i) % comm->nRanks; uint64_t recvMask = comm->connectRecv[recvPeer]; uint64_t sendMask = comm->connectSend[sendPeer]; // Data[i] contains all ncclConnect information for all send and receive connections with a given send and recv peer // This data is packed in the array based on the number of sendChannels and recvChannels connected with these peers // The first N entries contain recvData, connection information for recv connections // The next M entries contain sendData, connection information for send connections // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections data[i] = (ncclConnect*) malloc(sizeof(ncclConnect) * 2*MAXCHANNELS); recvData[i] = data[i]; int sendChannels = 0, recvChannels = 0; int type; TIME_START(0); for (int c=0; c(comm, graph, recvData[i]+recvChannels++, c, recvPeer, connIndex, &type), ret, fail); if (type > highestType) highestType = type; } } TIME_STOP(0); TIME_START(1); sendData[i] = recvData[i]+recvChannels; for (int c=0; c(comm, graph, sendData[i]+sendChannels++, c, sendPeer, connIndex, &type), ret, fail); if (type > highestType) highestType = type; } } TIME_STOP(1); TIME_START(2); if (sendPeer == recvPeer) { if (recvChannels+sendChannels) { NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail); NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data[i], sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail); sendData[i] = data[i]; recvData[i] = data[i]+sendChannels; } } else { if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail); if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail); if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData[i], sizeof(struct ncclConnect)*sendChannels), ret, fail); if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData[i], sizeof(struct ncclConnect)*recvChannels), ret, fail); } TIME_STOP(2); } // Loop until all channels with all ranks have been connected bool allChannelsConnected; allChannelsConnected = false; while (!allChannelsConnected) { allChannelsConnected = true; for (int i=1; inRanks; i++) { int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; int sendPeer = (comm->rank + i) % comm->nRanks; uint64_t recvMask = comm->connectRecv[recvPeer]; uint64_t sendMask = comm->connectSend[sendPeer]; int sendDataOffset = 0; int recvDataOffset = 0; for (int c=0; cchannels[c].peers[sendPeer]->send + connIndex; // This connector hasn't completed connection yet if (conn->connected == 0) { NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail); if (ret == ncclSuccess) { struct ncclDevChannelPeer* addr; conn->connected = 1; /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */ CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[sendPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail); CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail); } else if (ret == ncclInProgress) { allChannelsConnected = false; } } } TIME_STOP(3); // Start with recv channels TIME_START(4); if (recvMask & (1UL<channels[c].peers[recvPeer]->recv + connIndex; // This connector hasn't completed connection yet if (conn->connected == 0) { NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail); if (ret == ncclSuccess) { struct ncclDevChannelPeer* addr; conn->connected = 1; /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */ CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[recvPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail); CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail); } else if (ret == ncclInProgress) { allChannelsConnected = false; } } } TIME_STOP(4); } } } // Clear all connect masks and free each connectInfo array for (int i=1; inRanks; i++) { int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; int sendPeer = (comm->rank + i) % comm->nRanks; comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL; free(data[i]); } free(data); free(sendData); free(recvData); if (highestTransportType != NULL) *highestTransportType = highestType; TIME_PRINT("P2P Setup/Connect"); exit: NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream)); NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream)); return ret; fail: goto exit; } extern struct ncclTransport collNetTransport; // All ranks must participate in collNetSetup call // We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type) { int fail = 1; int rank = comm->rank; int nranks = comm->nRanks; int nMasters = comm->nNodes; int rankInCollNet = -1; int isMaster = (rank == masterRank) ? 1 : 0; struct { int collNetRank; ncclConnect connect; } sendrecvExchange; // check if we can connect to collnet, whose root is the nranks-th rank struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks; peerInfo->rank = nranks; // send master receives connect info from peer recv master if (isMaster && type == collNetSend) { NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange))); rankInCollNet = sendrecvExchange.collNetRank; TRACE(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer); } // select struct ncclChannelPeer* root = channel->peers[nranks]; // connector index: 0 for recv, 1 for send struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type; struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send); conn->transportComm = transportComm; // setup struct ncclConnect myConnect; if (isMaster) { NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type)); } // prepare connect handles ncclResult_t res; struct { int isMaster; ncclConnect connect; } *allConnects = NULL; ncclConnect *masterConnects = NULL; NCCLCHECK(ncclCalloc(&masterConnects, nMasters)); if (type == collNetRecv) { // recv side: AllGather // all ranks must participate NCCLCHECK(ncclCalloc(&allConnects, nranks)); allConnects[rank].isMaster = isMaster; memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect)); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup); // consolidate int c = 0; for (int r = 0; r < nranks; r++) { if (allConnects[r].isMaster) { memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect)); if (r == rank) rankInCollNet = c; c++; } } } else { // send side : copy in connect info received from peer recv master if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect)); } // connect if (isMaster) { NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup); struct ncclDevChannelPeer* devRoot; CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup); struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type; CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup); } // recv side sends connect info to send side if (isMaster && type == collNetRecv) { sendrecvExchange.collNetRank = rankInCollNet; memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect)); NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup); TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer); } fail = 0; cleanup: if (allConnects != NULL) free(allConnects); if (masterConnects != NULL) free(masterConnects); return fail; } ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) { // AllGather collNet setup results int allGatherFailures[NCCL_MAX_LOCAL_RANKS] = {0}; allGatherFailures[comm->localRank] = collNetSetupFail; NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, allGatherFailures, sizeof(int))); for (int i=0; ilocalRanks; i++) { if (allGatherFailures[i] != 0) { collNetSetupFail = 1; break; } } if (collNetSetupFail) { if (comm->localRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead"); return ncclSystemError; } return ncclSuccess; } ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) { // Free collNet resources for (int r=0; rnChannels; r++) { struct ncclChannel* channel = comm->channels+r; struct ncclChannelPeer* peer = channel->peers[comm->nRanks]; if (peer) { if (ncclAtomicRefCountDecrement(&peer->refCount) == 0) { for (int b=0; bsend + b; if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send)); send->transportResources = NULL; // avoid double free } for (int b=0; brecv + b; if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv)); recv->transportResources = NULL; // avoid double free } } } } return ncclSuccess; } nccl-2.18.3-1/src/transport/000077500000000000000000000000001444201535000155035ustar00rootroot00000000000000nccl-2.18.3-1/src/transport/coll_net.cc000066400000000000000000001145271444201535000176230ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "coll_net.h" #include "graph.h" #include "proxy.h" #include "gdrwrap.h" int64_t ncclParamGdrCopySyncEnable(); int64_t ncclParamGdrCopyFlushEnable(); struct collNetRecvConnectInfo { int rank; int nranks; collNetHandle_t collNetHandle; }; struct collNetSendConnectInfo { void* mhandles[NCCL_NUM_PROTOCOLS]; void* reqFifo; }; #define COLLNET_GROUP_NSUBS 8 #define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS) #define NCCL_NET_MAP_HOSTMEM 0 #define NCCL_NET_MAP_DEVMEM 1 #define NCCL_NET_MAP_SHARED_HOSTMEM 2 #define NCCL_NET_MAP_SHARED_DEVMEM 3 #define NCCL_NET_MAP_GDCMEM 4 #define NCCL_NET_MAP_MEMS 5 #define NCCL_NET_MAP_MASK_DEVMEM 0x40000000 #define NCCL_NET_MAP_MASK_SHARED 0x80000000 #define NCCL_NET_MAP_MASK_USED 0x20000000 #define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff #define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \ ((mapStruct)->offsets.offsetName >> 30) #define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \ (((mapStruct)->offsets.offsetName >> 29) == 0) #define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \ (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \ (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET)) #define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \ (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0) #define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \ int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \ if ((shared) == 0) { \ if (dev) { \ (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \ (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \ } else { \ (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \ (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \ } \ } else { \ (mapStruct)->offsets.offsetName = bank; \ } \ } while (0); struct connectMapMem{ char* gpuPtr; char* cpuPtr; int size; }; struct connectMap { int shared; // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem. struct connectMapMem mems[NCCL_NET_MAP_MEMS]; // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL. struct { uint32_t sendMem; uint32_t recvMem; uint32_t buffs[NCCL_NUM_PROTOCOLS]; } offsets; }; struct reqSlot { volatile void* recvBuff; volatile int size; }; struct sendResources { struct connectMap map; void* collNetComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; int rank; int nranks; int netDev; int useGdr; int useDmaBuf; uint64_t* gdcSync; void* gdrDesc; void* sendMhandles[NCCL_NUM_PROTOCOLS]; void* recvMhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; struct reqSlot (*reqFifo)[NCCL_STEPS]; int collNetRank; }; struct recvResources { struct connectMap map; void* collNetComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; int rank; int nranks; int netDev; int useGdr; int useDmaBuf; int needFlush; uint64_t* gdcSync; uint64_t* gdcFlush; void* gdrDesc; void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS]; int collNetRank; }; static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { // This transport cannot be used for p2p *ret = 0; return ncclSuccess; } struct setupReq { int netDev; int useGdr; int needFlush; struct ncclCollNetSharedRes* collNet; }; /* Setup send connector, and return connect information for others in the coll * communicator to connect to me */ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct setupReq req = { 0 }; int proxyRank, tpProxyRank; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.tpLocalRank)); tpProxyRank = comm->topParentRanks[myInfo->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn)); ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount); req.collNet = comm->collNetSharedRes; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, req.useGdr ? "/GDRDMA" : ""); return ncclSuccess; } static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct setupReq req = { 0 }; int proxyRank, tpProxyRank; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr)); recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; // Determine whether we need to flush the GDR buffer on recv or not if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.tpLocalRank)); tpProxyRank = comm->topParentRanks[myInfo->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn)); struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount); req.collNet = comm->collNetSharedRes; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t))); INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, req.useGdr ? "/GDRDMA" : ""); return ncclSuccess; } static ncclResult_t collNetDumpMap(struct connectMap* map) { printf("Dump map\n"); struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM; printf("Mem 0: Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_DEVMEM; printf("Mem 1: Vid mem CPU (%x B) %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM; printf("Mem 2: Shared Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM; printf("Mem 3: Shared Vid (%x B) mem CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET, NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem)); printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET, NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem)); for (int p=0; p Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p, map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0, NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET, NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p])); } printf("End of dump\n"); return ncclSuccess; } struct collNetConnectArgs { int rank; int nranks; struct ncclConnect* connectInfos; }; static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) { // We're on the same process as the proxy. We can pass a pointer to a struct. struct collNetConnectArgs args = { rank, nranks, connectInfos }; struct connectMap* map; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); // If collnet connect failed, propagate error to fallback on regular p2p if (map == NULL) return ncclSystemError; //NCCLCHECK(collNetDumpMap(map)); struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head; struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); send->conn.tail = &recvMem->tail; send->conn.sizesFifo = recvMem->sizesFifo; for (int i=0; iconn.sizesFifo[i] = -1; send->conn.offsFifo = recvMem->offsFifo; for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); return ncclSuccess; } static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) { // We're on the same process as the proxy. We can pass a pointer to a struct. struct collNetConnectArgs args = { rank, nranks, connectInfos }; struct connectMap* map; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); // If collnet connect failed, propagate error to fallback on regular p2p if (map == NULL) return ncclSystemError; //NCCLCHECK(collNetDumpMap(map)); struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); recv->conn.head = &sendMem->head; struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail; recv->conn.offsFifo = recvMem->offsFifo; for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); } return ncclSuccess; } static ncclResult_t sendFree(struct ncclConnector* send) { return ncclSuccess; } static ncclResult_t recvFree(struct ncclConnector* recv) { return ncclSuccess; } static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct setupReq* req = (struct setupReq*)reqBuff; if (reqSize != sizeof(struct setupReq)) return ncclInternalError; struct sendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; connection->shared = 1; resources->netDev = req->netDev; resources->useGdr = req->useGdr; ncclNetProperties_t props; NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props)); connection->collNet = req->collNet; /* DMA-BUF support */ resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); return ncclSuccess; } struct sharedResources { void* collNetListenComms[MAXCHANNELS]; void* collNetComms[MAXCHANNELS]; int commRefCount[NCCL_MAX_NETDEVS]; }; static ncclResult_t sharedListen(struct ncclProxyState* proxyState, int netDev, struct ncclCollNetSharedRes* collNet, void* collNetHandle) { struct sharedResources* resources = (struct sharedResources*)collNet->resources; if (resources == NULL) { NCCLCHECK(ncclCalloc(&resources, 1)); collNet->resources = resources; } if (resources->collNetComms[netDev] == NULL) NCCLCHECK(proxyState->ncclCollNet->listen(netDev, collNetHandle, resources->collNetListenComms + netDev)); return ncclSuccess; } static ncclResult_t sharedConnect(struct ncclProxyState* proxyState, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclCollNetSharedRes* collNet, void** collNetComm) { struct sharedResources* resources = (struct sharedResources*)collNet->resources; if (resources->collNetComms[netDev] == NULL) { // Connect to coll comm collNetHandle_t** handlePtrs = NULL; NCCLCHECK(ncclCalloc(&handlePtrs, nranks)); for (int i = 0; i < nranks; i++) { struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i); handlePtrs[i] = &(info->collNetHandle); } ncclResult_t ret = proxyState->ncclCollNet->connect((void**)handlePtrs, nranks, rank, resources->collNetListenComms[netDev], resources->collNetComms+netDev); free(handlePtrs); if (ret == ncclSuccess) { // Close listen comm NCCLCHECK(proxyState->ncclCollNet->closeListen(resources->collNetListenComms[netDev])); } else { resources->collNetListenComms[netDev] = NULL; } } *collNetComm = resources->collNetComms[netDev]; if (*collNetComm) resources->commRefCount[netDev]++; return ncclSuccess; } static ncclResult_t sharedFree(struct ncclProxyState* proxyState, struct ncclCollNetSharedRes* collNet, int netDev) { struct sharedResources* resources = (struct sharedResources*)collNet->resources; resources->commRefCount[netDev]--; if (resources->commRefCount[netDev] == 0) { NCCLCHECK(proxyState->ncclCollNet->closeColl(resources->collNetComms[netDev])); } for (int n=0; ncommRefCount[n]) return ncclSuccess; collNet->resources = NULL; free(resources); return ncclSuccess; } static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int cuda, char** gpuPtr, char** cpuPtr, int* size) { if (collNet->size == 0) { collNet->size = 2 * collNet->nChannels * collNet->buffSize; } *size = collNet->size; if (cuda && collNet->cudaBuff == NULL) { NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size)); } if (!cuda && collNet->hostBuff == NULL) { NCCLCHECK(ncclCudaHostCalloc(&collNet->hostBuff, *size)); } *gpuPtr = *cpuPtr = cuda ? collNet->cudaBuff : collNet->hostBuff; return ncclSuccess; } static ncclResult_t sharedBuffersGet(struct ncclCollNetSharedRes* collNet, int type, int slot, int channel, int* offset) { // Use different pools for different channels and also separate send/recv. int slotSize = collNet->buffSize / NCCL_STEPS; int globalSlot = (type * NCCL_STEPS + slot) * collNet->nChannels + channel; *offset = slotSize * globalSlot; return ncclSuccess; } static ncclResult_t sharedBuffersDestroy(struct ncclCollNetSharedRes* collNet) { if (collNet->size == 0) return ncclSuccess; NCCLCHECK(ncclCudaFree(collNet->cudaBuff)); NCCLCHECK(ncclCudaHostFree(collNet->hostBuff)); // This will be called multiple times, with multiple channels and send/recv. Make sure we only do it once. collNet->size = 0; return ncclSuccess; } static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct setupReq* req = (struct setupReq*)reqBuff; if (reqSize != sizeof (struct setupReq)) return ncclInternalError; struct recvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; connection->shared = 1; resources->netDev = req->netDev; resources->useGdr = req->useGdr; resources->needFlush = req->needFlush; ncclNetProperties_t props; NCCLCHECK(proxyState->ncclCollNet->getProperties(req->netDev, &props)); connection->collNet = req->collNet; /* DMA-BUF support */ resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); collNetHandle_t* netHandle = (collNetHandle_t*) respBuff; if (respSize != sizeof(collNetHandle_t)) return ncclInternalError; NCCLCHECK(sharedListen(proxyState, req->netDev, req->collNet, netHandle)); return ncclSuccess; } static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank); struct sendResources* resources = (struct sendResources*)(connection->transportResources); // Get info from recv side resources->collNetRank = args->rank; resources->reqFifo = (struct reqSlot (*)[NCCL_STEPS])(info->reqFifo); for (int p=0; precvMhandles[p] = info->mhandles[p]; NCCLCHECK(sharedConnect(proxyState, resources->netDev, args->connectInfos, args->nranks, args->rank, connection->collNet, &resources->collNetComm)); // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller. if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; } if (resources->collNetComm == NULL) { *((struct connectMap**)respBuff) = NULL; return ncclSuccess; } connection->proxyAppendPtr = connection->collNet->proxyAppend + 2 * resources->netDev; struct connectMap* map = &resources->map; NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) { uint64_t *cpuPtr, *gpuPtr; NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc)); resources->gdcSync = cpuPtr; struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; gdcMem->cpuPtr = (char*)cpuPtr; gdcMem->gpuPtr = (char*)gpuPtr; gdcMem->size = sizeof(uint64_t); // sendMem->head } resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); // Don't give credits yet in shared mode. resources->sendMem->head = -NCCL_STEPS; // Allocate & Register shared buffers for the Simple protocol int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); #if CUDA_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &resources->sendMhandles[NCCL_PROTO_SIMPLE])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif { NCCLCHECK(proxyState->ncclCollNet->regMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size, resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_SIMPLE])); } *((struct connectMap**)respBuff) = &resources->map; return ncclSuccess; } static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; struct recvResources* resources = (struct recvResources*)(connection->transportResources); struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank); resources->collNetRank = args->rank; NCCLCHECK(sharedConnect(proxyState, resources->netDev, args->connectInfos, args->nranks, args->rank, connection->collNet, &resources->collNetComm)); // Collnet connect is allowed to fail. Gracefully handle that case by returning NULL to the caller. if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; } if (resources->collNetComm == NULL) { *((struct connectMap**)respBuff) = NULL; return ncclSuccess; } connection->proxyAppendPtr = connection->collNet->proxyAppend + 2 * resources->netDev + 1; struct connectMap* map = &resources->map; NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy) { uint64_t *cpuPtr, *gpuPtr; NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc)); if (ncclParamGdrCopySyncEnable()) { resources->gdcSync = cpuPtr; struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; gdcMem->cpuPtr = (char*)cpuPtr; gdcMem->gpuPtr = (char*)gpuPtr; gdcMem->size = sizeof(uint64_t); } if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1; } resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); // Allocate & Register shared buffers for the Simple protocol int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); #if CUDA_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &resources->mhandles[NCCL_PROTO_SIMPLE])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif { NCCLCHECK(proxyState->ncclCollNet->regMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size, resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_SIMPLE])); } // Pass info to send side info->reqFifo = resources->reqFifo; for (int p=0; pmhandles[p] = resources->mhandles[p]; if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; } *((struct connectMap**)respBuff) = &resources->map; return ncclSuccess; } static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct sendResources* resources = (struct sendResources*)(connection->transportResources); if (resources) { for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) { if (resources->sendMhandles[p]) { NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, resources->sendMhandles[p])); } } struct connectMapMem* mems = resources->map.mems; NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); NCCLCHECK(sharedBuffersDestroy(connection->collNet)); NCCLCHECK(sharedFree(proxyState, connection->collNet, resources->netDev)); if (ncclAtomicRefCountDecrement(&connection->collNet->refCount) == 0) free(connection->collNet); free(connection->transportResources); } return ncclSuccess; } static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct recvResources* resources = (struct recvResources*)(connection->transportResources); if (resources) { for (int p=0; pmhandles[p]) { NCCLCHECK(proxyState->ncclCollNet->deregMr(resources->collNetComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); NCCLCHECK(sharedBuffersDestroy(connection->collNet)); NCCLCHECK(sharedFree(proxyState, connection->collNet, resources->netDev)); if (ncclAtomicRefCountDecrement(&connection->collNet->refCount) == 0) free(connection->collNet); free(connection->transportResources); } return ncclSuccess; } #define LAST_OF_GROUP(s) \ (s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1) static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->transmitted = sub->done = 0; resources->step = sub->base + sub->nsteps; } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = NCCL_PROTO_SIMPLE; int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS); int perGroupSteps = NCCL_STEPS / nGroups; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); void* sendMhandle = resources->sendMhandles[p]; void* recvMhandle = resources->recvMhandles[p]; auto reqFifo = resources->reqFifo; if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) { int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; int sharedBuffSlot = sub->posted%NCCL_STEPS; int offset; NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset)); resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize; __sync_synchronize(); volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; sub->posted += args->sliceSteps; *sendHead = sub->base + sub->posted - NCCL_STEPS; if (resources->gdcSync) wc_store_fence(); // Flush out WC write } // Enforce sync between operations of the same group. bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->received == sub->received)) || (s && (sub-1)->received > sub->received)); if (groupSync && sub->received < sub->posted && sub->received < sub->done + perGroupSteps) { int buffSlot = (sub->base+sub->received)%NCCL_STEPS; int sharedBuffSlot = sub->received%NCCL_STEPS; volatile int* sizesFifo = resources->recvMem->sizesFifo; volatile uint64_t* recvTail = &resources->recvMem->tail; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]); if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)))) { // We have something to receive, let's check whether data is ready. int ready = 1; if (s == 0) { int offset; NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset)); args->sharedBuff[sharedBuffSlot] = localBuff + offset; args->sharedSize[sharedBuffSlot] = args->chunkSize; } if (ready) { sizesFifo[buffSlot] = -1; sub->received += args->sliceSteps; args->idle = 0; //continue; } } } if (LAST_OF_GROUP(s) && (sub->transmitted < sub->received)) { int group = s / COLLNET_GROUP_NSUBS; int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; int sharedBuffSlot = sub->transmitted%NCCL_STEPS; if (reqFifo[group][buffSlot].recvBuff != NULL) { int totalSize = (s-group*COLLNET_GROUP_NSUBS+1) * args->sharedSize[sharedBuffSlot]; int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype); reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot]; char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot]; NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] == NULL) continue; TRACE(NCCL_NET, "sendProxy [%d/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]); // Make sure size is reset to zero before we update the head. __sync_synchronize(); sub->transmitted += args->sliceSteps; args->idle = 0; continue; } } // Check whether the network has completed some send operations. if (LAST_OF_GROUP(s) && sub->done < sub->transmitted) { int done, size; int group = s / COLLNET_GROUP_NSUBS; int buffSlot = (sub->base+sub->done)%NCCL_STEPS; NCCLCHECK(proxyState->ncclCollNet->test((void*)(sub->requests[buffSlot]), &done, &size)); if (done) { TRACE(NCCL_NET, "sendProxy [%d/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size); // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush) // (reordered store after store is possible on POWER, though not on x86) __sync_synchronize(); reqFifo[group][buffSlot].recvBuff = NULL; // Notify recvProxy for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].done += args->sliceSteps; args->idle = 0; int allDone = 1; for (int i=0; insubs; i++) { if (args->subs[i].done < args->subs[i].nsteps) { allDone = 0; break; } } if (allDone) { args->state = ncclProxyOpNone; TRACE(NCCL_NET, "sendProxy [%d/%d] stopped", sub->done, s); } } } } } return ncclSuccess; } static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0; resources->step = sub->base + sub->nsteps; } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = NCCL_PROTO_SIMPLE; int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS); int perGroupSteps = NCCL_STEPS / nGroups; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); void* mhandle = resources->mhandles[p]; auto reqFifo = resources->reqFifo; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); // Enforce sync between operations of the same group. if (LAST_OF_GROUP(s) && (sub->posted < sub->done + perGroupSteps) && (sub->posted < sub->nsteps)) { int group = s / COLLNET_GROUP_NSUBS; int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; int sharedBuffSlot = sub->posted%NCCL_STEPS; int startChannel = group*COLLNET_GROUP_NSUBS; int offset; NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset)); reqFifo[group][buffSlot].recvBuff = localBuff + offset; TRACE(NCCL_NET, "recvProxy [%d/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff); sub->posted += args->sliceSteps; args->idle = 0; continue; } if (LAST_OF_GROUP(s) && (sub->posted > sub->received)) { int group = s / COLLNET_GROUP_NSUBS; int buffSlot = (sub->base+sub->received)%NCCL_STEPS; int sharedBuffSlot = sub->received%NCCL_STEPS; if (reqFifo[group][buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete args->sharedSize[sharedBuffSlot] = reqFifo[group][buffSlot].size; int totalSize = args->sharedSize[sharedBuffSlot]*(s-group*COLLNET_GROUP_NSUBS+1); TRACE(NCCL_NET, "recvProxy [%d/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize); sub->received += args->sliceSteps; sub->requests[buffSlot] = NULL; if (reqFifo[group][buffSlot].size > 0 && resources->useGdr && resources->needFlush) { // GDRCOPY support if (resources->gdcFlush) { #if defined (__x86_64__) // Force a PCI-E read from GPU memory asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax"); #else WARN("NET: GDR Flush only supported on x86_64"); return ncclInternalError; #endif sub->requests[buffSlot] = NULL; } else { int startChannel = group*COLLNET_GROUP_NSUBS; int offset; NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset)); NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot)); } } else { for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps; } args->idle = 0; continue; } } if (LAST_OF_GROUP(s) && (sub->received > sub->flushed)) { // Progress flush operations int group = s / COLLNET_GROUP_NSUBS; int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS; int done = 1; if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test(sub->requests[buffSlot], &done, NULL)); if (done) { TRACE(NCCL_NET, "recvProxy [%d/%d/%d] flushed", sub->flushed, group, buffSlot); for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps; args->idle = 0; //continue; } } if (sub->flushed > sub->transmitted) { int group = s / COLLNET_GROUP_NSUBS; int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS; int sharedBuffSlot = sub->transmitted%NCCL_STEPS; int startChannel = group*COLLNET_GROUP_NSUBS; int offset; NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset)); volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo; offsFifo[buffSlot] = offset + (s%COLLNET_GROUP_NSUBS)*args->chunkSize; __sync_synchronize(); volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail; *recvTail = sub->base + sub->flushed; if (resources->gdcSync) wc_store_fence(); // Flush out WC write sub->transmitted += args->sliceSteps; args->idle = 0; continue; } // Enforce sync here to make sure the last sub doesn't increase "done" before all others in the group have // reached the same point, otherwise we would start posting buffers to the send proxy before we're done // processing all the shared buffer. bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->done == sub->done)) || (s && (sub-1)->done > sub->done)); volatile uint64_t* sendHead = &resources->sendMem->head; if (groupSync && sub->done < sub->transmitted && (sub->base+sub->done) < *sendHead) { sub->done += args->sliceSteps; args->idle = 0; if (sub->done == sub->nsteps && s == args->nsubs-1) { args->state = ncclProxyOpNone; TRACE(NCCL_NET, "recvProxy [%d/%d] stopped", sub->done, s); } } } } return ncclSuccess; } struct ncclTransport collNetTransport = { "COL", canConnect, { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress }, { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress } }; nccl-2.18.3-1/src/transport/net.cc000066400000000000000000001570031444201535000166060ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "net.h" #include "graph.h" #include "proxy.h" #include "collectives.h" #include "gdrwrap.h" #include "shm.h" #include "p2p.h" #include "profiler.h" static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large"); #define NCCL_NET_MAP_HOSTMEM 0 #define NCCL_NET_MAP_DEVMEM 1 #define NCCL_NET_MAP_SHARED_HOSTMEM 2 #define NCCL_NET_MAP_SHARED_DEVMEM 3 #define NCCL_NET_MAP_GDCMEM 4 #define NCCL_NET_MAP_MEMS 5 #define NCCL_NET_MAP_MASK_DEVMEM 0x40000000 #define NCCL_NET_MAP_MASK_SHARED 0x80000000 #define NCCL_NET_MAP_MASK_USED 0x20000000 #define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff #define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \ ((mapStruct)->offsets.offsetName >> 30) #define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \ (((mapStruct)->offsets.offsetName >> 29) == 0) #define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \ (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \ (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET)) #define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \ (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0) #define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \ int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \ if ((shared) == 0) { \ if (dev) { \ (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \ (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \ } else { \ (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \ (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \ } \ } else { \ (mapStruct)->offsets.offsetName = bank; \ } \ } while (0); struct connectMapMem{ char* gpuPtr; char* cpuPtr; int size; ncclIpcDesc ipcDesc; char shmPath[PATH_MAX]; ncclShmHandle_t attachHandle; ncclShmHandle_t createHandle; }; struct connectMap { int sameProcess; int shared; int cudaDev; // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem. struct connectMapMem mems[NCCL_NET_MAP_MEMS]; // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL. struct { uint32_t sendMem; uint32_t recvMem; uint32_t buffs[NCCL_NUM_PROTOCOLS]; } offsets; }; struct sendResources { struct connectMap map; void* netSendComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; int tpRank; int tpLocalRank; int tpRemoteRank; int netDev; int useGdr; int useDmaBuf; int maxRecvs; uint64_t* gdcSync; void* gdrDesc; int shared; int channelId; int connIndex; char* buffers[NCCL_NUM_PROTOCOLS]; int buffSizes[NCCL_NUM_PROTOCOLS]; void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; uint64_t llLastCleaning; }; struct recvResources { struct connectMap map; void* netListenComm; void* netRecvComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; int tpRank; int tpLocalRank; int tpRemoteRank; int tpRemoteProxyRank; int netDev; int useGdr; int useDmaBuf; int needFlush; int maxRecvs; uint64_t* gdcSync; uint64_t* gdcFlush; void* gdrDesc; int shared; int channelId; int connIndex; char* buffers[NCCL_NUM_PROTOCOLS]; int buffSizes[NCCL_NUM_PROTOCOLS]; void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; uint64_t llLastCleaning; }; /* Determine if two peers can communicate with NET */ static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 1; if (info1->hostHash == info2->hostHash) { // If on the same host, check intra-node net is not disabled. NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, ret)); } return ncclSuccess; } NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2); NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1); struct setupReq { int tpRank; int tpLocalRank; int tpRemoteRank; int shared; int netDev; int useGdr; int needFlush; int channelId; int connIndex; }; /* Determine if we will use this transport for this peer and return connect * information for this peer */ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct setupReq req = { 0 }; int localRank, tpProxyRank; send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; req.channelId = channelId; req.connIndex = connIndex; int proxyRank; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; tpProxyRank = comm->topParentRanks[proxyRank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn)); NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank)); req.tpLocalRank = comm->topParentLocalRanks[localRank]; req.tpRank = comm->topParentRanks[myInfo->rank]; req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); if (proxyRank == myInfo->rank) { INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } else { INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } *((int*)connectInfo) = tpProxyRank; return ncclSuccess; } // GDRCOPY support: TAIL_ENABLE When enabled locates the RX proxy tail in CUDA memory NCCL_PARAM(GdrCopySyncEnable, "GDRCOPY_SYNC_ENABLE", 1); // GDRCOPY support: FLUSH_ENABLE When enabled uses a PCI-E read to flush GDRDMA buffers NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0); /* Setup recv connector */ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct setupReq req = { 0 }; int localRank; recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; req.channelId = channelId; req.connIndex = connIndex; // Use myInfo->rank as the receiver uses its own NIC int proxyRank, tpProxyRank; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr)); // Determine whether we need to flush the GDR buffer on recv or not if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); // We don't support PXN on receive yet tpProxyRank = comm->topParentRanks[myInfo->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn)); NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank)); req.tpLocalRank = comm->topParentLocalRanks[localRank]; req.tpRank = comm->topParentRanks[myInfo->rank]; req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); return ncclSuccess; } static ncclResult_t netMapShm(struct connectMapMem* mem) { NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, -1, &mem->attachHandle)); return ncclSuccess; } static ncclResult_t netCreateShm(struct connectMapMem* mem) { mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1, &mem->createHandle)); return ncclSuccess; } static ncclResult_t netDumpMap(struct connectMap* map) { printf("Dump map same process %d shared %d\n", map->sameProcess, map->shared); struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM; printf("Mem 0: Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_DEVMEM; printf("Mem 1: Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM; printf("Mem 2: Shared Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr); mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM; printf("Mem 3: Shared Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET, NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem)); printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET, NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem)); for (int p=0; p Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p, map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0, NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET, NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p])); } printf("End of dump\n"); return ncclSuccess; } static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { struct connectMap* map = (connectMap*) send->transportResources; void* opId; // map isn't allocated thus this op hasn't been submitted yet if (!map) { // Setup device pointers NCCLCHECK(ncclCalloc(&map, 1)); send->transportResources = map; opId = send; INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId); NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId)); } else { opId = send; } ncclResult_t ret; NCCLCHECK(ret = ncclPollProxyResponse(comm, &send->proxyConn, map, opId)); if (ret == ncclInProgress) { return ret; } INFO(NCCL_PROXY, "sendConnect ncclPollProxyResponse opId=%p", opId); if (map->sameProcess && !ncclCuMemEnable()) { if (map->cudaDev != comm->cudaDev) { if (!ncclCuMemEnable()) { // Enable P2P access for Legacy IPC cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { cudaGetLastError(); } else if (err != cudaSuccess) { WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err)); return ncclInternalError; } } } } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) { if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM)); if (map->mems[NCCL_NET_MAP_DEVMEM].size) { NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank, map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = NULL; } if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) { void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank; if (*sharedDevMemPtr == NULL) { NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size, &map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc, sharedDevMemPtr)); } map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = (char*)(*sharedDevMemPtr); map->mems[NCCL_NET_MAP_SHARED_DEVMEM].cpuPtr = NULL; } } //NCCLCHECK(netDumpMap(map)); struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head; struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); send->conn.tail = &recvMem->tail; send->conn.sizesFifo = recvMem->sizesFifo; // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree send->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL; for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); return ncclSuccess; } /* Connect to this peer */ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { struct connectMap* map = (connectMap*) recv->transportResources; void* opId; if (!map) { NCCLCHECK(ncclCalloc(&map, 1)); recv->transportResources = map; // Use recv connector as unique identifier opId = recv; INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p", opId, &recv->proxyConn, connectInfo); NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId)); } else { opId = recv; } ncclResult_t ret; NCCLCHECK(ret = ncclPollProxyResponse(comm, &recv->proxyConn, map, opId)); if (ret == ncclInProgress) { return ret; } INFO(NCCL_PROXY, "recvConnect ncclPollProxyResponse opId=%p", opId); //NCCLCHECK(netDumpMap(map)); struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); recv->conn.head = &sendMem->head; struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail; recv->conn.sizesFifo = recvMem->sizesFifo; // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree recv->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL; for (int p=0; pconn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); return ncclSuccess; } static ncclResult_t sendFree(struct ncclConnector* send) { struct connectMap* map = (struct connectMap*)(send->transportResources); if (map) { int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); if (map->sameProcess && map->cudaDev == cudaDev) { // Our own GPU, so it wasn't mapped in free(map); return ncclSuccess; } if (!map->sameProcess || ncclCuMemEnable()) { if (!map->sameProcess) NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle)); if (map->mems[NCCL_NET_MAP_DEVMEM].size) { if (ncclCuMemEnable()) { // cuMem API support NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc)); NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } else { // Legacy CUDA IPC support CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } } } free(map); } return ncclSuccess; } static ncclResult_t recvFree(struct ncclConnector* recv) { if (recv->transportResources) free(recv->transportResources); return ncclSuccess; } #define NCCL_SHARED_STEPS 16 static ncclResult_t sharedBuffersInit(struct ncclProxyState* proxyState, int cuda, int tpLocalRank, int type, int sameProcess, int nChannels, char** gpuPtr, char** cpuPtr, int* size, ncclIpcDesc *ipcDesc) { if (cuda == 0 && sameProcess == 0) { WARN("PXN should not use host buffers for data"); return ncclInternalError; } struct ncclProxyProgressState* progressState = &proxyState->progressState; if (progressState->localPeers == NULL) { NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks)); } struct ncclProxyPeer** localPeers = progressState->localPeers; if (localPeers[tpLocalRank] == NULL) { NCCLCHECK(ncclCalloc(localPeers + tpLocalRank, 1)); } struct ncclProxyPeer* peer = localPeers[tpLocalRank]; struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv; state->refcount++; if (state->size == 0) { state->size = nChannels * NCCL_SHARED_STEPS * proxyState->p2pChunkSize; } if (size) *size = state->size; if (cuda && state->cudaBuff == NULL) { if (sameProcess == 0 || ncclCuMemEnable()) { NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff)); } else { NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size)); } } if (!cuda && state->hostBuff == NULL) { NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size)); } if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff; if (gpuPtr) *gpuPtr = sameProcess ? *cpuPtr : NULL; if (ipcDesc) memcpy(ipcDesc, &state->ipcDesc, sizeof(state->ipcDesc)); return ncclSuccess; } static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset) { // Use different pools for different channels and also separate send/recv. int globalSlot = (channel*NCCL_SHARED_STEPS)+slot; *offset = proxyState->p2pChunkSize * globalSlot; return ncclSuccess; } static ncclResult_t sharedBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) { if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError); struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank]; if (peer == NULL) NCCLCHECK(ncclInternalError;) struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv; if (state->size == 0) NCCLCHECK(ncclInternalError); if (ncclAtomicRefCountDecrement(&state->refcount) == 0) { if (state->cudaBuff) { if (!connection->sameProcess || ncclCuMemEnable()) { NCCLCHECK(ncclP2pFreeShareableBuffer(&state->ipcDesc)); } NCCLCHECK(ncclCudaFree(state->cudaBuff)); } if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff)); } if (peer->send.refcount || peer->recv.refcount) return ncclSuccess; free(peer); proxyState->progressState.localPeers[tpLocalRank] = NULL; for (int r = 0; r < proxyState->tpLocalnRanks; r++) { if (proxyState->progressState.localPeers[r]) return ncclSuccess; } // All peers are freed, free array free(proxyState->progressState.localPeers); proxyState->progressState.localPeers = NULL; return ncclSuccess; } static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels) { NCCLCHECK(sharedBuffersInit(proxyState, 1, connection->tpLocalRank, 0, connection->sameProcess, nChannels, NULL, NULL, NULL, NULL)); return ncclSuccess; } static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct setupReq* req = (struct setupReq*) reqBuff; if (reqSize != sizeof(struct setupReq)) return ncclInternalError; struct sendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; resources->tpRank = req->tpRank; resources->tpLocalRank = req->tpLocalRank; resources->tpRemoteRank = req->tpRemoteRank; resources->netDev = req->netDev; resources->shared = connection->shared = req->shared; resources->useGdr = req->useGdr; resources->channelId = req->channelId; resources->connIndex = req->connIndex; ncclNetProperties_t props; NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props)); /* DMA-BUF support */ resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxRecvs = props.maxRecvs; // We don't return any data if (respSize != 0) return ncclInternalError; *done = 1; return ncclSuccess; } static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct setupReq* req = (struct setupReq*) reqBuff; if (reqSize != sizeof(struct setupReq)) return ncclInternalError; struct recvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); connection->transportResources = resources; resources->tpRank = req->tpRank; resources->tpLocalRank = req->tpLocalRank; resources->tpRemoteRank = req->tpRemoteRank; resources->netDev = req->netDev; resources->shared = connection->shared = req->shared; resources->useGdr = req->useGdr; resources->needFlush = req->needFlush; resources->channelId = req->channelId; resources->connIndex = req->connIndex; ncclNetProperties_t props; NCCLCHECK(proxyState->ncclNet->getProperties(req->netDev, &props)); /* DMA-BUF support */ resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); resources->maxRecvs = props.maxRecvs; if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError; NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm)); *done = 1; return ncclSuccess; } static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct sendResources* resources = (struct sendResources*)(connection->transportResources); if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError; ncclResult_t ret = ncclSuccess; if (resources->shared) { // Shared buffers struct ncclProxyProgressState* progressState = &proxyState->progressState; if (progressState->localPeers == NULL) { NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks)); } struct ncclProxyPeer** localPeers = progressState->localPeers; if (localPeers[resources->tpLocalRank] == NULL) { NCCLCHECK(ncclCalloc(localPeers + resources->tpLocalRank, 1)); } connection->proxyAppendPtr = localPeers[resources->tpLocalRank]->send.proxyAppend + resources->channelId; if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { // Connect or reuse connection for a netdev/remote rank. if (progressState->netComms[resources->netDev] == NULL) { NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank; if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, comms->sendComm + resources->channelId); resources->netSendComm = comms->sendComm[resources->channelId]; if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; } else { ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm); } } else { // Connect to remote peer ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm); connection->proxyAppendPtr = &connection->proxyAppend; } NCCLCHECK(ret); if (resources->netSendComm == NULL) { *done = 0; return ncclInProgress; } *done = 1; // Create structures struct connectMap* map = &resources->map; map->sameProcess = connection->sameProcess; map->shared = resources->shared; CUDACHECK(cudaGetDevice(&map->cudaDev)); if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p for (int p=0; puseGdr, proxyState->buffSizes[p], buffs[p]); resources->buffSizes[p] = proxyState->buffSizes[p]; } } else { // Get shared buffers int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit( proxyState, resources->useGdr, resources->tpLocalRank, 0, map->sameProcess, proxyState->p2pnChannels, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipcDesc)); resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; if (proxyState->allocP2pNetLLBuffers) { NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*p == NCCL_PROTO_LL*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL]; } NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); } NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); if (map->mems[NCCL_NET_MAP_DEVMEM].size) { if (resources->shared == 0) { if (!map->sameProcess || ncclCuMemEnable()) { ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN); NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } else { NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size)); } map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; } } if (map->sameProcess) { NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; } else { NCCLCHECK(netCreateShm(map->mems+NCCL_NET_MAP_HOSTMEM)); } if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) { uint64_t *cpuPtr, *gpuPtr; NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc)); resources->gdcSync = cpuPtr; struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; gdcMem->cpuPtr = (char*)cpuPtr; gdcMem->gpuPtr = (char*)gpuPtr; gdcMem->size = sizeof(uint64_t); // sendMem->head } resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); // Don't give credits yet in shared mode. resources->sendMem->head = map->shared ? -NCCL_STEPS : 0; for (int i=0; irecvMem->sizesFifo[i] = -1; for (int p=0; pbuffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]); if (resources->buffers[p]) { #if CUDA_VERSION >= 11070 /* DMA-BUF support */ int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif { NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); } } } //NCCLCHECK(netDumpMap(map)); if (respSize != sizeof(struct connectMap)) return ncclInternalError; memcpy(respBuff, map, sizeof(struct connectMap)); return ncclSuccess; } static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(int)) return ncclInternalError; struct recvResources* resources = (struct recvResources*)(connection->transportResources); resources->tpRemoteProxyRank = *(int*)reqBuff; ncclResult_t ret = ncclSuccess; // Finish connection establishment from remote peer if (resources->shared) { // Shared buffers struct ncclProxyProgressState* progressState = &proxyState->progressState; if (progressState->localPeers == NULL) { NCCLCHECK(ncclCalloc(&progressState->localPeers, proxyState->tpLocalnRanks)); } struct ncclProxyPeer** localPeers = progressState->localPeers; if (localPeers[resources->tpLocalRank] == NULL) { NCCLCHECK(ncclCalloc(localPeers + resources->tpLocalRank, 1)); } connection->proxyAppendPtr = localPeers[resources->tpLocalRank]->recv.proxyAppend + resources->channelId; if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { // Connect or reuse connection for a netdev/remote rank. if (progressState->netComms[resources->netDev] == NULL) { NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank; if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId); resources->netRecvComm = comms->recvComm[resources->channelId]; if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++; } else { ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm); } } else { // Connect to remote peer ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm); connection->proxyAppendPtr = &connection->proxyAppend; } NCCLCHECK(ret); if (resources->netRecvComm == NULL) { *done = 0; return ncclInProgress; } *done = 1; NCCLCHECK(proxyState->ncclNet->closeListen(resources->netListenComm)); // Create structures struct connectMap* map = &resources->map; map->sameProcess = connection->sameProcess; if (map->sameProcess == 0) return ncclInternalError; // We don't support remote proxy for recv map->shared = resources->shared; if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p for (int p=0; puseGdr, proxyState->buffSizes[p], buffs[p]); resources->buffSizes[p] = proxyState->buffSizes[p]; } } else { // Get shared buffers int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit( proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL)); resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); } NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); if (proxyState->allocP2pNetLLBuffers) { NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL]; } if (map->mems[NCCL_NET_MAP_DEVMEM].size) { if (resources->shared == 0) { if (ncclCuMemEnable()) { NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc, (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } else { NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size)); } map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; } } NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; if (ncclGdrCopy && map->sameProcess) { uint64_t *cpuPtr, *gpuPtr; NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc)); if (ncclParamGdrCopySyncEnable()) { resources->gdcSync = cpuPtr; struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; gdcMem->cpuPtr = (char*)cpuPtr; gdcMem->gpuPtr = (char*)gpuPtr; gdcMem->size = sizeof(uint64_t); } if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1; } resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); for (int p=0; pbuffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]); if (resources->buffers[p]) { #if CUDA_VERSION >= 11070 /* DMA-BUF support */ int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif { NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); } } } //NCCLCHECK(netDumpMap(map)); if (respSize != sizeof(struct connectMap)) return ncclInternalError; memcpy(respBuff, map, sizeof(struct connectMap)); return ncclSuccess; } static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct sendResources* resources = (struct sendResources*)(connection->transportResources); if (connection->state == connSharedInitialized) { // NVB Preconnect NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 0, connection)); return ncclSuccess; } if (connection->state == connConnected) { for (int p=0; pbuffers[p]) { NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; if (resources->map.sameProcess) { NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); } else { NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].createHandle)); } NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); if (!resources->map.sameProcess || ncclCuMemEnable()) { // cuMem API support if (mems[NCCL_NET_MAP_DEVMEM].size) { NCCLCHECK(ncclP2pFreeShareableBuffer(&mems[NCCL_NET_MAP_DEVMEM].ipcDesc)); } } if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); if (resources->shared) { NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 0, connection)); if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev]+resources->tpRemoteRank; comms->sendRefCount[resources->channelId]--; if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeSend(comms->sendComm[resources->channelId])); } else { NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm)); } } else { NCCLCHECK(proxyState->ncclNet->closeSend(resources->netSendComm)); } } if (resources) free(resources); return ncclSuccess; } static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct recvResources* resources = (struct recvResources*)(connection->transportResources); if (connection->state == connSharedInitialized) { // NVB Preconnect NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 1, connection)); return ncclSuccess; } if (connection->state == connConnected) { for (int p=0; pbuffers[p]) { NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, resources->mhandles[p])); } } struct connectMapMem* mems = resources->map.mems; NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); if (!resources->map.sameProcess || ncclCuMemEnable()) { // cuMem API support if (mems[NCCL_NET_MAP_DEVMEM].size) { NCCLCHECK(ncclP2pFreeShareableBuffer(&mems[NCCL_NET_MAP_DEVMEM].ipcDesc)); } } if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); if (resources->shared) { NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 1, connection)); if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev] + resources->tpRemoteProxyRank; comms->recvRefCount[resources->channelId]--; if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(proxyState->ncclNet->closeRecv(comms->recvComm[resources->channelId])); } else { NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm)); } } else { NCCLCHECK(proxyState->ncclNet->closeRecv(resources->netRecvComm)); } } if (resources) free(resources); return ncclSuccess; } static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps"); static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->transmitted = sub->done = 0; for (uint64_t step=0; stepnsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin); } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs); for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; if (sub->done == sub->nsteps) continue; struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); void* mhandle = resources->mhandles[p]; int stepSize = resources->buffSizes[p] / NCCL_STEPS; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); int buffSize = stepSize*args->sliceSteps; if (sub->nbytes < buffSize) buffSize = sub->nbytes; // Post buffers to the GPU if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) { int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; if (resources->shared) { int sharedBuffSlot = sub->posted%maxDepth; int offset; NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset)); resources->recvMem->offsFifo[buffSlot] = offset; __sync_synchronize(); volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; sub->posted += args->sliceSteps; *sendHead = sub->base + sub->posted - NCCL_STEPS; if (resources->gdcSync) wc_store_fence(); // Flush out WC write } else sub->posted += args->sliceSteps; for (uint64_t step=sub->posted-args->sliceSteps; stepposted; step++) { ncclProfilingRecord(args, s, step, ncclProxyProfileSendGPUWait); } args->idle = 0; continue; } // Check whether we received data from the GPU and send it to the network if (sub->transmitted < sub->posted && sub->transmitted < sub->done + NCCL_STEPS) { int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; volatile int* sizesFifo = resources->recvMem->sizesFifo; volatile uint64_t* recvTail = &resources->recvMem->tail; if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) { // We have something to receive, let's check if it's completely ready. int size = sizesFifo[buffSlot]; bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared; char* buff = shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize; int ready = 1; if (p == NCCL_PROTO_LL128) { ready = resources->useGdr; if (!ready) { // When data is in sysmem, we need to wait until all flags are correct since the GPU only // called threadfence() uint64_t flag = sub->base+sub->transmitted+1; int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS); volatile uint64_t* lines = (volatile uint64_t*)buff; ready = 1; for (int i=0; ibase+sub->transmitted+1); int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff; for (int i=0; incclNet->isend(resources->netSendComm, buff, size, resources->tpRank, mhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]); sizesFifo[buffSlot] = -1; // Make sure size is reset to zero before we update the head. __sync_synchronize(); sub->transmitted += args->sliceSteps; for (uint64_t step=sub->transmitted-args->sliceSteps; steptransmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait); args->idle = 0; continue; } } } } // Check whether the network has completed some send operations. if (sub->done < sub->transmitted) { int done; int buffSlot = (sub->base+sub->done)%NCCL_STEPS; NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, NULL)); if (done) { TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]); sub->done += args->sliceSteps; for (uint64_t step=sub->done-args->sliceSteps; stepdone; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd); if (resources->shared == 0) { volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; *sendHead = sub->base + sub->done; if (resources->gdcSync) wc_store_fence(); // Flush out WC write } args->idle = 0; if (sub->done == sub->nsteps) { resources->step = sub->base + sub->nsteps; args->done++; } } } } if (args->done == args->nsubs) { args->state = ncclProxyOpNone; } } return ncclSuccess; } static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { // Initialize subs and group them by same recvComm. void* recvComm; int groupSize = 0; int maxRecvs = 1; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; if (groupSize == maxRecvs) { groupSize = 0; } else if (s>0) { // Find next sub with the same recvComm int next; for (next=s; nextnsubs; next++) { struct recvResources* nextRes = (struct recvResources*) (args->subs[next].connection->transportResources); if (nextRes->netRecvComm == recvComm) break; } if (next == args->nsubs) { // Not found groupSize = 0; } else if (s != next) { // We found a sub later with the same recvComm ; swap subs struct ncclProxySubArgs temp; memcpy(&temp, sub, sizeof(struct ncclProxySubArgs)); memcpy(sub, args->subs+next, sizeof(struct ncclProxySubArgs)); memcpy(args->subs+next, &temp, sizeof(struct ncclProxySubArgs)); } } groupSize++; struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); maxRecvs = resources->maxRecvs; recvComm = resources->netRecvComm; // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->transmitted = sub->done = 0; for (int i=0; insteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin); } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs); for (int s=0; snsubs; s+=args->subs[s].groupSize) { struct ncclProxySubArgs* subGroup = args->subs+s; int subCount = 0; void* ptrs[NCCL_PROXY_MAX_SUBS]; int sizes[NCCL_PROXY_MAX_SUBS]; int tags[NCCL_PROXY_MAX_SUBS]; void* mhandles[NCCL_PROXY_MAX_SUBS]; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; if (sub->posted < sub->nsteps) { if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; } struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); int stepSize = resources->buffSizes[p] / NCCL_STEPS; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; if (p == NCCL_PROTO_SIMPLE && resources->shared) { int sharedBuffSlot = sub->posted%maxDepth; int offset; NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset)); volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo; offsFifo[buffSlot] = offset; ptrs[subCount] = localBuff+offset; } else { ptrs[subCount] = localBuff+buffSlot*stepSize; } sizes[subCount] = stepSize*args->sliceSteps; if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes; tags[subCount] = resources->tpRemoteRank; mhandles[subCount] = resources->mhandles[p]; subCount++; } } if (subCount) { uint64_t step = subGroup->posted; struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); void** requestPtr = subGroup->requests+(step%NCCL_STEPS); NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr)); if (*requestPtr) { for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup+i; sub->posted += args->sliceSteps; for (uint64_t step=sub->posted-args->sliceSteps; stepposted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait); } args->idle = 0; } } } if (args->idle == 0) return ncclSuccess; for (int s=0; snsubs; s+=args->subs[s].groupSize) { struct ncclProxySubArgs* subGroup = args->subs+s; if (subGroup->posted > subGroup->received) { uint64_t step = subGroup->received; int done; void* ptrs[NCCL_PROXY_MAX_SUBS]; int sizes[NCCL_PROXY_MAX_SUBS]; void* mhandles[NCCL_PROXY_MAX_SUBS]; for (int i=0; incclNet->test(subGroup->requests[step%NCCL_STEPS], &done, sizes)); if (done) { int needFlush = 0; int totalSize = 0; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; sub->received += args->sliceSteps; for (uint64_t step=sub->received-args->sliceSteps; stepreceived; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait); if (step < sub->nsteps) { struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); if (resources->useGdr) needFlush |= resources->needFlush; } } subGroup->requests[step%NCCL_STEPS] = NULL; if (totalSize > 0 && p == NCCL_PROTO_SIMPLE && needFlush) { // GDRCOPY support struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); if (resources->gdcFlush) { #if defined (__x86_64__) // Force a PCI-E read from GPU memory asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax"); #else WARN("NET: GDR Flush only supported on x86_64"); return ncclInternalError; #endif } else { int subCount = 0; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; if (step < sub->nsteps) { struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); int stepSize = resources->buffSizes[p] / NCCL_STEPS; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; ptrs[subCount] = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize; mhandles[subCount] = resources->mhandles[p]; subCount++; } } struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); NCCLCHECK(proxyState->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS))); } } args->idle = 0; } } } if (args->idle == 0) return ncclSuccess; for (int s=0; snsubs; s+=args->subs[s].groupSize) { struct ncclProxySubArgs* subGroup = args->subs+s; if (subGroup->received > subGroup->transmitted) { uint64_t step = subGroup->transmitted; int done = 1; void* request = subGroup->requests[step%NCCL_STEPS]; if (request) NCCLCHECK(proxyState->ncclNet->test(request, &done, NULL)); if (done) { for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; sub->transmitted += args->sliceSteps; for (uint64_t step=sub->transmitted-args->sliceSteps; steptransmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait); if (step < sub->nsteps) { __sync_synchronize(); struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail; *recvTail = sub->base + sub->transmitted; if (resources->gdcSync) wc_store_fence(); // Flush out WC write } } args->idle = 0; } } } if (args->idle == 0) return ncclSuccess; for (int s=0; snsubs; s+=args->subs[s].groupSize) { struct ncclProxySubArgs* subGroup = args->subs+s; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; if (sub->done == sub->nsteps) continue; if (sub->transmitted > sub->done) { struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); volatile uint64_t* sendHead = &resources->sendMem->head; uint64_t done = *sendHead; while (done > sub->base + sub->done && // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted. sub->transmitted > sub->done) { sub->done += args->sliceSteps; for (uint64_t step=sub->done-args->sliceSteps; stepdone; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd); args->idle = 0; if (sub->done == sub->nsteps) { struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); resources->step = sub->base + sub->nsteps; args->done++; break; } } } } } if (args->done == args->nsubs) { args->state = ncclProxyOpNone; } } return ncclSuccess; } struct ncclTransport netTransport = { "NET", canConnect, { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress }, { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress } }; nccl-2.18.3-1/src/transport/net_ib.cc000066400000000000000000001423711444201535000172620ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #include "core.h" #include "socket.h" #include "net.h" #include "graph.h" #include "utils.h" #include "param.h" #include #include #include #include #include #include #include #include #define ENABLE_TIMER 0 #include "timer.h" #include "ibvwrap.h" #define MAXNAMESIZE 64 static char ncclIbIfName[MAX_IF_NAME_SIZE+1]; static union ncclSocketAddress ncclIbIfAddr; struct ncclIbMr { uintptr_t addr; int pages; int refs; ibv_mr *mr; }; struct ncclIbMrCache { struct ncclIbMr *slots; int capacity, population; }; static int ncclNIbDevs = -1; struct alignas(64) ncclIbDev { pthread_mutex_t lock; int device; uint64_t guid; uint8_t port; uint8_t link; int speed; ibv_context* context; int pdRefs; ibv_pd* pd; char devName[MAXNAMESIZE]; char* pciPath; int realPort; int maxQp; struct ncclIbMrCache mrCache; int ar; // ADAPTIVE_ROUTING }; #define MAX_IB_PORT 15 struct userIbDev { char devName[MAXNAMESIZE]; uint16_t port_en; }; #define MAX_IB_DEVS 16 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; struct userIbDev userIbDevs[MAX_IB_DEVS]; pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER; static int ncclIbRelaxedOrderingEnabled = 0; NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0); NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18); NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7); NCCL_PARAM(IbPkey, "IB_PKEY", 0); NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0); NCCL_PARAM(IbSl, "IB_SL", 0); NCCL_PARAM(IbTc, "IB_TC", 0); NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192); NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2); NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2); pthread_t ncclIbAsyncThread; static void* ncclIbAsyncThreadMain(void* args) { struct ibv_context* context = (struct ibv_context*)args; while (1) { struct ibv_async_event event; if (ncclSuccess != wrap_ibv_get_async_event(context, &event)) { break; } char *str; if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; } if (event.event_type != IBV_EVENT_COMM_EST) WARN("NET/IB : Got async event : %s", str); if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; } } return NULL; } NCCL_PARAM(IbDisable, "IB_DISABLE", 0); NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1); static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) { char devicePath[PATH_MAX]; snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName); char* p = realpath(devicePath, NULL); if (p == NULL) { WARN("Could not find real path of %s (%s)", devName, devicePath); } else { // Merge multi-port NICs into the same PCI device p[strlen(p)-1] = '0'; // Also merge virtual functions (VF) into the same device if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0'; // And keep the real port aside (the ibv port is always 1 on recent cards) *realPort = 0; for (int d=0; dname = ncclIbDevs[dev].devName; props->pciPath = ncclIbDevs[dev].pciPath; props->guid = ncclIbDevs[dev].guid; props->ptrSupport = NCCL_PTR_HOST; if (ncclIbGdrSupport(dev) == ncclSuccess) { props->ptrSupport |= NCCL_PTR_CUDA; // GDR support via nv_peermem } if (ncclIbDmaBufSupport(dev) == ncclSuccess) { props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF } props->speed = ncclIbDevs[dev].speed; props->latency = 0; // Not set props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort; props->maxComms = ncclIbDevs[dev].maxQp; props->maxRecvs = NCCL_NET_IB_MAX_RECVS; return ncclSuccess; } // We need to support NCCL_NET_MAX_REQUESTS for each concurrent receive #define MAX_REQUESTS (NCCL_NET_MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS) static_assert(MAX_REQUESTS <= 256, "request id are encoded in wr_id and we need up to 8 requests ids per completion"); #define NCCL_IB_MAX_QPS 128 struct ncclIbQpInfo { uint32_t lid; uint8_t ib_port; uint8_t link_layer; uint32_t qpn[NCCL_IB_MAX_QPS]; // For RoCE uint64_t spn; uint64_t iid; enum ibv_mtu mtu; // FIFO RDMA info uint32_t fifoRkey; uint64_t fifoAddr; }; enum ncclIbCommState { ncclIbCommStateStart = 0, ncclIbCommStateConnect = 1, ncclIbCommStateAccept = 3, ncclIbCommStateSend = 4, ncclIbCommStateRecv = 5, ncclIbCommStateConnecting = 6, ncclIbCommStateConnected = 7, ncclIbCommStatePendingReady = 8, }; struct ncclIbCommStage { enum ncclIbCommState state; int offset; void* buffer; void* comm; }; struct ncclIbHandle { union ncclSocketAddress connectAddr; // Filled by the target uint64_t magic; // random number to help debugging struct ncclIbCommStage stage; // Used by the other side when connecting }; // Retain local and remote RoCE addresses for error logging struct ncclIbGidInfo { uint8_t link_layer; union ibv_gid localGid; union ibv_gid remoteGid; }; #define NCCL_NET_IB_REQ_UNUSED 0 #define NCCL_NET_IB_REQ_SEND 1 #define NCCL_NET_IB_REQ_RECV 2 #define NCCL_NET_IB_REQ_FLUSH 3 const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush" }; struct ncclIbRequest { struct ncclIbVerbs* verbs; int type; int events; struct ncclSocket* sock; struct ncclIbGidInfo* gidInfo; int nreqs; union { struct { int size; void* data; uint32_t lkey; int offset; } send; struct { int sizes[NCCL_NET_IB_MAX_RECVS]; } recv; }; }; struct ncclIbVerbs { int dev; struct ibv_pd* pd; // duplicate of ncclIbDevs[dev].pd struct ibv_cq* cq; uint64_t pad[1]; struct ncclIbRequest reqs[MAX_REQUESTS]; }; struct ncclIbListenComm { int dev; struct ncclSocket sock; struct ncclIbCommStage stage; }; struct ncclIbSendFifo { uint64_t addr; int size; uint32_t rkey; uint32_t nreqs; uint32_t tag; uint64_t idx; }; struct ncclIbSendComm { struct ncclIbVerbs verbs; struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; uint64_t fifoHead; struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1]; struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS]; struct ncclSocket sock; int ready; struct ibv_qp* qps[NCCL_IB_MAX_QPS]; int nqps; int qpIndex; struct ibv_mr* fifoMr; int ar; struct ncclIbGidInfo gidInfo; }; // The SendFifo needs to be 32-byte aligned and each element needs // to be a 32-byte multiple, so that an entry does not get split and // written out of order when IB Relaxed Ordering is enabled static_assert((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned"); static_assert((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples"); struct ncclIbGpuFlush { int enabled; int hostMem; struct ibv_mr* hostMr; struct ibv_sge sge; struct ibv_qp* qp; }; struct ncclIbRemFifo { struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; uint64_t fifoTail; uint64_t addr; uint32_t rkey; uint32_t flags; struct ibv_mr* mr; struct ibv_sge sge; }; struct ncclIbRecvComm { struct ncclIbVerbs verbs; struct ncclIbRemFifo remFifo; struct ncclSocket sock; int ready; struct ibv_qp* qps[NCCL_IB_MAX_QPS]; int nqps; int qpIndex; struct ncclIbGpuFlush gpuFlush; struct ncclIbGidInfo gidInfo; }; static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned"); NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1); ncclResult_t ncclIbInitVerbs(int dev, struct ibv_context* ctx, struct ncclIbVerbs* verbs) { verbs->dev = dev; pthread_mutex_lock(&ncclIbDevs[dev].lock); if (0 == ncclIbDevs[dev].pdRefs++) { ncclResult_t res; NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ncclIbDevs[dev].pd, ctx), res, failure); if (0) { failure: pthread_mutex_unlock(&ncclIbDevs[dev].lock); return res; } } verbs->pd = ncclIbDevs[dev].pd; pthread_mutex_unlock(&ncclIbDevs[dev].lock); // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv). NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0)); return ncclSuccess; } ncclResult_t ncclIbDestroyVerbs(struct ncclIbVerbs* verbs) { ncclResult_t res; NCCLCHECK(wrap_ibv_destroy_cq(verbs->cq)); pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); if (0 == --ncclIbDevs[verbs->dev].pdRefs) { NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[verbs->dev].pd), res, returning); } res = ncclSuccess; returning: pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock); return res; } ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int access_flags, struct ibv_qp** qp) { struct ibv_qp_init_attr qpInitAttr; memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr)); qpInitAttr.send_cq = verbs->cq; qpInitAttr.recv_cq = verbs->cq; qpInitAttr.qp_type = IBV_QPT_RC; // We might send 2 messages per send (RDMA and RDMA_WITH_IMM) qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS; qpInitAttr.cap.max_recv_wr = MAX_REQUESTS; qpInitAttr.cap.max_send_sge = 1; qpInitAttr.cap.max_recv_sge = 1; qpInitAttr.cap.max_inline_data = ncclParamIbUseInline() ? sizeof(struct ncclIbSendFifo) : 0; NCCLCHECK(wrap_ibv_create_qp(qp, verbs->pd, &qpInitAttr)); struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_INIT; qpAttr.pkey_index = ncclParamIbPkey(); qpAttr.port_num = ib_port; qpAttr.qp_access_flags = access_flags; NCCLCHECK(wrap_ibv_modify_qp(*qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)); return ncclSuccess; } ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) { struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_RTR; qpAttr.path_mtu = info->mtu; qpAttr.dest_qp_num = qpn; qpAttr.rq_psn = 0; qpAttr.max_dest_rd_atomic = 1; qpAttr.min_rnr_timer = 12; if (info->link_layer == IBV_LINK_LAYER_ETHERNET) { qpAttr.ah_attr.is_global = 1; qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn; qpAttr.ah_attr.grh.dgid.global.interface_id = info->iid; qpAttr.ah_attr.grh.flow_label = 0; qpAttr.ah_attr.grh.sgid_index = ncclParamIbGidIndex(); qpAttr.ah_attr.grh.hop_limit = 255; qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc(); } else { qpAttr.ah_attr.is_global = 0; qpAttr.ah_attr.dlid = info->lid; } qpAttr.ah_attr.sl = ncclParamIbSl(); qpAttr.ah_attr.src_path_bits = 0; qpAttr.ah_attr.port_num = info->ib_port; NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)); return ncclSuccess; } ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) { struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_RTS; qpAttr.timeout = ncclParamIbTimeout(); qpAttr.retry_cnt = ncclParamIbRetryCnt(); qpAttr.rnr_retry = 7; qpAttr.sq_psn = 0; qpAttr.max_rd_atomic = 1; NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)); return ncclSuccess; } ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { struct ncclIbListenComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large"); memset(handle, 0, sizeof(struct ncclIbHandle)); comm->dev = dev; handle->magic = NCCL_SOCKET_MAGIC; NCCLCHECK(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1)); NCCLCHECK(ncclSocketListen(&comm->sock)); NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr)); *listenComm = comm; return ncclSuccess; } ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; struct ncclIbCommStage* stage = &handle->stage; struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm; int ready; *sendComm = NULL; if (stage->state == ncclIbCommStateConnect) goto ib_connect_check; if (stage->state == ncclIbCommStateSend) goto ib_send; if (stage->state == ncclIbCommStateConnecting) goto ib_connect; if (stage->state == ncclIbCommStateConnected) goto ib_send_ready; if (stage->state != ncclIbCommStateStart) { WARN("Error: trying to connect already connected sendComm"); return ncclInternalError; } NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm))); NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1)); stage->comm = comm; stage->state = ncclIbCommStateConnect; NCCLCHECK(ncclSocketConnect(&comm->sock)); ib_connect_check: /* since ncclSocketConnect is async, we must check if connection is complete */ NCCLCHECK(ncclSocketReady(&comm->sock, &ready)); if (!ready) return ncclSuccess; // IB Setup struct ibv_context* ctx; ctx = ncclIbDevs[dev].context; NCCLCHECK(ncclIbInitVerbs(dev, ctx, &comm->verbs)); uint8_t ib_port; ib_port = ncclIbDevs[dev].port; comm->nqps = ncclParamIbQpsPerConn(); for (int q=0; qnqps; q++) { NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, comm->qps+q)); } comm->ar = ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING // Send my QP Info to receiver through the socket. Hope this won't block. struct ibv_port_attr portAttr; NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr)); struct ncclIbQpInfo qpInfo; qpInfo.ib_port = ib_port; for (int q=0; qnqps; q++) qpInfo.qpn[q] = comm->qps[q]->qp_num; qpInfo.mtu = portAttr.active_mtu; // Prepare my fifo NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); qpInfo.fifoRkey = comm->fifoMr->rkey; qpInfo.fifoAddr = (uint64_t)comm->fifo; // RoCE support qpInfo.lid = portAttr.lid; qpInfo.link_layer = comm->gidInfo.link_layer = portAttr.link_layer; if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB for (int q=0; qnqps; q++) INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid); } else { // RoCE NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &comm->gidInfo.localGid)); qpInfo.spn = comm->gidInfo.localGid.global.subnet_prefix; qpInfo.iid = comm->gidInfo.localGid.global.interface_id; for (int q=0; qnqps; q++) INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); } stage->state = ncclIbCommStateSend; stage->offset = 0; NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(qpInfo))); memcpy(stage->buffer, &qpInfo, sizeof(qpInfo)); ib_send: NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset)); if (stage->offset != sizeof(qpInfo)) return ncclSuccess; stage->state = ncclIbCommStateConnecting; stage->offset = 0; // Clear the staging buffer for re-use memset(stage->buffer, 0, sizeof(qpInfo)); ib_connect: struct ncclIbQpInfo remQpInfo; NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, stage->buffer, sizeof(ncclIbQpInfo), &stage->offset)); if (stage->offset != sizeof(remQpInfo)) return ncclSuccess; memcpy(&remQpInfo, stage->buffer, sizeof(ncclIbQpInfo)); comm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn; comm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid; for (int q=0; qnqps; q++) { struct ibv_qp* qp = comm->qps[q]; NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo)); NCCLCHECK(ncclIbRtsQp(qp)); } comm->ready = 1; stage->state = ncclIbCommStateConnected; stage->offset = 0; ib_send_ready: NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, &comm->ready, sizeof(int), &stage->offset)); if (stage->offset != sizeof(int)) return ncclSuccess; free(stage->buffer); stage->state = ncclIbCommStateStart; *sendComm = comm; return ncclSuccess; } NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0); ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm; struct ncclIbCommStage* stage = &lComm->stage; struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm; int ready; *recvComm = NULL; if (stage->state == ncclIbCommStateAccept) goto ib_accept_check; if (stage->state == ncclIbCommStateRecv) goto ib_recv; if (stage->state == ncclIbCommStateSend) goto ib_send; if (stage->state == ncclIbCommStatePendingReady) goto ib_recv_ready; if (stage->state != ncclIbCommStateStart) { WARN("Listencomm in unknown state %d", stage->state); return ncclInternalError; } NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm))); stage->comm = rComm; stage->state = ncclIbCommStateAccept; NCCLCHECK(ncclSocketInit(&rComm->sock)); NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock)); ib_accept_check: NCCLCHECK(ncclSocketReady(&rComm->sock, &ready)); if (!ready) return ncclSuccess; struct ncclIbQpInfo remQpInfo; stage->state = ncclIbCommStateRecv; stage->offset = 0; NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo))); ib_recv: NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset)); if (stage->offset != sizeof(remQpInfo)) return ncclSuccess; /* copy back the received info */ memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo)); rComm->gidInfo.remoteGid.global.subnet_prefix = remQpInfo.spn; rComm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid; // IB setup struct ibv_context* ctx; uint8_t ib_port; ctx = ncclIbDevs[lComm->dev].context; ib_port = ncclIbDevs[lComm->dev].port; struct ibv_port_attr portAttr; NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr)); NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &rComm->gidInfo.localGid)); // QP Creation NCCLCHECK(ncclIbInitVerbs(lComm->dev, ctx, &rComm->verbs)); rComm->nqps = ncclParamIbQpsPerConn(); for (int q=0; qnqps; q++) { NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_REMOTE_WRITE, rComm->qps+q)); } // Adjust the MTU remQpInfo.mtu = (enum ibv_mtu)std::min(remQpInfo.mtu, portAttr.active_mtu); // Setup QP for (int q=0; qnqps; q++) { struct ibv_qp* qp = rComm->qps[q]; NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo)); NCCLCHECK(ncclIbRtsQp(qp)); } // Retain remote fifo info and prepare my RDMA ops rComm->remFifo.rkey = remQpInfo.fifoRkey; rComm->remFifo.addr = remQpInfo.fifoAddr; NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ)); rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey; if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE; // Allocate Flush dummy buffer for GPU Direct RDMA rComm->gpuFlush.enabled = ((ncclIbGdrSupport(lComm->dev) == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess) && (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0; if (rComm->gpuFlush.enabled) { NCCLCHECK(wrap_ibv_reg_mr(&rComm->gpuFlush.hostMr, rComm->verbs.pd, &rComm->gpuFlush.hostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE)); rComm->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlush.hostMem; rComm->gpuFlush.sge.length = 1; rComm->gpuFlush.sge.lkey = rComm->gpuFlush.hostMr->lkey; NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->gpuFlush.qp)); struct ncclIbQpInfo localQpInfo; localQpInfo.lid=portAttr.lid; localQpInfo.link_layer=portAttr.link_layer; localQpInfo.ib_port=ib_port; localQpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix; localQpInfo.iid=rComm->gidInfo.localGid.global.interface_id; localQpInfo.mtu=portAttr.active_mtu; NCCLCHECK(ncclIbRtrQp(rComm->gpuFlush.qp, rComm->gpuFlush.qp->qp_num, &localQpInfo)); NCCLCHECK(ncclIbRtsQp(rComm->gpuFlush.qp)); } // Fill Handle struct ncclIbQpInfo qpInfo; qpInfo.lid=portAttr.lid; qpInfo.link_layer= rComm->gidInfo.link_layer = portAttr.link_layer; qpInfo.ib_port=ib_port; for (int q=0; qnqps; q++) qpInfo.qpn[q]=rComm->qps[q]->qp_num; qpInfo.spn=rComm->gidInfo.localGid.global.subnet_prefix; qpInfo.iid=rComm->gidInfo.localGid.global.interface_id; qpInfo.mtu=remQpInfo.mtu; stage->state = ncclIbCommStateSend; stage->offset = 0; if (stage->buffer) free(stage->buffer); NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo))); memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo)); ib_send: NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset)); if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess; stage->offset = 0; stage->state = ncclIbCommStatePendingReady; ib_recv_ready: NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, &rComm->ready, sizeof(int), &stage->offset)); if (stage->offset != sizeof(int)) return ncclSuccess; free(stage->buffer); *recvComm = rComm; /* reset lComm stage */ stage->state = ncclIbCommStateStart; stage->offset = 0; stage->comm = NULL; stage->buffer = NULL; return ncclSuccess; } ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** req) { for (int i=0; ireqs+i; if (r->type == NCCL_NET_IB_REQ_UNUSED) { r->verbs = verbs; r->events = 1; r->sock = NULL; r->gidInfo = NULL; *req = r; return ncclSuccess; } } WARN("NET/IB : unable to allocate requests"); *req = NULL; return ncclInternalError; } ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) { r->type = NCCL_NET_IB_REQ_UNUSED; return ncclSuccess; } ncclResult_t ncclIbTest(void* request, int* done, int* size); /* DMA-BUF support */ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset"); assert(size > 0); static __thread uintptr_t pageSize = 0; if (pageSize == 0) pageSize = sysconf(_SC_PAGESIZE); struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache; uintptr_t addr = (uintptr_t)data & -pageSize; size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; ncclResult_t res; pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); for (int slot=0; /*true*/; slot++) { if (slot == cache->population) { // didn't find in cache if (cache->population == cache->capacity) { // must grow cache cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity; NCCLCHECKGOTO(ncclRealloc(&cache->slots, cache->population, cache->capacity), res, returning); } // Deregister / register struct ibv_mr* mr; unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ; if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING; if (fd != -1) { /* DMA-BUF support */ NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, verbs->pd, offset, pages*pageSize, addr, fd, flags), res, returning); } else { if (ncclIbRelaxedOrderingEnabled) { // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, addr, flags), res, returning); } else { NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning); } } TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x fd %d", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey, fd); cache->population += 1; cache->slots[slot].addr = addr; cache->slots[slot].pages = pages; cache->slots[slot].refs = 1; cache->slots[slot].mr = mr; *mhandle = (void*)mr; res = ncclSuccess; goto returning; } else if (cache->slots[slot].addr == addr && cache->slots[slot].pages == pages) { cache->slots[slot].refs += 1; *mhandle = (void*)cache->slots[slot].mr; res = ncclSuccess; goto returning; } } returning: pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock); return res; } ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) { return ncclIbRegMrDmaBuf(comm, data, (size_t)size, type, 0ULL, -1, mhandle); } ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache; ncclResult_t res; pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); for (int i=0; i < cache->population; i++) { if (mhandle == cache->slots[i].mr) { if (0 == --cache->slots[i].refs) { memmove(&cache->slots[i], &cache->slots[--cache->population], sizeof(struct ncclIbMr)); if (cache->population == 0) { free(cache->slots); cache->slots = NULL; cache->capacity = 0; } NCCLCHECKGOTO(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle), res, returning); } res = ncclSuccess; goto returning; } } WARN("NET/IB: could not find mr %p inside cache of %d entries", mhandle, cache->population); res = ncclInternalError; returning: pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock); return res; } NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 1); ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { struct ncclIbRequest** reqs = comm->fifoReqs[slot]; volatile struct ncclIbSendFifo* slots = comm->fifo[slot]; int nreqs = slots[0].nreqs; if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; uint64_t wr_id = 0ULL; for (int r=0; rwrs+r; memset(wr, 0, sizeof(struct ibv_send_wr)); struct ibv_sge* sge = comm->sges+r; sge->addr=(uintptr_t)reqs[r]->send.data; sge->lkey=reqs[r]->send.lkey; wr->opcode = IBV_WR_RDMA_WRITE; wr->send_flags = 0; wr->wr.rdma.remote_addr = slots[r].addr; wr->wr.rdma.rkey = slots[r].rkey; wr->next = wr+1; wr_id += (reqs[r] - comm->verbs.reqs) << (r*8); } // Write size as immediate data. In the case of multi-send, only write // 0 or 1 as size to indicate whether there was data sent or received. uint32_t immData = 0; if (nreqs == 1) { immData = reqs[0]->send.size; } else { if (nreqs > 32) { WARN("Cannot store sizes of %d requests in a 32-bits field", nreqs); return ncclInternalError; } for (int r=0; rsend.size ? 1 : 0) << r; } } struct ibv_send_wr* lastWr = comm->wrs+nreqs-1; if (nreqs > 1 || (comm->ar && reqs[0]->send.size > ncclParamIbArThreshold())) { // When using ADAPTIVE_ROUTING, send the bulk of the data first as an // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote // completion. lastWr++; memset(lastWr, 0, sizeof(struct ibv_send_wr)); } lastWr->wr_id = wr_id; lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM; lastWr->imm_data = immData; lastWr->next = NULL; lastWr->send_flags = IBV_SEND_SIGNALED; // Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work const int align = 128; const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1; for (int q=0; qsend.size, nqps), align) * align; int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize); if (length <= 0) { comm->wrs[r].sg_list = NULL; comm->wrs[r].num_sge = 0; } else { comm->sges[r].length = length; comm->wrs[r].sg_list = comm->sges+r; comm->wrs[r].num_sge = 1; } } struct ibv_send_wr* bad_wr; NCCLCHECK(wrap_ibv_post_send(comm->qps[comm->qpIndex], comm->wrs, &bad_wr)); comm->qpIndex = (comm->qpIndex+1)%comm->nqps; for (int r=0; rsend.size, nqps), align) * align; reqs[r]->send.offset += chunkSize; comm->sges[r].addr += chunkSize; comm->wrs[r].wr.rdma.remote_addr += chunkSize; } } return ncclSuccess; } ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm->ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->ready == 0"); return ncclInternalError; } if (comm->ready == 0) { *request = NULL; return ncclSuccess; } struct ibv_mr* mr = (struct ibv_mr*)mhandle; // Wait for the receiver to have posted the corresponding receive int nreqs = 0; volatile struct ncclIbSendFifo* slots; int slot = (comm->fifoHead)%MAX_REQUESTS; struct ncclIbRequest** reqs = comm->fifoReqs[slot]; slots = comm->fifo[slot]; int idx = comm->fifoHead+1; if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; } nreqs = slots[0].nreqs; // Wait until all data has arrived for (int r=1; r slots[r].size) { char line[SOCKET_NAME_MAXLEN + 1]; union ncclSocketAddress addr; ncclSocketGetAddr(&comm->sock, &addr); WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error, local size %d remote size %d", r, nreqs, tag, ncclSocketToString(&addr, line), size, slots[r].size); return ncclInvalidUsage; } // plus any potential programming errors else if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) { char line[SOCKET_NAME_MAXLEN + 1]; union ncclSocketAddress addr; ncclSocketGetAddr(&comm->sock, &addr); WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x", r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkey); return ncclInternalError; } struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); req->type = NCCL_NET_IB_REQ_SEND; req->sock = &comm->sock; req->verbs = &comm->verbs; req->nreqs = nreqs; req->send.size = size; req->send.data = data; req->send.lkey = mr->lkey; req->send.offset = 0; req->events = ncclParamIbSplitDataOnQps() ? comm->nqps : 1; if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo; *request = reqs[r] = req; // If this is a multi-recv, send only when all requests have matched. for (int r=0; rnreqs, as well as other fields to help debugging and sanity checks memset((void*)slots, 0, sizeof(struct ncclIbSendFifo)); memset(reqs, 0, NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbRequest*)); comm->fifoHead++; TIME_STOP(0); return ncclSuccess; } *request = NULL; return ncclSuccess; } ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) { struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); int slot = comm->remFifo.fifoTail%MAX_REQUESTS; struct ncclIbSendFifo* localElem = comm->remFifo.elems[slot]; for (int i=0; irkey; localElem[i].nreqs = n; localElem[i].size = sizes[i]; // Sanity/Debugging localElem[i].tag = tags[i]; localElem[i].idx = comm->remFifo.fifoTail+1; } wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo); wr.wr.rdma.rkey = comm->remFifo.rkey; comm->remFifo.sge.addr = (uint64_t)localElem; comm->remFifo.sge.length = n*sizeof(struct ncclIbSendFifo); wr.sg_list = &comm->remFifo.sge; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_WRITE; wr.send_flags = comm->remFifo.flags; // IBV_SEND_INLINE // We need to occasionally post a request with the IBV_SEND_SIGNALED flag, otherwise // the send queue will never empty. // // From https://www.rdmamojo.com/2014/06/30/working-unsignaled-completions/ // "How to use Unsignaled Completion?" / "Gotchas and Pitfalls" // All posted Send Requested, Signaled and Unsignaled, are considered outstanding until // a Work Completion that they, or Send Requests that were posted after them, was polled // from the Completion Queue associated with the Send Queue. This means if one works with // a Queue Pair that was configured to work with Unsignaled Completions, he must make // sure that occasionally (before the Send Queue is full with outstanding Send Requests) // a Send Request that generate Work Completion will be posted. // // Not following this rule may lead to a case that the Send Queue is full with Send // Requests that won't generate Work Completion: // // - The Send Queue is full, so no new Send Requests can be posted to it // - The Send Queue can't be emptied, since no Work Completion can be generated anymore // (the reason is that no Work Completion, that can generate Work Completion that // polling it will empty the Send Queue, can be posted) // - The status of all posted Send Request is considered unknown // if (slot == 0) { wr.send_flags |= IBV_SEND_SIGNALED; wr.wr_id = req - comm->verbs.reqs; req->events++; } struct ibv_send_wr* bad_wr; NCCLCHECK(wrap_ibv_post_send(comm->qps[0], &wr, &bad_wr)); comm->remFifo.fifoTail++; return ncclSuccess; } ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->ready == 0"); return ncclInternalError; } if (comm->ready == 0) { *request = NULL; return ncclSuccess; } if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); req->type = NCCL_NET_IB_REQ_RECV; req->sock = &comm->sock; req->nreqs = n; if (comm->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) req->gidInfo = &comm->gidInfo; for (int i=0; irecv.sizes[i] = 0; struct ibv_recv_wr wr; memset(&wr, 0, sizeof(wr)); wr.wr_id = req - comm->verbs.reqs; wr.sg_list = NULL; wr.num_sge = 0; TIME_START(1); const int nqps = ncclParamIbSplitDataOnQps() ? comm->nqps : 1; for (int q=0; qqps[comm->qpIndex]; struct ibv_recv_wr* bad_wr; NCCLCHECK(wrap_ibv_post_recv(qp, &wr, &bad_wr)); comm->qpIndex = (comm->qpIndex+1)%comm->nqps; } TIME_STOP(1); req->events = nqps; *request = req; // Post to FIFO to notify sender TIME_START(2); NCCLCHECK(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req)); TIME_STOP(2); return ncclSuccess; } ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; int last = -1; for (int i=0; igpuFlush.enabled == 0 || last == -1) return ncclSuccess; // Only flush once using the last non-zero receive struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); req->type = NCCL_NET_IB_REQ_FLUSH; req->sock = &comm->sock; struct ibv_mr* mr = (struct ibv_mr*)mhandles[last]; struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); wr.wr_id = req - comm->verbs.reqs; wr.wr.rdma.remote_addr = (uint64_t)data[last]; wr.wr.rdma.rkey = mr->rkey; wr.sg_list = &comm->gpuFlush.sge; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_READ; wr.send_flags = IBV_SEND_SIGNALED; TIME_START(4); struct ibv_send_wr* bad_wr; NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr)); TIME_STOP(4); *request = req; return ncclSuccess; } ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { struct ncclIbRequest *r = (struct ncclIbRequest*)request; *done = 0; while (1) { if (r->events == 0) { *done = 1; if (sizes && r->type == NCCL_NET_IB_REQ_RECV) { for (int i=0; inreqs; i++) sizes[i] = r->recv.sizes[i]; } NCCLCHECK(ncclIbFreeRequest(r)); return ncclSuccess; } int wrDone = 0; struct ibv_wc wcs[4]; TIME_START(3); NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone)); if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); } if (wrDone == 0) return ncclSuccess; for (int w=0; wstatus != IBV_WC_SUCCESS) { char line[SOCKET_NAME_MAXLEN+1]; union ncclSocketAddress addr; ncclSocketGetAddr(r->sock, &addr); char localGidString[INET6_ADDRSTRLEN] = ""; char remoteGidString[INET6_ADDRSTRLEN] = ""; const char* localGidStr = NULL, *remoteGidStr = NULL; if (r->gidInfo) { localGidStr = inet_ntop(AF_INET6, &r->gidInfo->localGid, localGidString, sizeof(localGidString)); remoteGidStr = inet_ntop(AF_INET6, &r->gidInfo->remoteGid, remoteGidString, sizeof(remoteGidString)); } WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d (%s)%s%s%s%s", ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type], localGidStr ? " localGid ":"", localGidString, remoteGidStr ? " remoteGid ":"", remoteGidString); return ncclRemoteError; } struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff); if (req->type == NCCL_NET_IB_REQ_SEND) { for (int i=0; inreqs; i++) { struct ncclIbRequest* sendReq = r->verbs->reqs+((wc->wr_id >> (i*8)) & 0xff); if ((sendReq->events <= 0)) return ncclInternalError; sendReq->events--; } } else { if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError; if (req->nreqs > 1) { // In the case of a multi recv, we only set sizes to 0 or 1. for (int i=0; inreqs; i++) { req->recv.sizes[i] = (wc->imm_data >> i) & 0x1; } } else { req->recv.sizes[0] += wc->imm_data; } } req->events--; } } } } ncclResult_t ncclIbCloseSend(void* sendComm) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm) { NCCLCHECK(ncclSocketClose(&comm->sock)); for (int q=0; qnqps; q++) if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q])); if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr)); NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs)); free(comm); } TIME_PRINT("IB"); return ncclSuccess; } ncclResult_t ncclIbCloseRecv(void* recvComm) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm) { NCCLCHECK(ncclSocketClose(&comm->sock)); for (int q=0; qnqps; q++) if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q])); if (comm->gpuFlush.enabled) { if (comm->gpuFlush.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->gpuFlush.qp)); if (comm->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->gpuFlush.hostMr)); } if (comm->remFifo.mr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remFifo.mr)); NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs)); free(comm); } return ncclSuccess; } ncclResult_t ncclIbCloseListen(void* listenComm) { struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm; if (comm) { NCCLCHECK(ncclSocketClose(&comm->sock)); free(comm); } return ncclSuccess; } ncclNet_t ncclNetIb = { "IB", ncclIbInit, ncclIbDevices, ncclIbGetProperties, ncclIbListen, ncclIbConnect, ncclIbAccept, ncclIbRegMr, ncclIbRegMrDmaBuf, ncclIbDeregMr, ncclIbIsend, ncclIbIrecv, ncclIbIflush, ncclIbTest, ncclIbCloseSend, ncclIbCloseRecv, ncclIbCloseListen }; nccl-2.18.3-1/src/transport/net_socket.cc000066400000000000000000000504721444201535000201600ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "core.h" #include "socket.h" #include "net.h" #include "param.h" #include #include #include #include #include /* Init functions */ static int ncclNetIfs = -1; struct ncclNetSocketDev { union ncclSocketAddress addr; char devName[MAX_IF_NAME_SIZE]; char* pciPath; }; static struct ncclNetSocketDev ncclNetSocketDevs[MAX_IFS]; pthread_mutex_t ncclNetSocketLock = PTHREAD_MUTEX_INITIALIZER; static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) { char devicePath[PATH_MAX]; snprintf(devicePath, PATH_MAX, "/sys/class/net/%s/device", devName); // May return NULL if the file doesn't exist. *pciPath = realpath(devicePath, NULL); return ncclSuccess; } ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) { if (ncclNetIfs == -1) { pthread_mutex_lock(&ncclNetSocketLock); if (ncclNetIfs == -1) { char names[MAX_IF_NAME_SIZE*MAX_IFS]; union ncclSocketAddress addrs[MAX_IFS]; ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS); if (ncclNetIfs <= 0) { WARN("NET/Socket : no interface found"); return ncclInternalError; } else { #define MAX_LINE_LEN (2047) char line[MAX_LINE_LEN+1]; char addrline[SOCKET_NAME_MAXLEN+1]; line[0] = '\0'; addrline[SOCKET_NAME_MAXLEN] = '\0'; for (int i=0; i 0) { *speed = strtol(speedStr, NULL, 0); } close(fd); } if (*speed <= 0) { INFO(NCCL_NET, "Could not get speed from %s. Defaulting to 10 Gbps.", speedPath); *speed = 10000; } return ncclSuccess; } ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) { props->name = ncclNetSocketDevs[dev].devName; props->pciPath = ncclNetSocketDevs[dev].pciPath; props->guid = dev; props->ptrSupport = NCCL_PTR_HOST; NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed)); props->latency = 0; // Not set props->port = 0; props->maxComms = 65536; props->maxRecvs = 1; return ncclSuccess; } /* Communication functions */ #define MAX_SOCKETS 64 #define MAX_THREADS 16 #define MAX_REQUESTS NCCL_NET_MAX_REQUESTS #define MIN_CHUNKSIZE (64*1024) NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2); NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2); enum ncclNetSocketCommState { ncclNetSocketCommStateStart = 0, ncclNetSocketCommStateConnect = 1, ncclNetSocketCommStateAccept = 3, ncclNetSocketCommStateSend = 4, ncclNetSocketCommStateRecv = 5, }; struct ncclNetSocketCommStage { enum ncclNetSocketCommState state; uint8_t iteration; struct ncclSocket* sock; struct ncclNetSocketComm* comm; }; struct ncclNetSocketHandle { union ncclSocketAddress connectAddr; uint64_t magic; // random number to help debugging int nSocks; int nThreads; struct ncclNetSocketCommStage stage; }; struct ncclNetSocketTask { int op; void* data; int size; struct ncclSocket* sock; int offset; int used; ncclResult_t result; }; struct ncclNetSocketRequest { int op; void* data; int size; struct ncclSocket* ctrlSock; int offset; int used; struct ncclNetSocketComm* comm; struct ncclNetSocketTask* tasks[MAX_SOCKETS]; int nSubs; }; struct ncclNetSocketTaskQueue { int next; int len; struct ncclNetSocketTask* tasks; }; struct ncclNetSocketThreadResources { struct ncclNetSocketTaskQueue threadTaskQueue; int stop; struct ncclNetSocketComm* comm; pthread_mutex_t threadLock; pthread_cond_t threadCond; }; struct ncclNetSocketListenComm { struct ncclSocket sock; struct ncclNetSocketCommStage stage; int nSocks; int nThreads; int dev; }; struct ncclNetSocketComm { struct ncclSocket ctrlSock; struct ncclSocket socks[MAX_SOCKETS]; int dev; int cudaDev; int nSocks; int nThreads; int nextSock; struct ncclNetSocketRequest requests[MAX_REQUESTS]; pthread_t helperThread[MAX_THREADS]; struct ncclNetSocketThreadResources threadResources[MAX_THREADS]; }; void* persistentSocketThread(void *args_) { struct ncclNetSocketThreadResources* resource = (struct ncclNetSocketThreadResources*)args_; struct ncclNetSocketComm* comm = resource->comm; struct ncclNetSocketTaskQueue* myQueue = &resource->threadTaskQueue; int nSocksPerThread = comm->nSocks / comm->nThreads; while (1) { int idle = 1; int mark = myQueue->next; // mark newest task seen for (int i=0; ilen; i+=nSocksPerThread) { int repeat; do { repeat = 0; for (int j=0; jtasks+i+j; if (r != NULL && r->used == 1 && r->offset < r->size) { r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset); if (r->result != ncclSuccess) { WARN("NET/Socket : socket progress error"); return NULL; } idle = 0; if (r->offset < r->size) repeat = 1; } } } while (repeat); } if (idle) { pthread_mutex_lock(&resource->threadLock); while (mark == myQueue->next && resource->stop == 0) { // no new tasks, wait pthread_cond_wait(&resource->threadCond, &resource->threadLock); } pthread_mutex_unlock(&resource->threadLock); } if (resource->stop) return NULL; } } ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) { int nSocksPerThread = ncclParamSocketNsocksPerThread(); int nThreads = ncclParamSocketNthreads(); if (nThreads > MAX_THREADS) { WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS); nThreads = MAX_THREADS; } if (nThreads == -2 || nSocksPerThread == -2) { // Auto-detection int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads char vendorPath[PATH_MAX]; snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetSocketDevs[dev].devName); char* rPath = realpath(vendorPath, NULL); int fd = open(rPath, O_RDONLY); free(rPath); if (fd == -1) { // Could not find device vendor. This is handled silently so // we don't want to print an INFO error. TRACE(NCCL_NET, "Open of %s failed : %s", vendorPath, strerror(errno)); goto end; } char vendor[7]; strncpy(vendor, "0x0000", 7); int len; SYSCHECKVAL(read(fd, vendor, 6), "read", len); SYSCHECK(close(fd), "close"); if (strcmp(vendor, "0x1d0f") == 0) { // AWS autoNt = 2; autoNs = 8; } else if (strcmp(vendor, "0x1ae0") == 0) { // GCP autoNt = 4; autoNs = 1; } end: if (nThreads == -2) nThreads = autoNt; if (nSocksPerThread == -2) nSocksPerThread = autoNs; } int nSocks = nSocksPerThread * nThreads; if (nSocks > MAX_SOCKETS) { nSocksPerThread = MAX_SOCKETS/nThreads; WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread); nSocks = nSocksPerThread * nThreads; } *ns = nSocks; *nt = nThreads; if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread); return ncclSuccess; } ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev return ncclInternalError; } struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle; memset(handle, 0, sizeof(struct ncclNetSocketHandle)); static_assert(sizeof(struct ncclNetSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclNetSocketHandle size too large"); struct ncclNetSocketListenComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); handle->magic = NCCL_SOCKET_MAGIC; NCCLCHECK(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1)); NCCLCHECK(ncclSocketListen(&comm->sock)); NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr)); NCCLCHECK(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads)); handle->nSocks = comm->nSocks; handle->nThreads = comm->nThreads; comm->dev = dev; *listenComm = comm; return ncclSuccess; } ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev return ncclInternalError; } int ready; struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle; struct ncclNetSocketCommStage* stage = &handle->stage; struct ncclNetSocketComm* comm = stage->comm; uint8_t i = stage->iteration; struct ncclSocket* sock = stage->sock; *sendComm = NULL; if (stage->state == ncclNetSocketCommStateConnect) goto socket_connect_check; if (stage->state == ncclNetSocketCommStateSend) goto socket_send; NCCLCHECK(ncclCalloc(&comm, 1)); stage->comm = comm; comm->nSocks = handle->nSocks; comm->nThreads = handle->nThreads; comm->dev = dev; CUDACHECK(cudaGetDevice(&comm->cudaDev)); for (; inSocks+1; i++) { sock = (i == comm->nSocks) ? &comm->ctrlSock : comm->socks+i; NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetSocket, NULL, 1)); stage->sock = sock; stage->state = ncclNetSocketCommStateConnect; stage->iteration = i; NCCLCHECK(ncclSocketConnect(sock)); socket_connect_check: NCCLCHECK(ncclSocketReady(sock, &ready)); if (! ready) return ncclSuccess; stage->state = ncclNetSocketCommStateSend; socket_send: int done = 0; NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &i, sizeof(uint8_t), &done)); if (done == 0) return ncclSuccess; } *sendComm = comm; return ncclSuccess; } ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm) { struct ncclNetSocketListenComm* lComm = (struct ncclNetSocketListenComm*)listenComm; struct ncclNetSocketCommStage* stage = &lComm->stage; struct ncclNetSocketComm* rComm = stage->comm; uint8_t i = stage->iteration; struct ncclSocket* sock = stage->sock; int ready; *recvComm = NULL; if (stage->state == ncclNetSocketCommStateAccept) goto socket_accept_check; if (stage->state == ncclNetSocketCommStateRecv) goto socket_recv; NCCLCHECK(ncclCalloc(&rComm, 1)); stage->comm = rComm; rComm->nSocks = lComm->nSocks; rComm->nThreads = lComm->nThreads; rComm->dev = lComm->dev; CUDACHECK(cudaGetDevice(&rComm->cudaDev)); for (; inSocks+1; i++) { uint8_t sendSockIdx; NCCLCHECK(ncclCalloc(&sock, 1)); NCCLCHECK(ncclSocketInit(sock)); stage->sock = sock; stage->state = ncclNetSocketCommStateAccept; stage->iteration = i; NCCLCHECK(ncclSocketAccept(sock, &lComm->sock)); socket_accept_check: NCCLCHECK(ncclSocketReady(sock, &ready)); if (!ready) return ncclSuccess; stage->state = ncclNetSocketCommStateRecv; socket_recv: int done = 0; NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &sendSockIdx, sizeof(uint8_t), &done)); if (done == 0) return ncclSuccess; if (sendSockIdx == rComm->nSocks) memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket)); else memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket)); free(sock); } *recvComm = rComm; /* reset lComm state */ stage->state = ncclNetSocketCommStateStart; stage->iteration = 0; stage->sock = NULL; stage->comm = NULL; return ncclSuccess; } ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketRequest** req) { for (int i=0; irequests+i; if (r->used == 0) { r->op = op; r->data = data; r->size = size; r->ctrlSock = &comm->ctrlSock; r->used = 1; r->comm = comm; r->nSubs = 0; *req = r; return ncclSuccess; } } WARN("NET/Socket : unable to allocate requests"); return ncclInternalError; } ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketTask** req) { int tid = comm->nextSock % comm->nThreads; struct ncclNetSocketThreadResources* res = comm->threadResources+tid; struct ncclNetSocketTaskQueue* queue = &res->threadTaskQueue; // create helper threads and prepare per-thread task queue if (queue->tasks == NULL) { // each request can be divided up to nSocks tasks, and // these tasks are distributed to nThreads threads, // we need to make sure each thread queue has enough slots for MAX_REQUESTS queue->len = MAX_REQUESTS * DIVUP(comm->nSocks, comm->nThreads); NCCLCHECK(ncclCalloc(&queue->tasks, queue->len)); queue->next = 0; res->comm = comm; pthread_mutex_init(&res->threadLock, NULL); pthread_cond_init(&res->threadCond, NULL); pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res); ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev); } struct ncclNetSocketTask* r = queue->tasks+queue->next; if (r->used == 0) { r->op = op; r->data = data; r->size = size; r->sock = comm->socks + comm->nextSock; r->offset = 0; r->result = ncclSuccess; comm->nextSock = (comm->nextSock + 1) % comm->nSocks; r->used = 1; *req = r; pthread_mutex_lock(&res->threadLock); queue->next = (queue->next+1)%queue->len; pthread_cond_signal(&res->threadCond); pthread_mutex_unlock(&res->threadLock); return ncclSuccess; } WARN("NET/Socket : unable to allocate subtasks"); return ncclInternalError; } ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { *done = 0; struct ncclNetSocketRequest *r = (struct ncclNetSocketRequest*)request; if (r == NULL) { WARN("NET/Socket : test called with NULL request"); return ncclInternalError; } if (r->used == 1) { /* try to send/recv size */ int data = r->size; int offset = 0; NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, &data, sizeof(int), &offset)); if (offset == 0) return ncclSuccess; /* Not ready -- retry later */ // Not sure we could ever receive less than 4 bytes, but just in case ... if (offset < sizeof(int)) NCCLCHECK(ncclSocketWait(r->op, r->ctrlSock, &data, sizeof(int), &offset)); // Check size is less or equal to the size provided by the user if (r->op == NCCL_SOCKET_RECV && data > r->size) { char line[SOCKET_NAME_MAXLEN+1]; union ncclSocketAddress addr; ncclSocketGetAddr(r->ctrlSock, &addr); WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \ there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks", ncclSocketToString(&addr, line), data, r->size); return ncclInvalidUsage; } r->size = data; r->offset = 0; r->used = 2; // done exchanging size // divide into subtasks int chunkOffset = 0, i = 0; if (r->comm->nSocks > 0) { // each request can be divided up to nSocks tasks int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks)); while (chunkOffset < r->size) { int chunkSize = std::min(taskSize, r->size-chunkOffset); NCCLCHECK(ncclNetSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); chunkOffset += chunkSize; } } r->nSubs = i; } if (r->used == 2) { // already exchanged size if (r->nSubs > 0) { int nCompleted = 0; for (int i=0; inSubs; i++) { struct ncclNetSocketTask* sub = r->tasks[i]; if (sub->result != ncclSuccess) return sub->result; if (sub->offset == sub->size) nCompleted++; } if (nCompleted == r->nSubs) { if (size) *size = r->size; *done = 1; r->used = 0; for (int i=0; inSubs; i++) { struct ncclNetSocketTask* sub = r->tasks[i]; sub->used = 0; } } } else { // progress request using main thread if (r->offset < r->size) { NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, r->data, r->size, &r->offset)); } if (r->offset == r->size) { if (size) *size = r->size; *done = 1; r->used = 0; } } } return ncclSuccess; } ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess; } ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm; NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclNetSocketRequest**)request)); return ncclSuccess; } ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm; if (n != 1) return ncclInternalError; NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclNetSocketRequest**)request)); return ncclSuccess; } ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { // We don't support CUDA pointers, so we don't need a flush operation return ncclInternalError; } ncclResult_t ncclNetSocketCloseListen(void* opaqueComm) { struct ncclNetSocketListenComm* comm = (struct ncclNetSocketListenComm*)opaqueComm; if (comm) { int ready; NCCLCHECK(ncclSocketReady(&comm->sock, &ready)); if (ready) NCCLCHECK(ncclSocketClose(&comm->sock)); free(comm); } return ncclSuccess; } ncclResult_t ncclNetSocketClose(void* opaqueComm) { struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)opaqueComm; if (comm) { for (int i=0; inThreads; i++) { struct ncclNetSocketThreadResources* res = comm->threadResources+i; if (comm->helperThread[i]) { pthread_mutex_lock(&res->threadLock); res->stop = 1; pthread_cond_signal(&res->threadCond); pthread_mutex_unlock(&res->threadLock); pthread_join(comm->helperThread[i], NULL); } free(res->threadTaskQueue.tasks); } int ready; NCCLCHECK(ncclSocketReady(&comm->ctrlSock, &ready)); if (ready) NCCLCHECK(ncclSocketClose(&comm->ctrlSock)); for (int i=0; inSocks; i++) { NCCLCHECK(ncclSocketReady(&comm->socks[i], &ready)); if (ready) NCCLCHECK(ncclSocketClose(&comm->socks[i])); } free(comm); } return ncclSuccess; } ncclNet_t ncclNetSocket = { "Socket", ncclNetSocketInit, ncclNetSocketDevices, ncclNetSocketGetProperties, ncclNetSocketListen, ncclNetSocketConnect, ncclNetSocketAccept, ncclNetSocketRegMr, NULL, // No DMA-BUF support ncclNetSocketDeregMr, ncclNetSocketIsend, ncclNetSocketIrecv, ncclNetSocketIflush, ncclNetSocketTest, ncclNetSocketClose, ncclNetSocketClose, ncclNetSocketCloseListen }; nccl-2.18.3-1/src/transport/nvls.cc000066400000000000000000000413351444201535000170020ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ // Implementation of the NVLink SHARP (NVLS) transport #include "comm.h" #include "graph.h" #include "utils.h" #include "proxy.h" #if CUDART_VERSION >= 12010 // Currently we only support POSIX_FILE_DESCRIPTOR handle exchange #define USE_POSIX_FD 1 #if USE_POSIX_FD #define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR #else #define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE #endif ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { // This transport cannot be used for p2p *ret = 0; return ncclSuccess; } ncclResult_t nvlsSendFree(struct ncclConnector* send) { return ncclSuccess; } ncclResult_t nvlsRecvFree(struct ncclConnector* recv) { return ncclSuccess; } struct ncclTransport nvlsTransport = { "NVLS", nvlsCanConnect, { NULL, NULL, nvlsSendFree, NULL, NULL, NULL, NULL, NULL }, { NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL } }; ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, int nranks, size_t size) { CUmulticastObjectProp* prop = &resources->properties; memset(prop, 0, sizeof(*prop)); prop->size = size; prop->numDevices = nranks; prop->handleTypes = NVLS_CU_MEM_HANDLE_TYPE; prop->flags = 0; // Could be changed to CU_MULTICAST_GRANULARITY_MINIMUM when 3418538 resolved CUCHECK(cuMulticastGetGranularity(&resources->granularity, prop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); ALIGN_SIZE(size, resources->granularity); prop->size = resources->size = size; memset(&resources->accessDesc, 0, sizeof(resources->accessDesc)); resources->accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; resources->accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; resources->accessDesc.location.id = dev; resources->dev = dev; return ncclSuccess; } ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, unsigned int nranks, char* shareableHandle) { size_t size = resources->size; // Create a Multicast group CUmulticastObjectProp* prop = &resources->properties; INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zi on rank %d", nranks, size, rank); CUCHECK(cuMulticastCreate(&resources->mcHandle, prop)); if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) { // Get a handle to pass to other ranks CUCHECK(cuMemExportToShareableHandle(shareableHandle, resources->mcHandle, NVLS_CU_MEM_HANDLE_TYPE, 0)); } else { memcpy(shareableHandle, &resources->mcHandle, sizeof(resources->mcHandle)); } INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zi on rank %d", resources->mcHandle, nranks, size, rank); return ncclSuccess; } ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { INFO(NCCL_NVLS, "NVLS group %llx adding dev %d", resources->mcHandle, resources->dev); CUCHECK(cuMulticastAddDevice(resources->mcHandle, resources->dev)); return ncclSuccess; } ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, char* shareableHandle) { CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE; INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank); // Import and map the remote memory descriptor to the local GPU if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // cuMem UDS support int fd = *(int *)shareableHandle; TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle from rank %d fd %d", comm->localRank, rank, fd); struct ncclProxyConnector proxyConn; int tpProxyRank = comm->topParentRanks[rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &proxyConn)); TRACE(NCCL_NVLS, "NVLS rank %d request conversion of fd %d from rank %d", comm->localRank, fd, rank); NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, (int *)shareableHandle)); fd = *(int *)shareableHandle; TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank); CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)(uintptr_t)fd, type)); } else { if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) { CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)shareableHandle, type)); } else { memcpy(&resources->mcHandle, shareableHandle, sizeof(resources->mcHandle)); } } return ncclSuccess; } ncclResult_t nvlsGroupDisconnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE; // Import and map the remote memory descriptor to the local GPU if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // cuMem UDS support int fd = *(int *)resources->shareableHandle; (void) close(fd); } return ncclSuccess; } ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { size_t size = resources->size; size_t granularity; CUdeviceptr ptr = 0; CUmemAllocationProp prop; memset(&prop, 0, sizeof(prop)); prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.location.id = resources->dev; prop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE; CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); // Map a VA for UC memory CUCHECK(cuMemAddressReserve(&ptr, size, granularity, 0U, 0)); // Alloc local physical mem for this NVLS group CUCHECK(cuMemCreate(&resources->ucHandle, size, &prop, 0)); CUCHECK(cuMemMap(ptr, size, 0, resources->ucHandle, 0)); CUCHECK(cuMemSetAccess(ptr, size, &resources->accessDesc, 1)); CUDACHECK(cudaMemset((void*)ptr, 0, size)); resources->ucBuff = (char*)ptr; INFO(NCCL_NVLS, "NVLS Mapped UC at %p size %zi", resources->ucBuff, size); // Bind physical memory to the Multicast group // NB: It will block until all ranks have been added to the Group INFO(NCCL_NVLS, "NVLS Bind mem %p UC handle 0x%llx MC handle 0x%llx size %zi", (void*)ptr, resources->ucHandle, resources->mcHandle, size); CUCHECK(cuMulticastBindMem(resources->mcHandle, 0/*mcOffset*/, resources->ucHandle, 0/*memOffset*/, size, 0/*flags*/)); return ncclSuccess; } ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { int dev = resources->dev; size_t size = resources->size; INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zi dev %d", resources->mcHandle, size, dev); // Unbind physical memory from group for the given device CUCHECK(cuMulticastUnbind(resources->mcHandle, dev, 0/*mcOffset*/, size)); // Release the MC group resources NCCLCHECK(nvlsGroupDisconnect(comm, resources)); return ncclSuccess; } ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { size_t size = resources->size; CUdeviceptr ptr = 0; // Create a VA for the NVLS CUCHECK(cuMemAddressReserve(&ptr, size, resources->granularity, 0U, 0)); // Map the VA locally CUCHECK(cuMemMap(ptr, size, 0, resources->mcHandle, 0)); resources->mcBuff = (char*)ptr; INFO(NCCL_NVLS, "NVLS Mapped MC buffer at %p size %zi", resources->mcBuff, size); // Having completed the BindMem we can now call SetAccess // NB: It will block until all ranks have bound to the Group CUCHECK(cuMemSetAccess((CUdeviceptr)resources->mcBuff, size, &resources->accessDesc, 1)); return ncclSuccess; } ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { size_t size; CUdeviceptr ptr; INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", resources->ucHandle, resources->ucBuff, resources->mcHandle, resources->mcBuff); // Release the UC memory and mapping ptr = (CUdeviceptr)resources->ucBuff; size = resources->size; CUCHECK(cuMemUnmap(ptr, size)); CUCHECK(cuMemAddressFree(ptr, size)); CUCHECK(cuMemRelease(resources->ucHandle)); // Release the MC memory and mapping ptr = (CUdeviceptr)resources->mcBuff; size = resources->size; CUCHECK(cuMemUnmap(ptr, size)); CUCHECK(cuMemAddressFree(ptr, size)); CUCHECK(cuMemRelease(resources->mcHandle)); return ncclSuccess; } #include "bootstrap.h" #include "channel.h" #define NVLS_MEM_ALIGN_SIZE (1 << 21) NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2); NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16); ncclResult_t ncclNvlsInit(struct ncclComm* comm) { comm->nvlsSupport = 0; comm->nvlsChannels = 0; int gpuCount; NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount)); if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess; CUdevice dev; int driverVersion; if (CUPFN(cuDeviceGet) == NULL) return ncclSuccess; CUCHECK(cuCtxGetDevice(&dev)); CUDACHECK(cudaDriverGetVersion(&driverVersion)); if (ncclParamNvlsEnable() == 2) { // NVLS Multicast support requires CUDA12.1 UMD + KMD if (CUPFN(cuMulticastCreate) != NULL /*&& driverVersion >= 12010 */) { CUCHECK(cuDeviceGetAttribute(&comm->nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev)); } } else { comm->nvlsSupport = 1; } INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev); if (comm->nvlsSupport == 1) comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (int)ncclParamNvlsChannels())); return ncclSuccess; } ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { if (comm->nvlsSupport == 0 || comm->nvlsChannels == 0) return ncclSuccess; int nHeads = comm->channels[0].nvls.nHeads; int headRank = comm->channels[0].nvls.headRank; CUdevice dev; CUCHECK(cuCtxGetDevice(&dev)); ncclResult_t res = ncclSuccess; bool nvlsShare = true; if (parent && parent->nvlsSupport && parent->config.splitShare && parent->localRanks == comm->localRanks) nvlsShare = true; else nvlsShare = false; if (nvlsShare) { /* reuse NVLS resources */ comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels); for (int c = 0; c < comm->nvlsChannels; c++) { NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, cleanup); } comm->nvlsResources = parent->nvlsResources; ncclAtomicRefCountIncrement(&parent->nvlsResources->refCount); } else { int nChannels; struct ncclNvlsSharedRes* resources; NCCLCHECK(ncclCalloc(&resources, 1)); comm->nvlsResources = resources; resources->refCount = 1; if (parent && parent->config.splitShare) { /* ranks on other nodes might share the NVLS resources, we need to cap nvlsChannels * to make sure nvlsChannels match for each rank. */ comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels); } nChannels = resources->nChannels = comm->nvlsChannels; for (int c = 0; c < nChannels; c++) { NCCLCHECK(initNvlsChannel(comm, c, parent, false)); } size_t buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE]; size_t memSize = NVLS_MEM_ALIGN_SIZE; size_t nvlsPerRankSize = nChannels * 2 * (buffSize + memSize); size_t nvlsTotalSize = nvlsPerRankSize * nHeads; INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zi memSize %zi nvlsPerRankSize %zi nvlsTotalSize %zi", comm, headRank, nHeads, buffSize, memSize, nvlsPerRankSize, nvlsTotalSize); char* shareableHandle = resources->shareableHandle; NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, comm->localRanks, nvlsTotalSize), res, cleanup); if (comm->localRank == 0) { NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, comm->localRank, comm->localRanks, shareableHandle), res, cleanup); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup); NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, comm->localRankToRank[0], shareableHandle), res, cleanup); } NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup); NCCLCHECKGOTO(nvlsGroupBindMem(comm, resources), res, cleanup); // Local intra-node barrier to ensure everyone has bound their memory to the group NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, cleanup); NCCLCHECKGOTO(nvlsGroupMapMem(comm, resources), res, cleanup); for (int h = 0; h < nHeads; h++) { int nvlsPeer = comm->nRanks + 1 + h; for (int c = 0; c < nChannels; c++) { struct ncclChannel* channel = comm->channels + c; char* mem = NULL; struct ncclChannelPeer* peer = channel->peers[nvlsPeer]; // Reduce UC -> MC mem = resources->ucBuff + (h * 2 * nChannels + c) * (buffSize + memSize); peer->send[1].transportComm = &nvlsTransport.send; peer->send[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem; peer->send[1].conn.head = (uint64_t*)(mem + buffSize); peer->send[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2); mem = resources->mcBuff + (h * 2 * nChannels + c) * (buffSize + memSize); peer->recv[0].transportComm = &nvlsTransport.recv; peer->recv[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem; peer->recv[0].conn.head = (uint64_t*)(mem + buffSize); peer->recv[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2); peer->recv[0].conn.flags |= NCCL_NVLS_MIN_POLL; // Broadcast MC -> UC mem = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize); peer->recv[1].transportComm = &nvlsTransport.recv; peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = mem; peer->recv[1].conn.head = (uint64_t*)(mem + buffSize); peer->recv[1].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2); mem = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * (buffSize + memSize); peer->send[0].transportComm = &nvlsTransport.send; peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = mem; peer->send[0].conn.head = (uint64_t*)(mem + buffSize); peer->send[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2); peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL; struct ncclDevChannelPeer* addr; CUDACHECKGOTO(cudaMemcpyAsync(&addr, comm->channels[c].devPeers + nvlsPeer, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), res, cleanup); CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup); CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup); CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup); CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup); /*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p", nvlsPeer, c, resources->mcBuff + (h*2*nChannels+c)*(buffSize+memSize), resources->mcBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize), resources->ucBuff + (h*2*nChannels+c)*(buffSize+memSize), resources->ucBuff + ((h*2+1)*nChannels+c)*(buffSize+memSize));*/ } } } return res; cleanup: comm->nvlsSupport = 0; return res; } ncclResult_t ncclNvlsFree(struct ncclComm* comm) { struct ncclNvlsSharedRes* resources = (struct ncclNvlsSharedRes*)comm->nvlsResources; if (resources == NULL) return ncclSuccess; if (ncclAtomicRefCountDecrement(&resources->refCount) == 0) { NCCLCHECK(nvlsGroupUnbind(comm, resources)); NCCLCHECK(nvlsGroupUnmapMem(comm, resources)); free(resources); comm->nvlsResources = NULL; } return ncclSuccess; } #else /* * Pre CUDA 12.1 stubs */ ncclResult_t ncclNvlsInit(struct ncclComm* comm) { comm->nvlsChannels = 0; return ncclSuccess; } ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { return ncclSuccess; } ncclResult_t ncclNvlsFree(struct ncclComm* comm) { return ncclSuccess; } #endif /* CUDA_VERSION >= 12010 */ nccl-2.18.3-1/src/transport/p2p.cc000066400000000000000000000723741444201535000165300ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "graph.h" #include "utils.h" #include "shm.h" #include "p2p.h" enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM }; struct ncclP2pBuff { void* directPtr; size_t size; ncclIpcDesc ipcDesc; }; struct p2pConnectInfo { int rank; int read; struct ncclP2pBuff p2pBuff; // Used by CE memcpy char shmName[7]; int shmSize; }; static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large"); struct p2pShm { struct ncclSendMem sendMem; struct ncclRecvMem recvMem; }; struct p2pShmProxyInfo { // Shared memory between proxy and receiving GPU struct p2pShm* shm; struct p2pShm* devShm; char shmName[7]; int shmSize; ncclShmHandle_t handle; // Intermediate step for sender struct ncclRecvMem* ceRecvMem; char* ceDevBuff; // Receiver buffer char* recvFifo; // Used by CE memcpy progress only uint64_t step; cudaStream_t stream; cudaEvent_t events[NCCL_STEPS]; }; static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large"); struct p2pResources { enum p2pType type; union { struct ncclSendMem* sendDevMem; struct ncclRecvMem* recvDevMem; }; void* sendMemIpc; void* recvMemIpc; // CE memcpy support struct p2pShmProxyInfo proxyInfo; struct p2pShm* shm; struct p2pShm* devShm; int shmSize; ncclShmHandle_t handle; }; // cuMem API support struct p2pCuMemProxyInfo { struct ncclP2pBuff p2pBuff; }; #include /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ static int busIdToCudaDev(int64_t busId) { int ndev; if (cudaGetDeviceCount(&ndev) != cudaSuccess) return -1; for (int i = 0; i < ndev; i++) { char devBusIdStr[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; if (cudaDeviceGetPCIBusId(devBusIdStr, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess) return -1; int64_t devBusId; NCCLCHECK(busIdToInt64(devBusIdStr, &devBusId)); if (busId == devBusId) return i; } // BusId was not found in our locally visible CUDA devices return -1; } // CE memcpy support NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0); static int useMemcpy = 0; static void initCeOperation(); /* Determine if two peers can communicate through p2p */ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { initCeOperation(); // Rule out different nodes / isolated containers if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) { *ret = 0; return ncclSuccess; } // Check topology / p2p level. int intermediateRank; NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank)); if (*ret == 0) return ncclSuccess; if (intermediateRank != -1) { if (useMemcpy) *ret = 0; return ncclSuccess; } // Check if NET would work better int useNet = 0; NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet)); if (useNet) { *ret = 0; return ncclSuccess; } // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) int cudaDev1 = busIdToCudaDev(info1->busId); int cudaDev2 = busIdToCudaDev(info2->busId); if (cudaDev1 == -1 || cudaDev2 == -1) { #if CUDART_VERSION >= 10010 // CUDA 10.1 and later can use P2P with invisible devices. return ncclSuccess; #else // Peer's CUDA device is not visible in this process : we can't communicate with it. *ret = 0; return ncclSuccess; #endif } // Check that CUDA can do P2P int p2p; if (cudaDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != cudaSuccess) { INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)", cudaDev1, info1->busId, cudaDev2, info2->busId); *ret = 0; return ncclSuccess; } // This will always fail when using NCCL_CUMEM_ENABLE=1 if (p2p != 0 && !ncclCuMemEnable()) { // Cached result of the legacyIPC detection static int legacyIPC = -1; if (legacyIPC >= 0) { *ret = legacyIPC; return ncclSuccess; } // Check that legacy IPC support is available (WSL WAR) char *dummy; cudaIpcMemHandle_t ipc; NCCLCHECK(ncclCudaMalloc(&dummy, CUDA_IPC_MIN)); if (cudaIpcGetMemHandle(&ipc, dummy) != cudaSuccess) { INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported"); *ret = 0; } NCCLCHECK(ncclCudaFree(dummy)); legacyIPC = *ret; return ncclSuccess; } if (p2p == 0) { INFO(NCCL_INIT|NCCL_P2P,"Could not enable P2P between dev %d(=%lx) and dev %d(=%lx)", cudaDev1, info1->busId, cudaDev2, info2->busId); *ret = 0; return ncclSuccess; } return ncclSuccess; } #define TRACE_DUMP_IPC(DEVIPC) \ do { \ unsigned long *devIpc = (unsigned long *) (DEVIPC); \ TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[0], devIpc[1], devIpc[2], devIpc[3]); \ TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \ } while (0) // cuMem API support ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) { if (ncclCuMemEnable()) { #if CUDART_VERSION >= 11030 // cuMem API support CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE; CUmemGenericAllocationHandle handle; NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size)); CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0)); #else return ncclInternalError; #endif } else { // Allocate a CUDA buffer and generate an IPC handle for it NCCLCHECK(ncclCudaCalloc((char **)ptr, size)); cudaError_t res = cudaIpcGetMemHandle(&ipcDesc->devIpc, *ptr); if (res != cudaSuccess) { WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res)); ncclCudaFree(*ptr); CUDACHECK(res); } } INFO(NCCL_P2P|NCCL_ALLOC, "Allocated shareable buffer %p size %zi ipcDesc %p", *ptr, size, ipcDesc); return ncclSuccess; } ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) { if (ncclCuMemEnable()) { // cuMem API support CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE; if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { int fd = *(int *) &ipcDesc->cuDesc.data; if (fd <= 0) return ncclInternalError; (void) close(fd); } } return ncclSuccess; } ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) { if (ncclCuMemEnable()) { #if CUDART_VERSION >= 11030 // cuMem API support CUdeviceptr dptr = 0; CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE; CUmemGenericAllocationHandle handle; ncclCuDesc *cuDesc = &ipcDesc->cuDesc; // Import and map the remote memory descriptor to the local GPU if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // UDS fd support struct ncclProxyConnector proxyConn; int fd = *(int *)(&cuDesc->data); int newFd = -1; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpPeer, &proxyConn)); NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, &newFd)); INFO(NCCL_P2P, "UDS converted fd %d -> %d on peer %d", fd, newFd, tpPeer); CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)newFd, type)); close(newFd); } else { CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type)); } CUCHECK(cuMemAddressReserve(&dptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0)); CUCHECK(cuMemMap(dptr, size, /* offset */ 0, handle, /* flags */ 0)); TRACE(NCCL_P2P, "Imported shareable buffer size %zi handle 0x%lx dptr %p", size, (long)handle, (void*)dptr); // Allow access by the local GPU CUmemAccessDesc accessDesc = {}; accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.id = comm->cudaDev; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1)); TRACE(NCCL_P2P, "Set Access for %p size %zi dev %d", (void*)dptr, size, accessDesc.location.id); *devMemPtr = (void *)dptr; #else return ncclInternalError; #endif } else { // Legacy CUDA IPC CUDACHECK(cudaIpcOpenMemHandle(devMemPtr, ipcDesc->devIpc, cudaIpcMemLazyEnablePeerAccess)); } INFO(NCCL_P2P, "Imported shareable buffer device %d size %zi ptr %p", comm->cudaDev, size, *devMemPtr); return ncclSuccess; } // Setting this to non zero causes P2P to use Reads rather than Writes NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2); NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0); static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) { int p2p; // Queries the topology to see if the GPUs are Ampere and // connected via NVLink, if so we enable P2P Read by default NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, read, intermediateRank)); int readEnable = ncclParamP2pReadEnable(); if (readEnable != -2) *read = readEnable; return ncclSuccess; } static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) { if (!ncclCuMemEnable() && myInfo->pidHash == peerInfo->pidHash) { if (peerInfo->cudaDev != myInfo->cudaDev) { // Same PID different GPUs, enable P2P access // Legacy CUDA IPC cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { cudaGetLastError(); } else if (err != cudaSuccess) { WARN("failed to peer with device %d(=%lx): %d %s", peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } } *devMem = p2pBuff->directPtr; *ipcPtr = NULL; } else { if ((myInfo->pidHash == peerInfo->pidHash) && (peerInfo->cudaDev == myInfo->cudaDev)) { // Same PID and GPU *devMem = p2pBuff->directPtr; *ipcPtr = NULL; } else { // Different PID or different GPU NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem)); *ipcPtr = *devMem; } } return ncclSuccess; } /* Send: Create and return connect structures for this peer to connect to me */ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct p2pResources* resources; int tpProxyRank; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; int useRead, intermediateRank; NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); if (useMemcpy) useRead = 0; static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; info->read = useRead; // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) if (graph && connIndex == 1) info->read = 0; const char* useReadStr = info->read ? "/read" : ""; int sendSize = sizeof(struct ncclSendMem); // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure if (info->read) sendSize += comm->buffSizes[NCCL_PROTO_SIMPLE]; ALIGN_SIZE(sendSize, CUDA_IPC_MIN); if (intermediateRank == -1) { info->rank = myInfo->rank; if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) { resources->type = P2P_DIRECT; send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr); } else { // cuMem API support if (ncclCuMemEnable()) { resources->type = P2P_CUMEM; INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/CUMEM%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr, useMemcpy ? "/CE" : "");; } else { // Legacy CUDA IPC resources->type = P2P_IPC; INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/IPC%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr, useMemcpy ? "/CE" : ""); } send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; } } else { resources->type = P2P_INTERMEDIATE; info->rank = intermediateRank; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/indirect/%d[%d]%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, intermediateRank, comm->peerInfo[intermediateRank].nvmlDev, useReadStr); } tpProxyRank = comm->topParentRanks[info->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &send->proxyConn)); if (useMemcpy) { NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pShmProxyInfo))); info->shmSize = resources->proxyInfo.shmSize; memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName)); } else { NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc)); } return ncclSuccess; } /* Create and return connect structures for this peer to connect to me */ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) { struct p2pResources* resources; int tpProxyRank; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; int useRead, intermediateRank; NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; info->read = useRead; // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) if (graph && connIndex == 1) info->read = 0; int recvSize = sizeof(struct ncclRecvMem); // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure for (int p=0; pread && p == NCCL_PROTO_SIMPLE)) recvSize += comm->buffSizes[p]; ALIGN_SIZE(recvSize, CUDA_IPC_MIN); if (intermediateRank == -1) { info->rank = myInfo->rank; if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) { resources->type = P2P_DIRECT; recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; } else { if (ncclCuMemEnable()) { // cuMem API support resources->type = P2P_CUMEM; TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/CUMEM", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); } else { // Legacy CUDA IPC resources->type = P2P_IPC; } recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; } } else { resources->type = P2P_INTERMEDIATE; info->rank = intermediateRank; } tpProxyRank = comm->topParentRanks[info->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn)); NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc)); return ncclSuccess; } /* Connect/Send to this peer */ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { struct p2pResources* resources = (struct p2pResources*)send->transportResources; struct ncclRecvMem* remDevMem = NULL; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc)); char* buff = (char*)(remDevMem+1); for (int p=0; pread && p == NCCL_PROTO_SIMPLE) { /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */ if (resources->sendDevMem == NULL) return ncclInternalError; // We should not use read + memcpy send->conn.buffs[p] = (char*)(resources->sendDevMem+1); } else { send->conn.buffs[p] = buff; buff += comm->buffSizes[p]; } } if (useMemcpy) { send->conn.tail = &resources->proxyInfo.ceRecvMem->tail; send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo; send->conn.head = &resources->proxyInfo.devShm->sendMem.head; // Send SIMPLE buff to proxy, and replace it by local buffer NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0)); send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff; } else { send->conn.tail = &remDevMem->tail; send->conn.head = &resources->sendDevMem->head; send->conn.ptrExchange = &resources->sendDevMem->ptrExchange; send->conn.redOpArgExchange = resources->sendDevMem->redOpArgExchange; } return ncclSuccess; } /* Connect/Recv from this peer */ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { struct p2pResources* resources = (struct p2pResources*)recv->transportResources; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; struct ncclSendMem* remDevMem = NULL; if (useMemcpy) { char shmPath[PATH_MAX]; sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); resources->shmSize = info->shmSize; // Attach to peer's SHM segment NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle)); recv->conn.tail = &resources->devShm->recvMem.tail; recv->conn.head = &resources->devShm->sendMem.head; } else { NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc)); struct ncclRecvMem* devMem = resources->recvDevMem; recv->conn.tail = &devMem->tail; recv->conn.head = &remDevMem->head; recv->conn.ptrExchange = &remDevMem->ptrExchange; recv->conn.redOpArgExchange = remDevMem->redOpArgExchange; } char* buff = (char*)(resources->recvDevMem+1); for (int p=0; pread && p == NCCL_PROTO_SIMPLE) { if (remDevMem == NULL) return ncclInternalError; // We should not use read + memcpy /* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */ recv->conn.buffs[p] = (char*)(remDevMem+1); } else { recv->conn.buffs[p] = buff; buff += comm->buffSizes[p]; } } return ncclSuccess; } ncclResult_t p2pSendFree(struct ncclConnector* send) { struct p2pResources* resources = (struct p2pResources*)send->transportResources; if (resources) { if (ncclCuMemEnable()) { // cuMem API support if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc)); if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc)); } else { if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); } free(resources); } return ncclSuccess; } ncclResult_t p2pRecvFree(struct ncclConnector* recv) { struct p2pResources* resources = (struct p2pResources*)recv->transportResources; if (resources) { if (ncclCuMemEnable()) { // cuMem API support if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc)); if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc)); } else { if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); if (useMemcpy) { NCCLCHECK(ncclShmClose(resources->handle)); } } free(resources); } return ncclSuccess; } static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (useMemcpy) { // CE memcpy support struct p2pShmProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); connection->transportResources = proxyInfo; NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE])); char shmPath[PATH_MAX]; shmPath[0] = '\0'; proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem); // Create a SHM segment for the peer to attach to NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle)); TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize); memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName)); NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError; memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo)); } else { if (reqSize != sizeof(int)) return ncclInternalError; int size = *((int*)reqBuff); if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError; struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff; NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr)); p2pBuff->size = size; if (ncclCuMemEnable()) { // cuMem API support struct p2pCuMemProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); memcpy(&proxyInfo->p2pBuff, p2pBuff, sizeof(*p2pBuff)); connection->transportResources = proxyInfo; } else { connection->transportResources = p2pBuff->directPtr; } } *done = 1; return ncclSuccess; } static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { if (reqSize != sizeof(int)) return ncclInternalError; int size = *((int*)reqBuff); if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError; struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff; NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr)); p2pBuff->size = size; if (ncclCuMemEnable()) { // cuMem API support struct p2pCuMemProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); memcpy(&proxyInfo->p2pBuff, p2pBuff, sizeof(*p2pBuff)); connection->transportResources = proxyInfo; } else { connection->transportResources = p2pBuff->directPtr; } *done = 1; return ncclSuccess; } static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources; if (reqSize != sizeof(void*)) return ncclInternalError; proxyInfo->recvFifo = *((char**)reqBuff); CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); for (int i=0; ievents+i)); } connection->proxyAppendPtr = &connection->proxyAppend; return ncclSuccess; } static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { // CE memcpy support if (useMemcpy) { struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources; if (proxyInfo) { NCCLCHECK(ncclShmClose(proxyInfo->handle)); NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem)); NCCLCHECK(ncclCudaFree(proxyInfo->ceDevBuff)); CUDACHECK(cudaStreamDestroy(proxyInfo->stream)); for (int i=0; ievents[i])); } free(proxyInfo); } } else { if (ncclCuMemEnable()) { // cuMem API support struct p2pCuMemProxyInfo *proxyInfo = (struct p2pCuMemProxyInfo *) connection->transportResources; if (proxyInfo) { struct ncclP2pBuff *p2pBuff = &proxyInfo->p2pBuff; ncclP2pFreeShareableBuffer(&p2pBuff->ipcDesc); ncclCudaFree(p2pBuff->directPtr); free(proxyInfo); } } else { // Do not check return code as CUDA may have already shut down ncclCudaFree(connection->transportResources); } } return ncclSuccess; } static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { if (ncclCuMemEnable()) { struct p2pCuMemProxyInfo *proxyInfo = (struct p2pCuMemProxyInfo *) connection->transportResources; if (proxyInfo) { struct ncclP2pBuff *p2pBuff = &proxyInfo->p2pBuff; ncclP2pFreeShareableBuffer(&p2pBuff->ipcDesc); ncclCudaFree(p2pBuff->directPtr); free(proxyInfo); } } else { // Do not check return code as CUDA may have already shut down ncclCudaFree(connection->transportResources); } return ncclSuccess; } // CE memcpy support static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct p2pShmProxyInfo* resources = (struct p2pShmProxyInfo*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->transmitted = sub->done = 0; } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; int stepSize = proxyState->buffSizes[p] / NCCL_STEPS; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct p2pShmProxyInfo* resources = (struct p2pShmProxyInfo*) (sub->connection->transportResources); if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy resources->step = sub->base + sub->nsteps; args->done++; continue; } if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) { int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; volatile int* sizesFifo = resources->ceRecvMem->sizesFifo; volatile uint64_t* recvTail = &resources->ceRecvMem->tail; // Check GPU has sent everything if ((*recvTail > sub->base+sub->transmitted)) { int size = sizesFifo[buffSlot]; CUDACHECK(cudaMemcpyAsync(resources->recvFifo+buffSlot*stepSize, resources->ceDevBuff+buffSlot*stepSize, size, cudaMemcpyDeviceToDevice, resources->stream)); CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream)); sub->transmitted += args->sliceSteps; } } if (sub->done < sub->transmitted) { int buffSlot = (sub->base+sub->done)%NCCL_STEPS; cudaError_t res = cudaEventQuery(resources->events[buffSlot]); if (res != cudaErrorNotReady) CUDACHECK(res); if (res == cudaSuccess) { sub->done += args->sliceSteps; // Notify SHM resources->shm->recvMem.tail = sub->base + sub->done; } if (sub->done == sub->nsteps) { resources->step = sub->base + sub->nsteps; args->done++; } } } if (args->done == args->nsubs) { args->state = ncclProxyOpNone; } } return ncclSuccess; } struct ncclTransport p2pTransport = { "P2P", p2pCanConnect, { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL }, { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL } }; static void initCeOperation() { static int init = 0; if (!init) { useMemcpy = ncclParamP2pUseCudaMemcpy(); if (useMemcpy) { p2pTransport.send.proxyConnect = p2pSendProxyConnect; p2pTransport.send.proxyProgress = p2pSendProxyProgress; } init = 1; } } nccl-2.18.3-1/src/transport/shm.cc000066400000000000000000000435301444201535000166060ustar00rootroot00000000000000/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "shm.h" struct shmConnectInfo { char shmName[7]; int shmSize; }; static_assert(sizeof(shmConnectInfo) <= CONNECT_SIZE, "SHM Connect info is too large"); struct shmSendResources { int remShmSize; struct ncclRecvMem* remHostMem; struct ncclRecvMem* devRemHostMem; ncclShmHandle_t remHandle; int shmSize; struct ncclSendMem* hostMem; struct ncclSendMem* devHostMem; ncclShmHandle_t hostHandle; }; struct shmRecvResources { int remShmSize; struct ncclSendMem* remHostMem; struct ncclSendMem* devRemHostMem; ncclShmHandle_t remHandle; int shmSize; struct ncclRecvMem* hostMem; struct ncclRecvMem* devHostMem; ncclShmHandle_t hostHandle; }; #define SHM_SEND_SIDE 1 #define SHM_RECV_SIDE 2 NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0); NCCL_PARAM(ShmUseCudaMemcpy, "SHM_USE_CUDA_MEMCPY", 0); NCCL_PARAM(ShmMemcpyMode, "SHM_MEMCPY_MODE", SHM_SEND_SIDE); // 1 is sender-side, 2 is receiver-side, 3 is both static int useMemcpySend = 0; static int useMemcpyRecv = 0; NCCL_PARAM(ShmLocality, "SHM_LOCALITY", SHM_RECV_SIDE); // 1 is sender-size, 2 is receiver-size static int shmLocality = 0; static void initCeOperation(); /* Determine two peers can communicate with SHM */ static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 0; initCeOperation(); if (ncclParamShmDisable() == 1) return ncclSuccess; int useNet = 0; NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet)); if (useNet) return ncclSuccess; // Same host? TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash); if (info1->hostHash != info2->hostHash) return ncclSuccess; // Common /dev/shm (between containers) ? TRACE(NCCL_INIT|NCCL_SHM, "peer1 shmDev %lx peer2 shmDev %lx", info1->shmDev, info2->shmDev); if (info1->shmDev != info2->shmDev) return ncclSuccess; *ret = 1; return ncclSuccess; } #define MAX_SHM_NAME_LEN 1024 /* Create and return connect structures for this peer to connect to me */ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct shmSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big"); struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; char shmPath[PATH_MAX]; shmPath[0] = '\0'; int shmSize = sizeof(struct ncclSendMem); if (shmLocality == SHM_SEND_SIDE) { for (int p=0; pbuffSizes[p]; } info->shmSize = resources->shmSize = shmSize; NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle)); TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize); memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName)); INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%d] -> %d[%d] via SHM/%s/%s", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct"); return ncclSuccess; } static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct shmRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big"); struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; char shmPath[PATH_MAX]; shmPath[0] = '\0'; int shmSize = sizeof(struct ncclRecvMem); if (shmLocality == SHM_RECV_SIDE) { for (int p=0; pbuffSizes[p]; } info->shmSize = resources->shmSize = shmSize; NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle)); TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize); memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName)); return ncclSuccess; } struct shmProxyInfo { struct ncclRecvMem* ceRecvMem; char* devFifo; char* shmFifo; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; // used by progress only uint64_t step; cudaStream_t stream; cudaEvent_t events[NCCL_STEPS]; }; /* Connect to this peer */ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { // Setup device pointers struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; char shmPath[PATH_MAX]; sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); resources->remShmSize = info->shmSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle)); char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1); for (int p=0; pconn.buffs[p] = buff; buff += comm->buffSizes[p]; } send->conn.tail = &resources->devRemHostMem->tail; send->conn.head = &resources->devHostMem->head; if (useMemcpyRecv) { send->conn.sizesFifo = resources->devRemHostMem->sizesFifo; } if (useMemcpySend) { int tpProxyRank; tpProxyRank = comm->topParentRanks[comm->rank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, tpProxyRank, &send->proxyConn)); struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem }; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; send->conn.tail = &proxyInfo.ceRecvMem->tail; send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo; } return ncclSuccess; } static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { // Setup device pointers struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; char shmPath[PATH_MAX]; sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); resources->remShmSize = info->shmSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle)); char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1); for (int p=0; pconn.buffs[p] = buff; buff += comm->buffSizes[p]; } recv->conn.head = &resources->devRemHostMem->head; recv->conn.tail = &resources->devHostMem->tail; if (useMemcpyRecv) { NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn)); struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem }; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo))); recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo; recv->conn.tail = &proxyInfo.ceRecvMem->tail; } return ncclSuccess; } static ncclResult_t shmSendFree(struct ncclConnector* send) { struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources; if (resources) { NCCLCHECK(ncclShmClose(resources->hostHandle)); NCCLCHECK(ncclShmClose(resources->remHandle)); free(resources); } return ncclSuccess; } static ncclResult_t shmRecvFree(struct ncclConnector* recv) { struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; if (resources) { NCCLCHECK(ncclShmClose(resources->hostHandle)); NCCLCHECK(ncclShmClose(resources->remHandle)); free(resources); } return ncclSuccess; } static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct shmProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError; memcpy(proxyInfo, reqBuff, reqSize); NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE])); NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); for (int i=0; ievents+i)); } connection->proxyAppendPtr = &connection->proxyAppend; connection->transportResources = proxyInfo; if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError; memcpy(respBuff, proxyInfo, respSize); return ncclSuccess; } static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct shmProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError; memcpy(proxyInfo, reqBuff, reqSize); NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE])); NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking)); for (int i=0; ievents+i)); } connection->proxyAppendPtr = &connection->proxyAppend; connection->transportResources = proxyInfo; if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError; memcpy(respBuff, proxyInfo, respSize); return ncclSuccess; } static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources; if (resources) { CUDACHECK(cudaStreamDestroy(resources->stream)); NCCLCHECK(ncclCudaFree(resources->devFifo)); NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); for (int i=0; ievents[i])); } free(connection->transportResources); } return ncclSuccess; } static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources; if (resources) { CUDACHECK(cudaStreamDestroy(resources->stream)); NCCLCHECK(ncclCudaFree(resources->devFifo)); NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem)); for (int i=0; ievents[i])); } free(connection->transportResources); } return ncclSuccess; } static ncclResult_t shmSendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->transmitted = sub->done = 0; } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; int stepSize = proxyState->buffSizes[p] / NCCL_STEPS; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy resources->step = sub->base + sub->nsteps; args->done++; continue; } if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) { int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; volatile int* sizesFifo = resources->ceRecvMem->sizesFifo; volatile uint64_t* recvTail = &resources->ceRecvMem->tail; // Check GPU has sent everything if ((*recvTail > sub->base+sub->transmitted)) { int size = sizesFifo[buffSlot]; CUDACHECK(cudaMemcpyAsync(resources->shmFifo+buffSlot*stepSize, resources->devFifo+buffSlot*stepSize, size, cudaMemcpyDeviceToHost, resources->stream)); CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream)); resources->recvMem->sizesFifo[buffSlot] = size; __sync_synchronize(); // make sure sizesFifo is visible sub->transmitted += args->sliceSteps; } } if (sub->done < sub->transmitted) { int buffSlot = (sub->base+sub->done)%NCCL_STEPS; cudaError_t res = cudaEventQuery(resources->events[buffSlot]); if (res != cudaErrorNotReady) CUDACHECK(res); if (res == cudaSuccess) { sub->done += args->sliceSteps; // Notify SHM resources->recvMem->tail = sub->base + sub->done; } if (sub->done == sub->nsteps) { resources->step = sub->base + sub->nsteps; args->done++; } } } if (args->done == args->nsubs) { args->state = ncclProxyOpNone; } } return ncclSuccess; } static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->transmitted = sub->done = 0; } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; int stepSize = proxyState->buffSizes[p] / NCCL_STEPS; for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources); if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy resources->step = sub->base + sub->nsteps; args->done++; continue; } if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) { int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; volatile int* sizesFifo = resources->recvMem->sizesFifo; volatile uint64_t* recvTail = &resources->recvMem->tail; // Check data is ready in SHM if ((*recvTail > sub->base+sub->transmitted)) { int size = sizesFifo[buffSlot]; CUDACHECK(cudaMemcpyAsync(resources->devFifo+buffSlot*stepSize, resources->shmFifo+buffSlot*stepSize, size, cudaMemcpyHostToDevice, resources->stream)); CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream)); sub->transmitted += args->sliceSteps; } } if (sub->done < sub->transmitted) { int buffSlot = (sub->base+sub->done)%NCCL_STEPS; cudaError_t res = cudaEventQuery(resources->events[buffSlot]); if (res != cudaErrorNotReady) CUDACHECK(res); if (res == cudaSuccess) { sub->done += args->sliceSteps; // Notify GPU resources->ceRecvMem->tail = sub->base + sub->done; } if (sub->done == sub->nsteps) { resources->step = sub->base + sub->nsteps; args->done++; } } } if (args->done == args->nsubs) { args->state = ncclProxyOpNone; } } return ncclSuccess; } struct ncclTransport shmTransport = { "SHM", shmCanConnect, { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL }, { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL } }; static void initCeOperation() { static int init = 0; if (!init) { useMemcpySend = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 1); useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2); if (useMemcpySend) { shmTransport.send.proxyConnect = shmSendProxyConnect; shmTransport.send.proxyFree = shmSendProxyFree; shmTransport.send.proxyProgress = shmSendProxyProgress; } if (useMemcpyRecv) { shmTransport.recv.proxyConnect = shmRecvProxyConnect; shmTransport.recv.proxyFree = shmRecvProxyFree; shmTransport.recv.proxyProgress = shmRecvProxyProgress; } shmLocality = ncclParamShmLocality(); if (shmLocality != SHM_SEND_SIDE && shmLocality != SHM_RECV_SIDE) { WARN("Ignoring SHM locality, must be 1 (sender side) or 2 (receiver side, default)"); shmLocality = SHM_RECV_SIDE; } init = 1; } }