diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 0c1babcdd..4dee63996 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -20,7 +20,7 @@ jobs: - name: Run cpplint run: | - CPPSOURCES=$(find ./src ./include ./python ./test -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)') + CPPSOURCES=$(find ./src ./include ./python ./test ./apps -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)') clang-format -style=file --verbose --Werror --dry-run ${CPPSOURCES} pylint: diff --git a/apps/nccl/include/nccl.h b/apps/nccl/include/nccl.h index b6e91975a..7f507927b 100644 --- a/apps/nccl/include/nccl.h +++ b/apps/nccl/include/nccl.h @@ -17,18 +17,22 @@ typedef struct ncclComm* ncclComm_t; #define NCCL_COMM_NULL NULL #define NCCL_UNIQUE_ID_BYTES 128 -typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; +typedef struct { + char internal[NCCL_UNIQUE_ID_BYTES]; +} ncclUniqueId; /* Error type */ -typedef enum { ncclSuccess = 0, - ncclUnhandledCudaError = 1, - ncclSystemError = 2, - ncclInternalError = 3, - ncclInvalidArgument = 4, - ncclInvalidUsage = 5, - ncclRemoteError = 6, - ncclInProgress = 7, - ncclNumResults = 8 } ncclResult_t; +typedef enum { + ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclRemoteError = 6, + ncclInProgress = 7, + ncclNumResults = 8 +} ncclResult_t; #define NCCL_CONFIG_UNDEF_INT INT_MIN #define NCCL_CONFIG_UNDEF_PTR NULL @@ -46,40 +50,41 @@ typedef struct ncclConfig_v21700 { int cgaClusterSize; int minCTAs; int maxCTAs; - const char *netName; + const char* netName; int splitShare; } ncclConfig_t; /* Config initializer must be assigned to initialize config structure when it is created. * Not initialized config will result in NCCL error. */ -#define NCCL_CONFIG_INITIALIZER { \ - sizeof(ncclConfig_t), /* size */ \ - 0xcafebeef, /* magic */ \ - NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ - NCCL_CONFIG_UNDEF_INT, /* blocking */ \ - NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ - NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ - NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ - NCCL_CONFIG_UNDEF_PTR, /* netName */ \ - NCCL_CONFIG_UNDEF_INT /* splitShare */ \ -} +#define NCCL_CONFIG_INITIALIZER \ + { \ + sizeof(ncclConfig_t), /* size */ \ + 0xcafebeef, /* magic */ \ + NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ + NCCL_CONFIG_UNDEF_INT, /* blocking */ \ + NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ + NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ + NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ + NCCL_CONFIG_UNDEF_PTR, /* netName */ \ + NCCL_CONFIG_UNDEF_INT /* splitShare */ \ + } /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. * This integer is coded with the MAJOR, MINOR and PATCH level of the * NCCL library */ -ncclResult_t ncclGetVersion(int *version); -ncclResult_t pncclGetVersion(int *version); +ncclResult_t ncclGetVersion(int* version); +ncclResult_t pncclGetVersion(int* version); /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be * called once and the Id should be distributed to all ranks in the * communicator before calling ncclCommInitRank. */ -ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); +ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); /* Create a new communicator (multi thread/process version) with a configuration * set by users. */ -ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); +ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); /* Creates a new communicator (multi thread/process version). @@ -88,7 +93,7 @@ ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId * ncclCommInitRank. * ncclCommInitRank implicitly syncronizes with other ranks, so it must be * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ -ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); +ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); /* Creates a clique of communicators (single process version). @@ -97,7 +102,7 @@ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). * If devlist is NULL, the first ndev CUDA devices are used. * Order of devlist defines user-order of processors within the communicator. */ -ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); +ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); /* Finalize a communicator. ncclCommFinalize flushes all issued communications, @@ -105,16 +110,16 @@ ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); * when the communicator is globally quiescent and related resources are freed; then, * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator * itself) without blocking. */ -ncclResult_t ncclCommFinalize(ncclComm_t comm); +ncclResult_t ncclCommFinalize(ncclComm_t comm); ncclResult_t pncclCommFinalize(ncclComm_t comm); /* Frees local resources associated with communicator object. */ -ncclResult_t ncclCommDestroy(ncclComm_t comm); +ncclResult_t ncclCommDestroy(ncclComm_t comm); ncclResult_t pncclCommDestroy(ncclComm_t comm); /* Frees resources associated with communicator object and aborts any operations * that might still be running on the device. */ -ncclResult_t ncclCommAbort(ncclComm_t comm); +ncclResult_t ncclCommAbort(ncclComm_t comm); ncclResult_t pncclCommAbort(ncclComm_t comm); /* Creates one or more communicators from an existing one. @@ -124,74 +129,81 @@ ncclResult_t pncclCommAbort(ncclComm_t comm); * and will therefore return a NULL communicator. * If config is NULL, the new communicator will inherit the original communicator's * configuration*/ -ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); -ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); +ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclConfig_t* config); +ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclConfig_t* config); /* Returns a string for each error code. */ -const char* ncclGetErrorString(ncclResult_t result); +const char* ncclGetErrorString(ncclResult_t result); const char* pncclGetErrorString(ncclResult_t result); /* Returns a human-readable message of the last error that occurred. * comm is currently unused and can be set to NULL */ -const char* ncclGetLastError(ncclComm_t comm); +const char* ncclGetLastError(ncclComm_t comm); const char* pncclGetLastError(ncclComm_t comm); /* Checks whether the comm has encountered any asynchronous errors */ -ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); -ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); +ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); +ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); /* Gets the number of ranks in the communicator clique. */ -ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); +ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); /* Returns the cuda device number associated with the communicator. */ -ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); +ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); /* Returns the user-ordered "rank" associated with the communicator. */ -ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); +ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); /* Reduction operation selector */ typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; -typedef enum { ncclSum = 0, - ncclProd = 1, - ncclMax = 2, - ncclMin = 3, - ncclAvg = 4, - /* ncclNumOps: The number of built-in ncclRedOp_t values. Also - * serves as the least possible value for dynamic ncclRedOp_t's - * as constructed by ncclRedOpCreate*** functions. */ - ncclNumOps = 5, - /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. - * It is defined to be the largest signed value (since compilers - * are permitted to use signed enums) that won't grow - * sizeof(ncclRedOp_t) when compared to previous NCCL versions to - * maintain ABI compatibility. */ - ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) - } ncclRedOp_t; +typedef enum { + ncclSum = 0, + ncclProd = 1, + ncclMax = 2, + ncclMin = 3, + ncclAvg = 4, + /* ncclNumOps: The number of built-in ncclRedOp_t values. Also + * serves as the least possible value for dynamic ncclRedOp_t's + * as constructed by ncclRedOpCreate*** functions. */ + ncclNumOps = 5, + /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. + * It is defined to be the largest signed value (since compilers + * are permitted to use signed enums) that won't grow + * sizeof(ncclRedOp_t) when compared to previous NCCL versions to + * maintain ABI compatibility. */ + ncclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t)) +} ncclRedOp_t; /* Data types */ -typedef enum { ncclInt8 = 0, ncclChar = 0, - ncclUint8 = 1, - ncclInt32 = 2, ncclInt = 2, - ncclUint32 = 3, - ncclInt64 = 4, - ncclUint64 = 5, - ncclFloat16 = 6, ncclHalf = 6, - ncclFloat32 = 7, ncclFloat = 7, - ncclFloat64 = 8, ncclDouble = 8, +typedef enum { + ncclInt8 = 0, + ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, + ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, + ncclHalf = 6, + ncclFloat32 = 7, + ncclFloat = 7, + ncclFloat64 = 8, + ncclDouble = 8, #if defined(__CUDA_BF16_TYPES_EXIST__) && defined(__CUDA_FP8_TYPES_EXIST__) - ncclBfloat16 = 9, - ncclFp8E4M3 = 10, - ncclFp8E5M2 = 11, - ncclNumTypes = 12 + ncclBfloat16 = 9, + ncclFp8E4M3 = 10, + ncclFp8E5M2 = 11, + ncclNumTypes = 12 #elif defined(__CUDA_BF16_TYPES_EXIST__) - ncclBfloat16 = 9, - ncclNumTypes = 10 + ncclBfloat16 = 9, + ncclNumTypes = 10 #else - ncclNumTypes = 9 + ncclNumTypes = 9 #endif } ncclDataType_t; @@ -216,8 +228,10 @@ typedef enum { * will be dereferenced. Upon return, the newly created operator's handle * is stored in *op*. */ -ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); -ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); +ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, ncclDataType_t datatype, + ncclScalarResidence_t residence, ncclComm_t comm); +ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, ncclDataType_t datatype, + ncclScalarResidence_t residence, ncclComm_t comm); /* * ncclRedOpDestroy @@ -253,10 +267,10 @@ ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); * * In-place operation will happen if sendbuff == recvbuff. */ -ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); -ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, + int root, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, + int root, ncclComm_t comm, cudaStream_t stream); /* * (deprecated) Broadcast (in-place) @@ -267,10 +281,10 @@ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncc * * This operation is implicitly in place. */ -ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, + cudaStream_t stream); +ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, + cudaStream_t stream); /* * Broadcast @@ -281,10 +295,10 @@ ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int r * * In-place operation will happen if sendbuff == recvbuff. */ -ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream); + ncclComm_t comm, cudaStream_t stream); /* * All-Reduce @@ -294,10 +308,10 @@ ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, * * In-place operation will happen if sendbuff == recvbuff. */ -ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); -ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, + ncclComm_t comm, cudaStream_t stream); /* * Reduce-Scatter @@ -310,12 +324,10 @@ ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, * * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. */ -ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, - size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, - cudaStream_t stream); -ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, - size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, - cudaStream_t stream); +ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, + ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, + ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); /* * All-Gather @@ -327,10 +339,10 @@ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, * * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. */ -ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); -ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, + ncclComm_t comm, cudaStream_t stream); /* * Send @@ -344,10 +356,10 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ * ncclGroupEnd section. */ -ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, + cudaStream_t stream); +ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, + cudaStream_t stream); /* * Receive @@ -361,10 +373,10 @@ ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t dataty * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ * ncclGroupEnd section. */ -ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, + cudaStream_t stream); +ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, + cudaStream_t stream); /* All-To-All * @@ -374,10 +386,10 @@ ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, in * * In-place operation will happen if sendbuff == recvbuff. */ -ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); -ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclComm_t comm, + cudaStream_t stream); +ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclComm_t comm, + cudaStream_t stream); /*! @brief Opaque handle to MSCCL algorithm */ typedef int mscclAlgoHandle_t; @@ -387,8 +399,8 @@ typedef int mscclAlgoHandle_t; * its handle via mscclAlgoHandle. This API is expected to be called by MSCCL * scheduler instead of end users. */ -ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); -ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); +ncclResult_t mscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank); +ncclResult_t pmscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank); /*! @brief MSCCL Run Algorithm * @@ -397,16 +409,14 @@ ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *ms * is a general-purposed API. This API is expected to be called by MSCCL * scheduler instead of end users. */ -ncclResult_t mscclRunAlgo( - const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], - void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], - size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, - mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, cudaStream_t stream); -ncclResult_t pmscclRunAlgo( - const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], - void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], - size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, - mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, cudaStream_t stream); +ncclResult_t mscclRunAlgo(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], void* recvBuff, + const size_t recvCounts[], const size_t rDisPls[], size_t count, ncclDataType_t dataType, + int root, int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, + cudaStream_t stream); +ncclResult_t pmscclRunAlgo(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], void* recvBuff, + const size_t recvCounts[], const size_t rDisPls[], size_t count, ncclDataType_t dataType, + int root, int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, + cudaStream_t stream); /*! @brief MSCCL Load Algorithm * @@ -445,7 +455,7 @@ ncclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); * a single NCCL operation. Nothing will be started on the CUDA stream until * ncclGroupEnd. */ -ncclResult_t ncclGroupStart(); +ncclResult_t ncclGroupStart(); ncclResult_t pncclGroupStart(); /* @@ -455,11 +465,11 @@ ncclResult_t pncclGroupStart(); * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations * need to be called after ncclGroupEnd. */ -ncclResult_t ncclGroupEnd(); +ncclResult_t ncclGroupEnd(); ncclResult_t pncclGroupEnd(); #ifdef __cplusplus -} // end extern "C" +} // end extern "C" #endif -#endif // end include guard +#endif // end include guard diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp index 418ee0695..35c2b60c4 100644 --- a/apps/nccl/src/allgather.hpp +++ b/apps/nccl/src/allgather.hpp @@ -14,8 +14,8 @@ template __global__ void __launch_bounds__(1024, 1) - allgather6(void* sendbuff, mscclpp::DeviceHandle* smChannels, size_t channelOutOffset, size_t rank, - [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { + allgather6(void* sendbuff, mscclpp::DeviceHandle* smChannels, size_t channelOutOffset, + size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; const size_t lid = tid % WARP_SIZE; const size_t wid = tid / WARP_SIZE; @@ -53,8 +53,10 @@ __global__ void __launch_bounds__(1024, 1) char* src = reinterpret_cast(smChans[peerIdx].src_); char* buff = reinterpret_cast(sendbuff); const size_t offsetWithinRank = (wid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); - smChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); + smChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, + WARP_SIZE); + smChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, + WARP_SIZE); } else { smChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE); } @@ -69,8 +71,10 @@ __global__ void __launch_bounds__(1024, 1) char* src = reinterpret_cast(smChans[peerIdx].src_); char* buff = reinterpret_cast(sendbuff); const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); - smChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); + smChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, + WARP_SIZE); + smChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, + WARP_SIZE); } else { smChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE); } @@ -89,8 +93,10 @@ __global__ void __launch_bounds__(1024, 1) char* dst = reinterpret_cast(smChans[peerIdx].dst_); char* src = reinterpret_cast(smChans[peerIdx].src_); char* buff = reinterpret_cast(sendbuff); - smChans[peerIdx].copy<16, true>(src + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE); - smChans[peerIdx].copy<16, true>(dst + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE); + smChans[peerIdx].copy<16, true>(src + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, + WARP_SIZE); + smChans[peerIdx].copy<16, true>(dst + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, + WARP_SIZE); } else { smChans[peerIdx].put<16, true>(offset + channelOutOffset, remainBytes, lid, WARP_SIZE); } @@ -100,11 +106,11 @@ __global__ void __launch_bounds__(1024, 1) template cudaError_t allgather(T* buff, [[maybe_unused]] T* scratch, [[maybe_unused]] T* resultBuff, - mscclpp::DeviceHandle* smChannels, size_t channelOutOffset, int rank, int nRanksPerNode, int worldSize, - size_t nelems, cudaStream_t stream) { - allgather6<<<28, 1024, 0, stream>>>((void*)buff, smChannels, channelOutOffset, rank, worldSize, nRanksPerNode, - nelems * sizeof(T) / sizeof(int)); + mscclpp::DeviceHandle* smChannels, size_t channelOutOffset, int rank, + int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { + allgather6<<<28, 1024, 0, stream>>>((void*)buff, smChannels, channelOutOffset, rank, worldSize, + nRanksPerNode, nelems * sizeof(T) / sizeof(int)); return cudaGetLastError(); } -#endif // ALLGATHER_HPP_ +#endif // ALLGATHER_HPP_ diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 078634c5d..23fd0c243 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -132,8 +132,8 @@ __forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem) { template __global__ void __launch_bounds__(32, 1) allreduceAllToAll(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, - size_t channelDataOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems, - uint32_t flag) { + size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, + size_t nelems, uint32_t flag) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; if (sizeof(T) == 2) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int); @@ -142,11 +142,9 @@ __global__ void __launch_bounds__(32, 1) const int localBlockIdx = blockIdx.x % nBlocksPerPeer; const int tid = threadIdx.x + localBlockIdx * blockDim.x; const int peerIdx = blockIdx.x / nBlocksPerPeer; - // Double buffering - size_t scratchBaseOffset = (flag & 1) ? 0 : 4 * worldSize * nelems * sizeof(mscclpp::LL8Packet); size_t srcOffset = channelDataOffset; - size_t scratchOffset = scratchBaseOffset + rank * nelems * sizeof(mscclpp::LL8Packet); - void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); + size_t scratchOffset = channelScratchOffset + rank * nelems * sizeof(mscclpp::LL8Packet); + void* scratchBuff = (void*)((char*)scratch + channelScratchOffset); uint32_t* src = (uint32_t*)((char*)buff); uint32_t* dst = (uint32_t*)((char*)resultBuff); @@ -178,7 +176,8 @@ __global__ void __launch_bounds__(32, 1) template __global__ void __launch_bounds__(1024, 1) allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, - size_t channelDataOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { + size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, + size_t nelems, uint32_t flag) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; nelems = nelems / (sizeof(int) / sizeof(T)); @@ -192,12 +191,9 @@ __global__ void __launch_bounds__(1024, 1) const int peerIdx = blockIdx.x / nBlocksPerPeer; const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; const int tid = threadIdx.x + localBlockIdx * blockDim.x; - // double buffering - size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LL8Packet); - void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); - size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LL8Packet); - size_t scratchResultOffset = - (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LL8Packet) : 3 * nPkts * sizeof(mscclpp::LL8Packet); + void* scratchBuff = (void*)((char*)scratch + channelScratchOffset); + size_t scratchOffset = channelScratchOffset + rank * nPktsPerRank * sizeof(mscclpp::LL8Packet); + size_t scratchResultOffset = channelScratchOffset + 2 * nPkts * sizeof(mscclpp::LL8Packet); size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int) + channelDataOffset; uint32_t* src = (uint32_t*)((char*)buff + rank * nelemsPerRank * sizeof(int)); uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); @@ -246,8 +242,8 @@ __global__ void __launch_bounds__(1024, 1) template __global__ void __launch_bounds__(512, 1) allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, - mscclpp::DeviceHandle* smOutChannels, size_t channelOutDataOffset, int rank, - int nRanksPerNode, int worldSize, size_t nelems) { + mscclpp::DeviceHandle* smOutChannels, size_t channelOutDataOffset, + size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems) { const int nPeer = nRanksPerNode - 1; const size_t chanOffset = nPeer * blockIdx.x; // assume (nelems * sizeof(T)) is divisible by (16 * worldSize) @@ -257,7 +253,7 @@ __global__ void __launch_bounds__(512, 1) auto smOutChans = smOutChannels + chanOffset; int4* buff4 = reinterpret_cast(buff); - int4* scratch4 = reinterpret_cast(scratch); + int4* scratch4 = reinterpret_cast((char*)scratch + channelScratchOffset); int4* resultBuff4 = reinterpret_cast(resultBuff); // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4` @@ -278,6 +274,7 @@ __global__ void __launch_bounds__(512, 1) const size_t chunkSizePerRank = nNeededBlocks * nInt4PerChunk; const size_t blockOffset = nInt4PerChunk * blockIdx.x; const size_t scratchChunkRankOffset = chunkSizePerRank * rank; + const size_t scratchBaseOffsetInt4 = channelScratchOffset / sizeof(int4); __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; __shared__ mscclpp::DeviceHandle outChannels[NRANKS_PER_NODE - 1]; @@ -301,7 +298,7 @@ __global__ void __launch_bounds__(512, 1) const int peerIdx = (i + blockIdx.x) % nPeer; const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock]; - channels[peerIdx].write(scratchChunkRankOffset + blockOffset + idx, val); + channels[peerIdx].write(scratchBaseOffsetInt4 + scratchChunkRankOffset + blockOffset + idx, val); } } @@ -338,7 +335,7 @@ __global__ void __launch_bounds__(512, 1) const int peerIdx = (i + blockIdx.x) % nPeer; const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock]; - channels[peerIdx].write(scratchChunkRankOffset + blockOffset + idx, val); + channels[peerIdx].write(scratchBaseOffsetInt4 + scratchChunkRankOffset + blockOffset + idx, val); } } @@ -367,15 +364,16 @@ __global__ void __launch_bounds__(512, 1) template cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, mscclpp::DeviceHandle* smOutChannels, size_t channelInOffset, - size_t channelOutOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems, - cudaStream_t stream) { + size_t channelOutOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, + size_t nelems, cudaStream_t stream) { static uint32_t flag = 1; if (sizeof(T) * nelems < worldSize * sizeof(int)) { int nBlocks = 7; int nThreadsPerBlock = 32; allreduceAllToAll<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, - rank, nRanksPerNode, worldSize, nelems, flag++); + channelScratchOffset, rank, nRanksPerNode, worldSize, + nelems, flag++); } else if (sizeof(T) * nelems <= (1 << 20)) { int nBlocks = 28; int nThreadsPerBlock = 1024; @@ -383,13 +381,15 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< nBlocks = 56; nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; } - allreduce7<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, rank, - nRanksPerNode, worldSize, nelems, flag++); + allreduce7<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, + channelScratchOffset, rank, nRanksPerNode, worldSize, nelems, + flag++); } else { int nBlocks = 35; int nThreadsPerBlock = 512; allreduce8<<>>(buff, scratch, resultBuff, smChannels, smOutChannels, - channelOutOffset, rank, nRanksPerNode, worldSize, nelems); + channelOutOffset, channelScratchOffset, rank, nRanksPerNode, + worldSize, nelems); } return cudaGetLastError(); diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp index cddc69625..25c74e71b 100644 --- a/apps/nccl/src/common.hpp +++ b/apps/nccl/src/common.hpp @@ -12,6 +12,6 @@ #endif constexpr int NRANKS_PER_NODE = 8; -constexpr int SCRATCH_SIZE = 1024 * 1024 * 70; // 35 thread-blocks * 8 ranks * 256KB = 70MB +constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB -#endif // NCCL_COMMON_HPP_ +#endif // NCCL_COMMON_HPP_ diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index d312ee66b..01bc56e0b 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -33,9 +33,7 @@ struct channelKey { const void* buff; size_t bytes; - bool operator==(const channelKey& other) const { - return buff == other.buff && bytes == other.bytes; - } + bool operator==(const channelKey& other) const { return buff == other.buff && bytes == other.bytes; } }; namespace std { @@ -62,6 +60,9 @@ struct ncclComm { std::unordered_map channelScratchInfos; std::shared_ptr scratchBuff; std::vector remoteScratchRegMemories; + + uint32_t numScratchBuff; + uint32_t buffFlag; }; static size_t ncclTypeSize(ncclDataType_t type) { @@ -146,7 +147,7 @@ static std::shared_ptr> setupSmChannel std::shared_ptr> ptr = mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); mscclpp::memcpyCuda>(ptr.get(), smChannelDeviceHandles.data(), - smChannelDeviceHandles.size(), cudaMemcpyHostToDevice); + smChannelDeviceHandles.size(), cudaMemcpyHostToDevice); return ptr; } @@ -164,8 +165,7 @@ NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) { return ncclSuccess; } -NCCL_API ncclResult_t ncclCommInitRankConfig(ncclComm_t*, int, ncclUniqueId, int, - ncclConfig_t*) { +NCCL_API ncclResult_t ncclCommInitRankConfig(ncclComm_t*, int, ncclUniqueId, int, ncclConfig_t*) { // TODO: implement this function return ncclInternalError; } @@ -206,6 +206,8 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI commPtr->comm = mscclppComm; commPtr->connections = std::move(connections); commPtr->smSemaphores = std::move(smSemaphores); + commPtr->buffFlag = 0; + commPtr->numScratchBuff = 2; commPtr->scratchBuff = mscclpp::allocExtSharedCuda(SCRATCH_SIZE); commPtr->remoteScratchRegMemories = setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), SCRATCH_SIZE, mscclpp::Transport::CudaIpc); @@ -292,8 +294,7 @@ NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { return ncclSuccess; } -NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t*, void*, ncclDataType_t, - ncclScalarResidence_t, ncclComm_t) { +NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t*, void*, ncclDataType_t, ncclScalarResidence_t, ncclComm_t) { // TODO: implement this function return ncclInternalError; } @@ -303,20 +304,18 @@ NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t, ncclComm_t) { return ncclInternalError; } -NCCL_API ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t, - ncclRedOp_t, int, ncclComm_t, cudaStream_t) { +NCCL_API ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, int, ncclComm_t, + cudaStream_t) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclBcast(void*, size_t, ncclDataType_t, int, ncclComm_t, - cudaStream_t) { +NCCL_API ncclResult_t ncclBcast(void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t, - int, ncclComm_t, cudaStream_t) { +NCCL_API ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) { // TODO: implement this function return ncclInternalError; } @@ -324,7 +323,8 @@ NCCL_API ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t, NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t comm, cudaStream_t stream) { // Checking if the parameters are valids - if (sendbuff == nullptr || recvbuff == nullptr || count == 0 || ncclTypeSize(datatype) == 0 || comm == nullptr) return ncclInvalidArgument; + if (sendbuff == nullptr || recvbuff == nullptr || count == 0 || ncclTypeSize(datatype) == 0 || comm == nullptr) + return ncclInvalidArgument; // Declarating variables size_t sendBytes, recvBytes; @@ -333,6 +333,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)recvbuff)); size_t offsetIn = (char*)sendbuff - (char*)sendBasePtr; size_t offsetOut = (char*)recvbuff - (char*)recvBasePtr; + uint32_t scratchBuffIdx = (++(comm->buffFlag)) % comm->numScratchBuff; + size_t offsetScratch = (SCRATCH_SIZE / comm->numScratchBuff) * scratchBuffIdx; int rank = comm->comm->bootstrap()->getRank(); channelKey sendKey{(void*)sendBasePtr, sendBytes}; channelKey recvKey{(void*)recvBasePtr, recvBytes}; @@ -340,21 +342,23 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t mscclpp::DeviceHandle* smOutChannels = nullptr; // Creating the channels - if(count * ncclTypeSize(datatype) <= (1 << 20)) { + if (count * ncclTypeSize(datatype) <= (1 << 20)) { auto sendIt = comm->channelScratchInfos.find(sendKey); if (sendIt == comm->channelScratchInfos.end()) { - std::vector channels = setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast((void*)sendBasePtr)); + std::vector channels = + setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast((void*)sendBasePtr)); ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)}; sendIt = comm->channelScratchInfos.emplace(sendKey, channelInfo).first; } + smChannels = sendIt->second.smChannelDeviceHandles.get(); - } - else { + } else { std::vector remoteMemories; auto sendIt = comm->channelInInfos.find(sendKey); if (sendIt == comm->channelInInfos.end()) { - std::vector channels = setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast((void*)sendBasePtr)); + std::vector channels = + setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast((void*)sendBasePtr)); ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)}; sendIt = comm->channelInInfos.emplace(sendKey, channelInfo).first; } @@ -363,7 +367,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t if (recvIt == comm->channelOutInfos.end()) { remoteMemories = setupRemoteMemories(comm->comm, rank, (void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc); - std::vector outChannels = setupSmChannels(comm, remoteMemories, const_cast((void*)recvBasePtr)); + std::vector outChannels = + setupSmChannels(comm, remoteMemories, const_cast((void*)recvBasePtr)); ChannelInfo channelInfo{outChannels, setupSmChannelDeviceHandles(outChannels)}; recvIt = comm->channelOutInfos.emplace(recvKey, channelInfo).first; } @@ -374,19 +379,20 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t switch (datatype) { case ncclFloat16: - CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, smChannels, smOutChannels, offsetIn, offsetOut, - rank, NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream)); + CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, smChannels, smOutChannels, + offsetIn, offsetOut, offsetScratch, rank, NRANKS_PER_NODE, + comm->comm->bootstrap()->getNranks(), count, stream)); break; case ncclFloat32: CUDACHECK(allreduce((float*)sendbuff, (float*)comm->scratchBuff.get(), (float*)recvbuff, smChannels, - smOutChannels, offsetIn, offsetOut, comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE, - comm->comm->bootstrap()->getNranks(), count, stream)); + smOutChannels, offsetIn, offsetOut, offsetScratch, comm->comm->bootstrap()->getRank(), + NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream)); break; case ncclInt32: case ncclUint32: - CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, smOutChannels, offsetIn, offsetOut, - comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), - count, stream)); + CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, smOutChannels, + offsetIn, offsetOut, offsetScratch, comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE, + comm->comm->bootstrap()->getNranks(), count, stream)); break; default: return ncclInvalidArgument; @@ -394,8 +400,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t return ncclSuccess; } -NCCL_API ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType_t, - ncclRedOp_t, ncclComm_t, cudaStream_t) { +NCCL_API ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm_t, + cudaStream_t) { // TODO: implement this function return ncclInternalError; } @@ -405,7 +411,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t size_t bytes = sendcount * ncclTypeSize(datatype); if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; - // Declarating variables + // Declarating variables size_t recvBytes; CUdeviceptr recvBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)recvbuff)); @@ -417,9 +423,8 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t auto it = comm->channelOutInfos.find(recvKey); if (it == comm->channelOutInfos.end()) { - std::vector remoteMemories = - setupRemoteMemories(comm->comm, rank, const_cast((void*)recvBasePtr), recvBytes, - mscclpp::Transport::CudaIpc); + std::vector remoteMemories = setupRemoteMemories( + comm->comm, rank, const_cast((void*)recvBasePtr), recvBytes, mscclpp::Transport::CudaIpc); std::vector channels = setupSmChannels(comm, remoteMemories, const_cast((void*)recvBasePtr)); std::vector> smChannelDeviceHandles; @@ -431,29 +436,26 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t smChannels = it->second.smChannelDeviceHandles.get(); if ((char*)sendbuff == (char*)recvbuff + rank * sendcount) { - CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, offsetOut, - rank, NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream)); + CUDACHECK(allgather((int*)sendbuff, (int*)nullptr, (int*)recvbuff, smChannels, offsetOut, rank, + NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream)); } else { - CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, offsetOut, - rank, NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream)); + CUDACHECK(allgather((int*)sendbuff, (int*)nullptr, (int*)recvbuff, smChannels, offsetOut, rank, + NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream)); } return ncclSuccess; } -NCCL_API ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, - cudaStream_t) { +NCCL_API ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclRecv(void*, size_t, ncclDataType_t, int, ncclComm_t, - cudaStream_t) { +NCCL_API ncclResult_t ncclRecv(void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclAllToAll(const void*, void*, size_t, ncclDataType_t, - ncclComm_t, cudaStream_t) { +NCCL_API ncclResult_t ncclAllToAll(const void*, void*, size_t, ncclDataType_t, ncclComm_t, cudaStream_t) { // TODO: implement this function return ncclInternalError; } diff --git a/cmake/AddFormatTargets.cmake b/cmake/AddFormatTargets.cmake index b95ad447b..9829bd135 100644 --- a/cmake/AddFormatTargets.cmake +++ b/cmake/AddFormatTargets.cmake @@ -9,7 +9,7 @@ add_custom_target(format) find_program(CLANG_FORMAT clang-format) if(CLANG_FORMAT) message(STATUS "Found clang-format: ${CLANG_FORMAT}") - set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test) + set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test ${PROJECT_SOURCE_DIR}/apps) add_custom_target(check-format-cpp ALL COMMAND ${CLANG_FORMAT} -style=file --dry-run `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu` )