diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 0c1babcdd..4dee63996 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -20,7 +20,7 @@ jobs:
 
     - name: Run cpplint
       run: |
-        CPPSOURCES=$(find ./src ./include ./python ./test -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)')
+        CPPSOURCES=$(find ./src ./include ./python ./test ./apps -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)')
         clang-format -style=file --verbose --Werror --dry-run ${CPPSOURCES}
 
   pylint:
diff --git a/apps/nccl/include/nccl.h b/apps/nccl/include/nccl.h
index b6e91975a..7f507927b 100644
--- a/apps/nccl/include/nccl.h
+++ b/apps/nccl/include/nccl.h
@@ -17,18 +17,22 @@ typedef struct ncclComm* ncclComm_t;
 #define NCCL_COMM_NULL NULL
 
 #define NCCL_UNIQUE_ID_BYTES 128
-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+typedef struct {
+  char internal[NCCL_UNIQUE_ID_BYTES];
+} ncclUniqueId;
 
 /* Error type */
-typedef enum { ncclSuccess                 =  0,
-               ncclUnhandledCudaError      =  1,
-               ncclSystemError             =  2,
-               ncclInternalError           =  3,
-               ncclInvalidArgument         =  4,
-               ncclInvalidUsage            =  5,
-               ncclRemoteError             =  6,
-               ncclInProgress              =  7,
-               ncclNumResults              =  8 } ncclResult_t;
+typedef enum {
+  ncclSuccess = 0,
+  ncclUnhandledCudaError = 1,
+  ncclSystemError = 2,
+  ncclInternalError = 3,
+  ncclInvalidArgument = 4,
+  ncclInvalidUsage = 5,
+  ncclRemoteError = 6,
+  ncclInProgress = 7,
+  ncclNumResults = 8
+} ncclResult_t;
 
 #define NCCL_CONFIG_UNDEF_INT INT_MIN
 #define NCCL_CONFIG_UNDEF_PTR NULL
@@ -46,40 +50,41 @@ typedef struct ncclConfig_v21700 {
   int cgaClusterSize;
   int minCTAs;
   int maxCTAs;
-  const char *netName;
+  const char* netName;
   int splitShare;
 } ncclConfig_t;
 
 /* Config initializer must be assigned to initialize config structure when it is created.
  * Not initialized config will result in NCCL error. */
-#define NCCL_CONFIG_INITIALIZER {                                       \
-  sizeof(ncclConfig_t), /* size */                                      \
-  0xcafebeef,           /* magic */                                     \
-  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
-  NCCL_CONFIG_UNDEF_INT,                    /* blocking */              \
-  NCCL_CONFIG_UNDEF_INT,                    /* cgaClusterSize */        \
-  NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
-  NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
-  NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
-  NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
-}
+#define NCCL_CONFIG_INITIALIZER                                                \
+  {                                                                            \
+    sizeof(ncclConfig_t),                                 /* size */           \
+        0xcafebeef,                                       /* magic */          \
+        NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */        \
+        NCCL_CONFIG_UNDEF_INT,                            /* blocking */       \
+        NCCL_CONFIG_UNDEF_INT,                            /* cgaClusterSize */ \
+        NCCL_CONFIG_UNDEF_INT,                            /* minCTAs */        \
+        NCCL_CONFIG_UNDEF_INT,                            /* maxCTAs */        \
+        NCCL_CONFIG_UNDEF_PTR,                            /* netName */        \
+        NCCL_CONFIG_UNDEF_INT                             /* splitShare */     \
+  }
 
 /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
  * This integer is coded with the MAJOR, MINOR and PATCH level of the
  * NCCL library
  */
-ncclResult_t  ncclGetVersion(int *version);
-ncclResult_t pncclGetVersion(int *version);
+ncclResult_t ncclGetVersion(int* version);
+ncclResult_t pncclGetVersion(int* version);
 
 /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
  * called once and the Id should be distributed to all ranks in the
  * communicator before calling ncclCommInitRank. */
-ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
+ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
 ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
 
 /* Create a new communicator (multi thread/process version) with a configuration
  * set by users. */
-ncclResult_t  ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
 ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
 
 /* Creates a new communicator (multi thread/process version).
@@ -88,7 +93,7 @@ ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId
  * ncclCommInitRank.
  * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
  * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
-ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
 ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
 
 /* Creates a clique of communicators (single process version).
@@ -97,7 +102,7 @@ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId
  * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
  * If devlist is NULL, the first ndev CUDA devices are used.
  * Order of devlist defines user-order of processors within the communicator. */
-ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 
 /* Finalize a communicator. ncclCommFinalize flushes all issued communications,
@@ -105,16 +110,16 @@ ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
  * when the communicator is globally quiescent and related resources are freed; then,
  * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
  * itself) without blocking. */
-ncclResult_t  ncclCommFinalize(ncclComm_t comm);
+ncclResult_t ncclCommFinalize(ncclComm_t comm);
 ncclResult_t pncclCommFinalize(ncclComm_t comm);
 
 /* Frees local resources associated with communicator object. */
-ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+ncclResult_t ncclCommDestroy(ncclComm_t comm);
 ncclResult_t pncclCommDestroy(ncclComm_t comm);
 
 /* Frees resources associated with communicator object and aborts any operations
  * that might still be running on the device. */
-ncclResult_t  ncclCommAbort(ncclComm_t comm);
+ncclResult_t ncclCommAbort(ncclComm_t comm);
 ncclResult_t pncclCommAbort(ncclComm_t comm);
 
 /* Creates one or more communicators from an existing one.
@@ -124,74 +129,81 @@ ncclResult_t pncclCommAbort(ncclComm_t comm);
  * and will therefore return a NULL communicator.
  * If config is NULL, the new communicator will inherit the original communicator's
  * configuration*/
-ncclResult_t  ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
-ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclConfig_t* config);
+ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclConfig_t* config);
 
 /* Returns a string for each error code. */
-const char*  ncclGetErrorString(ncclResult_t result);
+const char* ncclGetErrorString(ncclResult_t result);
 const char* pncclGetErrorString(ncclResult_t result);
 
 /* Returns a human-readable message of the last error that occurred.
  * comm is currently unused and can be set to NULL
  */
-const char*  ncclGetLastError(ncclComm_t comm);
+const char* ncclGetLastError(ncclComm_t comm);
 const char* pncclGetLastError(ncclComm_t comm);
 
 /* Checks whether the comm has encountered any asynchronous errors */
-ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
-ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
+ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
 
 /* Gets the number of ranks in the communicator clique. */
-ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
+ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
 ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
 
 /* Returns the cuda device number associated with the communicator. */
-ncclResult_t  ncclCommCuDevice(const ncclComm_t comm, int* device);
+ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
 ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
 
 /* Returns the user-ordered "rank" associated with the communicator. */
-ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
+ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
 ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
 
 /* Reduction operation selector */
 typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
-typedef enum { ncclSum        = 0,
-               ncclProd       = 1,
-               ncclMax        = 2,
-               ncclMin        = 3,
-               ncclAvg        = 4,
-               /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
-                * serves as the least possible value for dynamic ncclRedOp_t's
-                * as constructed by ncclRedOpCreate*** functions. */
-               ncclNumOps     = 5,
-               /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
-                * It is defined to be the largest signed value (since compilers
-                * are permitted to use signed enums) that won't grow
-                * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
-                * maintain ABI compatibility. */
-               ncclMaxRedOp   = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
-             } ncclRedOp_t;
+typedef enum {
+  ncclSum = 0,
+  ncclProd = 1,
+  ncclMax = 2,
+  ncclMin = 3,
+  ncclAvg = 4,
+  /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
+   * serves as the least possible value for dynamic ncclRedOp_t's
+   * as constructed by ncclRedOpCreate*** functions. */
+  ncclNumOps = 5,
+  /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
+   * It is defined to be the largest signed value (since compilers
+   * are permitted to use signed enums) that won't grow
+   * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
+   * maintain ABI compatibility. */
+  ncclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t))
+} ncclRedOp_t;
 
 /* Data types */
-typedef enum { ncclInt8       = 0, ncclChar       = 0,
-               ncclUint8      = 1,
-               ncclInt32      = 2, ncclInt        = 2,
-               ncclUint32     = 3,
-               ncclInt64      = 4,
-               ncclUint64     = 5,
-               ncclFloat16    = 6, ncclHalf       = 6,
-               ncclFloat32    = 7, ncclFloat      = 7,
-               ncclFloat64    = 8, ncclDouble     = 8,
+typedef enum {
+  ncclInt8 = 0,
+  ncclChar = 0,
+  ncclUint8 = 1,
+  ncclInt32 = 2,
+  ncclInt = 2,
+  ncclUint32 = 3,
+  ncclInt64 = 4,
+  ncclUint64 = 5,
+  ncclFloat16 = 6,
+  ncclHalf = 6,
+  ncclFloat32 = 7,
+  ncclFloat = 7,
+  ncclFloat64 = 8,
+  ncclDouble = 8,
 #if defined(__CUDA_BF16_TYPES_EXIST__) && defined(__CUDA_FP8_TYPES_EXIST__)
-               ncclBfloat16   = 9,
-               ncclFp8E4M3    = 10,
-               ncclFp8E5M2    = 11,
-               ncclNumTypes   = 12
+  ncclBfloat16 = 9,
+  ncclFp8E4M3 = 10,
+  ncclFp8E5M2 = 11,
+  ncclNumTypes = 12
 #elif defined(__CUDA_BF16_TYPES_EXIST__)
-               ncclBfloat16   = 9,
-               ncclNumTypes   = 10
+  ncclBfloat16 = 9,
+  ncclNumTypes = 10
 #else
-               ncclNumTypes   = 9
+  ncclNumTypes = 9
 #endif
 } ncclDataType_t;
 
@@ -216,8 +228,10 @@ typedef enum {
  * will be dereferenced. Upon return, the newly created operator's handle
  * is stored in *op*.
  */
-ncclResult_t  ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
-ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, ncclDataType_t datatype,
+                                      ncclScalarResidence_t residence, ncclComm_t comm);
+ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, ncclDataType_t datatype,
+                                       ncclScalarResidence_t residence, ncclComm_t comm);
 
 /*
  * ncclRedOpDestroy
@@ -253,10 +267,10 @@ ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
  *
  * In-place operation will happen if sendbuff == recvbuff.
  */
-ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op,
+                        int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op,
+                         int root, ncclComm_t comm, cudaStream_t stream);
 
 /*
  * (deprecated) Broadcast (in-place)
@@ -267,10 +281,10 @@ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncc
  *
  * This operation is implicitly in place.
  */
-ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm,
+                       cudaStream_t stream);
+ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm,
+                        cudaStream_t stream);
 
 /*
  * Broadcast
@@ -281,10 +295,10 @@ ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int r
  *
  * In-place operation will happen if sendbuff == recvbuff.
  */
-ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+                           ncclComm_t comm, cudaStream_t stream);
 ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+                            ncclComm_t comm, cudaStream_t stream);
 
 /*
  * All-Reduce
@@ -294,10 +308,10 @@ ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
  *
  * In-place operation will happen if sendbuff == recvbuff.
  */
-ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op,
+                           ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op,
+                            ncclComm_t comm, cudaStream_t stream);
 
 /*
  * Reduce-Scatter
@@ -310,12 +324,10 @@ ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
  *
  * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
  */
-ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
-    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
-ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
-    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
+ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype,
+                               ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype,
+                                ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
 
 /*
  * All-Gather
@@ -327,10 +339,10 @@ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
  *
  * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
  */
-ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype,
+                           ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype,
+                            ncclComm_t comm, cudaStream_t stream);
 
 /*
  * Send
@@ -344,10 +356,10 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
  * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
  * ncclGroupEnd section.
  */
-ncclResult_t  ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm,
+                      cudaStream_t stream);
+ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm,
+                       cudaStream_t stream);
 
 /*
  * Receive
@@ -361,10 +373,10 @@ ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t dataty
  * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
  * ncclGroupEnd section.
  */
-ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t  ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm,
+                       cudaStream_t stream);
+ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm,
+                      cudaStream_t stream);
 
 /* All-To-All
  *
@@ -374,10 +386,10 @@ ncclResult_t  ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, in
  *
  * In-place operation will happen if sendbuff == recvbuff.
  */
-ncclResult_t  ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclComm_t comm,
+                          cudaStream_t stream);
+ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclComm_t comm,
+                           cudaStream_t stream);
 /*! @brief Opaque handle to MSCCL algorithm */
 typedef int mscclAlgoHandle_t;
 
@@ -387,8 +399,8 @@ typedef int mscclAlgoHandle_t;
  * its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
  * scheduler instead of end users.
  */
-ncclResult_t  mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
-ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
+ncclResult_t mscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank);
+ncclResult_t pmscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, int rank);
 
 /*! @brief MSCCL Run Algorithm
  *
@@ -397,16 +409,14 @@ ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *ms
  * is a general-purposed API. This API is expected to be called by MSCCL
  * scheduler instead of end users.
  */
-ncclResult_t  mscclRunAlgo(
-    const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
-    void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
-    size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
-    mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pmscclRunAlgo(
-    const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
-    void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
-    size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
-    mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t mscclRunAlgo(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], void* recvBuff,
+                          const size_t recvCounts[], const size_t rDisPls[], size_t count, ncclDataType_t dataType,
+                          int root, int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm,
+                          cudaStream_t stream);
+ncclResult_t pmscclRunAlgo(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], void* recvBuff,
+                           const size_t recvCounts[], const size_t rDisPls[], size_t count, ncclDataType_t dataType,
+                           int root, int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm,
+                           cudaStream_t stream);
 
 /*! @brief MSCCL Load Algorithm
  *
@@ -445,7 +455,7 @@ ncclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
  * a single NCCL operation. Nothing will be started on the CUDA stream until
  * ncclGroupEnd.
  */
-ncclResult_t  ncclGroupStart();
+ncclResult_t ncclGroupStart();
 ncclResult_t pncclGroupStart();
 
 /*
@@ -455,11 +465,11 @@ ncclResult_t pncclGroupStart();
  * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
  * need to be called after ncclGroupEnd.
  */
-ncclResult_t  ncclGroupEnd();
+ncclResult_t ncclGroupEnd();
 ncclResult_t pncclGroupEnd();
 
 #ifdef __cplusplus
-} // end extern "C"
+}  // end extern "C"
 #endif
 
-#endif // end include guard
+#endif  // end include guard
diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp
index 418ee0695..35c2b60c4 100644
--- a/apps/nccl/src/allgather.hpp
+++ b/apps/nccl/src/allgather.hpp
@@ -14,8 +14,8 @@
 
 template <bool IsOutOfPlace>
 __global__ void __launch_bounds__(1024, 1)
-    allgather6(void* sendbuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels, size_t channelOutOffset, size_t rank,
-               [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) {
+    allgather6(void* sendbuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels, size_t channelOutOffset,
+               size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) {
   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   const size_t lid = tid % WARP_SIZE;
   const size_t wid = tid / WARP_SIZE;
@@ -53,8 +53,10 @@ __global__ void __launch_bounds__(1024, 1)
       char* src = reinterpret_cast<char*>(smChans[peerIdx].src_);
       char* buff = reinterpret_cast<char*>(sendbuff);
       const size_t offsetWithinRank = (wid / nPeer) * unitBytesPerWarp;
-      smChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE);
-      smChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE);
+      smChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
+                                       WARP_SIZE);
+      smChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
+                                       WARP_SIZE);
     } else {
       smChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE);
     }
@@ -69,8 +71,10 @@ __global__ void __launch_bounds__(1024, 1)
       char* src = reinterpret_cast<char*>(smChans[peerIdx].src_);
       char* buff = reinterpret_cast<char*>(sendbuff);
       const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp;
-      smChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE);
-      smChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE);
+      smChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
+                                       WARP_SIZE);
+      smChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
+                                       WARP_SIZE);
     } else {
       smChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE);
     }
@@ -89,8 +93,10 @@ __global__ void __launch_bounds__(1024, 1)
         char* dst = reinterpret_cast<char*>(smChans[peerIdx].dst_);
         char* src = reinterpret_cast<char*>(smChans[peerIdx].src_);
         char* buff = reinterpret_cast<char*>(sendbuff);
-        smChans[peerIdx].copy<16, true>(src + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE);
-        smChans[peerIdx].copy<16, true>(dst + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE);
+        smChans[peerIdx].copy<16, true>(src + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid,
+                                        WARP_SIZE);
+        smChans[peerIdx].copy<16, true>(dst + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid,
+                                        WARP_SIZE);
       } else {
         smChans[peerIdx].put<16, true>(offset + channelOutOffset, remainBytes, lid, WARP_SIZE);
       }
@@ -100,11 +106,11 @@ __global__ void __launch_bounds__(1024, 1)
 
 template <bool IsOutOfPlace, typename T>
 cudaError_t allgather(T* buff, [[maybe_unused]] T* scratch, [[maybe_unused]] T* resultBuff,
-                      mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels, size_t channelOutOffset, int rank, int nRanksPerNode, int worldSize,
-                      size_t nelems, cudaStream_t stream) {
-  allgather6<IsOutOfPlace><<<28, 1024, 0, stream>>>((void*)buff, smChannels, channelOutOffset, rank, worldSize, nRanksPerNode,
-                                                    nelems * sizeof(T) / sizeof(int));
+                      mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels, size_t channelOutOffset, int rank,
+                      int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) {
+  allgather6<IsOutOfPlace><<<28, 1024, 0, stream>>>((void*)buff, smChannels, channelOutOffset, rank, worldSize,
+                                                    nRanksPerNode, nelems * sizeof(T) / sizeof(int));
   return cudaGetLastError();
 }
 
-#endif // ALLGATHER_HPP_
+#endif  // ALLGATHER_HPP_
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 078634c5d..23fd0c243 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -132,8 +132,8 @@ __forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem) {
 template <typename T>
 __global__ void __launch_bounds__(32, 1)
     allreduceAllToAll(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
-                      size_t channelDataOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems,
-                      uint32_t flag) {
+                      size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize,
+                      size_t nelems, uint32_t flag) {
   // This version of allreduce only works for single nodes
   if (worldSize != nRanksPerNode) return;
   if (sizeof(T) == 2) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
@@ -142,11 +142,9 @@ __global__ void __launch_bounds__(32, 1)
   const int localBlockIdx = blockIdx.x % nBlocksPerPeer;
   const int tid = threadIdx.x + localBlockIdx * blockDim.x;
   const int peerIdx = blockIdx.x / nBlocksPerPeer;
-  // Double buffering
-  size_t scratchBaseOffset = (flag & 1) ? 0 : 4 * worldSize * nelems * sizeof(mscclpp::LL8Packet);
   size_t srcOffset = channelDataOffset;
-  size_t scratchOffset = scratchBaseOffset + rank * nelems * sizeof(mscclpp::LL8Packet);
-  void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset);
+  size_t scratchOffset = channelScratchOffset + rank * nelems * sizeof(mscclpp::LL8Packet);
+  void* scratchBuff = (void*)((char*)scratch + channelScratchOffset);
   uint32_t* src = (uint32_t*)((char*)buff);
   uint32_t* dst = (uint32_t*)((char*)resultBuff);
 
@@ -178,7 +176,8 @@ __global__ void __launch_bounds__(32, 1)
 template <typename T>
 __global__ void __launch_bounds__(1024, 1)
     allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
-               size_t channelDataOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) {
+               size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize,
+               size_t nelems, uint32_t flag) {
   // This version of allreduce only works for single nodes
   if (worldSize != nRanksPerNode) return;
   nelems = nelems / (sizeof(int) / sizeof(T));
@@ -192,12 +191,9 @@ __global__ void __launch_bounds__(1024, 1)
   const int peerIdx = blockIdx.x / nBlocksPerPeer;
   const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1;
   const int tid = threadIdx.x + localBlockIdx * blockDim.x;
-  // double buffering
-  size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LL8Packet);
-  void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset);
-  size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LL8Packet);
-  size_t scratchResultOffset =
-      (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LL8Packet) : 3 * nPkts * sizeof(mscclpp::LL8Packet);
+  void* scratchBuff = (void*)((char*)scratch + channelScratchOffset);
+  size_t scratchOffset = channelScratchOffset + rank * nPktsPerRank * sizeof(mscclpp::LL8Packet);
+  size_t scratchResultOffset = channelScratchOffset + 2 * nPkts * sizeof(mscclpp::LL8Packet);
   size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int) + channelDataOffset;
   uint32_t* src = (uint32_t*)((char*)buff + rank * nelemsPerRank * sizeof(int));
   uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
@@ -246,8 +242,8 @@ __global__ void __launch_bounds__(1024, 1)
 template <typename T>
 __global__ void __launch_bounds__(512, 1)
     allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
-               mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels, size_t channelOutDataOffset, int rank,
-               int nRanksPerNode, int worldSize, size_t nelems) {
+               mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels, size_t channelOutDataOffset,
+               size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
   const int nPeer = nRanksPerNode - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
   // assume (nelems * sizeof(T)) is divisible by (16 * worldSize)
@@ -257,7 +253,7 @@ __global__ void __launch_bounds__(512, 1)
   auto smOutChans = smOutChannels + chanOffset;
 
   int4* buff4 = reinterpret_cast<int4*>(buff);
-  int4* scratch4 = reinterpret_cast<int4*>(scratch);
+  int4* scratch4 = reinterpret_cast<int4*>((char*)scratch + channelScratchOffset);
   int4* resultBuff4 = reinterpret_cast<int4*>(resultBuff);
 
   // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4`
@@ -278,6 +274,7 @@ __global__ void __launch_bounds__(512, 1)
   const size_t chunkSizePerRank = nNeededBlocks * nInt4PerChunk;
   const size_t blockOffset = nInt4PerChunk * blockIdx.x;
   const size_t scratchChunkRankOffset = chunkSizePerRank * rank;
+  const size_t scratchBaseOffsetInt4 = channelScratchOffset / sizeof(int4);
 
   __shared__ mscclpp::DeviceHandle<mscclpp::SmChannel> channels[NRANKS_PER_NODE - 1];
   __shared__ mscclpp::DeviceHandle<mscclpp::SmChannel> outChannels[NRANKS_PER_NODE - 1];
@@ -301,7 +298,7 @@ __global__ void __launch_bounds__(512, 1)
         const int peerIdx = (i + blockIdx.x) % nPeer;
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock];
-        channels[peerIdx].write(scratchChunkRankOffset + blockOffset + idx, val);
+        channels[peerIdx].write(scratchBaseOffsetInt4 + scratchChunkRankOffset + blockOffset + idx, val);
       }
     }
 
@@ -338,7 +335,7 @@ __global__ void __launch_bounds__(512, 1)
         const int peerIdx = (i + blockIdx.x) % nPeer;
         const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
         int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock];
-        channels[peerIdx].write(scratchChunkRankOffset + blockOffset + idx, val);
+        channels[peerIdx].write(scratchBaseOffsetInt4 + scratchChunkRankOffset + blockOffset + idx, val);
       }
     }
 
@@ -367,15 +364,16 @@ __global__ void __launch_bounds__(512, 1)
 template <typename T>
 cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
                       mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels, size_t channelInOffset,
-                      size_t channelOutOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems,
-                      cudaStream_t stream) {
+                      size_t channelOutOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize,
+                      size_t nelems, cudaStream_t stream) {
   static uint32_t flag = 1;
 
   if (sizeof(T) * nelems < worldSize * sizeof(int)) {
     int nBlocks = 7;
     int nThreadsPerBlock = 32;
     allreduceAllToAll<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
-                                                                rank, nRanksPerNode, worldSize, nelems, flag++);
+                                                                channelScratchOffset, rank, nRanksPerNode, worldSize,
+                                                                nelems, flag++);
   } else if (sizeof(T) * nelems <= (1 << 20)) {
     int nBlocks = 28;
     int nThreadsPerBlock = 1024;
@@ -383,13 +381,15 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
       nBlocks = 56;
       nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
     }
-    allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset, rank,
-                                                         nRanksPerNode, worldSize, nelems, flag++);
+    allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
+                                                         channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,
+                                                         flag++);
   } else {
     int nBlocks = 35;
     int nThreadsPerBlock = 512;
     allreduce8<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, smOutChannels,
-                                                         channelOutOffset, rank, nRanksPerNode, worldSize, nelems);
+                                                         channelOutOffset, channelScratchOffset, rank, nRanksPerNode,
+                                                         worldSize, nelems);
   }
 
   return cudaGetLastError();
diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp
index cddc69625..25c74e71b 100644
--- a/apps/nccl/src/common.hpp
+++ b/apps/nccl/src/common.hpp
@@ -12,6 +12,6 @@
 #endif
 
 constexpr int NRANKS_PER_NODE = 8;
-constexpr int SCRATCH_SIZE = 1024 * 1024 * 70; // 35 thread-blocks * 8 ranks * 256KB = 70MB
+constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
 
-#endif // NCCL_COMMON_HPP_
+#endif  // NCCL_COMMON_HPP_
diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
index d312ee66b..01bc56e0b 100644
--- a/apps/nccl/src/nccl.cu
+++ b/apps/nccl/src/nccl.cu
@@ -33,9 +33,7 @@
 struct channelKey {
   const void* buff;
   size_t bytes;
-  bool operator==(const channelKey& other) const {
-    return buff == other.buff && bytes == other.bytes;
-  }
+  bool operator==(const channelKey& other) const { return buff == other.buff && bytes == other.bytes; }
 };
 
 namespace std {
@@ -62,6 +60,9 @@ struct ncclComm {
   std::unordered_map<channelKey, ChannelInfo> channelScratchInfos;
   std::shared_ptr<char> scratchBuff;
   std::vector<mscclpp::RegisteredMemory> remoteScratchRegMemories;
+
+  uint32_t numScratchBuff;
+  uint32_t buffFlag;
 };
 
 static size_t ncclTypeSize(ncclDataType_t type) {
@@ -146,7 +147,7 @@ static std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> setupSmChannel
   std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> ptr =
       mscclpp::allocSharedCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
   mscclpp::memcpyCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(), smChannelDeviceHandles.data(),
-                       smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
+                                                                 smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
   return ptr;
 }
 
@@ -164,8 +165,7 @@ NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) {
   return ncclSuccess;
 }
 
-NCCL_API ncclResult_t ncclCommInitRankConfig(ncclComm_t*, int, ncclUniqueId, int,
-                                             ncclConfig_t*) {
+NCCL_API ncclResult_t ncclCommInitRankConfig(ncclComm_t*, int, ncclUniqueId, int, ncclConfig_t*) {
   // TODO: implement this function
   return ncclInternalError;
 }
@@ -206,6 +206,8 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
   commPtr->comm = mscclppComm;
   commPtr->connections = std::move(connections);
   commPtr->smSemaphores = std::move(smSemaphores);
+  commPtr->buffFlag = 0;
+  commPtr->numScratchBuff = 2;
   commPtr->scratchBuff = mscclpp::allocExtSharedCuda<char>(SCRATCH_SIZE);
   commPtr->remoteScratchRegMemories =
       setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), SCRATCH_SIZE, mscclpp::Transport::CudaIpc);
@@ -292,8 +294,7 @@ NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
   return ncclSuccess;
 }
 
-NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t*, void*, ncclDataType_t,
-                                               ncclScalarResidence_t, ncclComm_t) {
+NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t*, void*, ncclDataType_t, ncclScalarResidence_t, ncclComm_t) {
   // TODO: implement this function
   return ncclInternalError;
 }
@@ -303,20 +304,18 @@ NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t, ncclComm_t) {
   return ncclInternalError;
 }
 
-NCCL_API ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t,
-                                 ncclRedOp_t, int, ncclComm_t, cudaStream_t) {
+NCCL_API ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, int, ncclComm_t,
+                                 cudaStream_t) {
   // TODO: implement this function
   return ncclInternalError;
 }
 
-NCCL_API ncclResult_t ncclBcast(void*, size_t, ncclDataType_t, int, ncclComm_t,
-                                cudaStream_t) {
+NCCL_API ncclResult_t ncclBcast(void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) {
   // TODO: implement this function
   return ncclInternalError;
 }
 
-NCCL_API ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t,
-                                    int, ncclComm_t, cudaStream_t) {
+NCCL_API ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) {
   // TODO: implement this function
   return ncclInternalError;
 }
@@ -324,7 +323,8 @@ NCCL_API ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t,
 NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
                                     ncclRedOp_t, ncclComm_t comm, cudaStream_t stream) {
   // Checking if the parameters are valids
-  if (sendbuff == nullptr || recvbuff == nullptr || count == 0 || ncclTypeSize(datatype) == 0 || comm == nullptr) return ncclInvalidArgument;
+  if (sendbuff == nullptr || recvbuff == nullptr || count == 0 || ncclTypeSize(datatype) == 0 || comm == nullptr)
+    return ncclInvalidArgument;
 
   // Declarating variables
   size_t sendBytes, recvBytes;
@@ -333,6 +333,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)recvbuff));
   size_t offsetIn = (char*)sendbuff - (char*)sendBasePtr;
   size_t offsetOut = (char*)recvbuff - (char*)recvBasePtr;
+  uint32_t scratchBuffIdx = (++(comm->buffFlag)) % comm->numScratchBuff;
+  size_t offsetScratch = (SCRATCH_SIZE / comm->numScratchBuff) * scratchBuffIdx;
   int rank = comm->comm->bootstrap()->getRank();
   channelKey sendKey{(void*)sendBasePtr, sendBytes};
   channelKey recvKey{(void*)recvBasePtr, recvBytes};
@@ -340,21 +342,23 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
   mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels = nullptr;
 
   // Creating the channels
-  if(count * ncclTypeSize(datatype) <= (1 << 20)) {
+  if (count * ncclTypeSize(datatype) <= (1 << 20)) {
     auto sendIt = comm->channelScratchInfos.find(sendKey);
     if (sendIt == comm->channelScratchInfos.end()) {
-      std::vector<mscclpp::SmChannel> channels = setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void *>((void*)sendBasePtr));
+      std::vector<mscclpp::SmChannel> channels =
+          setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>((void*)sendBasePtr));
       ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)};
       sendIt = comm->channelScratchInfos.emplace(sendKey, channelInfo).first;
     }
+
     smChannels = sendIt->second.smChannelDeviceHandles.get();
-  }
-  else {
+  } else {
     std::vector<mscclpp::RegisteredMemory> remoteMemories;
 
     auto sendIt = comm->channelInInfos.find(sendKey);
     if (sendIt == comm->channelInInfos.end()) {
-      std::vector<mscclpp::SmChannel> channels = setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void *>((void*)sendBasePtr));
+      std::vector<mscclpp::SmChannel> channels =
+          setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>((void*)sendBasePtr));
       ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)};
       sendIt = comm->channelInInfos.emplace(sendKey, channelInfo).first;
     }
@@ -363,7 +367,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
     if (recvIt == comm->channelOutInfos.end()) {
       remoteMemories =
           setupRemoteMemories(comm->comm, rank, (void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc);
-      std::vector<mscclpp::SmChannel> outChannels = setupSmChannels(comm, remoteMemories, const_cast<void*>((void*)recvBasePtr));
+      std::vector<mscclpp::SmChannel> outChannels =
+          setupSmChannels(comm, remoteMemories, const_cast<void*>((void*)recvBasePtr));
       ChannelInfo channelInfo{outChannels, setupSmChannelDeviceHandles(outChannels)};
       recvIt = comm->channelOutInfos.emplace(recvKey, channelInfo).first;
     }
@@ -374,19 +379,20 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
 
   switch (datatype) {
     case ncclFloat16:
-      CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, smChannels, smOutChannels, offsetIn, offsetOut,
-                          rank, NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream));
+      CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, smChannels, smOutChannels,
+                          offsetIn, offsetOut, offsetScratch, rank, NRANKS_PER_NODE,
+                          comm->comm->bootstrap()->getNranks(), count, stream));
       break;
     case ncclFloat32:
       CUDACHECK(allreduce((float*)sendbuff, (float*)comm->scratchBuff.get(), (float*)recvbuff, smChannels,
-                          smOutChannels, offsetIn, offsetOut, comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE,
-                          comm->comm->bootstrap()->getNranks(), count, stream));
+                          smOutChannels, offsetIn, offsetOut, offsetScratch, comm->comm->bootstrap()->getRank(),
+                          NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream));
       break;
     case ncclInt32:
     case ncclUint32:
-      CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, smOutChannels, offsetIn, offsetOut,
-                          comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(),
-                          count, stream));
+      CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, smOutChannels,
+                          offsetIn, offsetOut, offsetScratch, comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE,
+                          comm->comm->bootstrap()->getNranks(), count, stream));
       break;
     default:
       return ncclInvalidArgument;
@@ -394,8 +400,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
   return ncclSuccess;
 }
 
-NCCL_API ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType_t,
-                                        ncclRedOp_t, ncclComm_t, cudaStream_t) {
+NCCL_API ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm_t,
+                                        cudaStream_t) {
   // TODO: implement this function
   return ncclInternalError;
 }
@@ -405,7 +411,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
   size_t bytes = sendcount * ncclTypeSize(datatype);
   if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument;
 
-   // Declarating variables
+  // Declarating variables
   size_t recvBytes;
   CUdeviceptr recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)recvbuff));
@@ -417,9 +423,8 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
 
   auto it = comm->channelOutInfos.find(recvKey);
   if (it == comm->channelOutInfos.end()) {
-    std::vector<mscclpp::RegisteredMemory> remoteMemories =
-        setupRemoteMemories(comm->comm, rank, const_cast<void*>((void*)recvBasePtr), recvBytes,
-                            mscclpp::Transport::CudaIpc);
+    std::vector<mscclpp::RegisteredMemory> remoteMemories = setupRemoteMemories(
+        comm->comm, rank, const_cast<void*>((void*)recvBasePtr), recvBytes, mscclpp::Transport::CudaIpc);
     std::vector<mscclpp::SmChannel> channels =
         setupSmChannels(comm, remoteMemories, const_cast<void*>((void*)recvBasePtr));
     std::vector<mscclpp::DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles;
@@ -431,29 +436,26 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
 
   smChannels = it->second.smChannelDeviceHandles.get();
   if ((char*)sendbuff == (char*)recvbuff + rank * sendcount) {
-    CUDACHECK(allgather<false>((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, offsetOut,
-                        rank, NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream));
+    CUDACHECK(allgather<false>((int*)sendbuff, (int*)nullptr, (int*)recvbuff, smChannels, offsetOut, rank,
+                               NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream));
   } else {
-    CUDACHECK(allgather<true>((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, offsetOut,
-                        rank, NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream));
+    CUDACHECK(allgather<true>((int*)sendbuff, (int*)nullptr, (int*)recvbuff, smChannels, offsetOut, rank,
+                              NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream));
   }
   return ncclSuccess;
 }
 
-NCCL_API ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t,
-                               cudaStream_t) {
+NCCL_API ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) {
   // TODO: implement this function
   return ncclInternalError;
 }
 
-NCCL_API ncclResult_t ncclRecv(void*, size_t, ncclDataType_t, int, ncclComm_t,
-                               cudaStream_t) {
+NCCL_API ncclResult_t ncclRecv(void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) {
   // TODO: implement this function
   return ncclInternalError;
 }
 
-NCCL_API ncclResult_t ncclAllToAll(const void*, void*, size_t, ncclDataType_t,
-                                   ncclComm_t, cudaStream_t) {
+NCCL_API ncclResult_t ncclAllToAll(const void*, void*, size_t, ncclDataType_t, ncclComm_t, cudaStream_t) {
   // TODO: implement this function
   return ncclInternalError;
 }
diff --git a/cmake/AddFormatTargets.cmake b/cmake/AddFormatTargets.cmake
index b95ad447b..9829bd135 100644
--- a/cmake/AddFormatTargets.cmake
+++ b/cmake/AddFormatTargets.cmake
@@ -9,7 +9,7 @@ add_custom_target(format)
 find_program(CLANG_FORMAT clang-format)
 if(CLANG_FORMAT)
     message(STATUS "Found clang-format: ${CLANG_FORMAT}")
-    set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test)
+    set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test ${PROJECT_SOURCE_DIR}/apps)
     add_custom_target(check-format-cpp ALL
         COMMAND ${CLANG_FORMAT} -style=file --dry-run `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu`
     )