[UB] Adding configurable timeout for userbuffer and improving error reporting...

[UB] Adding configurable timeout for userbuffer and improving error reporting for potential hangs (#757) * Improving error reporting and hang detection logic * Adding verbose error reporting in case of UB hang * Adding CE hang detector * Replacing hard-coded timeout with configurable one Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Cleaning up warnings in the code Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Removing unused codes Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Fixing styling issues reported on github Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Addressing lint new line and casting warnings Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Addressing lint warning about the usage of `unsigned long long` Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Removing unused case causing build issues on multi-arch setup Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Post GRDCOPY removal cleanup * Remove cmake check * Remove unused includes Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> --------- Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

[UB] Adding configurable timeout for userbuffer and improving error reporting...
[UB] Adding configurable timeout for userbuffer and improving error reporting for potential hangs (#757) * Improving error reporting and hang detection logic * Adding verbose error reporting in case of UB hang * Adding CE hang detector * Replacing hard-coded timeout with configurable one Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Cleaning up warnings in the code Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Removing unused codes Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Fixing styling issues reported on github Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Addressing lint new line and casting warnings Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Addressing lint warning about the usage of `unsigned long long` Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Removing unused case causing build issues on multi-arch setup Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> * Post GRDCOPY removal cleanup * Remove cmake check * Remove unused includes Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> --------- Signed-off-by: Pasha (Pavel) Shamis <pasharesearch@gmail.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
08e5e4b1 · Pavel Shamis (Pasha) · GitHub · cd54a8cd · 08e5e4b1 · 08e5e4b1
Unverified Commit 08e5e4b1 authored Apr 17, 2024 by Pavel Shamis (Pasha) Committed by GitHub Apr 17, 2024
4 changed files
--- a/transformer_engine/pytorch/csrc/userbuffers/CMakeLists.txt
+++ b/transformer_engine/pytorch/csrc/userbuffers/CMakeLists.txt
@@ -11,17 +11,11 @@ target_include_directories(transformer_engine_userbuffers PUBLIC

 # Configure dependencies
 find_package(MPI REQUIRED)
-find_library(GDRCOPY_LIBRARY gdrapi
-             HINTS "${GDRCOPY_LIBRARY_DIR}" "$ENV{GDRCOPY_LIBRARY_DIR}")
-if(NOT GDRCOPY_LIBRARY)
-    message(FATAL_ERROR "Could not find GDRCopy, please set GDRCOPY_LIBRARY_DIR")
-endif()
-message(STATUS "Found GDRCopy: ${GDRCOPY_LIBRARY}")
 target_link_libraries(transformer_engine_userbuffers PUBLIC
                      CUDA::cudart
                      CUDA::cuda_driver
                      MPI::MPI_CXX
-                      ${GDRCOPY_LIBRARY})
+                      )
 target_include_directories(transformer_engine_userbuffers PRIVATE
                           ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})


--- a/transformer_engine/pytorch/csrc/userbuffers/userbuffers-host.cpp
+++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers-host.cpp
@@ -11,7 +11,6 @@
 #include <chrono>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
-#include <immintrin.h>
 #include <iostream>
 #include <math.h>
 #include <mpi.h>
@@ -19,7 +18,6 @@
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
-#include <x86intrin.h>
 #define MULTICAST_GB_TOTAL 512

 static int oob_bcast(void *comm_context, void *buf, int size, int root) {
@@ -123,11 +121,20 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode
    (*comm)->basecounter[i] = 0;
  (*comm)->head = 0;
  (*comm)->tail = 0;
-  (*comm)->activeproxy = 1;
  (*comm)->active_nreqs = 0;
  for (int i = 0; i < userbuffers_op_types; i++)
    (*comm)->active_req[i].active = -1;

+  int device_clock    = 0;
+  // 110 sec wait time by default
+  int sec_timeout = getenv("UB_TIMEOUT") ? atoi(getenv("UB_TIMEOUT")) : 110;
+  CUDACHECK(cudaDeviceGetAttribute(&device_clock, cudaDevAttrClockRate, cur_dev));
+  (*comm)->ub_timeout = 1000ull * device_clock * sec_timeout;
+  if ((*comm)->myrank == 0) {
+    printf("UB_TIMEOUT is set to %d sec, %" PRIu64 " cycles, freq: %dkhz\n",
+            sec_timeout, (*comm)->ub_timeout, device_clock);
+  }
+
  int ret = 0;
  // split communicator
  char host_name[MPI_MAX_PROCESSOR_NAME];
@@ -232,59 +239,12 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode
  (*comm)->num2_nodes = tensornodes;
  (*comm)->my2_node = (mynode / datanodes) % tensornodes;
  (*comm)->first2_node = mynode - (*comm)->my2_node * datanodes;
-
-  char *ib_dev_list;
-  int ZIONROCE = getenv("NVTE_ZIONROCE") ? atoi(getenv("NVTE_ZIONROCE")) : 0;
-  int ROCE = getenv("NVTE_ROCE") ? atoi(getenv("NVTE_ROCE")) : 0;
-  if (ZIONROCE)
-    ROCE = 1;
-  int DGX_H100 = device_prop.major == 9;
-
-  switch (mylocal) {
-  case 0:
-    ib_dev_list = "mlx5_0:1";
-    break;  // NOLINT(*)
-  case 1:
-    ib_dev_list = (char *)(DGX_H100 ? "mlx5_3:1" : "mlx5_1:1");  // NOLINT(*)
-    break;                                                       // NOLINT(*)
-  case 2:
-    ib_dev_list = (char *)(ZIONROCE   ? "mlx5_4:1" : DGX_H100 ? "mlx5_4:1" : "mlx5_2:1");  // NOLINT(*)
-    break;                                                                                 // NOLINT(*)
-  case 3:
-    ib_dev_list = (char *)(DGX_H100 ? "mlx5_5:1" : "mlx5_3:1");  // NOLINT(*)
-    break;                                                       // NOLINT(*)
-  case 4:
-    ib_dev_list = (char *)(DGX_H100 ? "mlx5_6:1" : "mlx5_6:1");  // NOLINT(*)
-    break;                                                       // NOLINT(*)
-  case 5:
-    ib_dev_list = (char *)(DGX_H100 ? "mlx5_9:1" : "mlx5_7:1");  // NOLINT(*)
-    break;                                                       // NOLINT(*)
-  case 6:
-    ib_dev_list = (char *)(ZIONROCE   ? "mlx5_10:1" : DGX_H100 ? "mlx5_10:1" : "mlx5_8:1");  // NOLINT(*)
-    break;                                                                                   // NOLINT(*)
-  case 7:
-    ib_dev_list = (char *)(DGX_H100 ? "mlx5_11:1" : "mlx5_9:1");  // NOLINT(*)
-    break;                                                        // NOLINT(*)
-  default:
-    break;
-  }
-
  (*comm)->fifo = reinterpret_cast<ub_request *>(malloc(sizeof(ub_request) * NVTE_MAX_REQUESTS));
  (*comm)->nblocks = 8;
  (*comm)->alignblock = 1024 * 512;
  (*comm)->minblock = 1024 * 2 * 1024;
  (*comm)->asyncblocks = 16;

-  CUDACHECK(cudaMallocHost((void **)&(*comm)->hostflags,  // NOLINT(*)
-                           (NVTE_MAX_SMS + 100) * sizeof(int)));
-  for (int i = 0; i < 100 + NVTE_MAX_SMS; i++)
-    (*comm)->hostflags[i] = 0;
-  _mm_mfence();
-  sleep(1);
-
-  // init_p2p_transport();
-  (*comm)->ibnvsize = (*comm)->nvsize;
-
 #define NBUF 2
  if ((*comm)->sm_arch >= 9 && (*comm)->ar2_nvsize > 1 &&
      !getenv("UB_SKIPMC")) {  // multicast init only for TP ops (____2 operations)
@@ -374,6 +334,7 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode
 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
 #define GPU_PAGE_OFFSET (GPU_PAGE_SIZE - 1)
 #define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
+
  CUDACHECK(cudaMalloc(&(*comm)->flags, 2 * GPU_PAGE_SIZE));
  unsigned int flag = 1;
  CUDACHECK(cudaMemset((*comm)->flags, 0, 2 * GPU_PAGE_SIZE));
@@ -381,23 +342,6 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode
      reinterpret_cast<int *>(((CUdeviceptr)(*comm)->flags + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK);

  using namespace std;
-  (*comm)->g = gdr_open();
-  if ((*comm)->g == NULL) {
-    fprintf(stderr, "gdrcopy open failed\n");
-    return -1;
-  }
-  gdr_mh_t mh;
-  ret = gdr_pin_buffer((*comm)->g, (CUdeviceptr)(*comm)->flags, GPU_PAGE_SIZE, 0, 0, &mh);
-  if (ret) {
-    fprintf(stderr, "gdr_pin_buffer failed\n");
-    return -1;
-  }
-  ret = gdr_map((*comm)->g, mh, (void **)&((*comm)->map_flags), GPU_PAGE_SIZE);  // NOLINT(*)
-
-  if (ret) {
-    fprintf(stderr, "gdr_map failed\n");
-    return -1;
-  }
  sched_param param;
  pthread_attr_t attr;
  pthread_attr_init(&attr);
@@ -426,10 +370,6 @@ int create_communicator(communicator **comm) {
 }

 void destroy_communicator(communicator *comm) {
-  comm->activeproxy = 0;
-  if (!comm->myrank && getenv("NVTE_UBDEBUG"))
-    printf("waiting for userbuffers proxy thread to exit()\n");
-  gdr_close(comm->g);
 }

 int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *comm, bool alloc) {
@@ -533,7 +473,7 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
      CUCHECK(cuMulticastBindMem(comm->mc_handle, comm->mc_offset, comm->uchandles[hndl][myrank],
                                 0 /*memOffset*/, aligned_size, 0));
      comm->memflags[hndl] |= UB_MEM_MC_CREATED;
-      comm->mc_ptr[hndl] = comm->mc_baseptr + comm->mc_offset;
+      comm->mc_ptr[hndl] = reinterpret_cast<char *>(comm->mc_baseptr) + comm->mc_offset;
      comm->mc_offset += aligned_size;
    } else if (!comm->myrank) {
      printf("UB: warning region %d size %ld MB registered without MC access\n", hndl,
@@ -570,146 +510,3 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *

  return comm->free_region++;
 }
-
-int allreduce_userbuff_inplace_gpu(const int handler, const int offset, const int elements,
-                                   const int blocksize, communicator *comm, cudaStream_t stream);
-
-int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset,
-                                    const int elements, const int blocksize, communicator *comm,
-                                    cudaStream_t stream, int op);
-
-int reducescatter2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset,
-                                        const int elements, const int blocksize, communicator *comm,
-                                        cudaStream_t stream, int op);
-
-int allgather2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset,
-                                    const int elements, const int blocksize, communicator *comm,
-                                    cudaStream_t stream, int op);
-
-void allreduce_nonsharp_inplace(const int handler, const int offset, const int elements,
-                                communicator *comm, cudaStream_t stream, int op) {
-  if (elements < 64)
-    NVTE_UB_ERROR("Userbuffer comm for given config not implemented.");
-  // if(comm->myrank==0) fprintf(stderr,"AR2(%d) user call
-  // launch_mode=%d\n",op,comm->launch_mode);
-  const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize;
-  int blocksize = elements * 2;
-  int maxcredit = 0;
-  const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes;
-  blocksize = (comm->nblocks - 1 + (comm->alignblock - 1 + elements * 2) / comm->alignblock) /
-              comm->nblocks;  // FIXME TUNING
-  blocksize *= comm->alignblock;
-  if (blocksize < comm->minblock)
-    blocksize = comm->minblock;
-
-  maxcredit = (elements * 2 + blocksize - 1) / blocksize;
-  size_t peerblock = sizeof(int) * NVTE_REG0_COMMBUFFER / maxcredit;  // max size we can fit
-  if (blocksize > peerblock * ar_nvsize)
-    blocksize = peerblock * ar_nvsize;
-  int sms = allreduce2_userbuff_inplace_gpu(maxcredit, handler, offset, elements, blocksize, comm,
-                                            stream, op);
-
-  if (num_nodes > 1 && comm->launch_mode & NVTE_LAUNCH_CPU) {
-    if (!sms)
-      return;
-    comm->fifo[comm->head].optype = op;
-    comm->fifo[comm->head].basecounter = comm->basecounter[op];
-    comm->fifo[comm->head].blocksize = blocksize;
-    comm->fifo[comm->head].maxcredit = maxcredit;
-    comm->fifo[comm->head].handler = handler;
-    comm->fifo[comm->head].offset = offset;
-    comm->fifo[comm->head].elements = elements;
-
-    int newhead = (comm->head + 1) & (NVTE_MAX_REQUESTS - 1);
-    while (newhead == comm->tail) {
-    }
-    comm->head = newhead;
-
-    comm->basecounter[op] += (elements * 2 + blocksize - 1) / blocksize;
-  }
-}
-
-void allreduce2_userbuff_inplace(const int handler, const int offset, const int elements,
-                                 communicator *comm, cudaStream_t stream) {
-  allreduce_nonsharp_inplace(handler, offset, elements, comm, stream,
-                             userbuffers_allreduceop_nonsharp2);
-}
-
-void allreduce_userbuff_inplace(const int handler, const int offset, const int elements,
-                                communicator *comm, cudaStream_t stream) {
-  if (elements < 64)
-    NVTE_UB_ERROR("Userbuffer comm for given config not implemented.");
-  allreduce_nonsharp_inplace(handler, offset, elements, comm, stream,
-                             userbuffers_allreduceop_nonsharp);
-  return;
-}
-
-void reducescatter_userbuff_inplace(const int handler, const int offset, const int elements,
-                                    communicator *comm, cudaStream_t stream) {
-  if (elements < 64)
-    NVTE_UB_ERROR("Userbuffer comm for given config not implemented.");
-
-  int op = userbuffers_allreduceop_nonsharp;
-  const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize;
-  int blocksize = elements * 2;
-  int maxcredit = 0;
-
-  const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes;
-  blocksize = (comm->nblocks - 1 + (comm->alignblock - 1 + elements * 2) / comm->alignblock) /
-              comm->nblocks;  // FIXME TUNING
-  blocksize *= comm->alignblock;
-  if (blocksize < comm->minblock)
-    blocksize = comm->minblock;
-
-  maxcredit = (elements * 2 + blocksize - 1) / blocksize;
-  size_t peerblock = sizeof(int) * NVTE_REG0_COMMBUFFER / maxcredit;  // max size we can fit
-  if (blocksize > peerblock * ar_nvsize)
-    blocksize = peerblock * ar_nvsize;
-
-  int sms = reducescatter2_userbuff_inplace_gpu(maxcredit, handler, offset, elements, blocksize,
-                                                comm, stream, op);
-
-  if (num_nodes > 1 && comm->launch_mode & NVTE_LAUNCH_CPU) {
-    if (!sms)
-      return;
-    comm->fifo[comm->head].optype = op;
-    comm->fifo[comm->head].basecounter = comm->basecounter[op];
-    comm->fifo[comm->head].blocksize = blocksize;
-    comm->fifo[comm->head].maxcredit = maxcredit;
-    comm->fifo[comm->head].handler = handler;
-    comm->fifo[comm->head].offset = offset;
-    comm->fifo[comm->head].elements = elements;
-
-    int newhead = (comm->head + 1) & (NVTE_MAX_REQUESTS - 1);
-    while (newhead == comm->tail) {
-    }
-    comm->head = newhead;
-
-    comm->basecounter[op] += (elements * 2 + blocksize - 1) / blocksize;
-  }
-}
-
-void allgather_userbuff_inplace(const int handler, const int offset, const int elements,
-                                communicator *comm, cudaStream_t stream) {
-  if (elements < 64)
-    NVTE_UB_ERROR("Userbuffer comm for given config not implemented.");
-  int op = userbuffers_allreduceop_nonsharp;
-  const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize;
-  int blocksize = elements * 2;
-  int maxcredit = 0;
-
-  const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes;
-  blocksize = (comm->nblocks - 1 + (comm->alignblock - 1 + elements * 2) / comm->alignblock) /
-              comm->nblocks;  // FIXME TUNING
-  blocksize *= comm->alignblock;
-  if (blocksize < comm->minblock)
-    blocksize = comm->minblock;
-
-  maxcredit = (elements * 2 + blocksize - 1) / blocksize;
-  size_t peerblock = sizeof(int) * NVTE_REG0_COMMBUFFER / maxcredit;  // max size we can fit
-  if (blocksize > peerblock * ar_nvsize)
-    blocksize = peerblock * ar_nvsize;
-
-  int sms = allgather2_userbuff_inplace_gpu(maxcredit, handler, offset, elements, blocksize, comm,
-                                            stream, op);
-}
--- a/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu
+++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu
@@ -4,12 +4,8 @@
 * See LICENSE for license information.
 ************************************************************************/

-#include <stdio.h>
-#include <assert.h>
-
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <cuda_fp8.h>

 #if __CUDA_ARCH__ >= 800
 #include <cuda_bf16.h>
@@ -20,8 +16,12 @@

 #include "userbuffers.h"

+#include <unistd.h>
+#include <stdio.h>
+#include <assert.h>
+#include <cuda_fp8.h>
+
 #define MAX_THREADS 1024
-#define TIMEOUT 200000000000ull

 #define CUDACHECK(cmd)                                                                             \
  do {                                                                                             \
@@ -35,8 +35,7 @@
 #define ATOMIC_CONSUMER(chunk)                                                                     \
  if (counters) {                                                                                  \
    if (threadIdx.x == 0 && blockIdx.x == 0) {                                                     \
-      int old_val;                                                                                 \
-      while (0 != (old_val = atomicCAS(((unsigned int *)counters) + chunk, 0, 0))) {               \
+      while (0 != (atomicCAS(((unsigned int *)counters) + chunk, 0, 0))) {                         \
      }                                                                                            \
      ((unsigned int *)counters)[chunk] = 1;                                                       \
      asm volatile("fence.sc.gpu;\n");                                                             \
@@ -54,11 +53,32 @@
 // If we expect that producer will be 2B+ messages behind consumer
 #define CHECK_IDS(producer, consumer) (((unsigned)(producer) - (unsigned)(consumer)) & (~INT_MAX))

+// Strip the path from a full filename
+#define FILENAME(file) ({ \
+    const char* filename = file; \
+    const char* basename = filename; \
+    for (const char* ptr = filename; *ptr != '\0'; ptr++) { \
+        if (*ptr == '/' || *ptr == '\\') { \
+            basename = ptr + 1; \
+        } \
+    } \
+    basename; \
+})
+
+// Printf to provide enough information so it is easier to attribute failures
+#define UB_PRINT(message, ...) printf("[%s:%s:%d] " message "\n", FILENAME(__FILE__),              \
+                                                                  __FUNCTION__,                    \
+                                                                  __LINE__, __VA_ARGS__)
+
+// Report and error on timeout
+#define CHECK_TIMEOUT(t, timeout) ((clock64() - (t)) > timeout)
+
 template <int RANKS>
 __global__ void __launch_bounds__(MAX_THREADS)
    userbuffers_fp16_sum_inplace_gpu_rw(const int op, const int flagoffset, const int firstrank,
                                        const int myrank, const int gpustep, const int lineoffset,
-                                        const int numlines, void **commbuff, const int handleridx) {
+                                        const int numlines, void **commbuff, const int handleridx,
+                                        const uint64_t ub_timeout) {
  __shared__ int4 *userptr[RANKS];
  int *flagptr, physgpu, targetgpu, *myptr;
  int *reduceidptr, reduce_id;
@@ -78,9 +98,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
    userptr[threadIdx.x] = reinterpret_cast<int4 *>(commbuff[targetgpu + handleridx]);
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id,
-               *flag);
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("[%d] Allreduce reduce-scatter: SM %d [%d]: expecting %d got %d", myrank,
+                 blockIdx.x, threadIdx.x, reduce_id, *flag);
        break;
      }
    }
@@ -132,9 +152,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
    volatile int *flag = (volatile int *)&myptr[targetgpu];
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > 2ull * TIMEOUT) {
-        printf("NVONLY AGBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id,
-               *flag);
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("[%d] Allreduce Gather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+                  threadIdx.x, reduce_id, *flag);
        break;
      }
    }
@@ -147,7 +167,8 @@ template <int RANKS>
 __global__ void __launch_bounds__(MAX_THREADS)
    userbuffers_fp16_sum_inplace_gpu_rr(const int op, const int flagoffset, const int firstrank,
                                        const int myrank, const int gpustep, const int lineoffset,
-                                        const int numlines, void **commbuff, const int handleridx) {
+                                        const int numlines, void **commbuff, const int handleridx,
+                                        const uint64_t ub_timeout) {
  __shared__ int4 *userptr[RANKS];
  int *flagptr, physgpu, targetgpu, *myptr;
  int *reduceidptr, reduce_id;
@@ -166,9 +187,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
    userptr[threadIdx.x] = reinterpret_cast<int4 *>(commbuff[targetgpu + handleridx]);
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id,
-               *flag);
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("[%d ]Allreduce reduce-scatter:SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+                  threadIdx.x, reduce_id, *flag);
        break;
      }
    }
@@ -215,9 +236,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
    volatile int *flag = (volatile int *)&myptr[targetgpu];
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > 2ull * TIMEOUT) {
-        printf("NVONLY AGBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id,
-               *flag);
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("[%d] Allreduce gather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+                  threadIdx.x, reduce_id, *flag);
        break;
      }
    }
@@ -258,7 +279,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
    userbuffers_fp16_sum_inplace_gpu_rr_rs(const int op, const int flagoffset, const int firstrank,
                                           const int myrank, const int gpustep,
                                           const int mylineoffset, const int totallines,
-                                           void **commbuff, const int handleridx) {
+                                           void **commbuff, const int handleridx,
+                                           const uint64_t ub_timeout) {
  __shared__ int4 *userptr[RANKS];
  volatile int *flagptr;
  int physgpu, targetgpu, *myptr;
@@ -277,9 +299,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
    userptr[threadIdx.x] = reinterpret_cast<int4 *>(commbuff[targetgpu + handleridx]);
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id,
-               *flag);
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+                  threadIdx.x, reduce_id, *flag);
        break;
      }
    }
@@ -333,7 +355,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
                                               const int gpustep, const int mylineoffset,
                                               const int totallines, const int rowlines,
                                               const int skiplines, void **commbuff,
-                                               const int handleridx, void *outbuf) {
+                                               const int handleridx, void *outbuf,
+                                               const uint64_t ub_timeout) {
  __shared__ int4 *userptr[RANKS];
  volatile int *flagptr;
  int physgpu, targetgpu, *myptr;
@@ -352,9 +375,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
    userptr[threadIdx.x] = reinterpret_cast<int4 *>(commbuff[targetgpu + handleridx]);
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id,
-               *flag);
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+                  threadIdx.x, reduce_id, *flag);
        break;
      }
    }
@@ -427,8 +450,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (clock64() - s > TIMEOUT) {
-        printf("NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id,
-               *flag);
+        UB_PRINT("Reduce-scatter: SM %d [%d]:expecting %d got %d", blockIdx.x, threadIdx.x,
+                  reduce_id, *flag);
        break;
      }
    }
@@ -495,7 +518,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (clock64() - s > 2ull * TIMEOUT) {
-        printf("NVONLY AGBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id,
+        UB_PRINT("Allgather: SM %d [%d]:expecting %d got %d", blockIdx.x, threadIdx.x, reduce_id,
               *flag);
        break;
      }
@@ -510,7 +533,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
    userbuffers_fp16_sum_inplace_gpu_mc_rs(const int op, const int flagoffset, const int firstrank,
                                           const int myrank, const int gpustep,
                                           const int mylineoffset, const int totallines,
-                                           void **commbuff, const int handleridx, float4 *mc_ptr) {
+                                           void **commbuff, const int handleridx, float4 *mc_ptr,
+                                           const uint64_t ub_timeout) {
  volatile int *flagptr;
  int physgpu, targetgpu, *myptr;
  int *reduceidptr, reduce_id;
@@ -529,10 +553,10 @@ __global__ void __launch_bounds__(MAX_THREADS)
    volatile int *flag = (volatile int *)&(myptr[targetgpu]);
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id,
-               *flag);
-        break;
+       if (CHECK_TIMEOUT(s, ub_timeout)) {
+         UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+                 threadIdx.x, reduce_id, *flag);
+         break;
      }
    }
  }
@@ -596,7 +620,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
                                               const int gpustep, const int mylineoffset,
                                               const int totallines, const int rowlines,
                                               const int skiplines, void **commbuff,
-                                               const int handleridx, void *outbuf, float4 *mc_ptr) {
+                                               const int handleridx, void *outbuf, float4 *mc_ptr,
+                                               const uint64_t ub_timeout) {
  volatile int *flagptr;
  int physgpu, targetgpu, *myptr;
  int *reduceidptr, reduce_id;
@@ -614,9 +639,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
    volatile int *flag = (volatile int *)&(myptr[targetgpu]);
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", myrank, blockIdx.x,
-               threadIdx.x, reduce_id, *flag);
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+                  threadIdx.x, reduce_id, *flag);
        break;
      }
    }
@@ -680,7 +705,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
    userbuffers_fp16_sum_inplace_gpu_mc_ag(const int op, const int flagoffset, const int firstrank,
                                           const int myrank, const int gpustep,
                                           const int mylineoffset, const int totallines,
-                                           void **commbuff, const int handleridx, uint4 *mc_ptr) {
+                                           void **commbuff, const int handleridx, uint4 *mc_ptr,
+                                           const uint64_t ub_timeout) {
  volatile int *flagptr;
  int physgpu, targetgpu, *myptr;
  int *reduceidptr, reduce_id;
@@ -744,10 +770,10 @@ __global__ void __launch_bounds__(MAX_THREADS)
    volatile int *flag = (volatile int *)&myptr[targetgpu];
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > 2ull * TIMEOUT) {
-        printf("NVONLY AGBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id,
-               *flag);
-        break;
+       if (CHECK_TIMEOUT(s, ub_timeout)) {
+         UB_PRINT("[%d] Allgather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
+                  reduce_id, *flag);
+         break;
      }
    }
  }
@@ -764,26 +790,32 @@ template <int RANKS>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rs_oop(
    const int op, const int flagoffset, const int firstrank, const int myrank, const int gpustep,
    const int mylineoffset, const int totallines, const int rowlines, const int skiplines,
-    void **commbuff, const int handleridx, void *outbuf, float4 *mc_ptr) {}
+    void **commbuff, const int handleridx, void *outbuf, float4 *mc_ptr,
+    const uint64_t ub_timeout) {}
+
 template <int RANKS>
 __global__ void __launch_bounds__(MAX_THREADS)
    userbuffers_fp16_sum_inplace_gpu_mc_ag(const int op, const int flagoffset, const int firstrank,
                                           const int myrank, const int gpustep,
                                           const int mylineoffset, const int totallines,
-                                           void **commbuff, const int handleridx, uint4 *mc_ptr) {}
+                                           void **commbuff, const int handleridx, uint4 *mc_ptr,
+                                           const uint64_t ub_timeout) {}
+
 template <int RANKS>
 __global__ void __launch_bounds__(MAX_THREADS)
    userbuffers_fp16_sum_inplace_gpu_mc_rs(const int op, const int flagoffset, const int firstrank,
                                           const int myrank, const int gpustep,
                                           const int mylineoffset, const int totallines,
-                                           void **commbuff, const int handleridx, float4 *mc_ptr) {}
+                                           void **commbuff, const int handleridx, float4 *mc_ptr,
+                                           const uint64_t ub_timeout) {}
 #endif

 template <int RANKS, typename fp8type>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_fp8(
    const int op, const int flagoffset, const int firstrank, const int myrank, const int gpustep,
    const int mylineoffset, const int totallines, const int rowlines, const int skiplines,
-    void **commbuff, const int handleridx, void *outbuf, float *scale) {
+    void **commbuff, const int handleridx, void *outbuf, float *scale,
+    const uint64_t ub_timeout) {
  __shared__ int4 *userptr[RANKS];
  volatile int *flagptr;
  int physgpu, targetgpu, *myptr;
@@ -804,8 +836,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
    userptr[threadIdx.x] = reinterpret_cast<int4 *>(commbuff[targetgpu + handleridx]);
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", myrank, blockIdx.x,
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
               threadIdx.x, reduce_id, *flag);
        break;
      }
@@ -862,7 +894,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
        const int op, const int flagoffset, const int firstrank, const int myrank,
        const int gpustep, const int mylineoffset, const int totallines, const int rowlines,
        const int skiplines_out, const int skiplines_in, void **commbuff, const int handleridx,
-        void *outbuf, float *scale, void *counters, const int numchunks, const int atomicindex) {
+        void *outbuf, float *scale, void *counters, const int numchunks, const int atomicindex,
+        const uint64_t ub_timeout) {
  __shared__ int4 *userptr[RANKS];
  volatile int *flagptr;
  int physgpu, targetgpu, *myptr;
@@ -892,8 +925,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
      userptr[threadIdx.x] = reinterpret_cast<int4 *>(commbuff[targetgpu + handleridx]);
      clock_t s = clock64();
      while (CHECK_IDS(*flag, reduce_id)) {
-        if (clock64() - s > TIMEOUT) {
-          printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", myrank, blockIdx.x,
+        if (CHECK_TIMEOUT(s, ub_timeout)) {
+          UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
                 threadIdx.x, reduce_id, *flag);
          break;
        }
@@ -959,7 +992,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
                                                      const int gpustep, const int mylineoffset,
                                                      const int totallines, const int rowlines,
                                                      const int skiplines, void **commbuff,
-                                                      const int handleridx, void *outbuf) {
+                                                      const int handleridx, void *outbuf,
+                                                      const uint64_t ub_timeout) {
  __shared__ int4 *userptr[RANKS];
  volatile int *flagptr;
  int physgpu, targetgpu, *myptr;
@@ -979,8 +1013,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
    userptr[threadIdx.x] = reinterpret_cast<int4 *>(commbuff[targetgpu + handleridx]);
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", myrank, blockIdx.x,
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
               threadIdx.x, reduce_id, *flag);
        break;
      }
@@ -1030,123 +1064,22 @@ __global__ void __launch_bounds__(MAX_THREADS)
    *reduceidptr = reduce_id;
 }  // fp16 reduce-scatter kernel (out of place) fp16

-#if 0
-template<int RANKS, typename fp8type>
-__global__ void
-__launch_bounds__(MAX_THREADS)
-userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride_atomic_fp8(
-  const int op, const int flagoffset, const int firstrank, const int myrank, const int gpustep,
-  const int mylineoffset, const int totallines, const int rowlines, const int skiplines,
-  const int numchunks, void **commbuff, const int handleridx, void* outbuf, void *counters,
-  float* scale) {
-  if (counters) {
-      if ( threadIdx.x == 0 ) {
-          // spin-lock on counter from producer
-          int old_val;
-          while (0 != (old_val = atomicCAS(((unsigned int*)counters), 0, 0) )) {}
-
-          // make sure all threadblocks have read/waited on counters.
-          int old_val2;
-          atomicInc(((unsigned int *)counters)+numchunks, gridDim.x-1);
-          while (0 != (old_val2 = atomicCAS(((unsigned int*)counters)+numchunks, 0, 0) )) {}
-
-          // reset counter for next producer.
-          ((unsigned int*)counters)[0] = 1;
-          asm volatile ("fence.sc.gpu;\n");
-      }
-  }
-  __syncthreads();
-
-  __shared__ int4* userptr[RANKS];
-  volatile int *flagptr;
-  int physgpu, targetgpu, *myptr;
-  int *reduceidptr, reduce_id;
-  int lastSM = 0;
-  half hscale = (half) *scale;
-
-  if (threadIdx.x < RANKS) {
-    physgpu = myrank*gpustep+firstrank;
-    targetgpu = threadIdx.x*gpustep+firstrank;
-    myptr = (reinterpret_cast<int*>(commbuff[physgpu])) + flagoffset;
-    reduceidptr = myptr-NVTE_MAX_OPS;  // +op;
-    reduce_id  =(*reduceidptr)+1;
-    flagptr = (reinterpret_cast<int *>(commbuff[targetgpu])) + flagoffset;
-    if (blockIdx.x == 0) flagptr[physgpu] = reduce_id;
-    volatile int* flag = (volatile int*)&(myptr[targetgpu]);
-    userptr[threadIdx.x] = reinterpret_cast<int4*>(commbuff[targetgpu+handleridx]);
-    clock_t s = clock64();
-    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64()-s > TIMEOUT) {
-        printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n",
-                myrank, blockIdx.x, threadIdx.x, reduce_id, *flag);
-        break;
-      }
-    }
-  }
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    const int adder = blockIdx.x == 0 ? NVTE_MAX_SMS-gridDim.x+1 : 1;
-    int old_val = atomicAdd(myptr+(NVTE_MAX_NVLINK*2), adder);
-    if (old_val+adder == NVTE_MAX_SMS*reduce_id) lastSM = 1;
-  }
-
-
-  int warp = blockIdx.x+(threadIdx.x>>5);
-  int dest[RANKS];
-#pragma unroll
-  for (int i = 0; i < RANKS; i++)
-    dest[i] = (i+myrank+warp)&(RANKS-1);
-
-       for (int line = threadIdx.x+blockDim.x*blockIdx.x;
-            line < totallines; line+=blockDim.x*gridDim.x) {
-        int4 val[RANKS];
-        int index_in = mylineoffset + myrank*(totallines*skiplines/rowlines/2) +
-                       (line/rowlines)*skiplines/2+(line%rowlines);
-
-#pragma unroll
-        for (int i = 0; i < RANKS; i++) {
-           val[i] = userptr[dest[i]][index_in];
-        }
-
-        int4 sum[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
-        half *s = reinterpret_cast<half*>(&sum);
-
-#pragma unroll
-        for (int i = 0; i < RANKS; i++) {
-          fp8type *x = reinterpret_cast<fp8type*>(&val[i]);
-#pragma unroll
-          for (int j=0; j < sizeof(int4)/sizeof(fp8type); j++) s[j] += hscale * (half)(x[j]);
-        }
-        int hline = 2*line;
-        int index_out1 = (hline/rowlines)*skiplines+(hline%rowlines);
-        (reinterpret_cast<int4*>(outbuf))[index_out1] = sum[0];
-        hline++;
-        int index_out2 = (hline/rowlines)*skiplines+(hline%rowlines);
-        (reinterpret_cast<int4*>(outbuf))[index_out2] = sum[1];
-      }
-
-  if (threadIdx.x == 0 && lastSM) *reduceidptr = reduce_id;
-}  // fp16 reduce-scatter kernel (out of place) fp16
-#endif
-
 template <int RANKS>
 __global__ void __launch_bounds__(MAX_THREADS)
    userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride_atomic(
        const int op, const int flagoffset, const int firstrank, const int myrank,
        const int gpustep, const int mylineoffset, const int totallines, const int rowlines,
        const int skiplines, const int numchunks, void **commbuff, const int handleridx,
-        void *outbuf, void *counters) {
+        void *outbuf, void *counters, const uint64_t ub_timeout) {
  if (counters) {
    if (threadIdx.x == 0) {
      // spin-lock on counter from producer
-      int old_val;
-      while (0 != (old_val = atomicCAS(((unsigned int *)counters), 0, 0))) {
+      while (0 != (atomicCAS(((unsigned int *)counters), 0, 0))) {
      }

      // make sure all threadblocks have read/waited on counters.
-      int old_val2;
      atomicInc(((unsigned int *)counters) + numchunks, gridDim.x - 1);
-      while (0 != (old_val2 = atomicCAS(((unsigned int *)counters) + numchunks, 0, 0))) {
+      while (0 != (atomicCAS(((unsigned int *)counters) + numchunks, 0, 0))) {
      }

      // reset counter for next producer.
@@ -1175,8 +1108,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
    userptr[threadIdx.x] = reinterpret_cast<int4 *>(commbuff[targetgpu + handleridx]);
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", myrank, blockIdx.x,
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
               threadIdx.x, reduce_id, *flag);
        break;
      }
@@ -1232,20 +1165,17 @@ __global__ void __launch_bounds__(MAX_THREADS)
        const int op, const int flagoffset, const int firstrank, const int myrank,
        const int gpustep, const int mylineoffset, const int totallines, const int rowlines,
        const int skiplines, const int numchunks, void **commbuff, const int handleridx,
-        void *outbuf, void *counters) {
+        void *outbuf, void *counters, const uint64_t ub_timeout) {
  for (int chunk_i = 0; chunk_i < numchunks; chunk_i++) {
    if (counters) {
      if (threadIdx.x == 0) {
        // spin-lock on counter from producer
-        int old_val;
-        while (0 != (old_val = atomicCAS(((unsigned int *)counters) + chunk_i, 0, 0))) {
+        while (0 != (atomicCAS(((unsigned int *)counters) + chunk_i, 0, 0))) {
        }

        // make sure all threadblocks have read/waited on counters.
-        int old_val2;
        atomicInc(((unsigned int *)counters) + numchunks + chunk_i, gridDim.x - 1);
-        while (0 !=
-               (old_val2 = atomicCAS(((unsigned int *)counters) + numchunks + chunk_i, 0, 0))) {
+        while (0 != (atomicCAS(((unsigned int *)counters) + numchunks + chunk_i, 0, 0))) {
        }

        // reset counter for next producer.
@@ -1274,8 +1204,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
      userptr[threadIdx.x] = reinterpret_cast<int4 *>(commbuff[targetgpu + handleridx]);
      clock_t s = clock64();
      while (CHECK_IDS(*flag, reduce_id)) {
-        if (clock64() - s > TIMEOUT) {
-          printf("[%d] NVONLY RSBAR:SM %d [%d]:expecting %d got %d\n", myrank, blockIdx.x,
+        if (CHECK_TIMEOUT(s, ub_timeout)) {
+          UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
                 threadIdx.x, reduce_id, *flag);
          break;
        }
@@ -1330,7 +1260,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
    userbuffers_fp16_sum_inplace_gpu_rr_ag(const int op, const int flagoffset, const int firstrank,
                                           const int myrank, const int gpustep,
                                           const int mylineoffset, const int totallines,
-                                           void **commbuff, const int handleridx) {
+                                           void **commbuff, const int handleridx,
+                                           const uint64_t ub_timeout) {
  __shared__ int4 *userptr[RANKS];
  volatile int *flagptr;
  int physgpu, targetgpu, *myptr;
@@ -1342,7 +1273,6 @@ __global__ void __launch_bounds__(MAX_THREADS)
    reduceidptr = myptr - NVTE_MAX_OPS;  // +op;
    reduce_id = (*reduceidptr) + 1;
    flagptr = (reinterpret_cast<int *>(commbuff[targetgpu])) + flagoffset;
-    volatile int *flag = (volatile int *)&(myptr[targetgpu]);
    userptr[threadIdx.x] = reinterpret_cast<int4 *>(commbuff[targetgpu + handleridx]);
    clock_t s = clock64();
  }
@@ -1393,9 +1323,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
    volatile int *flag = (volatile int *)&myptr[targetgpu];
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > 2ull * TIMEOUT) {
-        printf("NVONLY AGBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id,
-               *flag);
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("[%d] Allgather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
+                  reduce_id, *flag);
        break;
      }
    }
@@ -1407,7 +1337,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
    userbuffers_fp16_sum_inplace_gpu_rw_ag(const int op, const int flagoffset, const int firstrank,
                                           const int myrank, const int gpustep,
                                           const int mylineoffset, const int totallines,
-                                           void **commbuff, const int handleridx) {
+                                           void **commbuff, const int handleridx,
+                                           const uint64_t ub_timeout) {
  __shared__ int4 *userptr[RANKS];
  volatile int *flagptr;
  int physgpu, targetgpu, *myptr;
@@ -1490,784 +1421,15 @@ __global__ void __launch_bounds__(MAX_THREADS)
    volatile int *flag = (volatile int *)&myptr[targetgpu];
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
-      if (clock64() - s > 2ull * TIMEOUT) {
-        printf("NVONLY AGBAR:SM %d [%d]:expecting %d got %d\n", blockIdx.x, threadIdx.x, reduce_id,
-               *flag);
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("[%d] Allgather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
+                 reduce_id, *flag);
        break;
      }
    }
  }
 }  // fp16 inplace allgather kernel (Volta,Hopper)

-template <int RANKS>
-__global__ void __launch_bounds__(MAX_THREADS)
-    userbuffers_fp16_sum_inplace_gpu_rr_blocked(const int op, const int flagoffset,
-                                                const int firstrank, const int myrank,
-                                                const int lineoffset, const int numlines,
-                                                void **commbuff, const int handleridx,
-                                                const int peerblocklines, int *hostflags,
-                                                int *gpuflag, const int numblocks) {
-  const int basecounter = gpuflag[NVTE_GF_STATE + op];
-
-#define REDUCETHREADS (blockDim.x - 32)
-
-  if (threadIdx.x < 32) {
-    int *flagptr;
-    if (threadIdx.x < RANKS) {
-      if (!blockIdx.x) {
-        flagptr = reinterpret_cast<int *>(commbuff[threadIdx.x + firstrank]);
-        flagptr[flagoffset + myrank + firstrank] = basecounter;
-      }
-      volatile int *flag = (volatile int *)&((reinterpret_cast<int *>(
-          commbuff[myrank + firstrank]))[flagoffset + threadIdx.x + firstrank]);
-      while (CHECK_IDS(*flag, basecounter)) {
-      }
-    }
-    __syncthreads();
-
-    int startblock = 0, endblock = numblocks;
-
-    for (int nblock = 0; nblock < endblock; nblock++) {
-      asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32));
-
-      if (threadIdx.x == 0) {
-        __threadfence();
-        if (blockIdx.x)
-          gpuflag[op * NVTE_MAX_SMS * 2 + blockIdx.x] = nblock + basecounter + 1;
-      } else if (blockIdx.x == 0) {
-        int expecting = (basecounter + nblock + 1);
-        if (threadIdx.x < gridDim.x)
-          while (((volatile int *)gpuflag)[op * NVTE_MAX_SMS * 2 + threadIdx.x] < expecting) {
-          }
-      }
-      if (!blockIdx.x) {
-        asm volatile("bar.sync 15, %0;" ::"r"(32));
-        if (!threadIdx.x)
-          hostflags[0] = nblock + basecounter + 1;
-      }
-    }
-
-    int cachedflag = basecounter;
-
-#define ALLGATHERFLAG NVTE_GF_IBSHARPDONE
-
-    if (blockIdx.x == 0 && threadIdx.x < RANKS) {
-      while (cachedflag < basecounter + numblocks) {
-        int newflag = ((volatile int *)gpuflag)[ALLGATHERFLAG];
-        if (newflag == cachedflag)
-          continue;
-        cachedflag = newflag;
-        flagptr[flagoffset + myrank + 32 + firstrank] = cachedflag;
-      }
-    }
-
-    if (blockIdx.x == 0 && threadIdx.x == 0)
-      gpuflag[NVTE_GF_STATE + op] = basecounter + numblocks;
-  } else {
-    const int warp = blockIdx.x + (threadIdx.x >> 5);
-    int4 *userptr[RANKS];
-    int4 *userptrmyrank;
-#pragma unroll
-    for (int i = 0; i < RANKS; i++)
-      userptr[i] = reinterpret_cast<int4 *>(
-          commbuff[((i + myrank + warp) & (RANKS - 1)) + handleridx + firstrank]);
-    userptrmyrank = reinterpret_cast<int4 *>(commbuff[myrank + handleridx + firstrank]);
-    __syncthreads();
-
-    int blocklineoffset = 0;
-
-    while (blocklineoffset < numlines) {
-      const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS);
-      const int blocklines = remainder / RANKS;
-      const int blockstart = lineoffset + blocklineoffset + blocklines * myrank;
-
-      for (int line = threadIdx.x - 32 + REDUCETHREADS * blockIdx.x; line < blocklines;
-           line += REDUCETHREADS * gridDim.x) {
-        int4 val[RANKS];
-
-#pragma unroll
-        for (int i = 0; i < RANKS; i++) {
-          val[i] = userptr[i][blockstart + line];
-        }
-
-        int4 sum = val[0];
-        half *s = reinterpret_cast<half *>(&sum);
-
-#pragma unroll
-        for (int i = 1; i < RANKS; i++) {
-          half *x = reinterpret_cast<half *>(&val[i]);
-#pragma unroll
-          for (int j = 0; j < sizeof(int4) / sizeof(half); j++)
-            s[j] += x[j];
-        }
-
-        userptrmyrank[blockstart + line] = sum;
-      }  // single block loop
-
-      asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32));
-
-      blocklineoffset += peerblocklines * RANKS;
-    }  // block loop NVLINK-REDUCESCATTER
-    const int nwarps = (REDUCETHREADS >> 5) / (RANKS - 1);
-    const int myblockDim = nwarps << 5;
-    const int mywarp = ((threadIdx.x - 32) >> 5) / (RANKS - 1);
-    const int maxthreadIdx = myblockDim * (RANKS - 1) + 32;
-    const int mydest = (myrank + 1 + ((threadIdx.x - 32) >> 5) % (RANKS - 1)) & (RANKS - 1);
-    const int mythreadIdx = (mywarp << 5) + (threadIdx.x & 31);
-    volatile int *flag = (volatile int *)&((reinterpret_cast<int *>(
-        commbuff[myrank + firstrank]))[flagoffset + mydest + 32 + firstrank]);
-
-    int4 *userptrmydest = userptr[((RANKS << 10) + mydest - myrank - warp) & (RANKS - 1)];
-
-    blocklineoffset = 0;
-    int gathercounter = basecounter + 1;
-    while (blocklineoffset < numlines) {
-      const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS);
-      const int blocklines = remainder / RANKS;
-      const int blockstart = lineoffset + blocklineoffset;
-
-#define UNROLL 6
-      int4 *myptr = &userptrmyrank[blockstart + blocklines * mydest];
-      int4 *peerptr = &userptrmydest[blockstart + blocklines * mydest];
-
-      if (threadIdx.x < maxthreadIdx) {
-        const int start_elem = mythreadIdx + myblockDim * blockIdx.x;
-        const int end_elem = max(start_elem, blocklines);
-        const int aligned_elem = ((end_elem - start_elem) / (myblockDim * gridDim.x * UNROLL)) *
-                                 (myblockDim * gridDim.x * UNROLL);
-        const int end_aligned = start_elem + aligned_elem;
-
-        if (mythreadIdx == 0) {
-          while (CHECK_IDS(*flag, gathercounter)) {
-          }
-          gathercounter++;
-        }
-
-        asm volatile("bar.sync %0, %1;" ::"r"(1 + mydest), "r"(myblockDim));
-
-        for (int line = start_elem; line < end_aligned; line += myblockDim * gridDim.x * UNROLL) {
-          int4 val[UNROLL];
-#pragma unroll
-          for (int i = 0; i < UNROLL; i++)
-            val[i] = peerptr[line + i * myblockDim * gridDim.x];
-#pragma unroll
-          for (int i = 0; i < UNROLL; i++)
-            myptr[line + i * myblockDim * gridDim.x] = val[i];
-        }
-        for (int line = end_aligned; line < end_elem; line += myblockDim * gridDim.x)
-          myptr[line] = peerptr[line];
-      }
-      blocklineoffset += peerblocklines * RANKS;
-    }  // block loop for NVLINK-ALLGATHER
-  }    // worker warps else block
-}  // fp16 inplace reduce kernel with SHARP / in blocks
-
-// threadfence and SMs sync to SM0
-#define SMBAR(offset, block)                                                                       \
-  asm volatile("bar.sync 13, %0;" ::"r"(blockDim.x));                                              \
-  if (threadIdx.x == 0) {                                                                          \
-    __threadfence_system();                                                                        \
-    if (blockIdx.x)                                                                                \
-      gpuflag[offset + blockIdx.x] = block + basecounter + 1;                                      \
-  } else if (blockIdx.x == 0) {                                                                    \
-    int expecting = (basecounter + block + 1);                                                     \
-    if (threadIdx.x < gridDim.x)                                                                   \
-      while (((volatile int *)gpuflag)[offset + threadIdx.x] < expecting) {                        \
-      }                                                                                            \
-  }                                                                                                \
-  if (blockIdx.x == 0)                                                                             \
-    asm volatile("bar.sync 15, %0;" ::"r"(32));
-
-template <int RANKS>
-__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_rr_blocked2(
-    const int op, const int maxcredit, const int headstart, const int myibrank, const int ibranks,
-    const int commbufoffset, const int flagoffset, const int firstrank, const int myrank,
-    const int gpustep, const int lineoffset, const int numlines, void **commbuff,
-    const int handleridx, const int peerblocklines, int *hostflags, int *gpuflag,
-    const int numblocks) {
-  const int basecounter = gpuflag[NVTE_GF_STATE + op];
-  if (threadIdx.x < 32) {
-    int *flagptr;
-    volatile int *localflag = (volatile int *)&(
-        ((int *)commbuff[gpustep * myrank + firstrank])[flagoffset]);  // NOLINT(*)
-    // initial intranode barrier - once
-    if (threadIdx.x < RANKS) {
-      if (!blockIdx.x) {
-        flagptr = reinterpret_cast<int *>(commbuff[gpustep * threadIdx.x + firstrank]);
-        flagptr[flagoffset + gpustep * myrank + firstrank] = basecounter;
-      }
-      volatile int *flag = &localflag[gpustep * threadIdx.x + firstrank];
-      while (CHECK_IDS(*flag, basecounter)) {
-      }
-    }
-    __syncthreads();
-
-    for (int nblock = 0; nblock < numblocks + headstart; nblock++) {
-      if (nblock < numblocks) {
-        // RS happens here
-        SMBAR(op * 2 * NVTE_MAX_SMS, nblock);
-        if (!blockIdx.x && !threadIdx.x)
-          hostflags[NVTE_HF_NVRSDONE + (op & 1)] = nblock + basecounter + 1;
-      }
-
-      if (nblock >= headstart) {
-        for (int ibflag = threadIdx.x; ibflag < ibranks; ibflag += 32)
-          if (ibflag != myibrank)
-            while (localflag[NVTE_REG0_IBRS + ibflag] < basecounter + nblock - headstart + 1) {
-            }
-        asm volatile("bar.sync 13, %0;" ::"r"(blockDim.x));
-        // REDUCE happens here
-        SMBAR(op * 2 * NVTE_MAX_SMS + NVTE_MAX_SMS, nblock - headstart);
-        if (!blockIdx.x && !threadIdx.x)
-          hostflags[NVTE_HF_NVREDUCEDONE + (op & 1)] = nblock + basecounter + 1 - headstart;
-      }
-    }
-    // final part doing NVAG based on responses from NIC-RMW:IBAG
-
-    if (blockIdx.x == 0) {
-      for (int nblock = 0; nblock < numblocks; nblock++) {
-        const int expected = basecounter + nblock + 1;
-        for (int ibflag = threadIdx.x; ibflag < ibranks; ibflag += 32)
-          if (ibflag != myibrank)
-            while (localflag[NVTE_REG0_IBAG + ibflag] < expected) {
-            }
-        asm volatile("bar.sync 15, %0;" ::"r"(32));
-        if (threadIdx.x < RANKS)
-          flagptr[flagoffset + gpustep * myrank + NVTE_MAX_NVLINK + firstrank] = expected;
-      }
-    }
-
-    if (blockIdx.x == 0 && threadIdx.x == 0)
-      gpuflag[NVTE_GF_STATE + op] = basecounter + numblocks;
-  } else {  // sync warp
-    // reducethreads
-    const int warp = blockIdx.x + (threadIdx.x >> 5);
-    int4 *userptr[RANKS];
-    int4 *userptrmyrank;
-#pragma unroll
-    for (int i = 0; i < RANKS; i++)
-      userptr[i] = reinterpret_cast<int4 *>(
-          commbuff[((i + myrank + warp) & (RANKS - 1)) * gpustep + handleridx + firstrank]);
-    userptrmyrank = reinterpret_cast<int4 *>(commbuff[gpustep * myrank + handleridx + firstrank]);
-    int4 *internalbuf = reinterpret_cast<int4 *>(commbuff[myrank * gpustep + firstrank] +
-                                                 commbufoffset * sizeof(int));
-    __syncthreads();
-
-    int blocklineoffset = 0, rblocklineoffset = 0;
-
-    for (int nblock = 0; nblock < numblocks + headstart; nblock++) {
-      // NVRS part(only first numblocks steps)
-      if (blocklineoffset < numlines) {
-        const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS);
-        const int blocklines = remainder / RANKS;
-        const int blockstart = lineoffset + blocklineoffset + blocklines * myrank;
-        if (RANKS > 1) {
-          for (int line = threadIdx.x - 32 + REDUCETHREADS * blockIdx.x; line < blocklines;
-               line += REDUCETHREADS * gridDim.x) {
-            int4 val[RANKS];
-
-#pragma unroll
-            for (int i = 0; i < RANKS; i++) {
-              val[i] = userptr[i][blockstart + line];
-            }
-
-            int4 sum = val[0];
-            half *s = reinterpret_cast<half *>(&sum);
-
-#pragma unroll
-            for (int i = 1; i < RANKS; i++) {
-              half *x = reinterpret_cast<half *>(&val[i]);
-#pragma unroll
-              for (int j = 0; j < sizeof(int4) / sizeof(half); j++)
-                s[j] += x[j];
-            }
-
-            userptrmyrank[blockstart + line] = sum;
-          }  // single block loop
-        }
-
-        asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32));
-        blocklineoffset += peerblocklines * RANKS;
-      }
-      if (nblock >= headstart) {
-#define UNROLLRS 2
-        const int remainder = min(numlines - rblocklineoffset, peerblocklines * RANKS);
-        const int blocklines = remainder / RANKS;
-        rblocklineoffset += peerblocklines * RANKS;
-        const int ibblocklines = blocklines / ibranks;
-        int4 *tempbufptr = &internalbuf[((nblock - headstart) % maxcredit) * peerblocklines];
-        const int tempstart = lineoffset + (nblock - headstart) * peerblocklines * RANKS +
-                              myrank * blocklines + ibblocklines * myibrank;
-
-        asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32));
-
-        for (int line = threadIdx.x - 32 + REDUCETHREADS * blockIdx.x; line < ibblocklines;
-             line += REDUCETHREADS * gridDim.x) {
-          int4 val[UNROLLRS];
-
-#pragma unroll
-          for (int i = 0; i < UNROLLRS; i++)
-            val[i] = i == myibrank ? userptrmyrank[tempstart + line]
-                                   : tempbufptr[i * ibblocklines + line];
-
-          int4 sum = val[0];
-          half *s = reinterpret_cast<half *>(&sum);
-
-          for (int i = 0; i < ibranks - UNROLLRS; i++) {
-            val[i % UNROLLRS] = i == myibrank ? userptrmyrank[tempstart + line]
-                                              : tempbufptr[i * ibblocklines + line];
-            half *x = reinterpret_cast<half *>(&val[(i + 1) % UNROLLRS]);
-#pragma unroll
-            for (int j = 0; j < 16; j++)
-              s[j] += x[j];
-          }
-#pragma unroll
-          for (int i = 1; i < UNROLLRS; i++) {
-            half *x = reinterpret_cast<half *>(&val[i]);
-#pragma unroll
-            for (int j = 0; j < 16; j++)
-              s[j] += x[j];
-          }
-          userptrmyrank[tempstart + line] = sum;
-        }
-
-        asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32));
-      }
-    }  // nblock loop NVLINK-REDUCESCATTER + IBREDUCE LOCAL COMPUTE
-
-    if (RANKS != 1) {
-      const int nwarps = (REDUCETHREADS >> 5) / (RANKS - 1);
-      const int myblockDim = nwarps << 5;
-      const int mywarp = ((threadIdx.x - 32) >> 5) / (RANKS - 1);
-      const int maxthreadIdx = myblockDim * (RANKS - 1) + 32;
-      const int mydest = (myrank + 1 + ((threadIdx.x - 32) >> 5) % (RANKS - 1)) & (RANKS - 1);
-      const int mythreadIdx = (mywarp << 5) + (threadIdx.x & 31);
-      volatile int *flag = (volatile int *)&((reinterpret_cast<int *>(
-          commbuff[gpustep * myrank + firstrank]))[flagoffset + gpustep * mydest + NVTE_MAX_NVLINK +
-                                                   firstrank]);
-
-      int4 *userptrmydest = userptr[((RANKS << 10) + mydest - myrank - warp) & (RANKS - 1)];
-
-      blocklineoffset = 0;
-      int gathercounter = basecounter + 1;
-      while (blocklineoffset < numlines) {
-        const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS);
-        const int blocklines = remainder / RANKS;
-        const int blockstart = lineoffset + blocklineoffset;
-
-#define UNROLL 6
-        int4 *myptr = &userptrmyrank[blockstart + blocklines * mydest];
-        int4 *peerptr = &userptrmydest[blockstart + blocklines * mydest];
-
-        if (threadIdx.x < maxthreadIdx) {
-          const int start_elem = mythreadIdx + myblockDim * blockIdx.x;
-          const int end_elem = max(start_elem, blocklines);
-          const int aligned_elem = ((end_elem - start_elem) / (myblockDim * gridDim.x * UNROLL)) *
-                                   (myblockDim * gridDim.x * UNROLL);
-          const int end_aligned = start_elem + aligned_elem;
-
-          if (mythreadIdx == 0) {
-            while (CHECK_IDS(*flag, gathercounter)) {
-            }
-            gathercounter++;
-          }
-
-          asm volatile("bar.sync %0, %1;" ::"r"(1 + mydest), "r"(myblockDim));
-
-          for (int line = start_elem; line < end_aligned; line += myblockDim * gridDim.x * UNROLL) {
-            int4 val[UNROLL];
-#pragma unroll
-            for (int i = 0; i < UNROLL; i++)
-              val[i] = peerptr[line + i * myblockDim * gridDim.x];
-#pragma unroll
-            for (int i = 0; i < UNROLL; i++)
-              myptr[line + i * myblockDim * gridDim.x] = val[i];
-          }
-          for (int line = end_aligned; line < end_elem; line += myblockDim * gridDim.x)
-            myptr[line] = peerptr[line];
-        }
-        blocklineoffset += peerblocklines * RANKS;
-      }  // block loop for NVLINK-ALLGATHER
-    }    // RANKS!=1
-  }      // worker warps else block
-}  // fp16 inplace reduce kernel with SHARP / in blocks
-
-template <int RANKS>
-__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_rr_blocked2_rs(
-    const int op, const int maxcredit, const int headstart, const int myibrank, const int ibranks,
-    const int commbufoffset, const int flagoffset, const int firstrank, const int myrank,
-    const int gpustep, const int lineoffset, const int numlines, void **commbuff,
-    const int handleridx, const int peerblocklines, int *hostflags, int *gpuflag,
-    const int numblocks) {
-  const int basecounter = gpuflag[NVTE_GF_STATE + op];
-  if (threadIdx.x < 32) {
-    int *flagptr;
-    volatile int *localflag = (volatile int *)&(
-        ((int *)commbuff[gpustep * myrank + firstrank])[flagoffset]);  // NOLINT(*)
-    // initial intranode barrier - once
-    if (threadIdx.x < RANKS) {
-      if (!blockIdx.x) {
-        flagptr = reinterpret_cast<int *>(commbuff[gpustep * threadIdx.x + firstrank]);
-        flagptr[flagoffset + gpustep * myrank + firstrank] = basecounter;
-      }
-      volatile int *flag = &localflag[gpustep * threadIdx.x + firstrank];
-      while (CHECK_IDS(*flag, basecounter)) {
-      }
-    }
-    __syncthreads();
-
-    for (int nblock = 0; nblock < numblocks + headstart; nblock++) {
-      if (nblock < numblocks) {
-        // RS happens here
-        SMBAR(op * 2 * NVTE_MAX_SMS, nblock);
-        if (!blockIdx.x && !threadIdx.x)
-          hostflags[NVTE_HF_NVRSDONE + (op & 1)] = nblock + basecounter + 1;
-      }
-
-      if (nblock >= headstart) {
-        for (int ibflag = threadIdx.x; ibflag < ibranks; ibflag += 32)
-          if (ibflag != myibrank)
-            while (localflag[NVTE_REG0_IBRS + ibflag] < basecounter + nblock - headstart + 1) {
-            }
-        asm volatile("bar.sync 13, %0;" ::"r"(blockDim.x));
-        // REDUCE happens here
-        SMBAR(op * 2 * NVTE_MAX_SMS + NVTE_MAX_SMS, nblock - headstart);
-      }
-    }
-  } else {  // sync warp
-    // reducethreads
-    const int warp = blockIdx.x + (threadIdx.x >> 5);
-    int4 *userptr[RANKS];
-    int4 *userptrmyrank;
-#pragma unroll
-    for (int i = 0; i < RANKS; i++)
-      userptr[i] = reinterpret_cast<int4 *>(
-          commbuff[((i + myrank + warp) & (RANKS - 1)) * gpustep + handleridx + firstrank]);
-    userptrmyrank = reinterpret_cast<int4 *>(commbuff[gpustep * myrank + handleridx + firstrank]);
-    int4 *internalbuf = reinterpret_cast<int4 *>(commbuff[myrank * gpustep + firstrank] +
-                                                 commbufoffset * sizeof(int));
-    __syncthreads();
-
-    int blocklineoffset = 0, rblocklineoffset = 0;
-
-    for (int nblock = 0; nblock < numblocks + headstart; nblock++) {
-      // NVRS part(only first numblocks steps)
-      if (blocklineoffset < numlines) {
-        const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS);
-        const int blocklines = remainder / RANKS;
-        const int blockstart = lineoffset + blocklineoffset + blocklines * myrank;
-        if (RANKS > 1) {
-          for (int line = threadIdx.x - 32 + REDUCETHREADS * blockIdx.x; line < blocklines;
-               line += REDUCETHREADS * gridDim.x) {
-            int4 val[RANKS];
-
-#pragma unroll
-            for (int i = 0; i < RANKS; i++) {
-              val[i] = userptr[i][blockstart + line];
-            }
-
-            int4 sum = val[0];
-            half *s = reinterpret_cast<half *>(&sum);
-
-#pragma unroll
-            for (int i = 1; i < RANKS; i++) {
-              half *x = reinterpret_cast<half *>(&val[i]);
-#pragma unroll
-              for (int j = 0; j < sizeof(int4) / sizeof(half); j++)
-                s[j] += x[j];
-            }
-
-            userptrmyrank[blockstart + line] = sum;
-          }  // single block loop
-        }
-
-        asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32));
-        blocklineoffset += peerblocklines * RANKS;
-      }
-      if (nblock >= headstart) {
-#define UNROLLRS 2
-        const int remainder = min(numlines - rblocklineoffset, peerblocklines * RANKS);
-        const int blocklines = remainder / RANKS;
-        rblocklineoffset += peerblocklines * RANKS;
-        const int ibblocklines = blocklines / ibranks;
-        int4 *tempbufptr = &internalbuf[((nblock - headstart) % maxcredit) * peerblocklines];
-        const int tempstart = lineoffset + (nblock - headstart) * peerblocklines * RANKS +
-                              myrank * blocklines + ibblocklines * myibrank;
-
-        asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32));
-
-        for (int line = threadIdx.x - 32 + REDUCETHREADS * blockIdx.x; line < ibblocklines;
-             line += REDUCETHREADS * gridDim.x) {
-          int4 val[UNROLLRS];
-
-#pragma unroll
-          for (int i = 0; i < UNROLLRS; i++)
-            val[i] = i == myibrank ? userptrmyrank[tempstart + line]
-                                   : tempbufptr[i * ibblocklines + line];
-
-          int4 sum = val[0];
-          half *s = reinterpret_cast<half *>(&sum);
-
-          for (int i = 0; i < ibranks - UNROLLRS; i++) {
-            val[i % UNROLLRS] = i == myibrank ? userptrmyrank[tempstart + line]
-                                              : tempbufptr[i * ibblocklines + line];
-            half *x = reinterpret_cast<half *>(&val[(i + 1) % UNROLLRS]);
-#pragma unroll
-            for (int j = 0; j < 16; j++)
-              s[j] += x[j];
-          }
-#pragma unroll
-          for (int i = 1; i < UNROLLRS; i++) {
-            half *x = reinterpret_cast<half *>(&val[i]);
-#pragma unroll
-            for (int j = 0; j < 16; j++)
-              s[j] += x[j];
-          }
-          userptrmyrank[tempstart + line] = sum;
-        }
-
-        asm volatile("bar.sync 13, %0;" ::"r"(REDUCETHREADS + 32));
-      }
-    }  // nblock loop NVLINK-REDUCESCATTER + IBREDUCE LOCAL COMPUTE
-  }    // worker warps else block
-}  // fp16 inplace reduce kernel with SHARP / in blocks
-
-template <int RANKS>
-__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_rr_blocked2_ag(
-    const int op, const int maxcredit, const int headstart, const int myibrank, const int ibranks,
-    const int commbufoffset, const int flagoffset, const int firstrank, const int myrank,
-    const int gpustep, const int lineoffset, const int numlines, void **commbuff,
-    const int handleridx, const int peerblocklines, int *hostflags, int *gpuflag,
-    const int numblocks) {
-  const int basecounter = gpuflag[NVTE_GF_STATE + op];
-  if (threadIdx.x < 32) {
-    int *flagptr;
-    volatile int *localflag = (volatile int *)&(
-        ((int *)commbuff[gpustep * myrank + firstrank])[flagoffset]);  // NOLINT(*)
-    if (threadIdx.x < RANKS) {
-      if (!blockIdx.x) {
-        flagptr = reinterpret_cast<int *>(commbuff[gpustep * threadIdx.x + firstrank]);
-      }
-    }
-    __syncthreads();
-    if (!blockIdx.x && !threadIdx.x)
-      hostflags[NVTE_HF_NVREDUCEDONE + (op & 1)] = numblocks + basecounter;
-    // tell CPU proxy all blocks are done and ready for NVAG
-
-    // final part doing NVAG based on responses from NIC-RMW:IBAG
-
-    if (blockIdx.x == 0) {
-      for (int nblock = 0; nblock < numblocks; nblock++) {
-        const int expected = basecounter + nblock + 1;
-        for (int ibflag = threadIdx.x; ibflag < ibranks; ibflag += 32)
-          if (ibflag != myibrank)
-            while (localflag[NVTE_REG0_IBAG + ibflag] < expected) {
-            }
-        asm volatile("bar.sync 15, %0;" ::"r"(32));
-        if (threadIdx.x < RANKS)
-          flagptr[flagoffset + gpustep * myrank + NVTE_MAX_NVLINK + firstrank] = expected;
-      }
-    }
-
-    if (blockIdx.x == 0 && threadIdx.x == 0)
-      gpuflag[NVTE_GF_STATE + op] = basecounter + numblocks;
-  } else {  // sync warp
-    // reducethreads
-    const int warp = blockIdx.x + (threadIdx.x >> 5);
-    int4 *userptr[RANKS];
-    int4 *userptrmyrank;
-#pragma unroll
-    for (int i = 0; i < RANKS; i++)
-      userptr[i] = reinterpret_cast<int4 *>(
-          commbuff[((i + myrank + warp) & (RANKS - 1)) * gpustep + handleridx + firstrank]);
-    userptrmyrank = reinterpret_cast<int4 *>(commbuff[gpustep * myrank + handleridx + firstrank]);
-    __syncthreads();
-
-    int blocklineoffset = 0, rblocklineoffset = 0;
-
-    if (RANKS != 1) {
-      const int nwarps = (REDUCETHREADS >> 5) / (RANKS - 1);
-      const int myblockDim = nwarps << 5;
-      const int mywarp = ((threadIdx.x - 32) >> 5) / (RANKS - 1);
-      const int maxthreadIdx = myblockDim * (RANKS - 1) + 32;
-      const int mydest = (myrank + 1 + ((threadIdx.x - 32) >> 5) % (RANKS - 1)) & (RANKS - 1);
-      const int mythreadIdx = (mywarp << 5) + (threadIdx.x & 31);
-      volatile int *flag = (volatile int *)&((reinterpret_cast<int *>(
-          commbuff[gpustep * myrank + firstrank]))[flagoffset + gpustep * mydest + NVTE_MAX_NVLINK +
-                                                   firstrank]);
-
-      int4 *userptrmydest = userptr[((RANKS << 10) + mydest - myrank - warp) & (RANKS - 1)];
-
-      blocklineoffset = 0;
-      int gathercounter = basecounter + 1;
-      while (blocklineoffset < numlines) {
-        const int remainder = min(numlines - blocklineoffset, peerblocklines * RANKS);
-        const int blocklines = remainder / RANKS;
-        const int blockstart = lineoffset + blocklineoffset;
-
-#define UNROLL 6
-        int4 *myptr = &userptrmyrank[blockstart + blocklines * mydest];
-        int4 *peerptr = &userptrmydest[blockstart + blocklines * mydest];
-
-        if (threadIdx.x < maxthreadIdx) {
-          const int start_elem = mythreadIdx + myblockDim * blockIdx.x;
-          const int end_elem = max(start_elem, blocklines);
-          const int aligned_elem = ((end_elem - start_elem) / (myblockDim * gridDim.x * UNROLL)) *
-                                   (myblockDim * gridDim.x * UNROLL);
-          const int end_aligned = start_elem + aligned_elem;
-
-          if (mythreadIdx == 0) {
-            while (CHECK_IDS(*flag, gathercounter)) {
-            }
-            gathercounter++;
-          }
-
-          asm volatile("bar.sync %0, %1;" ::"r"(1 + mydest), "r"(myblockDim));
-
-          for (int line = start_elem; line < end_aligned; line += myblockDim * gridDim.x * UNROLL) {
-            int4 val[UNROLL];
-#pragma unroll
-            for (int i = 0; i < UNROLL; i++)
-              val[i] = peerptr[line + i * myblockDim * gridDim.x];
-#pragma unroll
-            for (int i = 0; i < UNROLL; i++)
-              myptr[line + i * myblockDim * gridDim.x] = val[i];
-          }
-          for (int line = end_aligned; line < end_elem; line += myblockDim * gridDim.x)
-            myptr[line] = peerptr[line];
-        }
-        blocklineoffset += peerblocklines * RANKS;
-      }  // block loop for NVLINK-ALLGATHER
-    }    // RANKS!=1
-  }      // worker warps else block
-}  // fp16 inplace reduce kernel with SHARP / in blocks
-
-__global__ void userbuffers_fp16_sum_inplace_gpu_null(const int op, int *hostflags, int *gpuflag,
-                                                      int numblocks) {
-  const int basecounter = gpuflag[NVTE_GF_STATE + op] + numblocks;
-  hostflags[0] = basecounter;
-  gpuflag[NVTE_GF_STATE + op] = basecounter;
-  while (((volatile int *)gpuflag)[NVTE_GF_IBSHARPDONE] < basecounter) {
-  }
-}
-
-#define callranks_block(x)                                                                         \
-  if (comm->ar_nvsize == x)                                                                        \
-    userbuffers_fp16_sum_inplace_gpu_rr_blocked<x><<<sms, warps * 32, 0, stream>>>(                \
-        userbuffers_allreduceop_sharp, NVTE_REG0_OFFSET(comm), comm->ar_firstgpu, comm->ar_nvrank, \
-        offset / 8, elements / 8, reinterpret_cast<void **>(comm->gpu_ptrs),                       \
-        handler * comm->nvsize, blocksize / sizeof(int4) / comm->ar_nvsize,                        \
-        reinterpret_cast<int *>(comm->hostflags), comm->flags,                                     \
-        (elements * 2 + blocksize - 1) / blocksize);
-
-#define callranks2_block(x)                                                                        \
-  if (ar_nvsize == x) {                                                                            \
-    int numblocks = (elements * 2 + blocksize - 1) / blocksize;                                    \
-    int headstart = numblocks - 1; /*<3?numblocks-1:3;*/                                           \
-    if (headstart > maxcredit)                                                                     \
-      headstart = maxcredit;                                                                       \
-    if (x == 1)                                                                                    \
-      headstart = maxcredit;                                                                       \
-    if (headstart > numblocks)                                                                     \
-      headstart = numblocks;                                                                       \
-    if (headstart == 0)                                                                            \
-      headstart = 1;                                                                               \
-    userbuffers_fp16_sum_inplace_gpu_rr_blocked2<x><<<sms, warps * 32, 0, stream>>>(               \
-        op, maxcredit, headstart, my_node, num_nodes,                                              \
-        NVTE_REG0_OFFSET(comm) + NVTE_REG0_FLAGS +                                                 \
-            (op == userbuffers_allreduceop_nonsharp ? NVTE_REG0_COMMBUFFER : 0),                   \
-        NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * op, ar_firstgpu, ar_nvrank, ar_step,          \
-        offset / 8, elements / 8, reinterpret_cast<void **>(comm->gpu_ptrs),                       \
-        handler * comm->nvsize, blocksize / sizeof(int4) / ar_nvsize,                              \
-        reinterpret_cast<int *>(comm->hostflags), comm->flags, numblocks);                         \
-  }
-
-#define callranks2_block_rs(x)                                                                     \
-  if (ar_nvsize == x) {                                                                            \
-    int numblocks = (elements * 2 + blocksize - 1) / blocksize;                                    \
-    int headstart = numblocks - 1; /*<3?numblocks-1:3;*/                                           \
-    if (headstart > maxcredit)                                                                     \
-      headstart = maxcredit;                                                                       \
-    if (x == 1)                                                                                    \
-      headstart = maxcredit;                                                                       \
-    if (headstart > numblocks)                                                                     \
-      headstart = numblocks;                                                                       \
-    if (headstart == 0)                                                                            \
-      headstart = 1;                                                                               \
-    userbuffers_fp16_sum_inplace_gpu_rr_blocked2_rs<x><<<sms, warps * 32, 0, stream>>>(            \
-        op, maxcredit, headstart, my_node, num_nodes,                                              \
-        NVTE_REG0_OFFSET(comm) + NVTE_REG0_FLAGS +                                                 \
-            (op == userbuffers_allreduceop_nonsharp ? NVTE_REG0_COMMBUFFER : 0),                   \
-        NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * op, ar_firstgpu, ar_nvrank, ar_step,          \
-        offset / 8, elements / 8, reinterpret_cast<void **>(comm->gpu_ptrs),                       \
-        handler * comm->nvsize, blocksize / sizeof(int4) / ar_nvsize,                              \
-        reinterpret_cast<int *>(comm->hostflags), comm->flags, numblocks);                         \
-  }
-
-#define callranks2_block_ag(x)                                                                     \
-  if (ar_nvsize == x) {                                                                            \
-    int numblocks = (elements * 2 + blocksize - 1) / blocksize;                                    \
-    int headstart = numblocks - 1; /*<3?numblocks-1:3;*/                                           \
-    if (headstart > maxcredit)                                                                     \
-      headstart = maxcredit;                                                                       \
-    if (x == 1)                                                                                    \
-      headstart = maxcredit;                                                                       \
-    if (headstart > numblocks)                                                                     \
-      headstart = numblocks;                                                                       \
-    if (headstart == 0)                                                                            \
-      headstart = 1;                                                                               \
-    userbuffers_fp16_sum_inplace_gpu_rr_blocked2_ag<x><<<sms, warps * 32, 0, stream>>>(            \
-        op, maxcredit, headstart, my_node, num_nodes,                                              \
-        NVTE_REG0_OFFSET(comm) + NVTE_REG0_FLAGS +                                                 \
-            (op == userbuffers_allreduceop_nonsharp ? NVTE_REG0_COMMBUFFER : 0),                   \
-        NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * op, ar_firstgpu, ar_nvrank, ar_step,          \
-        offset / 8, elements / 8, reinterpret_cast<void **>(comm->gpu_ptrs),                       \
-        handler * comm->nvsize, blocksize / sizeof(int4) / ar_nvsize,                              \
-        reinterpret_cast<int *>(comm->hostflags), comm->flags, numblocks);                         \
-  }
-
-#define callranks(x)                                                                               \
-  if (ar_nvsize == x) {                                                                            \
-    int arg1 = op - NVTE_MAX_OPS,                                                                  \
-        arg2 = NVTE_REG0_OFFSET(comm) -                                                            \
-               (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE +           \
-               NVTE_MAX_OPS,                                                                       \
-        arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg6 = offset / 8,                   \
-        arg7 = elements / 8;                                                                       \
-    void **arg8 = reinterpret_cast<void **>(comm->gpu_ptrs);                                       \
-    int arg9 = handler * comm->nvsize;                                                             \
-    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1), reinterpret_cast<void *>(&arg2),        \
-                          reinterpret_cast<void *>(&arg3), reinterpret_cast<void *>(&arg4),        \
-                          reinterpret_cast<void *>(&arg5), reinterpret_cast<void *>(&arg6),        \
-                          reinterpret_cast<void *>(&arg7), reinterpret_cast<void *>(&arg8),        \
-                          reinterpret_cast<void *>(&arg9)};                                        \
-    CUDACHECK(cudaLaunchKernelExC(                                                                 \
-        &cfg,                                                                                      \
-        reinterpret_cast<void *>(comm->use_rr_kernel ? userbuffers_fp16_sum_inplace_gpu_rr<x>      \
-                                                     : userbuffers_fp16_sum_inplace_gpu_rw<x>),    \
-        kernelArgs));                                                                              \
-  }
-
-#define callranksMC(x)                                                                             \
-  if (ar_nvsize == x) {                                                                            \
-    int arg1 = op - NVTE_MAX_OPS,                                                                  \
-        arg2 = NVTE_REG0_OFFSET(comm) -                                                            \
-               (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE +           \
-               NVTE_MAX_OPS,                                                                       \
-        arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg6 = offset / 8,                   \
-        arg7 = elements / 8;                                                                       \
-    void **arg8 = reinterpret_cast<void **>(comm->gpu_ptrs);                                       \
-    int arg9 = handler * comm->nvsize;                                                             \
-    void *arg10 = comm->mc_ptr[handler];                                                           \
-    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1), reinterpret_cast<void *>(&arg2),        \
-                          reinterpret_cast<void *>(&arg3), reinterpret_cast<void *>(&arg4),        \
-                          reinterpret_cast<void *>(&arg5), reinterpret_cast<void *>(&arg6),        \
-                          reinterpret_cast<void *>(&arg7), reinterpret_cast<void *>(&arg8),        \
-                          reinterpret_cast<void *>(&arg9), reinterpret_cast<void *>(&arg10)};      \
-    CUDACHECK(cudaLaunchKernelExC(                                                                 \
-        &cfg, reinterpret_cast<void *>(userbuffers_fp16_sum_inplace_gpu_mc<x>), kernelArgs));      \
-  }
-
 #define SETUP_LAUNCH_CONFIG(sms, threads, stream)                                                  \
  cudaLaunchConfig_t cfg = {sms, threads, 0, stream, NULL, 0};                                     \
  cudaLaunchAttribute attribute_ub[2];                                                             \
@@ -2279,60 +1441,6 @@ __global__ void userbuffers_fp16_sum_inplace_gpu_null(const int op, int *hostfla
  cfg.attrs = attribute_ub;                                                                        \
  cfg.numAttrs = comm->sm_arch >= 9 ? 2 : 1;

-int allreduce_userbuff_inplace_gpu(const int handler, const int offset, const int elements,
-                                   const int blocksize, communicator *comm, cudaStream_t stream) {
-  // schedule GPU kernel only
-  // CPU/SHARP part is responsibility of caller
-  const int ar_step = comm->ar2_nvsize;
-  const int op = userbuffers_allreduceop_nonsharp;
-  const int ar_nvsize = comm->nvsize;
-  const int ar_firstgpu = comm->ar_firstgpu;
-  const int ar_nvrank = comm->ar_nvrank;
-  if (elements < 8)
-    return 0;
-  int sms = sms = comm->sms;
-  int warps = comm->threads / 32;
-  if (warps < comm->ar_nvsize)
-    warps = comm->ar_nvsize;
-
-  if (comm->launch_mode & NVTE_LAUNCH_GPU) {
-    if (comm->ar_nvsize == 1)
-      userbuffers_fp16_sum_inplace_gpu_null<<<1, 1, 0, stream>>>(
-          userbuffers_allreduceop_sharp, reinterpret_cast<int *>(comm->hostflags), comm->flags,
-          (elements * 2 + blocksize - 1) / blocksize);
-    callranks_block(2) callranks_block(4) callranks_block(8)
-  }
-  return sms;
-}
-
-int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset,
-                                    const int elements, const int blocksize, communicator *comm,
-                                    cudaStream_t stream, int op) {
-  // schedule GPU kernel only
-  // CPU/SHARP part is responsibility of caller
-  const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes;
-  const int my_node = op == userbuffers_allreduceop_nonsharp ? comm->my_node : comm->my2_node;
-  const int ar_firstgpu =
-      op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu;
-  const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize;
-  const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize;
-  const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank;
-
-  if (elements < 8)
-    return 0;
-  int sms = ar_nvsize == 1 ? 2 : comm->sms;
-  int warps = comm->threads / 32;
-  if (warps < ar_nvsize)
-    warps = ar_nvsize;
-  if (num_nodes > 1) {
-    callranks2_block(1) callranks2_block(2) callranks2_block(4) callranks2_block(8)
-  } else {
-    SETUP_LAUNCH_CONFIG(sms, warps * 32, stream);
-      callranks(2) callranks(4) callranks(8)
-  }
-  return sms;
-}
-
 #define callranks_ag(x)                                                                            \
  if (ar_nvsize == x) {                                                                            \
    int arg1 = op - NVTE_MAX_OPS,                                                                  \
@@ -2343,11 +1451,12 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
        arg6 = offset / 8 + (comm->use_rr_kernel ? 0 : arg4 * arg7);                               \
    void **arg8 = reinterpret_cast<void **>(comm->gpu_ptrs);                                       \
    int arg9 = handler * comm->nvsize;                                                             \
+    uint64_t arg10 = comm->ub_timeout;                                                             \
    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1), reinterpret_cast<void *>(&arg2),        \
                          reinterpret_cast<void *>(&arg3), reinterpret_cast<void *>(&arg4),        \
                          reinterpret_cast<void *>(&arg5), reinterpret_cast<void *>(&arg6),        \
                          reinterpret_cast<void *>(&arg7), reinterpret_cast<void *>(&arg8),        \
-                          reinterpret_cast<void *>(&arg9)};                                        \
+                          reinterpret_cast<void *>(&arg9), reinterpret_cast<void *>(&arg10)};      \
    CUDACHECK(cudaLaunchKernelExC(                                                                 \
        &cfg,                                                                                      \
        reinterpret_cast<void *>(comm->use_rr_kernel ? userbuffers_fp16_sum_inplace_gpu_rr_ag<x>   \
@@ -2366,11 +1475,13 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
    void **arg8 = reinterpret_cast<void **>(comm->gpu_ptrs);                                       \
    int arg9 = handler * comm->nvsize;                                                             \
    uint4 *arg10 = reinterpret_cast<uint4 *>(comm->mc_ptr[handler]);                               \
+    uint64_t arg11 = comm->ub_timeout;                                                             \
    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1), reinterpret_cast<void *>(&arg2),        \
                          reinterpret_cast<void *>(&arg3), reinterpret_cast<void *>(&arg4),        \
                          reinterpret_cast<void *>(&arg5), reinterpret_cast<void *>(&arg6),        \
                          reinterpret_cast<void *>(&arg7), reinterpret_cast<void *>(&arg8),        \
-                          reinterpret_cast<void *>(&arg9), reinterpret_cast<void *>(&arg10)};      \
+                          reinterpret_cast<void *>(&arg9), reinterpret_cast<void *>(&arg10),       \
+                          reinterpret_cast<void *>(&arg11)};                                       \
    CUDACHECK(cudaLaunchKernelExC(                                                                 \
        &cfg, reinterpret_cast<void *>(userbuffers_fp16_sum_inplace_gpu_mc_ag<x>), kernelArgs));   \
  }
@@ -2385,11 +1496,12 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
        arg6 = offset / 8 + arg4 * arg7;                                                           \
    void **arg8 = reinterpret_cast<void **>(comm->gpu_ptrs);                                       \
    int arg9 = handler * comm->nvsize;                                                             \
+    uint64_t arg10 = comm->ub_timeout;                                                             \
    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1), reinterpret_cast<void *>(&arg2),        \
                          reinterpret_cast<void *>(&arg3), reinterpret_cast<void *>(&arg4),        \
                          reinterpret_cast<void *>(&arg5), reinterpret_cast<void *>(&arg6),        \
                          reinterpret_cast<void *>(&arg7), reinterpret_cast<void *>(&arg8),        \
-                          reinterpret_cast<void *>(&arg9)};                                        \
+                          reinterpret_cast<void *>(&arg9), reinterpret_cast<void *>(&arg10)};      \
    CUDACHECK(cudaLaunchKernelExC(                                                                 \
        &cfg, reinterpret_cast<void *>(userbuffers_fp16_sum_inplace_gpu_rr_rs<x>), kernelArgs));   \
  }
@@ -2405,11 +1517,13 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
    void **arg8 = reinterpret_cast<void **>(comm->gpu_ptrs);                                       \
    int arg9 = handler * comm->nvsize;                                                             \
    void *arg10 = comm->mc_ptr[handler];                                                           \
+    uint64_t arg11 = comm->ub_timeout;                                                             \
    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1), reinterpret_cast<void *>(&arg2),        \
                          reinterpret_cast<void *>(&arg3), reinterpret_cast<void *>(&arg4),        \
                          reinterpret_cast<void *>(&arg5), reinterpret_cast<void *>(&arg6),        \
                          reinterpret_cast<void *>(&arg7), reinterpret_cast<void *>(&arg8),        \
-                          reinterpret_cast<void *>(&arg9), reinterpret_cast<void *>(&arg10)};      \
+                          reinterpret_cast<void *>(&arg9), reinterpret_cast<void *>(&arg10),       \
+                          reinterpret_cast<void *>(&arg11)};                                       \
    CUDACHECK(cudaLaunchKernelExC(                                                                 \
        &cfg, reinterpret_cast<void *>(userbuffers_fp16_sum_inplace_gpu_mc_rs<x>), kernelArgs));   \
  }
@@ -2425,12 +1539,14 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
    void **arg10 = reinterpret_cast<void **>(comm->gpu_ptrs);                                      \
    int arg11 = handler * comm->nvsize;                                                            \
    void *arg12 = output;                                                                          \
+    uint64_t arg13 = comm->ub_timeout;                                                             \
    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1),  reinterpret_cast<void *>(&arg2),       \
                          reinterpret_cast<void *>(&arg3),  reinterpret_cast<void *>(&arg4),       \
                          reinterpret_cast<void *>(&arg5),  reinterpret_cast<void *>(&arg6),       \
                          reinterpret_cast<void *>(&arg7),  reinterpret_cast<void *>(&arg8),       \
                          reinterpret_cast<void *>(&arg9),  reinterpret_cast<void *>(&arg10),      \
-                          reinterpret_cast<void *>(&arg11), reinterpret_cast<void *>(&arg12)};     \
+                          reinterpret_cast<void *>(&arg11), reinterpret_cast<void *>(&arg12),      \
+                          reinterpret_cast<void *>(&arg13)};                                       \
    CUDACHECK(cudaLaunchKernelExC(                                                                 \
        &cfg, reinterpret_cast<void *>(userbuffers_fp16_sum_inplace_gpu_rr_rs_oop<x>),             \
        kernelArgs));                                                                              \
@@ -2448,13 +1564,14 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
    int arg11 = handler * comm->nvsize;                                                            \
    void *arg12 = output;                                                                          \
    float *arg13 = scale;                                                                          \
+    uint64_t arg14 = comm->ub_timeout;                                                             \
    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1),  reinterpret_cast<void *>(&arg2),       \
                          reinterpret_cast<void *>(&arg3),  reinterpret_cast<void *>(&arg4),       \
                          reinterpret_cast<void *>(&arg5),  reinterpret_cast<void *>(&arg6),       \
                          reinterpret_cast<void *>(&arg7),  reinterpret_cast<void *>(&arg8),       \
                          reinterpret_cast<void *>(&arg9),  reinterpret_cast<void *>(&arg10),      \
                          reinterpret_cast<void *>(&arg11), reinterpret_cast<void *>(&arg12),      \
-                          reinterpret_cast<void *>(&arg13)};                                       \
+                          reinterpret_cast<void *>(&arg13), reinterpret_cast<void *>(&arg14)};     \
    CUDACHECK(cudaLaunchKernelExC(                                                                 \
        &cfg,                                                                                      \
        reinterpret_cast<void *>(userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_fp8<x, fp8type>),      \
@@ -2473,13 +1590,14 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
    int arg11 = handler * comm->nvsize;                                                            \
    void *arg12 = output;                                                                          \
    void *arg13 = comm->mc_ptr[handler];                                                           \
+    uint64_t arg14 = comm->ub_timeout;                                                             \
    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1),  reinterpret_cast<void *>(&arg2),       \
                          reinterpret_cast<void *>(&arg3),  reinterpret_cast<void *>(&arg4),       \
                          reinterpret_cast<void *>(&arg5),  reinterpret_cast<void *>(&arg6),       \
                          reinterpret_cast<void *>(&arg7),  reinterpret_cast<void *>(&arg8),       \
                          reinterpret_cast<void *>(&arg9),  reinterpret_cast<void *>(&arg10),      \
                          reinterpret_cast<void *>(&arg11), reinterpret_cast<void *>(&arg12),      \
-                          reinterpret_cast<void *>(&arg13)};                                       \
+                          reinterpret_cast<void *>(&arg13), reinterpret_cast<void *>(&arg14)};     \
    CUDACHECK(cudaLaunchKernelExC(                                                                 \
        &cfg, reinterpret_cast<void *>(userbuffers_fp16_sum_inplace_gpu_mc_rs_oop<x>),             \
        kernelArgs));                                                                              \
@@ -2500,6 +1618,7 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
    float *arg14 = scale;                                                                          \
    void *arg15 = counters;                                                                        \
    int arg16 = numchunks, arg17 = atomicindex;                                                    \
+    uint64_t arg18 = comm->ub_timeout;                                                             \
    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1),  reinterpret_cast<void *>(&arg2),       \
                          reinterpret_cast<void *>(&arg3),  reinterpret_cast<void *>(&arg4),       \
                          reinterpret_cast<void *>(&arg5),  reinterpret_cast<void *>(&arg6),       \
@@ -2508,7 +1627,7 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
                          reinterpret_cast<void *>(&arg11), reinterpret_cast<void *>(&arg12),      \
                          reinterpret_cast<void *>(&arg13), reinterpret_cast<void *>(&arg14),      \
                          reinterpret_cast<void *>(&arg15), reinterpret_cast<void *>(&arg16),      \
-                          reinterpret_cast<void *>(&arg17)};                                       \
+                          reinterpret_cast<void *>(&arg17), reinterpret_cast<void *>(&arg18)};     \
    CUDACHECK(cudaLaunchKernelExC(                                                                 \
        &cfg,                                                                                      \
        reinterpret_cast<void *>(                                                                  \
@@ -2527,46 +1646,18 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
    void **arg10 = reinterpret_cast<void **>(comm->gpu_ptrs);                                      \
    int arg11 = handler * comm->nvsize;                                                            \
    void *arg12 = output;                                                                          \
-    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1),  reinterpret_cast<void *>(&arg2),       \
-                          reinterpret_cast<void *>(&arg3),  reinterpret_cast<void *>(&arg4),       \
-                          reinterpret_cast<void *>(&arg5),  reinterpret_cast<void *>(&arg6),       \
-                          reinterpret_cast<void *>(&arg7),  reinterpret_cast<void *>(&arg8),       \
-                          reinterpret_cast<void *>(&arg9),  reinterpret_cast<void *>(&arg10),      \
-                          reinterpret_cast<void *>(&arg11), reinterpret_cast<void *>(&arg12)};     \
-    CUDACHECK(cudaLaunchKernelExC(                                                                 \
-        &cfg, reinterpret_cast<void *>(userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride<x>),      \
-        kernelArgs));                                                                              \
-  }
-
-#if 0
-#define callranks_rs_oop_stride_atomic_fp8(x)                                                      \
-  if (ar_nvsize == x) {                                                                            \
-    int arg1 = op - NVTE_MAX_OPS,                                                                  \
-        arg2 = NVTE_REG0_OFFSET(comm) -                                                            \
-               (op == userbuffers_allreduceop_nonsharp ? 2 : 1) * NVTE_REG0_SINGLENODE +           \
-               NVTE_MAX_OPS,                                                                       \
-        arg3 = ar_firstgpu, arg4 = ar_nvrank, arg5 = ar_step, arg7 = elements / 16 / x,            \
-        arg6 = offset / 16, arg8 = rowelements / 8, arg9 = strideelements / 8, arg10 = numchunks;  \
-    void **arg11 = reinterpret_cast<void **>(comm->gpu_ptrs);                                      \
-    int arg12 = handler * comm->nvsize;                                                            \
-    void *arg13 = output;                                                                          \
-    void *arg14 = counters;                                                                        \
-    float *arg15 = scale;                                                                          \
+    uint64_t arg13 = comm->ub_timeout;                                                             \
    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1),  reinterpret_cast<void *>(&arg2),       \
                          reinterpret_cast<void *>(&arg3),  reinterpret_cast<void *>(&arg4),       \
                          reinterpret_cast<void *>(&arg5),  reinterpret_cast<void *>(&arg6),       \
                          reinterpret_cast<void *>(&arg7),  reinterpret_cast<void *>(&arg8),       \
                          reinterpret_cast<void *>(&arg9),  reinterpret_cast<void *>(&arg10),      \
                          reinterpret_cast<void *>(&arg11), reinterpret_cast<void *>(&arg12),      \
-                          reinterpret_cast<void *>(&arg13), reinterpret_cast<void *>(&arg14),      \
-                          reinterpret_cast<void *>(&arg15)};                                       \
+                          reinterpret_cast<void *>(&arg13)};                                       \
    CUDACHECK(cudaLaunchKernelExC(                                                                 \
-        &cfg,                                                                                      \
-        reinterpret_cast<void *>(                                                                  \
-            userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride_atomic_fp8<x, fp8type>),             \
+        &cfg, reinterpret_cast<void *>(userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride<x>),      \
        kernelArgs));                                                                              \
  }
-#endif

 #define callranks_rs_oop_stride_atomic(x)                                                          \
  if (ar_nvsize == x) {                                                                            \
@@ -2580,13 +1671,15 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
    int arg12 = handler * comm->nvsize;                                                            \
    void *arg13 = output;                                                                          \
    void *arg14 = counters;                                                                        \
+    uint64_t arg15 = comm->ub_timeout;                                                             \
    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1),  reinterpret_cast<void *>(&arg2),       \
                          reinterpret_cast<void *>(&arg3),  reinterpret_cast<void *>(&arg4),       \
                          reinterpret_cast<void *>(&arg5),  reinterpret_cast<void *>(&arg6),       \
                          reinterpret_cast<void *>(&arg7),  reinterpret_cast<void *>(&arg8),       \
                          reinterpret_cast<void *>(&arg9),  reinterpret_cast<void *>(&arg10),      \
                          reinterpret_cast<void *>(&arg11), reinterpret_cast<void *>(&arg12),      \
-                          reinterpret_cast<void *>(&arg13), reinterpret_cast<void *>(&arg14)};     \
+                          reinterpret_cast<void *>(&arg13), reinterpret_cast<void *>(&arg14),      \
+                          reinterpret_cast<void *>(&arg15)};                                       \
    CUDACHECK(cudaLaunchKernelExC(                                                                 \
        &cfg,                                                                                      \
        reinterpret_cast<void *>(userbuffers_fp16_sum_inplace_gpu_rr_rs_oop_stride_atomic<x>),     \
@@ -2605,13 +1698,15 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
    int arg12 = handler * comm->nvsize;                                                            \
    void *arg13 = output;                                                                          \
    void *arg14 = counters;                                                                        \
+    uint64_t arg15 = comm->ub_timeout;                                                             \
    void *kernelArgs[] = {reinterpret_cast<void *>(&arg1),  reinterpret_cast<void *>(&arg2),       \
                          reinterpret_cast<void *>(&arg3),  reinterpret_cast<void *>(&arg4),       \
                          reinterpret_cast<void *>(&arg5),  reinterpret_cast<void *>(&arg6),       \
                          reinterpret_cast<void *>(&arg7),  reinterpret_cast<void *>(&arg8),       \
                          reinterpret_cast<void *>(&arg9),  reinterpret_cast<void *>(&arg10),      \
                          reinterpret_cast<void *>(&arg11), reinterpret_cast<void *>(&arg12),      \
-                          reinterpret_cast<void *>(&arg13), reinterpret_cast<void *>(&arg14)};     \
+                          reinterpret_cast<void *>(&arg13), reinterpret_cast<void *>(&arg14),      \
+                          reinterpret_cast<void *>(&arg15)};                                       \
    CUDACHECK(                                                                                     \
        cudaLaunchKernelExC(&cfg,                                                                  \
                            reinterpret_cast<void *>(                                              \
@@ -2619,47 +1714,12 @@ int allreduce2_userbuff_inplace_gpu(const int maxcredit, const int handler, cons
                            kernelArgs));                                                          \
  }

-int reducescatter2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset,
-                                        const int elements, const int blocksize, communicator *comm,
-                                        cudaStream_t stream, int op) {
-  // schedule GPU kernel only
-  // CPU/SHARP part is responsibility of caller
-
-  const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes;
-  const int my_node = op == userbuffers_allreduceop_nonsharp ? comm->my_node : comm->my2_node;
-  const int ar_firstgpu =
-      op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu;
-  const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize;
-  const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize;
-  const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank;
-
-  if (elements < 8)
-    return 0;
-  int sms = ar_nvsize == 1 ? 2 : comm->sms;
-  int warps = comm->threads / 32;
-  if (warps < ar_nvsize)
-    warps = ar_nvsize;
-
-  if (num_nodes > 1) {
-    callranks2_block_rs(1) callranks2_block_rs(2) callranks2_block_rs(4) callranks2_block_rs(8)
-  } else {
-    SETUP_LAUNCH_CONFIG(sms, warps * 32, stream);
-    if (comm->use_mc && (comm->memflags[handler] & UB_MEM_MC_CREATED)) {
-      callranks_rsMC(2) callranks_rsMC(4) callranks_rsMC(8)
-    } else {
-      callranks_rs(2) callranks_rs(4) callranks_rs(8)
-    }
-  }
-  return sms;
-}
-
 void reducescatter2_userbuff_strided(void *output, const int handler, const int offset,
                                     const int rowelements, const int colelements,
                                     const int strideelements, communicator *comm,
                                     cudaStream_t stream) {
  const int elements = rowelements * colelements;
  const int op = userbuffers_allreduceop_nonsharp2;
-  const int blocksize = elements * 2;
  const int ar_firstgpu =
      op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu;
  const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize;
@@ -2683,7 +1743,6 @@ void reducescatter2_userbuff_strided_atomic(void *output, const int handler, con
                                            cudaStream_t stream) {
  const int elements = rowelements * colelements;
  const int op = userbuffers_allreduceop_nonsharp2;
-  const int blocksize = elements * 2;
  const int ar_firstgpu =
      op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu;
  const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize;
@@ -2702,36 +1761,6 @@ void reducescatter2_userbuff_strided_atomic(void *output, const int handler, con
      callranks_rs_oop_stride_atomic(8)
 }

-#if 0
-  template<typename fp8type>
-  void reducescatter2_userbuff_strided_atomic_fp8(
-    void* output, float *scale, const int handler, const int offset, const int rowelements,
-    const int colelements, const int strideelements, const int numchunks, void *counters,
-    communicator* comm, cudaStream_t stream) {
-      const int elements = rowelements*colelements;
-      const int op = userbuffers_allreduceop_nonsharp2;
-      const int blocksize = elements;
-      const int ar_firstgpu = op == userbuffers_allreduceop_nonsharp ?
-                              comm->ar_firstgpu : comm->ar2_firstgpu;
-      const int ar_step = op == userbuffers_allreduceop_nonsharp2 ?
-                          1 : comm->ar2_nvsize;
-      const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ?
-                            comm->ar_nvsize : comm->ar2_nvsize;
-      const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ?
-                            comm->ar_nvrank : comm->ar2_nvrank;
-
-      assert(comm->sm_arch >= 9);
-      if (elements < 128) return;
-      int sms = ar_nvsize == 1 ? 2 : comm->sms;
-      int warps = comm->threads/32;
-      if (warps < ar_nvsize) warps = ar_nvsize;
-
-      SETUP_LAUNCH_CONFIG(sms, warps*32, stream);
-      callranks_rs_oop_stride_atomic_fp8(2)
-      callranks_rs_oop_stride_atomic_fp8(4)
-      callranks_rs_oop_stride_atomic_fp8(8)
-  }
-#endif
 template <typename fp8type>
 void reducescatter2_userbuff_strided_universal_fp8(void *output, float *scale, const int handler,
                                                   const int offset, const int rowelements,
@@ -2742,7 +1771,6 @@ void reducescatter2_userbuff_strided_universal_fp8(void *output, float *scale, c
                                                   communicator *comm, cudaStream_t stream) {
  const int elements = rowelements * colelements;
  const int op = userbuffers_allreduceop_nonsharp2;
-  const int blocksize = elements;
  const int ar_firstgpu =
      op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu;
  const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize;
@@ -2771,6 +1799,7 @@ void reducescatter2_userbuff_strided_atomic_fp8(void *output, float *scale, cons
      output, scale, handler, offset, rowelements, colelements, strideelements_out,
      strideelements_in, 1, numchunks, counters /*nullptr*/, comm, stream);
 }
+
 template <typename fp8type>
 void reducescatter2_userbuff_strided_multiatomic_fp8(
    void *output, float *scale, const int handler, const int offset, const int rowelements,
@@ -2788,7 +1817,6 @@ void reducescatter2_userbuff_strided_multiatomic(void *output, const int handler
                                                 cudaStream_t stream) {
  const int elements = rowelements * colelements;
  const int op = userbuffers_allreduceop_nonsharp2;
-  const int blocksize = elements * 2;
  const int ar_firstgpu =
      op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu;
  const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize;
@@ -2803,56 +1831,13 @@ void reducescatter2_userbuff_strided_multiatomic(void *output, const int handler
    warps = ar_nvsize;

  SETUP_LAUNCH_CONFIG(sms, warps * 32, stream);
-  // if(comm->use_mc && (comm->memflags[handler] & NVTE_UB_MEM_MC_CREATED)) {
-  //   //callranks_rs_oopMC(2)
-  //   //callranks_rs_oopMC(4)
-  //   //callranks_rs_oopMC(8)
-  // } else {
-  //   if(comm->memflags[handler] & NVTE_UB_MEM_UC_CONTIG) {
-  //     //callranks_rs_oopUCPTR(2)
-  //     //callranks_rs_oopUCPTR(4)
-  //     //callranks_rs_oopUCPTR(8)
-  //   } else {
  callranks_rs_oop_stride_multiatomic(2) callranks_rs_oop_stride_multiatomic(4)
      callranks_rs_oop_stride_multiatomic(8)
-  //  }
-  //}
-}
-
-int allgather2_userbuff_inplace_gpu(const int maxcredit, const int handler, const int offset,
-                                    const int elements, const int blocksize, communicator *comm,
-                                    cudaStream_t stream, int op) {
-  // schedule GPU kernel only
-  // CPU/SHARP part is responsibility of caller
-
-  const int num_nodes = op == userbuffers_allreduceop_nonsharp ? comm->num_nodes : comm->num2_nodes;
-  const int my_node = op == userbuffers_allreduceop_nonsharp ? comm->my_node : comm->my2_node;
-  const int ar_firstgpu =
-      op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu;
-  const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize;
-  const int ar_nvsize = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvsize : comm->ar2_nvsize;
-  const int ar_nvrank = op == userbuffers_allreduceop_nonsharp ? comm->ar_nvrank : comm->ar2_nvrank;
-
-  if (elements < 8)
-    return 0;
-  int sms = ar_nvsize == 1 ? 2 : comm->sms;
-  int warps = comm->threads / 32;
-  if (warps < ar_nvsize)
-    warps = ar_nvsize;
-
-  if (num_nodes > 1) {
-    callranks2_block_ag(1) callranks2_block_ag(2) callranks2_block_ag(4) callranks2_block_ag(8)
-  } else {
-    SETUP_LAUNCH_CONFIG(sms, warps * 32, stream);
-    callranks_ag(2) callranks_ag(4) callranks_ag(8)
-  }
-  return sms;
 }

 void allgather2_userbuff_inplace(const int handler, const int offset, const int elements,
                                 communicator *comm, cudaStream_t stream) {
  const int op = userbuffers_allreduceop_nonsharp2;
-  const int blocksize = elements * 2;
  const int ar_firstgpu =
      op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu;
  const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize;
@@ -2892,7 +1877,6 @@ void allgather2_userbuff_inplace_sliced(const int handler, const int offset, con
 void reducescatter2_userbuff_inplace(const int handler, const int offset, const int elements,
                                     communicator *comm, cudaStream_t stream) {
  const int op = userbuffers_allreduceop_nonsharp2;
-  const int blocksize = elements * 2;
  const int ar_firstgpu =
      op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu;
  const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize;
@@ -2919,7 +1903,6 @@ void reducescatter2_userbuff_stridedoutput(void *output, const int handler, cons
                                           cudaStream_t stream) {
  const int elements = rowelements * colelements;
  const int op = userbuffers_allreduceop_nonsharp2;
-  const int blocksize = elements * 2;
  const int ar_firstgpu =
      op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu;
  const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize;
@@ -2952,7 +1935,6 @@ void reducescatter2_userbuff_stridedoutput_fp8(void *output, float *scale, const
                                               communicator *comm, cudaStream_t stream) {
  const int elements = rowelements * colelements;
  const int op = userbuffers_allreduceop_nonsharp2;
-  const int blocksize = elements;
  const int ar_firstgpu =
      op == userbuffers_allreduceop_nonsharp ? comm->ar_firstgpu : comm->ar2_firstgpu;
  const int ar_step = op == userbuffers_allreduceop_nonsharp2 ? 1 : comm->ar2_nvsize;
@@ -2980,92 +1962,36 @@ void reducescatter2_userbuff_fp8(void *output, float *scale, const int handler,
 template void reducescatter2_userbuff_fp8<__nv_fp8_e5m2>(void *output, float *scale,
                                                         const int handler, const int offset,
                                                         const int elements, communicator *comm,
-                                                         cudaStream_t stream = 0);
+                                                         cudaStream_t stream);
 template void reducescatter2_userbuff_fp8<__nv_fp8_e4m3>(void *output, float *scale,
                                                         const int handler, const int offset,
                                                         const int elements, communicator *comm,
-                                                         cudaStream_t stream = 0);
-#if 0
-template void reducescatter2_userbuff_strided_atomic_fp8<__nv_fp8_e4m3>(
-    void* output, float *scale, const int handler, const int offset,
-    const int rowelements, const int colelements, const int strideelements,
-    const int numchunks, void *counters, communicator* comm, cudaStream_t stream = 0);
-#endif
+                                                         cudaStream_t stream);
+
 template void reducescatter2_userbuff_strided_atomic_fp8<__nv_fp8_e4m3>(
    void *output, float *scale, const int handler, const int offset, const int rowelements,
    const int colelements, const int strideelements_out, const int strideelements_in,
-    const int numchunks, void *counters, communicator *comm, cudaStream_t stream = 0);
+    const int numchunks, void *counters, communicator *comm, cudaStream_t stream);
+
 template void reducescatter2_userbuff_strided_multiatomic_fp8<__nv_fp8_e4m3>(
    void *output, float *scale, const int handler, const int offset, const int rowelements,
    const int colelements, const int strideelements_out, const int strideelements_in,
-    const int numchunks, void *counters, communicator *comm, cudaStream_t stream = 0);
-__global__ void __launch_bounds__(MAX_THREADS)
-    kuserbuffers_pullsendrecv(int myrank, int peer, int *recv_id, int *send_flagptr,
-                              int *recv_flagptr, int4 *srcptr, int4 *dstptr, const int lines) {
-  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    atomicAdd_system(send_flagptr, 1);
-  }
-
-#define UNROLLCOPY 8
-  const int start_elem = threadIdx.x + blockDim.x * blockIdx.x;
-  const int end_elem = lines;
-  const int aligned_elem = (end_elem - start_elem) & (~(blockDim.x * gridDim.x * UNROLLCOPY - 1));
-  const int end_aligned = start_elem + aligned_elem;
-
-  if (threadIdx.x == 0) {
-    const int signal_id = (*recv_id) + 1;
-    volatile int *flag = (volatile int *)recv_flagptr;
-    clock_t s = clock64();
-    while (CHECK_IDS(*flag, signal_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("[%d from %d] pullrecv: expected %d, stuck with %d\n", myrank, peer, signal_id,
-               *flag);
-        break;
-      }
-    }
-    if (lines == 0) {
-      *recv_id = signal_id;
-      return;
-    }  // otherwise need an extra kernel
-  }
-  __syncthreads();
-
-  if (end_elem <= start_elem)
-    return;
-
-  for (int line = start_elem; line < end_aligned; line += blockDim.x * gridDim.x * UNROLLCOPY) {
-    int4 val[UNROLLCOPY];
-#pragma unroll
-    for (int i = 0; i < UNROLLCOPY; i++)
-      val[i] = srcptr[line + i * blockDim.x * gridDim.x];
-#pragma unroll
-    for (int i = 0; i < UNROLLCOPY; i++)
-      dstptr[line + i * blockDim.x * gridDim.x] = val[i];
-  }
-  for (int line = end_aligned; line < end_elem; line += blockDim.x * gridDim.x)
-    dstptr[line] = srcptr[line];
-}
+    const int numchunks, void *counters, communicator *comm, cudaStream_t stream);

 __global__ void kuserbuffers_pullsend(int myrank, int peer, int *send_id, int *flagptr) {
  atomicAdd_system(flagptr, 1);
 }

 __global__ void kuserbuffers_inc(int *id) {
-  const int signal_id = (*id) + 1;
-  *id = signal_id;
-}
-
-__global__ void kuserbuffers_proxysend(int *id, int *hostflag) {
-  const int signal_id = (*id) + 1;
-  *hostflag = signal_id;
-  *id = signal_id;
+  atomicAdd(id, 1);
 }

 __global__ void kuserbuffers_dummy(void) {}

 __global__ void __launch_bounds__(MAX_THREADS)
-    kuserbuffers_pullrecv(int myrank, int peer, int *recv_id, int *flagptr, int4 *srcptr,
-                          int4 *dstptr, const int lines) {
+    kuserbuffers_pullrecv(int myrank, int peer, int nvrank, int nvpeer, int *recv_id, int *flagptr,
+                          int4 *srcptr, int4 *dstptr, const int lines,
+                          uint64_t ub_timeout) {
 #define UNROLLCOPY 8
  const int start_elem = threadIdx.x + blockDim.x * blockIdx.x;
  const int end_elem = lines;
@@ -3077,9 +2003,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
    volatile int *flag = (volatile int *)flagptr;
    clock_t s = clock64();
    while (CHECK_IDS(*flag, signal_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("[%d from %d] pullrecv: expected %d, stuck with %d\n", myrank, peer, signal_id,
-               *flag);
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("pullrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d]: expected %d,"
+                 " observed %d", myrank, peer, nvrank, nvpeer, signal_id, *flag);
        break;
      }
    }
@@ -3138,7 +2064,12 @@ __global__ void __launch_bounds__(MAX_THREADS)
  }
 }

-__global__ void kuserbuffers_pushrecv(int myrank, int peer, int *recv_id, int *flagptr, int adder) {
+#define CHECK_CE(ce_start, ce_end) ((ce_start) != nullptr && (ce_end) != nullptr && \
+                                    *(ce_start) != *(ce_end))
+
+__global__ void kuserbuffers_pushrecv(int myrank, int peer, int nvrank, int nvpeer, int *recv_id,
+                                      int *flagptr, int adder, uint64_t ub_timeout,
+                                      int *ce_start_ptr, int *ce_end_ptr) {
  const int signal_id = (*recv_id) + adder;
  *recv_id = signal_id;
  volatile int *flag = (volatile int *)flagptr;
@@ -3146,8 +2077,12 @@ __global__ void kuserbuffers_pushrecv(int myrank, int peer, int *recv_id, int *f
    return;
  clock_t s = clock64();
  while (CHECK_IDS(*flag, signal_id)) {
-    if (clock64() - s > TIMEOUT) {
-      printf("%d from %d] pushrecv: expected %d, stuck with %d\n", myrank, peer, signal_id, *flag);
+    if (CHECK_TIMEOUT(s, ub_timeout)) {
+      UB_PRINT("pushrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d] : "
+               "expected %d, observed %d", myrank, peer, nvrank, nvpeer, signal_id, *flag);
+      if (CHECK_CE(ce_start_ptr, ce_end_ptr))
+        UB_PRINT("pushrecv: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n",
+                 *ce_start_ptr, *ce_end_ptr);
      return;
    }
  }
@@ -3155,8 +2090,9 @@ __global__ void kuserbuffers_pushrecv(int myrank, int peer, int *recv_id, int *f

 __global__ void __launch_bounds__(MAX_THREADS)
    kuserbuffers_pushsendrecv(int *send_id, int *send_flagptr, int4 *srcptr, int4 *dstptr,
-                              const int lines, int myrank, int peer, int *recv_id,
-                              int *recv_flagptr, int adder) {
+                              const int lines, int send_peer, int recv_peer, int *recv_id,
+                              int *recv_flagptr, int adder, uint64_t ub_timeout,
+                              int nv_send, int nv_recv, int *ce_start_ptr, int *ce_end_ptr) {
  if (lines) {
    const int start_elem = threadIdx.x + blockDim.x * blockIdx.x;
    const int end_elem = lines;
@@ -3197,9 +2133,13 @@ __global__ void __launch_bounds__(MAX_THREADS)
      return;
    clock_t s = clock64();
    while (CHECK_IDS(*flag, signal_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("%d from %d] pushrecv: expected %d, stuck with %d\n", myrank, peer, signal_id,
-               *flag);
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("pushsendrecv [sending peer:%d receiving peer:%d][nvrank(GPU) sending peer: %d"
+                 " receiving peer: %d]: expected %d, observed %d",
+                 send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag);
+        if (CHECK_CE(ce_start_ptr, ce_end_ptr))
+          UB_PRINT("pushrecv: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n",
+                   *ce_start_ptr, *ce_end_ptr);
        return;
      }
    }
@@ -3208,8 +2148,10 @@ __global__ void __launch_bounds__(MAX_THREADS)

 __global__ void __launch_bounds__(MAX_THREADS)
    kuserbuffers_pushsendrecv_atomic(int *send_id, int *send_flagptr, int4 *srcptr, int4 *dstptr,
-                                     const int lines, int myrank, int peer, int *recv_id,
-                                     int *recv_flagptr, int adder, void *counters) {
+                                     const int lines, int send_peer, int recv_peer, int *recv_id,
+                                     int *recv_flagptr, int adder, void *counters,
+                                     uint64_t ub_timeout, int nv_send, int nv_recv,
+                                     int *ce_start_ptr, int *ce_end_ptr) {
  if (lines) {
    const int start_elem = threadIdx.x + blockDim.x * blockIdx.x;
    const int end_elem = lines;
@@ -3246,12 +2188,15 @@ __global__ void __launch_bounds__(MAX_THREADS)
    const int signal_id = (*recv_id) + adder;
    *recv_id = signal_id;
    volatile int *flag = (volatile int *)recv_flagptr;
-    // if(*flag>=signal_id) return;
    clock_t s = clock64();
    while (CHECK_IDS(*flag, signal_id)) {
-      if (clock64() - s > TIMEOUT) {
-        printf("%d from %d] pushrecv: expected %d, stuck with %d\n", myrank, peer, signal_id,
-               *flag); /*return;*/
+      if (CHECK_TIMEOUT(s, ub_timeout)) {
+        UB_PRINT("pushsendrecv atomic [sending peer:%d receiving peer:%d][nvrank(GPU) sending peer:"
+                 " %d receiving peer: %d]: expected %d, observed %d",
+                 send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag); /*return;*/
+        if (CHECK_CE(ce_start_ptr, ce_end_ptr))
+          UB_PRINT("pushsendrecv atomic: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n",
+                   *ce_start_ptr, *ce_end_ptr);
      }
    }

@@ -3265,13 +2210,14 @@ __global__ void __launch_bounds__(MAX_THREADS)

 __global__ void __launch_bounds__(MAX_THREADS)
    kuserbuffers_pushsendrecv_multiatomic(int *send_id, int *send_flagptr, int4 *srcptr,
-                                          int4 *dstptr, const int lines, int myrank, int peer,
-                                          int *recv_id, int *recv_flagptr, int adder,
+                                          int4 *dstptr, const int lines, int send_peer,
+                                          int recv_peer, int *recv_id, int *recv_flagptr, int adder,
                                          void *counters, int nchunks, int send_stride,
-                                          int recv_stride, bool shuffle) {
+                                          int recv_stride, bool shuffle,
+                                          uint64_t ub_timeout, int nv_send, int nv_recv) {
  for (int chunk_i = 0; chunk_i < nchunks - 1; chunk_i++) {
-    int send_chunk_id = shuffle ? chunk_i : (nchunks + myrank - chunk_i) % nchunks;
-    int recv_chunk_id = shuffle ? chunk_i + 1 : (nchunks + myrank - chunk_i - 1) % nchunks;
+    int send_chunk_id = shuffle ? chunk_i : (nchunks + send_peer - chunk_i) % nchunks;
+    int recv_chunk_id = shuffle ? chunk_i + 1 : (nchunks + send_peer - chunk_i - 1) % nchunks;
    int send_offset = (send_chunk_id * send_stride) / 16;
    int recv_offset = ((shuffle ? recv_chunk_id : send_chunk_id) * recv_stride) / 16;

@@ -3313,12 +2259,14 @@ __global__ void __launch_bounds__(MAX_THREADS)
      const int signal_id = (*recv_id) + adder;
      *recv_id = signal_id;
      volatile int *flag = (volatile int *)recv_flagptr;
-      // if(*flag>=signal_id) return;
      clock_t s = clock64();
      while (CHECK_IDS(*flag, signal_id)) {
-        if (clock64() - s > TIMEOUT) {
-          printf("%d from %d] pushrecv: expected %d, stuck with %d\n", myrank, peer, signal_id,
-                 *flag); /*return;*/
+        if (CHECK_TIMEOUT(s, ub_timeout)) {
+          UB_PRINT("pushsendrecv multiatomic [sending peer:%d receiving peer:%d][nvrank(GPU)"
+                   " sending peer: %d receiving peer: %d]: expected %d, observed %d",
+                   send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag); /*return;*/
+          // CE mode is not supported for multi-atomic, so there is no need to check for a deadlock
+          return;
        }
      }
    }
@@ -3334,9 +2282,8 @@ __global__ void __launch_bounds__(MAX_THREADS)

    // sync all CTAs before moving to next chunk.
    if (threadIdx.x == 0) {
-      int old_val2;
      atomicInc(((unsigned int *)counters) + nchunks + chunk_i, gridDim.x - 1);
-      while (0 != (old_val2 = atomicCAS(((unsigned int *)counters) + nchunks + chunk_i, 0, 0))) {
+      while (0 != (atomicCAS(((unsigned int *)counters) + nchunks + chunk_i, 0, 0))) {
      }
    }
    __syncthreads();
@@ -3352,50 +2299,56 @@ __global__ void __launch_bounds__(MAX_THREADS)
    }                                                                                              \
  } while (0)

+// Return TRUE if two ranks share the same NV domain
 #define INTRANODE(peer) ((peer / comm->nvsize) == (comm->myrank / comm->nvsize))

+// Index corresponds to the type of flag:
+// 0 - Send index counter
+// 1 - CE start index counter
+// 2 - CE end index counter
+#define GET_SEND_PTR_BY_INDEX(peerlocal, comm, dsth, index)                                       \
+  ((reinterpret_cast<char *>((comm)->peer_ptr[0][(peerlocal)])) +                                 \
+    ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV +                                                   \
+    (comm)->myrank * NVTE_MAX_REGIONS + (dsth) +                                                  \
+    (index) * NVTE_MAX_NVLINK * NVTE_MAX_REGIONS) *                                               \
+    sizeof(int)))
+
+// Index corresponds to the type of flag:
+// 0 - Receive index counter
+// 1 - CE start index counter
+// 2 - CE end index counter
+#define GET_RECV_PTR_BY_INDEX(recv_peer, comm, dsth, index)                                       \
+  ((reinterpret_cast<char *>((comm)->mem_ptr[0])) +                                               \
+  ((NVTE_REG0_OFFSET(comm) +                                                                      \
+    NVTE_REG0_RECV + (recv_peer) * NVTE_MAX_REGIONS +                                             \
+    (dsth) + (index) * NVTE_MAX_NVLINK * NVTE_MAX_REGIONS) *                                      \
+    sizeof(int)))
+
 void userbuffers_send(const int srchandler, const size_t srcoffset, const int dsthandler,
                      const size_t dstoffset, const size_t bytes, communicator *comm,
                      const int peer, cudaStream_t stream) {
-  int peerlocal = peer % comm->nvsize;
-  void *flagptr =
-      (comm->peer_ptr[0][peerlocal]) +
-      ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + comm->myrank * NVTE_MAX_REGIONS + dsthandler) *
-       sizeof(int));
-  bool signalonly = (bytes / 16 == 0) || (comm->use_ce != 0);
-  bool intranode = INTRANODE(peer);
-  if (!intranode && (comm->launch_mode & NVTE_LAUNCH_CPU)) {
-    comm->fifo[comm->head].optype = userbuffers_sendop;
-    comm->fifo[comm->head].basecounter = comm->basecounter[userbuffers_sendop];
-    comm->fifo[comm->head].handler = srchandler;
-    comm->fifo[comm->head].offset = srcoffset;
-    comm->fifo[comm->head].handler2 = dsthandler;
-    comm->fifo[comm->head].offset2 = dstoffset;
-    comm->fifo[comm->head].elements = bytes;
-    comm->fifo[comm->head].peer = peer;
-
-    int newhead = (comm->head + 1) & (NVTE_MAX_REQUESTS - 1);
-    while (newhead == comm->tail) {
-    }
-    comm->head = newhead;
-    comm->basecounter[userbuffers_sendop] += 1;
-  }
-  if (!intranode && (comm->launch_mode & NVTE_LAUNCH_GPU)) {
-    kuserbuffers_proxysend<<<1, 1, 0, stream>>>(&(comm->flags[NVTE_GF_STATE + userbuffers_sendop]),
-                                                comm->hostflags + userbuffers_sendop);
-    return;
-  }
+  int peerlocal      = peer % comm->nvsize;
+  void *flagptr      = GET_SEND_PTR_BY_INDEX(peerlocal, comm, dsthandler, 0);
+  void *ce_send_start_ptr = GET_SEND_PTR_BY_INDEX(peerlocal, comm, dsthandler, 1);
+  void *ce_send_end_ptr   = GET_SEND_PTR_BY_INDEX(peerlocal, comm, dsthandler, 2);
+  bool signalonly    = (bytes / 16 == 0) || (comm->use_ce != 0);
+
+  assert(INTRANODE(peer));
+
  if (!(comm->launch_mode & NVTE_LAUNCH_GPU))
    return;
  if (comm->push == 0) {
    kuserbuffers_pullsend<<<1, 1, 0, stream>>>(comm->myrank, peer, &(comm->send_id[peer]),
                                               reinterpret_cast<int *>(flagptr));
  } else {
-    void *srcptr = (comm->mem_ptr[srchandler]) + srcoffset;
-    void *dstptr = (comm->peer_ptr[dsthandler][peerlocal]) + dstoffset;
+    void *srcptr = reinterpret_cast<char *>(comm->mem_ptr[srchandler]) + srcoffset;
+    void *dstptr = reinterpret_cast<char *>(comm->peer_ptr[dsthandler][peerlocal]) + dstoffset;

-    if (comm->use_ce)
+    if (comm->use_ce) {
+      kuserbuffers_inc<<<1, 1, 0, stream>>>(reinterpret_cast<int *>(ce_send_start_ptr));
      CUDACHECK(cudaMemcpyAsync(dstptr, srcptr, bytes, cudaMemcpyDeviceToDevice, stream));
+      kuserbuffers_inc<<<1, 1, 0, stream>>>(reinterpret_cast<int *>(ce_send_end_ptr));
+    }
    SETUP_LAUNCH_CONFIG(signalonly ? 1 : comm->sms, signalonly ? 1 : 1024, stream);
    int *arg1 = &comm->send_id[peer], *arg2 = reinterpret_cast<int *>(flagptr);
    int4 *arg3 = reinterpret_cast<int4 *>(srcptr), *arg4 = reinterpret_cast<int4 *>(dstptr);
@@ -3414,19 +2367,20 @@ void userbuffers_sendrecv(const int srchandler, const int dsthandler, const size
  bool signalonly = (bytes / 16 == 0) || (comm->use_ce != 0);
  int send_peerlocal = send_peer % comm->nvsize;
  int recv_peerlocal = recv_peer % comm->nvsize;
-  void *flagptr_send =
-      (comm->peer_ptr[0][send_peerlocal]) +
-      ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + comm->myrank * NVTE_MAX_REGIONS + dsthandler) *
-       sizeof(int));
-  void *flagptr_recv =
-      (comm->mem_ptr[0]) +
-      ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + recv_peer * NVTE_MAX_REGIONS + dsthandler) *
-       sizeof(int));
-
-  void *send_srcptr = (comm->mem_ptr[srchandler]) + send_offset;
-  void *send_dstptr = (comm->peer_ptr[dsthandler][send_peerlocal]) + send_offset;
-  if (comm->use_ce)
+  void *flagptr_send      = GET_SEND_PTR_BY_INDEX(send_peerlocal, comm, dsthandler, 0);
+  void *ce_send_start_ptr = GET_SEND_PTR_BY_INDEX(send_peerlocal, comm, dsthandler, 1);
+  void *ce_send_end_ptr   = GET_SEND_PTR_BY_INDEX(send_peerlocal, comm, dsthandler, 2);
+  void *flagptr_recv = GET_RECV_PTR_BY_INDEX(recv_peer, comm, dsthandler, 0);
+
+  void *send_srcptr = reinterpret_cast<char *>(comm->mem_ptr[srchandler]) + send_offset;
+  void *send_dstptr = reinterpret_cast<char *>(comm->peer_ptr[dsthandler][send_peerlocal])
+                      + send_offset;
+
+  if (comm->use_ce) {
+    kuserbuffers_inc<<<1, 1, 0, stream>>>(reinterpret_cast<int *>(ce_send_start_ptr));
    CUDACHECK(cudaMemcpyAsync(send_dstptr, send_srcptr, bytes, cudaMemcpyDeviceToDevice, stream));
+    kuserbuffers_inc<<<1, 1, 0, stream>>>(reinterpret_cast<int *>(ce_send_end_ptr));
+  }
  SETUP_LAUNCH_CONFIG(signalonly ? 1 : comm->sms, signalonly ? 1 : 1024, stream);

  int *arg1 = &comm->send_id[send_peer];
@@ -3434,19 +2388,30 @@ void userbuffers_sendrecv(const int srchandler, const int dsthandler, const size
  int4 *arg3 = reinterpret_cast<int4 *>(send_srcptr);
  int4 *arg4 = reinterpret_cast<int4 *>(send_dstptr);
  int arg5 = signalonly ? 0 : bytes / 16;
-  int arg6 = comm->myrank;
+  int arg6 = send_peer;
  int arg7 = recv_peer;
  int *arg8 = &comm->recv_id[recv_peer * NVTE_MAX_REGIONS + dsthandler];
  int *arg9 = reinterpret_cast<int *>(flagptr_recv);
  int arg10 = signalonly ? 1 : comm->sms;
+  uint64_t arg11 = comm->ub_timeout;
+  int arg12 = send_peerlocal;
+  int arg13 = recv_peerlocal;
+  int *arg14 = reinterpret_cast<int *>(comm->use_ce ?
+                                       GET_RECV_PTR_BY_INDEX(recv_peer, comm, dsthandler, 1):
+                                       nullptr);
+  int *arg15 = reinterpret_cast<int *>(comm->use_ce ?
+                                       GET_RECV_PTR_BY_INDEX(recv_peer, comm, dsthandler, 2):
+                                       nullptr);
  void *kernelArgs[] = {reinterpret_cast<void *>(&arg1), reinterpret_cast<void *>(&arg2),
                        reinterpret_cast<void *>(&arg3), reinterpret_cast<void *>(&arg4),
                        reinterpret_cast<void *>(&arg5), reinterpret_cast<void *>(&arg6),
                        reinterpret_cast<void *>(&arg7), reinterpret_cast<void *>(&arg8),
-                        reinterpret_cast<void *>(&arg9), reinterpret_cast<void *>(&arg10)};
+                        reinterpret_cast<void *>(&arg9), reinterpret_cast<void *>(&arg10),
+                        reinterpret_cast<void *>(&arg11), reinterpret_cast<void *>(&arg12),
+                        reinterpret_cast<void *>(&arg13), reinterpret_cast<void *>(&arg14),
+                        reinterpret_cast<void *>(&arg15)};
  CUDACHECK(
      cudaLaunchKernelExC(&cfg, reinterpret_cast<void *>(kuserbuffers_pushsendrecv), kernelArgs));
-  //}
 }

 void userbuffers_sendrecv_atomic(const int srchandler, const int dsthandler,
@@ -3458,19 +2423,18 @@ void userbuffers_sendrecv_atomic(const int srchandler, const int dsthandler,

  int send_peerlocal = send_peer % comm->nvsize;
  int recv_peerlocal = recv_peer % comm->nvsize;
-  void *flagptr_send =
-      (comm->peer_ptr[0][send_peerlocal]) +
-      ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + comm->myrank * NVTE_MAX_REGIONS + dsthandler) *
-       sizeof(int));
-  void *flagptr_recv =
-      (comm->mem_ptr[0]) +
-      ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + recv_peer * NVTE_MAX_REGIONS + dsthandler) *
-       sizeof(int));
-
-  void *send_srcptr = (comm->mem_ptr[srchandler]) + send_offset;
-  void *send_dstptr = (comm->peer_ptr[dsthandler][send_peerlocal]) + send_offset;
+  void *flagptr_send      = GET_SEND_PTR_BY_INDEX(send_peerlocal, comm, dsthandler, 0);
+  void *ce_send_start_ptr = GET_SEND_PTR_BY_INDEX(send_peerlocal, comm, dsthandler, 1);
+  void *ce_send_end_ptr   = GET_SEND_PTR_BY_INDEX(send_peerlocal, comm, dsthandler, 2);
+  void *flagptr_recv      = GET_RECV_PTR_BY_INDEX(recv_peer, comm, dsthandler, 0);
+
+  void *send_srcptr = reinterpret_cast<char *>(comm->mem_ptr[srchandler]) + send_offset;
+  void *send_dstptr = reinterpret_cast<char *>(comm->peer_ptr[dsthandler][send_peerlocal])
+                      + send_offset;
  if (comm->use_ce) {
+    kuserbuffers_inc<<<1, 1, 0, stream>>>(reinterpret_cast<int *>(ce_send_start_ptr));
    CUDACHECK(cudaMemcpyAsync(send_dstptr, send_srcptr, bytes, cudaMemcpyDeviceToDevice, stream));
+    kuserbuffers_inc<<<1, 1, 0, stream>>>(reinterpret_cast<int *>(ce_send_end_ptr));
  }
  SETUP_LAUNCH_CONFIG(signalonly ? 1 : comm->sms, signalonly ? 1 : 1024, stream);

@@ -3479,18 +2443,29 @@ void userbuffers_sendrecv_atomic(const int srchandler, const int dsthandler,
  int4 *arg3 = reinterpret_cast<int4 *>(send_srcptr);
  int4 *arg4 = reinterpret_cast<int4 *>(send_dstptr);
  int arg5 = signalonly ? 0 : bytes / 16;
-  int arg6 = comm->myrank;
+  int arg6 = send_peer;
  int arg7 = recv_peer;
  int *arg8 = &comm->recv_id[recv_peer * NVTE_MAX_REGIONS + dsthandler];
  int *arg9 = reinterpret_cast<int *>(flagptr_recv);
  int arg10 = signalonly ? 1 : comm->sms;
  void *arg11 = counters;
+  int arg12 = comm->ub_timeout;
+  int arg13 = send_peerlocal;
+  int arg14 = recv_peerlocal;
+  int *arg15 = reinterpret_cast<int *>(comm->use_ce ?
+                                       GET_RECV_PTR_BY_INDEX(recv_peer, comm, dsthandler, 1) :
+                                       nullptr);
+  int *arg16 = reinterpret_cast<int *>(comm->use_ce ?
+                                       GET_RECV_PTR_BY_INDEX(recv_peer, comm, dsthandler, 2) :
+                                       nullptr);
  void *kernelArgs[] = {reinterpret_cast<void *>(&arg1), reinterpret_cast<void *>(&arg2),
                        reinterpret_cast<void *>(&arg3), reinterpret_cast<void *>(&arg4),
                        reinterpret_cast<void *>(&arg5), reinterpret_cast<void *>(&arg6),
                        reinterpret_cast<void *>(&arg7), reinterpret_cast<void *>(&arg8),
                        reinterpret_cast<void *>(&arg9), reinterpret_cast<void *>(&arg10),
-                        reinterpret_cast<void *>(&arg11)};
+                        reinterpret_cast<void *>(&arg11), reinterpret_cast<void *>(&arg12),
+                        reinterpret_cast<void *>(&arg13), reinterpret_cast<void *>(&arg14),
+                        reinterpret_cast<void *>(&arg15), reinterpret_cast<void *>(&arg16)};
  CUDACHECK(cudaLaunchKernelExC(&cfg, reinterpret_cast<void *>(kuserbuffers_pushsendrecv_atomic),
                                kernelArgs));
 }
@@ -3501,17 +2476,12 @@ void userbuffers_sendrecv_multiatomic(const int srchandler, const int dsthandler
                                      const int recv_peer, const int nchunks, void *counters,
                                      bool shuffle, cudaStream_t stream) {
  assert(comm->push && comm->use_ce == 0);
+  // CE is not supported

  int send_peerlocal = send_peer % comm->nvsize;
  int recv_peerlocal = recv_peer % comm->nvsize;
-  void *flagptr_send =
-      (comm->peer_ptr[0][send_peerlocal]) +
-      ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + comm->myrank * NVTE_MAX_REGIONS + dsthandler) *
-       sizeof(int));
-  void *flagptr_recv =
-      (comm->mem_ptr[0]) +
-      ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + recv_peer * NVTE_MAX_REGIONS + dsthandler) *
-       sizeof(int));
+  void *flagptr_send = GET_SEND_PTR_BY_INDEX(send_peerlocal, comm, dsthandler, 0);
+  void *flagptr_recv = GET_RECV_PTR_BY_INDEX(recv_peer, comm, dsthandler, 0);

  SETUP_LAUNCH_CONFIG(comm->sms, 1024, stream);

@@ -3530,6 +2500,9 @@ void userbuffers_sendrecv_multiatomic(const int srchandler, const int dsthandler
  int arg13 = send_stride;
  int arg14 = recv_stride;
  bool arg15 = shuffle;
+  uint64_t arg16 = comm->ub_timeout;
+  int arg17 = send_peerlocal;
+  int arg18 = recv_peerlocal;
  void *kernelArgs[] = {reinterpret_cast<void *>(&arg1),  reinterpret_cast<void *>(&arg2),
                        reinterpret_cast<void *>(&arg3),  reinterpret_cast<void *>(&arg4),
                        reinterpret_cast<void *>(&arg5),  reinterpret_cast<void *>(&arg6),
@@ -3537,95 +2510,33 @@ void userbuffers_sendrecv_multiatomic(const int srchandler, const int dsthandler
                        reinterpret_cast<void *>(&arg9),  reinterpret_cast<void *>(&arg10),
                        reinterpret_cast<void *>(&arg11), reinterpret_cast<void *>(&arg12),
                        reinterpret_cast<void *>(&arg13), reinterpret_cast<void *>(&arg14),
-                        reinterpret_cast<void *>(&arg15)};
+                        reinterpret_cast<void *>(&arg15), reinterpret_cast<void *>(&arg16),
+                        reinterpret_cast<void *>(&arg17), reinterpret_cast<void *>(&arg18)};
  CUDACHECK(cudaLaunchKernelExC(
-      &cfg, reinterpret_cast<void *>(kuserbuffers_pushsendrecv_multiatomic), kernelArgs));
-}
-
-__global__ void __launch_bounds__(MAX_THREADS)
-    kuserbuffers_alltoall(void **baseflagptrs, int flagoffset, int4 *basesrcptr, void **dstptrs,
-                          size_t dstoffset, const int lines, const int myrank) {
-  if (blockIdx.x == myrank)
-    return;
-  int4 *dstptr = reinterpret_cast<int4 *>(dstptrs[blockIdx.x] + dstoffset);
-  int *flagptr = reinterpret_cast<int *>(baseflagptrs[blockIdx.x] + flagoffset);
-  const size_t myblockoffset = blockIdx.x * lines;
-  int4 *srcptr = basesrcptr + myblockoffset;
-  dstptr += myblockoffset;
-
-  if (lines) {
-    const int start_elem = threadIdx.x;
-    const int end_elem = lines;
-    const int aligned_elem = ((end_elem - start_elem) & (~(blockDim.x * UNROLLCOPY - 1)));
-    const int end_aligned = start_elem + aligned_elem;
-    if (end_elem > start_elem) {
-      for (int line = start_elem; line < end_aligned; line += blockDim.x * UNROLLCOPY) {
-        int4 val[UNROLLCOPY];
-#pragma unroll
-        for (int i = 0; i < UNROLLCOPY; i++)
-          val[i] = srcptr[line + i * blockDim.x];
-#pragma unroll
-        for (int i = 0; i < UNROLLCOPY; i++)
-          dstptr[line + i * blockDim.x] = val[i];
-      }
-      for (int line = end_aligned; line < end_elem; line += blockDim.x)
-        dstptr[line] = srcptr[line];
-    }
-    __syncthreads();
-    if (threadIdx.x)
-      return;
-    __threadfence_system();
-    atomicAdd(flagptr, 1);
-
-  } else {
-    atomicAdd(flagptr, 1);
-  }
-}
-
-void userbuffers_alltoall_send(const int srchandler, const size_t srcoffset, const int dsthandler,
-                               const size_t dstoffset, const size_t bytes, communicator *comm,
-                               cudaStream_t stream) {
-  if (comm->launch_mode & NVTE_LAUNCH_CPU) {
-    comm->fifo[comm->head].optype = userbuffers_alltoall;
-    comm->fifo[comm->head].basecounter = comm->basecounter[userbuffers_alltoall];
-    comm->fifo[comm->head].handler = srchandler;
-    comm->fifo[comm->head].offset = srcoffset;
-    comm->fifo[comm->head].handler2 = dsthandler;
-    comm->fifo[comm->head].offset2 = dstoffset;
-    comm->fifo[comm->head].elements = bytes;
-
-    int newhead = (comm->head + 1) & (NVTE_MAX_REQUESTS - 1);
-    while (newhead == comm->tail) {
-    }
-    comm->head = newhead;
-    comm->basecounter[userbuffers_alltoall] += 1;
-  }
-  if (comm->launch_mode & NVTE_LAUNCH_GPU)
-    kuserbuffers_proxysend<<<1, 1, 0, stream>>>(
-        &(comm->flags[NVTE_GF_STATE + userbuffers_alltoall]),
-        comm->hostflags + userbuffers_alltoall);
+    &cfg, reinterpret_cast<void *>(kuserbuffers_pushsendrecv_multiatomic), kernelArgs));
 }

 void userbuffers_recv(const int srchandler, const size_t srcoffset, const int dsthandler,
                      const size_t dstoffset, const size_t bytes, communicator *comm,
                      const int peer, cudaStream_t stream) {
-  int peerlocal = peer % comm->nvsize;
-  void *flagptr =
-      (comm->mem_ptr[0]) +
-      ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_RECV + peer * NVTE_MAX_REGIONS + dsthandler) *
-       sizeof(int));
+  int peerlocal      = peer % comm->nvsize;
+  void *flagptr      = GET_RECV_PTR_BY_INDEX(peer, comm, dsthandler, 0);
  bool signalonly = (bytes / 16 == 0) || (comm->use_ce != 0);
-  bool intranode = INTRANODE(peer);
+
+  assert(INTRANODE(peer));
+
  if (!(comm->launch_mode & NVTE_LAUNCH_GPU))
    return;
-  if (comm->push == 0 && intranode) {
-    void *dstptr = (comm->mem_ptr[dsthandler]) + dstoffset;
-    void *srcptr = (comm->peer_ptr[srchandler][peerlocal]) + srcoffset;
+  if (comm->push == 0) {
+    void *dstptr = reinterpret_cast<char *>(comm->mem_ptr[dsthandler]) + dstoffset;
+    void *srcptr = reinterpret_cast<char *>(comm->peer_ptr[srchandler][peerlocal]) + srcoffset;

    kuserbuffers_pullrecv<<<signalonly ? 1 : comm->sms, signalonly ? 1 : 1024, 0, stream>>>(
-        comm->myrank, peer, &(comm->recv_id[peer * NVTE_MAX_REGIONS + dsthandler]),
+        comm->myrank, peer, comm->nvrank,
+        peerlocal, &(comm->recv_id[peer * NVTE_MAX_REGIONS + dsthandler]),
        reinterpret_cast<int *>(flagptr), reinterpret_cast<int4 *>(srcptr),
-        reinterpret_cast<int4 *>(dstptr), signalonly ? 0 : bytes / 16);
+        reinterpret_cast<int4 *>(dstptr), signalonly ? 0 : bytes / 16,
+        comm->ub_timeout);
    if (!signalonly)
      kuserbuffers_inc<<<1, 1, 0, stream>>>(&(comm->recv_id[peer * NVTE_MAX_REGIONS + dsthandler]));
    if (comm->use_ce) {
@@ -3633,22 +2544,17 @@ void userbuffers_recv(const int srchandler, const size_t srcoffset, const int ds
    }
  } else {
    kuserbuffers_pushrecv<<<1, 1, 0, stream>>>(
-        comm->myrank, peer, &comm->recv_id[peer * NVTE_MAX_REGIONS + dsthandler],
-        reinterpret_cast<int *>(flagptr), signalonly || !intranode ? 1 : comm->sms);
+        comm->myrank, peer, comm->nvrank, peerlocal,
+        &comm->recv_id[peer * NVTE_MAX_REGIONS + dsthandler],
+        reinterpret_cast<int *>(flagptr), signalonly || comm->sms,
+        comm->ub_timeout,
+        reinterpret_cast<int *>(comm->use_ce ?
+                                GET_RECV_PTR_BY_INDEX(peer, comm, dsthandler, 1) : nullptr),
+        reinterpret_cast<int *>(comm->use_ce ?
+                                GET_RECV_PTR_BY_INDEX(peer, comm, dsthandler, 2) : nullptr));
  }
 }

-void userbuffers_alltoall_recv(communicator *comm, cudaStream_t stream) {
-  void *flagptr =
-      (comm->mem_ptr[0]) +
-      ((NVTE_REG0_OFFSET(comm) + NVTE_REG0_OPFLAGS * userbuffers_alltoall) * sizeof(int));
-
-  if (!(comm->launch_mode & NVTE_LAUNCH_GPU))
-    return;
-  kuserbuffers_pushrecv<<<1, 1, 0, stream>>>(comm->myrank, -1, reinterpret_cast<int *>(flagptr + 4),
-                                             reinterpret_cast<int *>(flagptr), comm->nranks - 1);
-}
-
 // producer
 static __global__ void producer_kernel(void *atomic_ptr, int chunk_i) {
  // Decrement atomic val to signal current output tile finish
@@ -3666,8 +2572,7 @@ static __global__ void producer_kernel(void *atomic_ptr, int chunk_i) {
 static __global__ void consumer_kernel(void *atomic_ptr, int chunk_i) {
  // Wait for producer to change the val to 0, which signal producer ready
  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    int old_val;
-    while (0 != (old_val = atomicCAS((unsigned int *)atomic_ptr + chunk_i, 0, 0))) {
+    while (0 != (atomicCAS((unsigned int *)atomic_ptr + chunk_i, 0, 0))) {
    }
    ((unsigned int *)atomic_ptr)[chunk_i] = 1;
    asm volatile("fence.sc.gpu;\n");
@@ -3678,9 +2583,8 @@ static __global__ void consumer_kernel(void *atomic_ptr, int chunk_i) {
 static __global__ void consumer_batch_kernel(void *atomic_ptr, int first_chunk_i, int num_chunks) {
  // Wait for producer to change the val to 0, which signal producer ready
  if (blockIdx.x == 0 && threadIdx.x == 0) {
-    int old_val;
    for (int i = first_chunk_i; i < num_chunks; i++) {
-      while (0 != (old_val = atomicCAS((unsigned int *)atomic_ptr + i, 0, 0))) {
+      while (0 != (atomicCAS((unsigned int *)atomic_ptr + i, 0, 0))) {
      }
      ((unsigned int *)atomic_ptr)[i] = 1;
      asm volatile("fence.sc.gpu;\n");

--- a/transformer_engine/pytorch/csrc/userbuffers/userbuffers.h
+++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.h
@@ -12,7 +12,6 @@
 #include "cuda_runtime.h"
 #include <pthread.h>
 #include <chrono>
-#include "gdrapi.h"
 #include <stdexcept>

 #define NVTE_MAX_REGIONS 16
@@ -32,10 +31,6 @@
 #define NVTE_UB_MEM_MC_CREATED 2
 #define NVTE_UB_MEM_ALLOCATED 4

-#ifdef UCP
-#include <ucp/api/ucp.h>
-#endif
-
 // region 0 flag offsets
 #define NVTE_REG0_OPFLAGS 1024
 #define NVTE_REG0_RECV (NVTE_REG0_OPFLAGS * userbuffers_op_types)
@@ -43,7 +38,8 @@
 #define NVTE_REG0_OFFSET(comm) ((2 * NVTE_MAX_REGIONS) * NVTE_MAX_NVLINK \
                                 + NVTE_REG0_SINGLENODE * 2 + NVTE_MAX_PEERS)
 #define NVTE_REG0_COMMBUFFER 0
-#define NVTE_REG0_FLAGS (NVTE_REG0_RECV + NVTE_MAX_PEERS * NVTE_MAX_REGIONS)
+// x3 for [flagptr, ce_start_ptr, ce_end_ptr]
+#define NVTE_REG0_FLAGS (NVTE_REG0_RECV + NVTE_MAX_PEERS * NVTE_MAX_REGIONS * 3)
 #define NVTE_REG0_IBRS 32
 #define NVTE_REG0_IBAG 512

@@ -122,16 +118,11 @@ struct communicator {
  // max value for running block counters in hostflags
  int basecounter[userbuffers_op_types];  // NOLINT(*)

-  int *hostflags;
  int *flags, *map_flags;
-  gdr_t g;

-  struct sharp_coll_context *sharp_coll_context;
-  struct sharp_coll_comm *sharp_coll_comm;
  void *mem_mr[NVTE_MAX_REGIONS];

  ub_request *fifo;
-  volatile int activeproxy;
  int nblocks, alignblock, minblock, asyncblocks, active_nreqs;
  ub_request active_req[userbuffers_op_types];  // NOLINT(*)
  int padding[7];
@@ -142,10 +133,9 @@ struct communicator {
  MPI_Request mpihndl[NVTE_MAX_SHARP];
  MPI_Comm comm_inter,  // reduction group communicator (subset of the nodes) along GPU rail
      comm_intra;       // full intranode (all ndev GPUS)
-  int ibnvsize;  // can be used to fake smaller or larger nvlink domain to use ib instead of nvlink
-                 // or force MNNVL
  int *send_id, *recv_id;
  int mydev;
+  uint64_t ub_timeout;
 };
 typedef struct communicator communicator;

@@ -185,23 +175,9 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
   SHARP and NSO/MNNVL)
 */

-void allreduce_userbuff_inplace(const int handler, const int offset, const int elements,
-                                communicator *comm, cudaStream_t stream = 0);
-// for DP distributed optimizer, only nonSHARP multinode is implemented & calls must come in pairs
-// ordered
-void allgather_userbuff_inplace(const int handler, const int offset, const int elements,
-                                communicator *comm, cudaStream_t stream = 0);
-void reducescatter_userbuff_inplace(const int handler, const int offset, const int elements,
-                                    communicator *comm, cudaStream_t stream = 0);
-
-void allreduce2_userbuff_inplace(const int handler, const int offset, const int elements,
-                                 communicator *comm, cudaStream_t stream = 0);
 // for TP-parallelism, only single node is implemented
 void allgather2_userbuff_inplace(const int handler, const int offset, const int elements,
                                 communicator *comm, cudaStream_t stream = 0);
-void allgather2_userbuff_inplace_sliced(const int handler, const int offset, const int elements,
-                                        communicator *comm, const int slice_id, const int nslices,
-                                        cudaStream_t stream = 0);
 /*
 each Rank input is
 allgather2_userbuff_inplace: offset+myrank*elements
@@ -231,14 +207,6 @@ void reducescatter2_userbuff_stridedoutput_fp8(void* output, float* scale, const
 template<typename fp8type>
 void reducescatter2_userbuff_fp8(void* output, float* scale, const int handler, const int offset,
                                 const int elements, communicator* comm, cudaStream_t stream = 0);
-#if 0
-template<typename fp8type>
-void reducescatter2_userbuff_strided_atomic_fp8(void* output, float *scale, const int handler,
-                                                const int offset, const int rowelements,
-                                                const int colelements, const int strideelements,
-                                                const int numchunks, void *counters,
-                                                communicator* comm, cudaStream_t stream = 0);
-#endif
 template<typename fp8type>
 void reducescatter2_userbuff_strided_atomic_fp8(void* output, float *scale, const int handler,
                                                const int offset, const int rowelements,