Initial Code for SCCL_v1

d9d23f34 · lishen · 57df3737 · d9d23f34 · d9d23f34 · d9d23f34
Commit d9d23f34 authored Jun 20, 2025 by lishen
19 changed files
--- a/src/hardware/topology/bootstrap/bootstrap_utils.h
+++ b/src/hardware/topology/bootstrap/bootstrap_utils.h
+#pragma once
+#include <string.h>
+#include "base.h"
+#include "ipcsocket.h"
+#include "proxy.h"
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace bootstrap {
+typedef net::host::scclSocketAddress scclSocketAddress_t;
+typedef net::host::scclSocket scclSocket_t;
+// scclBootstrapHandle 结构体定义，用于存储引导句柄
+struct scclBootstrapHandle {
+    uint64_t magic;           // 魔术数，用于标识结构体的有效性
+    scclSocketAddress_t addr; // 地址，用于网络通信
+};
+struct scclProxyState {
+    int refCount;      // 引用计数
+    int tpRank;        // 当前线程的排名
+    int tpnRanks;      // 线程组中线程的总数
+    int tpLocalnRanks; // 本地线程组中线程的总数
+    int cudaDev;                       // CUDA设备编号
+    int p2pnChannels;                  // 点对点通信的通道数
+    int p2pChunkSize;                  // 点对点通信的数据块大小
+    int nChannels;                     // 通道总数
+    int buffSizes[SCCL_NUM_PROTOCOLS]; // 各种协议的缓冲区大小
+    // 服务线程
+    pthread_t thread;         // 线程ID
+    scclSocket_t* listenSock; // 监听套接字
+    int stop;                 // 停止标志
+    // 由主线程使用
+    scclSocketAddress_t* peerAddresses; // 对等体地址
+    scclSocket_t* peerSocks;            // 对等体套接字
+    struct scclIpcSocket peerIpcSock;   // cuMEM API支持（UDS）
+    // 进展线程
+    struct scclProxyProgressState progressState; // 进展状态
+    // 从代理预期的响应队列
+    struct scclExpectedProxyResponse* expectedResponses; // 预期的代理响应
+};
+// scclBootstrapComm 结构体定义，用于存储引导通信信息
+struct scclBootstrapComm {
+    struct scclUniqueInfo unique_info; // 每个通信节点的基础信息
+    void* bootstrap;              // 引导信息
+    uint64_t magic;               // 魔术数，用于验证结构体
+    volatile uint32_t* abortFlag; // 中止标志
+    int splitShare;               // 是否使用共享内存进行分割
+    int* topParentRanks;          // 顶级父节点的rank
+    /* 与代理相关的共享资源 */
+    struct scclProxyState* proxyState;
+};
+// extInfo 结构体定义，用于存储Socket扩展信息
+struct extInfo {
+    int rank;                                 // 进程排名
+    int nranks;                               // 进程总数
+    scclSocketAddress_t extAddressListenRoot; // 根监听地址
+    scclSocketAddress_t extAddressListen;     // 监听地址
+};
+struct unexConn {
+    int peer;              // 对等节点的标识符
+    int tag;               // 连接的标签，用于区分不同的连接
+    scclSocket_t sock;     // 套接字结构，用于网络通信
+    struct unexConn* next; // 指向下一个未建立连接的指针，形成链表结构
+};
+// bootstrapState 结构体定义，用于存储引导状态
+struct bootstrapState {
+    scclSocket_t listenSock;                 // 监听套接字
+    scclSocket_t ringRecvSocket;             // 环接收套接字
+    scclSocket_t ringSendSocket;             // 环发送套接字
+    scclSocketAddress_t* peerCommAddresses;  // 对等通信地址
+    scclSocketAddress_t* peerProxyAddresses; // 对等代理地址
+    struct unexConn* unexpectedConnections;  // 意外连接
+    int cudaDev;                             // CUDA 设备编号
+    int rank;                                // 进程排名
+    int nranks;                              // 进程总数
+    uint64_t magic;                          // 魔术数，用于验证结构体
+    volatile uint32_t* abortFlag;            // 中止标志
+};
+} // namespace bootstrap
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/topology/bootstrap/ipcsocket.cpp
+++ b/src/hardware/topology/bootstrap/ipcsocket.cpp
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include "ipcsocket.h"
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace bootstrap {
+// Enable Linux abstract socket naming
+#define USE_ABSTRACT_SOCKET
+#define SCCL_IPC_SOCKNAME_STR "/tmp/sccl-socket-%d-%lx"
+/**
+ * @brief 初始化IPC套接字
+ *
+ * 创建一个UNIX域数据报套接字，并绑定到指定路径。支持抽象套接字和普通文件系统套接字两种模式。
+ *
+ * @param handle 指向scclIpcSocket结构体的指针，用于存储套接字信息
+ * @param rank 进程排名，用于生成唯一的套接字名称
+ * @param hash 哈希值，与rank一起用于生成唯一的套接字名称
+ * @param abortFlag 指向中止标志的指针，如果非NULL则设置套接字为非阻塞模式
+ * @return scclResult_t 返回操作结果，成功返回scclSuccess，失败返回相应错误码
+ */
+scclResult_t scclIpcSocketInit(scclIpcSocket* handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) {
+    int fd = -1;
+    struct sockaddr_un cliaddr;
+    char temp[SCCL_IPC_SOCKNAME_LEN] = "";
+    if(handle == NULL) {
+        return scclInternalError;
+    }
+    handle->fd            = -1;
+    handle->socketName[0] = '\0';
+    if((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
+        WARN("UDS: Socket creation error : %d", errno);
+        return scclSystemError;
+    }
+    bzero(&cliaddr, sizeof(cliaddr));
+    cliaddr.sun_family = AF_UNIX;
+    // Create unique name for the socket.
+    int len = snprintf(temp, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, rank, hash);
+    if(len > (sizeof(cliaddr.sun_path) - 1)) {
+        WARN("UDS: Cannot bind provided name to socket. Name too large");
+        return scclInternalError;
+    }
+#ifndef USE_ABSTRACT_SOCKET
+    unlink(temp);
+#endif
+    INFO(SCCL_LOG_BOOTSTRAP, "UDS: Creating socket %s", temp);
+    strncpy(cliaddr.sun_path, temp, len);
+#ifdef USE_ABSTRACT_SOCKET
+    cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
+#endif
+    if(bind(fd, (struct sockaddr*)&cliaddr, sizeof(cliaddr)) < 0) {
+        WARN("UDS: Binding to socket %s failed : %d", temp, errno);
+        close(fd);
+        return scclSystemError;
+    }
+    handle->fd = fd;
+    strcpy(handle->socketName, temp);
+    handle->abortFlag = abortFlag;
+    // Mark socket as non-blocking
+    if(handle->abortFlag) {
+        int flags;
+        EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+        SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+    }
+    return scclSuccess;
+}
+/**
+ * 关闭IPC套接字并释放相关资源
+ *
+ * @param handle 指向scclIpcSocket结构体的指针，包含要关闭的套接字信息
+ * @return scclResult_t 返回操作结果：
+ *         - scclSuccess: 操作成功完成
+ *         - scclInternalError: 传入无效句柄(handle为NULL)
+ *
+ * @note 如果定义了USE_ABSTRACT_SOCKET宏，则不会删除socket文件
+ *       如果套接字文件描述符无效(fd<=0)，函数会直接返回成功
+ */
+scclResult_t scclIpcSocketClose(scclIpcSocket* handle) {
+    if(handle == NULL) {
+        return scclInternalError;
+    }
+    if(handle->fd <= 0) {
+        return scclSuccess;
+    }
+#ifndef USE_ABSTRACT_SOCKET
+    if(handle->socketName[0] != '\0') {
+        unlink(handle->socketName);
+    }
+#endif
+    close(handle->fd);
+    return scclSuccess;
+}
+/**
+ * 通过IPC socket接收文件描述符
+ *
+ * @param handle 指向scclIpcSocket结构体的指针，包含socket相关信息
+ * @param recvFd 用于存储接收到的文件描述符的指针
+ * @return scclResult_t 返回操作结果：
+ *         - scclSuccess: 成功接收文件描述符
+ *         - scclSystemError: 系统调用出错
+ *         - scclInternalError: 操作被中断
+ *
+ * @note 该函数会阻塞等待直到接收到数据或发生错误
+ * @warning 调用者需要确保recvFd指向有效的内存空间
+ */
+scclResult_t scclIpcSocketRecvFd(scclIpcSocket* handle, int* recvFd) {
+    struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
+    struct iovec iov[1];
+    // Union to guarantee alignment requirements for control array
+    union {
+        struct cmsghdr cm;
+        char control[CMSG_SPACE(sizeof(int))];
+    } control_un;
+    struct cmsghdr* cmptr;
+    char dummy_buffer[1];
+    int ret;
+    msg.msg_control    = control_un.control;
+    msg.msg_controllen = sizeof(control_un.control);
+    iov[0].iov_base = (void*)dummy_buffer;
+    iov[0].iov_len  = sizeof(dummy_buffer);
+    msg.msg_iov    = iov;
+    msg.msg_iovlen = 1;
+    while((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
+        if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
+            WARN("UDS: Receiving data over socket failed : %d", errno);
+            return scclSystemError;
+        }
+        if(handle->abortFlag && *handle->abortFlag)
+            return scclInternalError;
+    }
+    if(((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
+        if((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
+            WARN("UDS: Receiving data over socket failed");
+            return scclSystemError;
+        }
+        memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
+    } else {
+        WARN("UDS: Receiving data over socket %s failed", handle->socketName);
+        return scclSystemError;
+    }
+    INFO(SCCL_LOG_BOOTSTRAP, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
+    return scclSuccess;
+}
+/**
+ * 通过UNIX域套接字发送文件描述符
+ *
+ * @param handle      IPC套接字句柄
+ * @param sendFd      要发送的文件描述符
+ * @param rank        目标rank号
+ * @param hash        用于生成套接字名的哈希值
+ *
+ * @return 成功返回scclSuccess，失败返回错误码:
+ *         - scclInternalError: 内部错误(如名称过长或操作被中止)
+ *         - scclSystemError:   系统调用错误
+ *
+ * @note 使用SCM_RIGHTS机制通过控制消息发送文件描述符
+ *       在Linux下支持抽象套接字命名空间(当USE_ABSTRACT_SOCKET定义时)
+ */
+scclResult_t scclIpcSocketSendFd(scclIpcSocket* handle, const int sendFd, int rank, uint64_t hash) {
+    struct msghdr msg;
+    struct iovec iov[1];
+    char temp[SCCL_IPC_SOCKNAME_LEN];
+    union {
+        struct cmsghdr cm;
+        char control[CMSG_SPACE(sizeof(int))];
+    } control_un;
+    struct cmsghdr* cmptr;
+    struct sockaddr_un cliaddr;
+    // Construct client address to send this shareable handle to
+    bzero(&cliaddr, sizeof(cliaddr));
+    cliaddr.sun_family = AF_UNIX;
+    int len = snprintf(temp, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, rank, hash);
+    if(len > (sizeof(cliaddr.sun_path) - 1)) {
+        WARN("UDS: Cannot connect to provided name for socket. Name too large");
+        return scclInternalError;
+    }
+    (void)strncpy(cliaddr.sun_path, temp, len);
+    INFO(SCCL_LOG_BOOTSTRAP, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
+#ifdef USE_ABSTRACT_SOCKET
+    cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
+#endif
+    msg.msg_control    = control_un.control;
+    msg.msg_controllen = sizeof(control_un.control);
+    cmptr             = CMSG_FIRSTHDR(&msg);
+    cmptr->cmsg_len   = CMSG_LEN(sizeof(int));
+    cmptr->cmsg_level = SOL_SOCKET;
+    cmptr->cmsg_type  = SCM_RIGHTS;
+    memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+    msg.msg_name    = (void*)&cliaddr;
+    msg.msg_namelen = sizeof(struct sockaddr_un);
+    iov[0].iov_base = (void*)"";
+    iov[0].iov_len  = 1;
+    msg.msg_iov     = iov;
+    msg.msg_iovlen  = 1;
+    msg.msg_flags   = 0;
+    ssize_t sendResult;
+    while((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
+        if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
+            WARN("UDS: Sending data over socket %s failed : %d", temp, errno);
+            return scclSystemError;
+        }
+        if(handle->abortFlag && *handle->abortFlag)
+            return scclInternalError;
+    }
+    return scclSuccess;
+}
+} // namespace bootstrap
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/topology/bootstrap/ipcsocket.h
+++ b/src/hardware/topology/bootstrap/ipcsocket.h
+#pragma once
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <memory.h>
+#include <sys/un.h>
+#include <inttypes.h>
+#include "base.h"
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace bootstrap {
+#define SCCL_IPC_SOCKNAME_LEN 64
+// 定义IPC套接字结构体
+struct scclIpcSocket {
+    int fd;                                 // 文件描述符
+    char socketName[SCCL_IPC_SOCKNAME_LEN]; // 套接字名称
+    volatile uint32_t* abortFlag;           // 用于中止操作的标志
+};
+// 初始化IPC套接字
+scclResult_t scclIpcSocketInit(struct scclIpcSocket* handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
+// 关闭IPC套接字
+scclResult_t scclIpcSocketClose(struct scclIpcSocket* handle);
+// 接收文件描述符
+scclResult_t scclIpcSocketRecvFd(struct scclIpcSocket* handle, int* fd);
+// 发送文件描述符
+scclResult_t scclIpcSocketSendFd(struct scclIpcSocket* handle, const int fd, int rank, uint64_t hash);
+} // namespace bootstrap
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/topology/bootstrap/proxy.cpp
+++ b/src/hardware/topology/bootstrap/proxy.cpp
+#include <sys/syscall.h>
+#include <assert.h>
+#include "proxy.h"
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace bootstrap {}
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
+// static bool NeedProxy(int type, int pattern, int root, struct scclRing* ring, int nranks) {
+//     if(pattern == scclPatternRing || pattern == scclPatternRingTwice)
+//         return true;
+//     /* In chains, one rank does not need a proxy. Let's figure out which one it is */
+// /* Which index in the reorganized rings should we compare root against */
+// const int myrank = 0, nextrank = 1, prevrank = nranks - 1;
+// int index = pattern == scclPatternPipelineFrom ?
+//                                                /*                            no recv /  no send    if root = */
+//                 /* bcast  */ (type == proxyRecv ? myrank : nextrank)
+//                                                :
+//                                                /* reduce */ (type == proxyRecv ? prevrank : myrank);
+// int rank  = ring->userRanks[index];
+// return (root != rank);
+// }
+// #define PROXYARGS_ALLOCATE_SIZE SCCL_MAX_OPS
+// struct scclProxyPool {
+//     struct scclProxyPool* next;
+//     struct scclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
+// };
+// static void expectedProxyResponseFree(struct scclProxyState* state) {
+//     struct scclExpectedProxyResponse* elem = state->expectedResponses;
+//     struct scclExpectedProxyResponse* prev = NULL;
+//     while(elem) {
+//         prev = elem;
+//         elem = elem->next;
+//         free(prev->respBuff);
+//         free(prev);
+//     }
+// }
+// static scclResult_t expectedProxyResponseStore(struct scclProxyState* state, void* opId, void* respBuff, int respSize) {
+//     struct scclExpectedProxyResponse* elem = state->expectedResponses;
+//     while(elem) {
+//         if(elem->opId == opId) {
+//             if(respSize != elem->respSize) {
+//                 WARN("Mismatched response size for opId=%p", opId);
+//                 return scclInternalError;
+//             }
+//             if(elem->done) {
+//                 WARN("Storing response for already completed opId=%p", opId);
+//                 return scclInternalError;
+//             }
+//             memcpy(elem->respBuff, respBuff, respSize);
+//             free(respBuff);
+//             elem->done = true;
+//             return scclSuccess;
+//         }
+//         elem = elem->next;
+//     }
+//     WARN("Proxy response for opId=%p doesn't match any expected response", opId);
+//     return scclInternalError;
+// }
+// static scclResult_t expectedProxyResponseEnqueue(struct scclProxyState* state, void* opId, int respSize) {
+//     struct scclExpectedProxyResponse* ex;
+//     scclCHECK(scclCalloc(&ex, 1));
+//     ex->opId = opId;
+//     // Pre-alloc response buffer
+//     ex->respBuff = malloc(respSize);
+//     ex->respSize = respSize;
+//     ex->done     = false;
+//     // Enqueue
+//     struct scclExpectedProxyResponse* list = state->expectedResponses;
+//     if(list == NULL) {
+//         state->expectedResponses = ex;
+//         return scclSuccess;
+//     }
+//     while(list->next)
+//         list = list->next;
+//     list->next = ex;
+//     return scclSuccess;
+// }
+// static scclResult_t expectedProxyResponseDequeue(struct scclProxyState* state, void* opId, void* respBuff, int* found) {
+//     struct scclExpectedProxyResponse* elem = state->expectedResponses;
+//     struct scclExpectedProxyResponse* prev = NULL;
+//     *found                                 = 0;
+//     while(elem) {
+//         if((elem->opId == opId) && elem->done) {
+//             if(prev == NULL) {
+//                 state->expectedResponses = elem->next;
+//             } else {
+//                 prev->next = elem->next;
+//             }
+//             memcpy(respBuff, elem->respBuff, elem->respSize);
+//             free(elem->respBuff);
+//             free(elem);
+//             *found = 1;
+//             return scclSuccess;
+//         }
+//         prev = elem;
+//         elem = elem->next;
+//     }
+//     return scclSuccess;
+// }
+// static scclResult_t expectedProxyResponseRemove(struct scclProxyState* state, void* opId) {
+//     struct scclExpectedProxyResponse* elem = state->expectedResponses;
+//     struct scclExpectedProxyResponse* prev = NULL;
+//     while(elem) {
+//         if(elem->opId == opId) {
+//             if(prev == NULL) {
+//                 state->expectedResponses = elem->next;
+//             } else {
+//                 prev->next = elem->next;
+//             }
+//             free(elem->respBuff);
+//             free(elem);
+//             return scclSuccess;
+//         }
+//         prev = elem;
+//         elem = elem->next;
+//     }
+//     WARN("Couldn't find opId=%p", opId);
+//     return scclInternalError;
+// }
+// static scclResult_t asyncProxyOpEnqueue(struct scclProxyLocalPeer* peer, scclProxyAsyncOp* op) {
+//     scclProxyAsyncOp* list = peer->asyncOps;
+//     if(list == NULL) {
+//         peer->asyncOps = op;
+//         return scclSuccess;
+//     }
+//     while(list->next)
+//         list = list->next;
+//     list->next = op;
+//     return scclSuccess;
+// }
+// static scclResult_t asyncProxyOpDequeue(struct scclProxyLocalPeer* peer, scclProxyAsyncOp* op) {
+//     struct scclProxyAsyncOp* elem = peer->asyncOps;
+//     struct scclProxyAsyncOp* prev = NULL;
+//     while(elem) {
+//         if(elem->opId == op->opId) {
+//             if(prev == NULL) {
+//                 peer->asyncOps = elem->next;
+//             } else {
+//                 prev->next = elem->next;
+//             }
+//             if(elem->reqBuff) {
+//                 free(elem->reqBuff);
+//             }
+//             if(elem->respBuff) {
+//                 free(elem->respBuff);
+//             }
+//             free(elem);
+//             return scclSuccess;
+//         }
+//         prev = elem;
+//         elem = elem->next;
+//     }
+//     if(op) {
+//         WARN("Attempting to dequeue nonexistent async opId=%p", op->opId);
+//     } else {
+//         WARN("Attempting to dequeue null operation");
+//     }
+//     return scclInternalError;
+// }
+// static scclResult_t allocateArgs(struct scclProxyProgressState* state, struct scclProxyArgs** argsptr) {
+//     struct scclProxyArgs* elem;
+//     if(state->pool == NULL) {
+//         // Allocate a new pool of elements. Make sure we allocate the memory close
+//         // to the network thread
+//         struct scclProxyPool* newPool;
+//         scclCHECK(scclCalloc(&newPool, 1));
+//         struct scclProxyArgs* newElems = newPool->elems;
+//         // Chain newly allocated elements
+//         for(int i = 0; i < PROXYARGS_ALLOCATE_SIZE; i++) {
+//             if(i + 1 < PROXYARGS_ALLOCATE_SIZE)
+//                 newElems[i].next = newElems + i + 1;
+//         }
+//         // Add them all to the pool list
+//         state->pool = newElems;
+//         // Save the pool memory block for later resource release
+//         newPool->next = state->pools;
+//         state->pools  = newPool;
+//     }
+//     elem        = state->pool;
+//     state->pool = state->pool->next;
+//     elem->next = elem->nextPeer = NULL;
+//     *argsptr                    = elem;
+//     return scclSuccess;
+// }
+// // #define DEBUG_PROXY 1
+// #ifdef DEBUG_PROXY
+// #define DEBUG_PROXY_PRINT printf
+// #else
+// #define DEBUG_PROXY_PRINT(...)
+// #endif
+// #define OP_INDEX(op) ((op) ? (op) - state->pools->elems : -1)
+// #define OP_SEEN 0x100000
+// scclResult_t getOpIndex(struct scclProxyArgs* op, struct scclProxyProgressState* state, int* poolIndex, int* opIndex) {
+//     struct scclProxyPool* pool = state->pools;
+//     int p                      = 0;
+//     while(pool) {
+//         uint64_t o = op - pool->elems;
+//         if(o < PROXYARGS_ALLOCATE_SIZE) {
+//             *opIndex   = o;
+//             *poolIndex = p;
+//             return scclSuccess;
+//         }
+//         pool = pool->next;
+//         p++;
+//     }
+//     WARN("Could not find pool of op %p", op);
+//     return scclInternalError;
+// }
+// scclResult_t printProxyOp(struct scclProxyArgs* op, int poolIndex, int opIndex) {
+//     printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, op->pattern == scclPatternSend ? "Send" : op->pattern == scclPatternRecv ? "Recv" : "Coll");
+//     for(int s = 0; s < op->nsubs; s++) {
+//         struct scclProxySubArgs* sub = op->subs + s;
+//         if(op->state == scclProxyOpProgress) {
+//             char status = ' ';
+//             if(op->pattern == scclPatternRecv) {
+//                 if(sub->posted < sub->nsteps && sub->posted < sub->done + SCCL_STEPS)
+//                     status = 'I'; // Init
+//                 else if(sub->received < sub->posted)
+//                     status = 'R'; // Receiving
+//                 else if(sub->received < sub->transmitted)
+//                     status = 'R'; // Receiving
+//                 else if(sub->transmitted < sub->received)
+//                     status = 'F'; // Flushing
+//                 else if(sub->done < sub->transmitted)
+//                     status = 'G'; // Waiting on GPU
+//                 else
+//                     status = 'D'; // Done
+//             } else if(op->pattern == scclPatternSend) {
+//                 if(sub->posted < sub->nsteps && sub->posted < sub->done + SCCL_STEPS)
+//                     status = 'I'; // Init
+//                 else if(sub->transmitted < sub->posted)
+//                     status = 'G'; // Waiting on GPU
+//                 else if(sub->done < sub->transmitted)
+//                     status = 'S'; // Sending
+//                 else
+//                     status = 'D'; // Done
+//             }
+//             printf(" %d%c/%d", sub->peer, status, sub->channelId);
+//         } else {
+//             printf(" %d/%d", sub->peer, sub->channelId);
+//         }
+//     }
+//     printf("]");
+//     return scclSuccess;
+// }
+// scclResult_t dumpProxyState(struct scclProxyProgressState* state) {
+//     struct scclProxyArgs* op = state->active;
+//     int poolIndex, opIndex;
+//     printf("ACTIVE OPS\n");
+//     while(op) {
+//         scclCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
+//         if(op->state & OP_SEEN) {
+//             WARN("List loop at element %d-%d", poolIndex, opIndex);
+//         }
+//         scclCHECK(printProxyOp(op, poolIndex, opIndex));
+//         op->state |= OP_SEEN;
+//         printf("\n");
+//         struct scclProxyArgs* nextOp = op->nextPeer;
+//         while(nextOp) {
+//             scclCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex));
+//             if(nextOp->state & OP_SEEN) {
+//                 WARN("List loop at element %d-%d", poolIndex, opIndex);
+//             }
+//             printf("| `-> ");
+//             scclCHECK(printProxyOp(nextOp, poolIndex, opIndex));
+//             nextOp->state |= OP_SEEN;
+//             printf("\n");
+//             if(nextOp->next) {
+//                 WARN("Inactive op has next set!");
+//             }
+//             nextOp = nextOp->nextPeer;
+//         }
+//         if(op->nextPeer == NULL)
+//             printf("|\n");
+//         op = op->next;
+//         printf("v\n");
+//     }
+//     printf("[X]\n");
+// #if 0
+//   printf("FREE OPS\n");
+//   op = state->pool;
+//   while (op) {
+//     scclCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
+//     if (op->state & OP_SEEN) {
+//       WARN("List loop at element %d-%d", poolIndex, opIndex);
+//     }
+//     scclCHECK(printProxyOp(op, poolIndex, opIndex));
+//     op->state |= OP_SEEN;
+//     printf("->");
+//     op = op->next;
+//   }
+//   printf("[X]\n");
+// #else
+//     op = state->pool;
+//     while(op) {
+//         scclCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
+//         if(op->state & OP_SEEN) {
+//             WARN("List loop at element %d-%d", poolIndex, opIndex);
+//         }
+//         op->state |= OP_SEEN;
+//         op = op->next;
+//     }
+// #endif
+//     struct scclProxyPool* pool = state->pools;
+//     poolIndex                  = 0;
+//     while(pool) {
+//         struct scclProxyArgs* elem = pool->elems;
+//         for(int e = 0; e < PROXYARGS_ALLOCATE_SIZE; e++, elem++) {
+//             if((elem->state & OP_SEEN) == 0) {
+//                 printf("Elem %d-%d is not in any list:\n", poolIndex, e);
+//                 scclCHECK(printProxyOp(elem, poolIndex, e));
+//                 printf("\n");
+//             } else {
+//                 elem->state -= OP_SEEN;
+//             }
+//         }
+//         pool = pool->next;
+//         poolIndex++;
+//     }
+//     return scclSuccess;
+// }
+// static scclResult_t scclProxyOpToArgs(struct scclProxyOp* op, struct scclProxyArgs* args, int subIndex) {
+//     struct scclProxySubArgs* sub = args->subs + subIndex;
+//     if(subIndex >= SCCL_PROXY_MAX_SUBS) {
+//         WARN("Proxy append out of bounds");
+//         return scclInternalError;
+//     }
+//     // memset(sub, 0, sizeof(struct scclProxySubArgs));
+//     sub->connection = op->connection;
+//     sub->channelId  = op->channelId;
+//     sub->nsteps     = op->nsteps;
+//     sub->nbytes     = op->nbytes;
+//     sub->peer       = op->root;
+//     args->nsubs     = subIndex + 1;
+//     if(subIndex) {
+//         if((args->sliceSteps != op->sliceSteps) || (args->chunkSteps != op->chunkSteps) || (args->protocol != op->protocol) || (args->dtype != op->dtype) ||
+//            (args->redOp != op->redOp)) {
+//             WARN("Proxy append mismatch");
+//             return scclInternalError;
+//         }
+//         if(args->state != scclProxyOpReady) {
+//             WARN("Proxy append on running operation");
+//             return scclInternalError;
+//         }
+//         return scclSuccess;
+//     }
+//     // memset(&args->progress, 0, sizeof(struct scclProxyArgs)-offsetof(struct scclProxyArgs, progress));
+//     args->done           = 0;
+//     args->opCount        = op->opCount;
+//     args->sliceSteps     = op->sliceSteps;
+//     args->chunkSteps     = op->chunkSteps;
+//     args->chunkSize      = op->chunkSize;
+//     args->dtype          = op->dtype;
+//     args->redOp          = op->redOp;
+//     args->pattern        = op->pattern;
+//     args->protocol       = op->protocol;
+//     args->state          = scclProxyOpReady;
+//     args->progress       = op->connection->tcomm->proxyProgress;
+//     args->proxyAppendPtr = op->connection->proxyAppendPtr;
+//     return scclSuccess;
+// }
+// static scclResult_t ProxyAppend(struct scclProxyProgressState* state, struct scclProxyOp* op) {
+//     struct scclProxyConnection* connection = op->connection;
+//     int shared                             = connection->shared;
+//     struct scclProxyArgs* args             = *connection->proxyAppendPtr;
+//     if(args) {
+//         if(shared && args->opCount == op->opCount) {
+//             scclCHECK(scclProxyOpToArgs(op, args, args->nsubs));
+//             DEBUG_PROXY_PRINT("Insert (%d/%5ld/%5ld) as group with %5ld\n", shared, args->opCount, op->opCount, OP_INDEX(args));
+//         } else {
+//             struct scclProxyArgs* prevArgs = args;
+//             scclCHECK(allocateArgs(state, &args));
+//             scclCHECK(scclProxyOpToArgs(op, args, 0));
+//             prevArgs->nextPeer = args;
+//             DEBUG_PROXY_PRINT(
+//                 "Insert  %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, prevArgs->opCount, args->opCount, OP_INDEX(prevArgs));
+//             *(args->proxyAppendPtr) = args;
+//         }
+//     } else {
+//         // Nothing running for that peer. Add to the list
+//         scclCHECK(allocateArgs(state, &args));
+//         scclCHECK(scclProxyOpToArgs(op, args, 0));
+//         if(state->active == NULL) {
+//             // Create the list
+//             DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld) as first element\n", OP_INDEX(args), shared, args->opCount);
+//             state->active = args;
+//         } else {
+//             // Append element at the end of the list
+//             struct scclProxyArgs* last = state->active;
+//             while(last->next)
+//                 last = last->next;
+//             last->next = args;
+//             DEBUG_PROXY_PRINT("Insert  %5ld (%d/%5ld) as last element\n", OP_INDEX(args), shared, args->opCount);
+//         }
+//         *(args->proxyAppendPtr) = args;
+//     }
+//     return scclSuccess;
+// }
+// scclResult_t scclProxyPost(struct scclProxyOpsPool* pool, int nextOps, int nextOpsEnd) {
+//     pthread_mutex_lock(&pool->mutex);
+//     if(pool->nextOps == -1) {
+//         pool->nextOps = nextOps;
+//         pthread_cond_signal(&pool->cond);
+//     } else {
+//         pool->ops[pool->nextOpsEnd].next = nextOps;
+//     }
+//     pool->nextOpsEnd = nextOpsEnd;
+//     pthread_mutex_unlock(&pool->mutex);
+//     return scclSuccess;
+// }
+// static scclResult_t scclLocalOpAppend(struct scclComm* comm, struct scclProxyConnector* proxyConn, struct scclProxyOp* proxyOp) {
+//     int tpLocalRank               = comm->topParentLocalRanks[comm->localRank];
+//     struct scclProxyOps* proxyOps = comm->proxyState->proxyOps;
+//     if(proxyOps == NULL)
+//         return scclInternalError;
+//     proxyOps += proxyConn->tpLocalRank;
+//     struct scclProxyOpsPool* pool = proxyOps->pool;
+//     TIME_START(0);
+//     int opIndex = proxyOps->freeOp;
+//     struct scclProxyOp* op;
+//     if(opIndex != -1) {
+//         op               = pool->ops + opIndex;
+//         proxyOps->freeOp = op->next;
+//     } else {
+//         int freeOp;
+//         while((freeOp = pool->freeOps[tpLocalRank]) == -1)
+//             sched_yield();
+//         int freeOpNew;
+//         while((freeOpNew = __sync_val_compare_and_swap(pool->freeOps + tpLocalRank, freeOp, -1)) != freeOp)
+//             freeOp = freeOpNew;
+//         opIndex          = freeOp;
+//         op               = pool->ops + opIndex;
+//         proxyOps->freeOp = op->next;
+//     }
+//     if(op->next != -1)
+//         __builtin_prefetch(pool->ops + op->next); // Prefetch next free op
+//     memcpy(op, proxyOp, sizeof(struct scclProxyOp));
+//     op->next       = -1;
+//     op->connection = proxyConn->connection;
+//     if(proxyOps->nextOps == -1) {
+//         proxyOps->nextOps = proxyOps->nextOpsEnd = opIndex;
+//     } else {
+//         pool->ops[proxyOps->nextOpsEnd].next = opIndex;
+//         proxyOps->nextOpsEnd                 = opIndex;
+//     }
+//     if(++proxyOps->count == MAX_OPS_PER_PEER) {
+//         // Post what we have so far to free some ops in the pool
+//         // Do not post last operations as we could have more coming with the same opCount, and posting
+//         // them in different batches would break proxyArgs aggregation with subs.
+//         uint64_t lastOpCount = pool->ops[proxyOps->nextOpsEnd].opCount;
+//         int lastOp           = -1;
+//         int toSend           = 0;
+//         int ops              = 0;
+//         for(int op = proxyOps->nextOps; op != proxyOps->nextOpsEnd; op = pool->ops[op].next) {
+//             ops++;
+//             if(pool->ops[op].opCount != lastOpCount) {
+//                 lastOp = op;
+//                 toSend = ops;
+//             }
+//         }
+//         if(lastOp == -1) {
+//             WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
+//             return scclInternalError;
+//         }
+//         // Cut chain at lastOp
+//         int nextOps            = proxyOps->nextOps;
+//         proxyOps->nextOps      = pool->ops[lastOp].next;
+//         pool->ops[lastOp].next = -1;
+//         scclCHECK(scclProxyPost(proxyOps->pool, nextOps, lastOp));
+//         proxyOps->count -= toSend;
+//     }
+//     TIME_STOP(0);
+//     return scclSuccess;
+// }
+// static scclResult_t
+// SaveProxy(struct scclComm* comm, struct scclChannel* channel, int type, int peer, struct scclProxyOp* op, int connIndex, bool* justInquire) {
+//     if(peer < 0)
+//         return scclSuccess;
+//     struct scclChannelPeer* peerComm = channel->peers[peer];
+//     struct scclConnector* connector  = type == proxyRecv ? peerComm->recv + connIndex : peerComm->send + connIndex;
+//     if(connector->transportComm == NULL) {
+//         WARN("Rank %d has no transport for %s peer %d on channel %d/%d", comm->rank, type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex);
+//         return scclInternalError;
+//     }
+//     if(connector->transportComm->proxyProgress == NULL)
+//         return scclSuccess;
+//     if(justInquire)
+//         *justInquire = true;
+//     else {
+//         scclCHECK(scclLocalOpAppend(comm, &connector->proxyConn, op));
+//     }
+//     return scclSuccess;
+// }
+// scclResult_t mscclSaveProxy(struct scclComm* comm, struct scclChannel* channel, int type, int peer, struct scclProxyOp* op, int connIndex) {
+//     scclCHECK(SaveProxy(comm, channel, type, peer, op, connIndex, nullptr));
+//     return scclSuccess;
+// }
+// // justInquire != nullptr means don't actually do anything, just assertain need of
+// // scclProxySaveOp for this op.
+// scclResult_t scclProxySaveOp(struct scclComm* comm, struct scclProxyOp* op, bool* justInquire) {
+//     struct scclChannel* channel = &comm->channels[op->channelId];
+//     if(justInquire)
+//         *justInquire = false;
+//     switch(op->pattern) {
+//         case scclPatternRing:
+//         case scclPatternRingTwice:
+//         case scclPatternPipelineFrom:
+//         case scclPatternPipelineTo: {
+//             struct scclRing* ring = &channel->ring;
+//             if(NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) {
+//                 scclCHECK(SaveProxy(comm, channel, proxyRecv, ring->prev, op, op->connIndex, justInquire));
+//             }
+//             if(NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) {
+//                 scclCHECK(SaveProxy(comm, channel, proxySend, ring->next, op, op->connIndex, justInquire));
+//             }
+//         } break;
+//         case scclPatternTreeUp:
+//         case scclPatternTreeDown:
+//         case scclPatternTreeUpDown: {
+//             if(op->pattern != scclPatternTreeDown) { // Tree up
+//                 struct scclTree* tree = &channel->tree;
+//                 for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++) {
+//                     scclCHECK(SaveProxy(comm, channel, proxyRecv, tree->down[i], op, 0, justInquire));
+//                 }
+//                 scclCHECK(SaveProxy(comm, channel, proxySend, tree->up, op, 0, justInquire));
+//             }
+//             if(op->pattern != scclPatternTreeUp) { // Tree down
+//                 struct scclTree* tree = &channel->tree;
+//                 for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++) {
+//                     scclCHECK(SaveProxy(comm, channel, proxySend, tree->down[i], op, 0, justInquire));
+//                 }
+//                 scclCHECK(SaveProxy(comm, channel, proxyRecv, tree->up, op, 0, justInquire));
+//             }
+//         } break;
+//         case scclPatternCollnetChain: {
+//             scclCHECK(SaveProxy(comm, channel, proxySend, channel->collnetChain.up, op, 1, justInquire));
+//             scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetChain.up, op, 0, justInquire));
+//         } break;
+//         case scclPatternCollnetDirect: {
+//             scclCHECK(SaveProxy(comm, channel, proxySend, channel->collnetDirect.out, op, 1, justInquire));
+//             scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetDirect.out, op, 0, justInquire));
+//         } break;
+//         case scclPatternNvls: {
+//             scclCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.out, op, 1, justInquire));
+//             scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.out, op, 0, justInquire));
+//         } break;
+//         case scclPatternNvlsTree: {
+//             scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[1], op, 0, justInquire));
+//             scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[2], op, 0, justInquire));
+//             scclCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeUp, op, 0, justInquire));
+//             scclCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[1], op, 0, justInquire));
+//             scclCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[2], op, 0, justInquire));
+//             scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeUp, op, 0, justInquire));
+//         } break;
+//         case scclPatternSend:
+//         case scclPatternRecv: {
+//             if(op->root == comm->rank)
+//                 return scclSuccess;
+//             scclCHECK(SaveProxy(comm, channel, op->pattern == scclPatternSend ? proxySend : proxyRecv, op->root, op, op->connIndex, justInquire));
+//         } break;
+//     }
+//     return scclSuccess;
+// }
+// SCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
+// scclResult_t scclProxyComputeP2p(struct scclInfo* info, struct scclProxyOp* op) {
+//     memset(op, 0, sizeof(struct scclProxyOp));
+//     int channelId               = info->channelId;
+//     struct scclChannel* channel = info->comm->channels + channelId;
+//     op->channelId               = channelId;
+//     op->sliceSteps              = P2P_SLICESTEPS;
+//     op->chunkSteps              = P2P_CHUNKSTEPS;
+//     op->dtype                   = info->datatype;
+//     op->protocol                = info->protocol;
+//     int stepSize = info->comm->buffSizes[op->protocol] / SCCL_STEPS;
+//     if(op->protocol == SCCL_PROTO_SIMPLE)
+//         stepSize = info->comm->p2pChunkSize;
+// #ifdef HCU_SDMA_FEATURE
+//     info->chunkSize = info->comm->p2pRealChunkSize;
+// #else
+//     info->chunkSize = stepSize;
+// #endif
+//     op->root = info->root;
+//     struct scclChannelPeer* peer = channel->peers[op->root];
+//     if(info->coll == scclFuncSend) {
+//         op->pattern = scclPatternSend;
+//         if(op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
+//             // Tune chunk size for the network
+//             if(info->count < stepSize)
+//                 info->chunkSize /= 4;
+//             else if(info->count < 8 * stepSize)
+//                 info->chunkSize /= 2;
+//         }
+//     } else if(info->coll == scclFuncRecv) {
+//         op->pattern = scclPatternRecv;
+//         if(op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) {
+//             // Tune chunk size for the network
+//             if(info->count < stepSize)
+//                 info->chunkSize /= 4;
+//             else if(info->count < 8 * stepSize)
+//                 info->chunkSize /= 2;
+//         }
+//     } else {
+//         WARN("P2p operation is neither send or recv");
+//         return scclInternalError;
+//     }
+//     if(scclParamChunkSize() != 0) {
+//         info->chunkSize = scclParamChunkSize();
+//     }
+//     op->chunkSize = info->chunkSize;
+//     // Compute nSteps for proxies
+//     int chunkEffectiveSize = op->chunkSize;
+//     if(op->protocol == SCCL_PROTO_LL) {
+//         chunkEffectiveSize /= 2;
+//     }
+//     op->nbytes = stepSize;
+//     op->nsteps = DIVUP(info->count, chunkEffectiveSize);
+//     if(op->nsteps == 0)
+//         op->nsteps = 1;
+//     return scclSuccess;
+// }
+// static scclResult_t removeOp(struct scclProxyProgressState* state, struct scclProxyArgs** opPtr, struct scclProxyArgs** prevOpPtr) {
+//     struct scclProxyArgs* freeOp = *opPtr;
+//     struct scclProxyArgs* next   = freeOp->next;
+//     DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(next));
+//     *opPtr = next;
+//     if(freeOp->nextPeer) {
+//         // replace op by nextPeer
+//         struct scclProxyArgs* nextPeer = freeOp->nextPeer;
+//         if(*prevOpPtr) {
+//             (*prevOpPtr)->next = nextPeer;
+//         } else {
+//             state->active = nextPeer;
+//         }
+//         nextPeer->next = next;
+//         *(prevOpPtr)   = nextPeer;
+//     } else {
+//         *(freeOp->proxyAppendPtr) = NULL;
+//         if(*prevOpPtr) {
+//             (*prevOpPtr)->next = next;
+//         } else {
+//             state->active = next;
+//         }
+//     }
+//     freeOp->next = state->pool;
+//     state->pool  = freeOp;
+//     DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
+// #ifdef DEBUG_PROXY
+//     scclCHECK(dumpProxyState(state));
+// #endif
+//     return scclSuccess;
+// }
+// static scclResult_t progressOps(struct scclProxyState* proxyState, struct scclProxyProgressState* state, struct scclProxyArgs* opStart, int* idle) {
+//     struct scclProxyArgs* prevOp = NULL;
+//     struct scclProxyArgs* op     = opStart;
+//     while(op) {
+//         if(op->state == scclProxyOpNone)
+//             return scclInternalError;
+//         TIME_START(0);
+//         TIME_START(1);
+//         scclCHECK(op->progress(proxyState, op));
+//         if(op->idle) {
+//             TIME_STOP(1);
+//             TIME_CANCEL(0);
+//         } else {
+//             TIME_CANCEL(1);
+//             TIME_STOP(0);
+//         }
+//         *idle &= op->idle;
+//         if(op->state == scclProxyOpNone) {
+//             TIME_START(2);
+//             scclCHECK(removeOp(state, &op, &prevOp));
+//             TIME_STOP(2);
+//         } else {
+//             prevOp = op;
+//             op     = op->next;
+//         }
+//     }
+//     return scclSuccess;
+// }
+// SCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16);
+// static scclResult_t scclProxyGetPostedOps(struct scclProxyState* proxyState, int* added) {
+//     struct scclProxyProgressState* state = &proxyState->progressState;
+//     if(state->opsPool == NULL)
+//         return scclInternalError;
+//     struct scclProxyOpsPool* pool = state->opsPool;
+//     struct scclProxyArgs profArgs; // Only used for profiling purposes
+//     if(state->nextOps != -1)
+//         goto process_nextops;
+//     // If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock
+//     // to be available. Exit, continue progress, and come back later.
+//     if(state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0))
+//         return scclSuccess;
+//     if(state->active == NULL) {
+//         pthread_mutex_lock(&pool->mutex);
+//         while(pool->nextOps == -1 && !state->stop) {
+//             struct scclProxyArgs profArgs; // Only used for profiling purposes
+//             scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileSleep);
+//             pthread_cond_wait(&pool->cond, &pool->mutex);
+//             scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileWakeup);
+//         }
+//         if(state->stop) { // We might have been woken up to stop.
+//             pthread_mutex_unlock(&pool->mutex);
+//             return scclSuccess;
+//         }
+//     }
+//     state->nextOps = pool->nextOps;
+//     pool->nextOps = pool->nextOpsEnd = -1;
+//     pthread_mutex_unlock(&pool->mutex);
+//     if(state->nextOps == -1)
+//         return scclInternalError;
+// process_nextops:
+//     scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileAppend);
+//     TIME_START(2);
+//     int freeOp[SCCL_MAX_LOCAL_RANKS];
+//     int freeOpEnd[SCCL_MAX_LOCAL_RANKS];
+//     for(int i = 0; i < proxyState->tpLocalnRanks; i++)
+//         freeOp[i] = -1;
+//     uint64_t lastOpCount = 0;
+//     int lastPeer         = -1;
+//     int count            = 0;
+//     for(int opIndex = state->nextOps; opIndex != -1;) {
+//         struct scclProxyOp* peerOp = pool->ops + opIndex;
+//         int peer                   = opIndex / MAX_OPS_PER_PEER;
+//         if((lastOpCount && peerOp->opCount != lastOpCount) || ((lastPeer != -1) && peer != lastPeer))
+//             count++;
+//         if(count == scclParamProxyAppendBatchSize() + 1)
+//             break;
+//         lastOpCount = peerOp->opCount;
+//         lastPeer    = peer;
+//         if(peerOp->connection == NULL)
+//             return scclInternalError;
+//         if(peerOp->next != -1)
+//             __builtin_prefetch(pool->ops + peerOp->next);
+//         scclCHECK(ProxyAppend(state, peerOp));
+//         (*added)++;
+//         int lastOpIndex = opIndex;
+//         opIndex         = peerOp->next;
+//         // Return op to peer pool
+//         if(freeOp[peer] == -1) {
+//             freeOpEnd[peer] = lastOpIndex;
+//         } else {
+//             peerOp->next = freeOp[peer];
+//         }
+//         freeOp[peer]   = lastOpIndex;
+//         state->nextOps = opIndex;
+//     }
+//     for(int i = 0; i < proxyState->tpLocalnRanks; i++) {
+//         if(freeOp[i] == -1)
+//             continue;
+//         int newFree                  = freeOp[i];
+//         int oldFree                  = pool->freeOps[i];
+//         pool->ops[freeOpEnd[i]].next = oldFree;
+//         if(oldFree == -1) {
+//             // Nothing for the main thread to consume, we can set it.
+//             pool->freeOps[i] = newFree;
+//         } else {
+//             // The main thread may recycle free ops at any time, replace the freeOps value atomically and check it worked.
+//             int swap = __sync_val_compare_and_swap(pool->freeOps + i, oldFree, newFree);
+//             if(swap != oldFree) {
+//                 if(swap != -1)
+//                     return scclInternalError;
+//                 // Ops were recycled while we were trying to swap, just set the value directly now.
+//                 pool->ops[freeOpEnd[i]].next = -1;
+//                 pool->freeOps[i]             = newFree;
+//             }
+//         }
+//     }
+//     profArgs.opCount = *added;
+//     scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileAppendEnd);
+//     TIME_STOP(2);
+//     return scclSuccess;
+// }
+// #include <signal.h>
+// static scclProxyProgressState* scclLastProxyState;
+// void scclDumpProxyState(int signal) { dumpProxyState(scclLastProxyState); }
+// SCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0);
+// static int setProxyThreadContext(struct scclProxyState* proxyState) {
+// #if CUDART_VERSION >= 11030
+//     static int createThreadContext = -1;
+//     if(createThreadContext == -1) {
+//         createThreadContext = scclParamCreateThreadContext();
+//         if(createThreadContext) {
+//             if(CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
+//                 WARN("Unable to create thread context due to old driver, disabling.");
+//                 createThreadContext = 0;
+//             }
+//         }
+//     }
+//     if(createThreadContext) {
+//         if(proxyState->cudaCtx == NULL) {
+//             if(CUPFN(cuCtxCreate(&proxyState->cudaCtx, CU_CTX_SCHED_SPIN | CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) {
+//                 WARN("Failed to create CUDA context on device %d", proxyState->cudaDev);
+//                 createThreadContext = 0;
+//             }
+//         } else {
+//             if(CUPFN(cuCtxSetCurrent(proxyState->cudaCtx)) != CUDA_SUCCESS) {
+//                 WARN("Failed to set CUDA context on device %d", proxyState->cudaDev);
+//                 return 0;
+//             }
+//             return 1;
+//         }
+//     }
+// #endif
+//     return 0;
+// }
+// // Set to SIGUSR1 or SIGUSR2 to help debug proxy state during hangs
+// SCCL_PARAM(ProxyDumpSignal, "PROXY_DUMP_SIGNAL", -1);
+// SCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8);
+// void* scclProxyProgress(void* proxyState_) {
+//     struct scclProxyState* proxyState = (struct scclProxyState*)proxyState_;
+//     if(setProxyThreadContext(proxyState)) {
+//         INFO(SCCL_INIT, "[Proxy Progress] Created CUDA context on device %d", proxyState->cudaDev);
+//     } else if(cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
+//         WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev);
+//     }
+//     // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+//     struct scclProxyProgressState* state = &proxyState->progressState;
+//     state->nextOps                       = -1;
+//     const int sig                        = scclParamProxyDumpSignal();
+//     if(sig != -1)
+//         signal(sig, scclDumpProxyState);
+//     scclLastProxyState = state;
+//     char threadName[SCCL_THREAD_NAMELEN];
+//     snprintf(threadName, SCCL_THREAD_NAMELEN, "sccl Progress%2d", proxyState->cudaDev);
+//     nvtxNameOsThreadA(syscall(SYS_gettid), threadName);
+//     int lastIdle = 0;
+//     /* Too frequent call of scclProxyGetPostedOps() will result in perf regression for small message
+//      * communication. proxyOpAppendCounter is a counter that helps us decide if we need to append proxy ops.
+//      * After each progress, proxyOpAppendCounter will increase by 1 and compare with environment variable
+//      * scclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the
+//      * frequency of calling scclProxyGetPostedOps() and reduce the perf impact. */
+//     int proxyOpAppendCounter = 0;
+//     struct scclProxyArgs profArgs; // Only used for profiling purposes
+//     while((state->stop == false || (state->stop == true && state->active)) && *proxyState->abortFlag == 0) {
+//         int idle         = 1;
+//         scclResult_t ret = progressOps(proxyState, state, state->active, &idle);
+//         if(ret != scclSuccess) {
+//             INFO(SCCL_ALL, "%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+//             return NULL;
+//         }
+//         if(lastIdle == 0 && idle == 1)
+//             scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileIdle);
+//         if(lastIdle == 1 && idle == 0)
+//             scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileActive);
+//         if(idle || (++proxyOpAppendCounter == scclParamProgressAppendOpFreq())) {
+//             int added            = 0;
+//             proxyOpAppendCounter = 0;
+//             TIME_START(3);
+//             if(state->stop == false)
+//                 ret = scclProxyGetPostedOps(proxyState, &added);
+//             if(added) {
+//                 TIME_STOP(3);
+//             } else {
+//                 TIME_CANCEL(3);
+//             }
+//             if(ret != scclSuccess) {
+//                 INFO(SCCL_ALL, "%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+//             }
+//             if(added == 0) {
+//                 sched_yield(); // No request progressed. Let others run.
+//             }
+//         }
+//         lastIdle = idle;
+//     }
+//     return NULL;
+// }
+// scclResult_t scclProxyStart(struct scclComm* comm) {
+//     struct scclProxyOps* proxyOps = comm->proxyState->proxyOps;
+//     if(proxyOps == NULL)
+//         return scclSuccess;
+//     TIME_START(1);
+//     for(int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) {
+//         struct scclProxyOps* ops = proxyOps + r;
+//         if(ops->pool == NULL || ops->nextOps == -1)
+//             continue;
+//         scclCHECK(scclProxyPost(ops->pool, ops->nextOps, ops->nextOpsEnd));
+//         ops->nextOps = ops->nextOpsEnd = -1;
+//         ops->count                     = 0;
+//     }
+//     comm->opCount++;
+//     TIME_STOP(1);
+//     return scclSuccess;
+// }
+// static scclResult_t scclProxyProgressCreate(struct scclProxyState* proxyState) {
+//     struct scclProxyProgressState* state = &proxyState->progressState;
+//     if(!state->thread) {
+//         pthread_create(&state->thread, NULL, scclProxyProgress, proxyState);
+//         scclSetThreadName(state->thread, "sccl Progress%2d", proxyState->tpLocalnRanks);
+//     }
+//     return scclSuccess;
+// }
+// scclResult_t scclProxyProgressDestroy(struct scclProxyState* proxyState) {
+//     struct scclProxyProgressState* state = &proxyState->progressState;
+//     // Request the proxy to stop and then wake it
+//     if(state->opsPool) {
+//         pthread_mutex_lock(&state->opsPool->mutex);
+//         state->stop = true;
+//         pthread_cond_signal(&state->opsPool->cond);
+//         pthread_mutex_unlock(&state->opsPool->mutex);
+//         pthread_join(state->thread, NULL);
+//     }
+//     // Free off any memory allocated for the proxy arg pools
+//     while(state->pools != NULL) {
+//         struct scclProxyPool* next = state->pools->next;
+//         free(state->pools);
+//         state->pools = next;
+//     }
+//     scclProfilingDump();
+//     TIME_PRINT("Proxy");
+//     return scclSuccess;
+// }
+// #define SCCL_PROXY_CONN_POOL_SIZE_POW2 7
+// #define SCCL_PROXY_CONN_POOL_SIZE (1 << (SCCL_PROXY_CONN_POOL_SIZE_POW2))
+// #define SCCL_PROXY_CONN_POOL_MASK ((SCCL_PROXY_CONN_POOL_SIZE) - 1)
+// struct scclProxyConnectionPool {
+//     struct scclProxyConnection** pools;
+//     int banks;
+//     int offset;
+// };
+// static scclResult_t scclProxyNewConnection(struct scclProxyConnectionPool* pool, int* id) {
+//     if(pool->offset == SCCL_PROXY_CONN_POOL_SIZE) {
+//         scclCHECK(scclRealloc(&pool->pools, pool->banks, pool->banks + 1));
+//         scclCHECK(scclCalloc(pool->pools + pool->banks, SCCL_PROXY_CONN_POOL_SIZE));
+//         pool->banks++;
+//         pool->offset = 0;
+//     }
+//     *id = ((pool->banks - 1) << SCCL_PROXY_CONN_POOL_SIZE_POW2) + pool->offset;
+//     pool->offset++;
+//     return scclSuccess;
+// }
+// static scclResult_t scclProxyGetConnection(struct scclProxyConnectionPool* pool, int id, struct scclProxyConnection** conn) {
+//     int bank   = id >> SCCL_PROXY_CONN_POOL_SIZE_POW2;
+//     int offset = id & SCCL_PROXY_CONN_POOL_MASK;
+//     if((pool->pools == NULL) || (bank > pool->banks) || (pool->pools[bank] == NULL))
+//         return scclInternalError;
+//     *conn = pool->pools[bank] + offset;
+//     return scclSuccess;
+// }
+// static scclResult_t proxyFree(struct scclProxyConnection* connection, struct scclProxyState* proxyState) {
+//     if(connection->send) {
+//         if(scclTransports[connection->transport]->send.proxyFree) {
+//             scclCHECK(scclTransports[connection->transport]->send.proxyFree(connection, proxyState));
+//         }
+//     } else {
+//         if(scclTransports[connection->transport]->recv.proxyFree) {
+//             scclCHECK(scclTransports[connection->transport]->recv.proxyFree(connection, proxyState));
+//         }
+//     }
+//     return scclSuccess;
+// }
+// static scclResult_t scclProxyFreeConnections(struct scclProxyConnectionPool* pool, struct scclProxyState* proxyState) {
+//     for(int b = 0; b < pool->banks; b++) {
+//         int max = b == pool->banks - 1 ? pool->offset : SCCL_PROXY_CONN_POOL_SIZE;
+//         for(int i = 0; i < max; i++) {
+//             scclProxyConnection* connection = pool->pools[b] + i;
+//             if(connection->state != connUninitialized) {
+//                 scclCHECK(proxyFree(connection, proxyState));
+//             }
+//         }
+//         free(pool->pools[b]);
+//     }
+//     free(pool->pools);
+//     return scclSuccess;
+// }
+// #include "transport.h"
+// struct scclProxyInitReq {
+//     int transport;
+//     int send;
+//     int tpLocalRank;
+//     int tpRank;
+//     int sameProcess;
+// };
+// struct scclProxyInitResp {
+//     scclProxyConnection* connection;
+//     char devShmPath[6]; // "XXXXXX" - May or may not be set
+// };
+// scclResult_t scclProxyConnect(struct scclComm* comm, int transport, int send, int tpProxyRank, struct scclProxyConnector* proxyConn) {
+//     struct scclSocket* sock;
+//     int ready, proxyRank = -1;
+//     struct scclProxyState* sharedProxyState = comm->proxyState;
+//     // Keep one connection per mlocal rank
+//     for(int i = 0; i < comm->localRanks; ++i) {
+//         /* find the proxy rank in comm. */
+//         if(comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) {
+//             proxyRank = comm->localRankToRank[i];
+//             break;
+//         }
+//     }
+//     proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+//     // Keep one connection per local rank
+//     proxyConn->connection = NULL;
+//     proxyConn->tpRank     = tpProxyRank;
+//     if(sharedProxyState->peerSocks == NULL) {
+//         scclCHECK(scclCalloc(&sharedProxyState->peerSocks, comm->sharedRes->tpNLocalRanks));
+//         scclCHECK(scclCalloc(&sharedProxyState->proxyOps, comm->sharedRes->tpNLocalRanks));
+//         scclCHECK(scclCalloc(&sharedProxyState->sharedDevMems, comm->sharedRes->tpNLocalRanks));
+//         for(int i = 0; i < comm->sharedRes->tpNLocalRanks; ++i) {
+//             scclCHECK(scclSocketSetFd(-1, &sharedProxyState->peerSocks[i]));
+//         }
+//     }
+//     proxyConn->tpLocalRank = comm->sharedRes->tpRankToLocalRank[proxyConn->tpRank];
+//     sock                   = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
+//     scclCHECK(scclSocketReady(sock, &ready));
+//     if(!ready) {
+//         scclCHECK(scclSocketInit(sock, sharedProxyState->peerAddresses + proxyConn->tpRank, comm->sharedRes->magic, scclSocketTypeProxy, comm->abortFlag));
+//         scclCHECK(scclSocketConnect(sock));
+//     }
+//     struct scclProxyInitReq req = {0};
+//     req.transport               = transport;
+//     req.send                    = send;
+//     req.tpLocalRank             = comm->topParentLocalRanks[comm->localRank];
+//     req.tpRank                  = comm->topParentRanks[comm->rank];
+//     req.sameProcess             = proxyConn->sameProcess;
+//     struct scclProxyInitResp resp = {0};
+//     // This usually sends proxyConn->connection to identify which connection this is.
+//     // However, this is part of the response and therefore is ignored
+//     scclCHECK(scclProxyCallBlocking(comm, proxyConn, scclProxyMsgInit, &req, sizeof(req), &resp, sizeof(resp)));
+//     proxyConn->connection = resp.connection;
+//     // If we need proxy progress, map progress ops
+//     struct scclTransportComm* tcomm = send ? &scclTransports[transport]->send : &scclTransports[transport]->recv;
+//     if(tcomm->proxyProgress) {
+//         char poolPath[] = "/dev/shm/sccl-XXXXXX";
+//         strncpy(poolPath + sizeof("/dev/shm/sccl-") - 1, resp.devShmPath, sizeof("XXXXXX") - 1);
+//         struct scclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank;
+//         if(proxyOps->pool == NULL) {
+//             scclCHECK(scclShmOpen(poolPath, sizeof(struct scclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0, &proxyOps->handle));
+//             proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
+//         }
+//     }
+//     INFO(SCCL_NET | SCCL_PROXY, "Connection to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
+//     return scclSuccess;
+// }
+// // cuMem API support
+// // The response is sent out-of-band using scclIpcSocket for this specific command
+// /**
+//  * 通过代理连接将文件描述符转换为跨进程可用的描述符
+//  *
+//  * @param comm sccl通信器
+//  * @param proxyConn 代理连接器
+//  * @param fd 待转换的文件描述符
+//  * @param convertedFd 输出参数，存储转换后的文件描述符
+//  * @return 操作结果(scclSuccess表示成功)
+//  *
+//  * 该函数会阻塞直到转换完成或失败。首先创建UDS socket接收转换后的fd，
+//  * 然后通过代理请求转换，最后轮询代理响应直到操作完成。
+//  * 出错时会关闭socket并返回错误信息。
+//  */
+// scclResult_t scclProxyClientConvertFdBlocking(struct scclComm* comm, struct scclProxyConnector* proxyConn, int fd, int* convertedFd) {
+//     scclResult_t ret             = scclSuccess;
+//     scclResult_t res             = scclInProgress;
+//     struct scclIpcSocket ipcSock = {0};
+//     void* opId                   = malloc(1);
+//     // Create a UDS socket to receive the converted fd
+//     scclCHECK(scclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag));
+//     // Request the conversion of the fd over sockets
+//     scclCHECKGOTO(scclProxyCallAsync(comm, proxyConn, scclProxyMsgConvertFd, &fd, sizeof(int), 0, opId), ret, error);
+//     // Receive converted fd over UDS
+//     scclCHECK(scclIpcSocketRecvFd(&ipcSock, convertedFd));
+//     TRACE(SCCL_PROXY, "UDS: ConvertFd rank %d returned %p %d", proxyConn->tpLocalRank, convertedFd, *convertedFd);
+//     scclCHECK(scclIpcSocketClose(&ipcSock));
+//     while(res == scclInProgress) {
+//         res = scclPollProxyResponse(comm, proxyConn, NULL, opId);
+//     }
+//     free(opId);
+//     return res;
+// error:
+//     scclCHECK(scclIpcSocketClose(&ipcSock));
+//     WARN("scclProxyClientConvertFd call to top parent rank %d failed", proxyConn->tpRank);
+//     return ret;
+// }
+// const char* scclProxyMsgTypeStr[] = {"Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd"};
+// scclResult_t scclProxyCallAsync(struct scclComm* comm, struct scclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId)
+// {
+//     struct scclSocket* sock;
+//     scclResult_t ret                        = scclSuccess;
+//     struct scclProxyState* sharedProxyState = comm->proxyState;
+//     if(sharedProxyState->peerSocks == NULL)
+//         return scclInternalError;
+//     sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
+//     if(sock == NULL)
+//         return scclInternalError;
+//     scclCHECKGOTO(scclSocketSend(sock, &type, sizeof(int)), ret, error);
+//     scclCHECKGOTO(scclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
+//     scclCHECKGOTO(scclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
+//     scclCHECKGOTO(scclSocketSend(sock, &respSize, sizeof(int)), ret, error);
+//     if(reqSize)
+//         scclCHECKGOTO(scclSocketSend(sock, reqBuff, reqSize), ret, error);
+//     // Send opId to proxy
+//     scclCHECKGOTO(scclSocketSend(sock, &opId, sizeof(opId)), ret, error);
+//     // Add proxyOp to expected response queue
+//     scclCHECK(expectedProxyResponseEnqueue(sharedProxyState, opId, respSize));
+//     return scclSuccess;
+// error:
+//     return ret;
+// }
+// scclResult_t scclPollProxyResponse(struct scclComm* comm, struct scclProxyConnector* proxyConn, void* respBuff, void* opId) {
+//     struct scclProxyState* sharedProxyState = comm->proxyState;
+//     // Receive the connection pointer from the Proxy
+//     if(*comm->abortFlag) {
+//         WARN("Comm %p is in abort state", comm);
+//         return scclInternalError;
+//     }
+//     if(sharedProxyState->peerSocks == NULL)
+//         return scclInternalError;
+//     // Check response queue
+//     int found = 0;
+//     scclCHECK(expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found));
+//     if(found == 0) {
+//         // Attempt to read in a new response header from the proxy thread
+//         struct scclSocket* sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
+//         void* recvOpId;
+//         int offset = 0;
+//         if(scclSuccess != scclSocketProgress(SCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)) {
+//             WARN("Socket recv failed while polling for opId=%p", opId);
+//             return scclInternalError;
+//         }
+//         if(offset == 0) {
+//             return scclInProgress;
+//             // If we've returned a partial response, block to receive the rest of it
+//         } else if(offset < sizeof(recvOpId)) {
+//             while(offset < sizeof(recvOpId))
+//                 scclCHECK(scclSocketProgress(SCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset));
+//         }
+//         INFO(SCCL_PROXY, "scclPollProxyResponse Received new opId=%p", recvOpId);
+//         // Now do a blocking recv of the response size
+//         int respSize = 0;
+//         scclCHECK(scclSocketRecv(sock, &respSize, sizeof(respSize)));
+//         // If there's a respSize to recv
+//         if(respSize > 0) {
+//             if(recvOpId != opId) {
+//                 // Unexpected response, need to buffer the socket data
+//                 respBuff = malloc(respSize);
+//             }
+//             assert(respBuff != NULL);
+//             scclCHECK(scclSocketRecv(sock, respBuff, respSize));
+//         }
+//         if(recvOpId == opId) {
+//             INFO(SCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId);
+//             scclCHECK(expectedProxyResponseRemove(sharedProxyState, recvOpId));
+//             return scclSuccess;
+//         } else {
+//             INFO(SCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", recvOpId, respBuff, respSize);
+//             // Store the result and mark response as completed
+//             scclCHECK(expectedProxyResponseStore(sharedProxyState, recvOpId, respBuff, respSize));
+//             return scclInProgress;
+//         }
+//     } else {
+//         INFO(SCCL_PROXY, "scclPollProxyResponse Dequeued cached opId=%p", opId);
+//     }
+//     return scclSuccess;
+// }
+// scclResult_t
+// scclProxyCallBlocking(struct scclComm* comm, struct scclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
+//     // Alloc some memory to act as a handle
+//     scclResult_t res = scclSuccess;
+//     void* opId       = malloc(1);
+//     scclCHECKGOTO(scclProxyCallAsync(comm, proxyConn, type, reqBuff, reqSize, respSize, opId), res, fail);
+//     do {
+//         res = scclPollProxyResponse(comm, proxyConn, respBuff, opId);
+//     } while(res == scclInProgress);
+// exit:
+//     free(opId);
+//     return res;
+// fail:
+//     goto exit;
+// }
+// static scclResult_t proxyProgressInit(struct scclProxyState* proxyState) {
+//     struct scclProxyProgressState* state = &proxyState->progressState;
+//     if(state->opsPool == NULL) {
+//         int size                      = sizeof(struct scclProxyOpsPool);
+//         struct scclProxyOpsPool* pool = NULL;
+//         char shmPath[sizeof("/dev/shm/sccl-XXXXXX")];
+//         shmPath[0] = '\0';
+//         scclCHECK(scclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks + 1, &state->handle));
+//         // Init pool
+//         pool->nextOps = -1;
+//         for(int r = 0; r < proxyState->tpLocalnRanks; r++) {
+//             pool->freeOps[r] = r * MAX_OPS_PER_PEER;
+//             for(int i = 0; i < MAX_OPS_PER_PEER - 1; i++)
+//                 pool->ops[r * MAX_OPS_PER_PEER + i].next = r * MAX_OPS_PER_PEER + i + 1;
+//             pool->ops[(r + 1) * MAX_OPS_PER_PEER - 1].next = -1;
+//         }
+//         // Setup mutex/cond to work inter-process
+//         pthread_mutexattr_t mutexAttr;
+//         pthread_mutexattr_init(&mutexAttr);
+//         pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED);
+//         pthread_mutex_init(&pool->mutex, &mutexAttr);
+//         pthread_condattr_t condAttr;
+//         pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED);
+//         pthread_cond_init(&pool->cond, &condAttr);
+//         state->opsPool = pool;
+//         memcpy(state->opsPoolShmSuffix, shmPath + sizeof("/dev/shm/sccl-") - 1, sizeof("XXXXXX") - 1);
+//         // All ops structures are created, we can start the progress thread
+//         scclCHECK(scclProxyProgressCreate(proxyState));
+//     }
+//     return scclSuccess;
+// }
+// static void proxyOpsFree(struct scclProxyState* proxyState) {
+//     struct scclProxyProgressState* state = &proxyState->progressState;
+//     if(scclShmClose(state->handle) != scclSuccess) {
+//         WARN("[Service thread] shm close failed");
+//     }
+// }
+// scclResult_t scclProxyShmUnlink(struct scclComm* comm) {
+//     struct scclProxyProgressState* state = &comm->proxyState->progressState;
+//     if(state->opsPool == NULL)
+//         return scclSuccess;
+//     if(scclShmUnlink(state->handle) != scclSuccess) {
+//         WARN("[Service thread] proxy ops shm unlink failed");
+//     }
+//     return scclSuccess;
+// }
+// static scclResult_t proxyConnInit(struct scclProxyLocalPeer* peer,
+//                                   struct scclProxyConnectionPool* connectionPool,
+//                                   struct scclProxyState* proxyState,
+//                                   scclProxyInitReq* req,
+//                                   scclProxyInitResp* resp,
+//                                   struct scclProxyConnection** connection) {
+//     int id;
+//     scclCHECK(scclProxyNewConnection(connectionPool, &id));
+//     scclCHECK(scclProxyGetConnection(connectionPool, id, connection));
+//     (*connection)->sock        = &peer->sock;
+//     (*connection)->transport   = req->transport;
+//     (*connection)->send        = req->send;
+//     (*connection)->tpLocalRank = req->tpLocalRank;
+//     (*connection)->sameProcess = req->sameProcess;
+//     peer->tpLocalRank          = req->tpLocalRank;
+//     peer->tpRank               = req->tpRank;
+//     resp->connection = *connection;
+//     (*connection)->tcomm = (*connection)->send ? &scclTransports[(*connection)->transport]->send : &scclTransports[(*connection)->transport]->recv;
+//     // If we need proxy progress, let's allocate ops and start the thread
+//     if((*connection)->tcomm->proxyProgress) {
+//         scclCHECK(proxyProgressInit(proxyState));
+//         struct scclProxyProgressState* state = &proxyState->progressState;
+//         strncpy(resp->devShmPath, state->opsPoolShmSuffix, sizeof(resp->devShmPath));
+//     }
+//     INFO(SCCL_NET | SCCL_PROXY,
+//          "New proxy %s connection %d from local rank %d, transport %d",
+//          (*connection)->send ? "send" : "recv",
+//          id,
+//          (*connection)->tpLocalRank,
+//          (*connection)->transport);
+//     __atomic_store_n(&(*connection)->state, connInitialized, __ATOMIC_RELEASE);
+//     return scclSuccess;
+// }
+// // cuMem API support
+// static scclResult_t proxyConvertFd(struct scclProxyLocalPeer* peer, void* opId, struct scclProxyState* proxyState, int fd) {
+//     struct scclIpcSocket ipcSock = {0};
+//     uint64_t hash                = (uint64_t)opId;
+//     INFO(SCCL_PROXY, "UDS proxyConvertFd received fd %d peer %d opId %lx", fd, peer->tpLocalRank, hash);
+//     // Send back the converted fd using UDS
+//     scclCHECK(scclIpcSocketInit(&ipcSock, proxyState->tpRank, hash ^ 1, proxyState->abortFlag));
+//     scclCHECK(scclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash));
+//     scclCHECK(scclIpcSocketClose(&ipcSock));
+//     return scclSuccess;
+// }
+// static scclResult_t proxyProgressAsync(struct scclProxyAsyncOp* op,
+//                                        struct scclProxyState* proxyState,
+//                                        int* asyncOpCount,
+//                                        struct scclProxyLocalPeer* peer,
+//                                        struct scclProxyConnectionPool* connectionPool) {
+//     int done = 1;
+//     if(op->type == scclProxyMsgSetup) {
+//         TRACE(SCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId);
+//         scclCHECK(op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+//     } else if(op->type == scclProxyMsgConnect) {
+//         TRACE(SCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
+//         scclCHECK(op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+//     } else if(op->type == scclProxyMsgSharedInit) {
+//         int nChannels = (int)*op->reqBuff;
+//         TRACE(SCCL_PROXY, "proxyProgressAsync::scclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels);
+//         if(op->connection->tcomm->proxySharedInit)
+//             scclCHECK(op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels));
+//         __atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE);
+//     } else if(op->type == scclProxyMsgConvertFd) {
+//         int fd = *(int*)op->reqBuff;
+//         TRACE(SCCL_PROXY, "proxyProgressAsync::scclProxyMsgConvertFd opId=%p op.reqBuff=%p fd=%d", op->opId, op->reqBuff, fd);
+//         scclCHECK(proxyConvertFd(peer, op->opId, proxyState, fd)); // cuMem API support
+//     } else if(op->type == scclProxyMsgInit) {
+//         TRACE(SCCL_PROXY, "proxyProgressAsync::scclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
+//         scclCHECK(proxyConnInit(peer, connectionPool, proxyState, (scclProxyInitReq*)op->reqBuff, (scclProxyInitResp*)op->respBuff, &op->connection));
+//     } else
+//         return scclInternalError;
+//     if(done) {
+//         INFO(SCCL_PROXY, "proxyProgressAsync opId=%p op.type=%d op.reqBuff=%p op.respSize=%d done", op->opId, op->type, op->reqBuff, op->respSize);
+//         if(op->type == scclProxyMsgSetup)
+//             __atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE);
+//         else if(op->type == scclProxyMsgConnect)
+//             __atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE);
+//         /* if setup or connect is done, we should not return any error at this point since
+//          * scclSocketSend might already send the respBuff to the requester. If we still choose
+//          * to abort and close the connection, it can cause segfault if the requester is using
+//          * the respBuff. */
+//         // Send the opId for referencing async operation
+//         scclCHECK(scclSocketSend(op->connection->sock, &op->opId, sizeof(op->opId)));
+//         // Send the response size
+//         scclCHECK(scclSocketSend(op->connection->sock, &op->respSize, sizeof(op->respSize)));
+//         if(op->respSize) {
+//             // Send the response
+//             scclCHECK(scclSocketSend(op->connection->sock, op->respBuff, op->respSize));
+//         }
+//         asyncProxyOpDequeue(peer, op);
+//         (*asyncOpCount)--;
+//         return scclSuccess;
+//     } else if(*proxyState->abortFlag != 0) {
+//         return scclInternalError;
+//     }
+//     return scclInProgress;
+// }
+// static scclResult_t proxyServiceInitOp(
+//     int type, struct scclProxyLocalPeer* peer, struct scclProxyConnectionPool* connectionPool, struct scclProxyState* proxyState, int* asyncOpCount) {
+//     struct scclSocket* sock = &peer->sock;
+//     struct scclProxyAsyncOp* asyncOp;
+//     scclCHECK(scclCalloc(&asyncOp, 1));
+//     asyncOp->type = type;
+//     scclCHECK(scclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
+//     scclCHECK(scclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)));
+//     scclCHECK(scclSocketRecv(sock, &asyncOp->respSize, sizeof(int)));
+//     if(asyncOp->reqSize) {
+//         scclCHECK(scclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
+//         scclCHECK(scclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
+//     }
+//     // Store opId for completion response
+//     scclCHECK(scclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)));
+//     if(asyncOp->respSize)
+//         scclCHECK(scclCalloc(&asyncOp->respBuff, asyncOp->respSize));
+//     asyncProxyOpEnqueue(peer, asyncOp);
+//     (*asyncOpCount)++;
+//     scclCHECK(proxyProgressAsync(asyncOp, proxyState, asyncOpCount, peer, connectionPool));
+//     return scclSuccess;
+// }
+// #include <poll.h>
+// static bool proxyMatchOpType(int type) {
+//     switch(type) {
+//         case scclProxyMsgInit:
+//         case scclProxyMsgSharedInit:
+//         case scclProxyMsgSetup:
+//         case scclProxyMsgConnect:
+//         case scclProxyMsgConvertFd: return true;
+//         default: return false;
+//     }
+// }
+// void* scclProxyService(void* _args) {
+//     struct scclProxyState* proxyState = (struct scclProxyState*)_args;
+//     // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+//     if(setProxyThreadContext(proxyState)) {
+//         INFO(SCCL_INIT, "[Proxy Service] Created CUDA context on device %d", proxyState->cudaDev);
+//     } else if(cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
+//         WARN("[Proxy Service] Failed to set CUDA device %d", proxyState->cudaDev);
+//     }
+//     // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+//     // Prepare poll descriptor
+//     struct scclProxyConnectionPool connectionPool;
+//     connectionPool.pools  = NULL;
+//     connectionPool.banks  = 0;
+//     connectionPool.offset = SCCL_PROXY_CONN_POOL_SIZE;
+//     struct pollfd pollfds[SCCL_MAX_LOCAL_RANKS + 1];
+//     struct scclProxyLocalPeer peers[SCCL_MAX_LOCAL_RANKS];
+//     memset(&peers, 0, sizeof(struct scclProxyLocalPeer) * SCCL_MAX_LOCAL_RANKS);
+//     for(int s = 0; s < SCCL_MAX_LOCAL_RANKS; s++) {
+//         pollfds[s].fd     = -1;
+//         pollfds[s].events = POLLHUP | POLLIN;
+//     }
+//     if(scclSocketGetFd(proxyState->listenSock, &pollfds[SCCL_MAX_LOCAL_RANKS].fd) != scclSuccess) {
+//         WARN("[Proxy Service] Get listenSock fd fails");
+//         return NULL;
+//     };
+//     pollfds[SCCL_MAX_LOCAL_RANKS].events = POLLIN;
+//     int maxnpeers    = 0;
+//     int npeers       = 0;
+//     int stop         = 0;
+//     int asyncOpCount = 0;
+//     while(stop == 0 || (stop == 1 && npeers > 0)) {
+//         /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
+//          * connections. Need to wait until all other related comms call abort and safely exit
+//          * together, or we could face segmentation fault. */
+//         if(*proxyState->abortFlag != 0)
+//             stop = 1;
+//         /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
+//         int ret;
+//         do {
+//             ret = poll(pollfds, SCCL_MAX_LOCAL_RANKS + 1, asyncOpCount ? 0 : 500);
+//         } while(ret < 0 && errno == EINTR);
+//         if(ret < 0) {
+//             WARN("[Proxy Service] Poll failed: %s", strerror(errno));
+//             return NULL;
+//         }
+//         if(pollfds[SCCL_MAX_LOCAL_RANKS].revents) {
+//             int s = 0;
+//             while(s < SCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0)
+//                 s++;
+//             if(s == SCCL_MAX_LOCAL_RANKS) {
+//                 WARN("[Proxy service] Too many connections (%d max)", SCCL_MAX_LOCAL_RANKS);
+//                 return NULL;
+//             }
+//             if(maxnpeers < s + 1)
+//                 maxnpeers = s + 1;
+//             if(scclSocketInit(&peers[s].sock) != scclSuccess) {
+//                 WARN("[Service thread] Initialize peers[%d].sock fails", s);
+//                 return NULL;
+//             }
+//             if(scclSocketAccept(&peers[s].sock, proxyState->listenSock) != scclSuccess) {
+//                 WARN("[Service thread] Accept failed %s", strerror(errno));
+//             } else {
+//                 if(scclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != scclSuccess) {
+//                     WARN("[Service thread] Get peers[%d].sock fd fails", s);
+//                     return NULL;
+//                 }
+//                 npeers++;
+//                 peers[s].tpLocalRank = -1;
+//             }
+//         }
+//         for(int s = 0; s < maxnpeers; s++) {
+//             struct scclProxyLocalPeer* peer = peers + s;
+//             struct scclSocket* sock         = &peer->sock;
+//             int closeConn                   = 0;
+//             int type                        = 0;
+//             scclResult_t res                = scclSuccess;
+//             if(pollfds[s].fd == -1)
+//                 continue;
+//             // Progress all ops for this scclProxyLocalPeer
+//             scclProxyAsyncOp* op = peer->asyncOps;
+//             while(op != nullptr) {
+//                 scclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */
+//                 type                     = op->type;
+//                 res                      = proxyProgressAsync(op, proxyState, &asyncOpCount, peer, &connectionPool);
+//                 if(res == scclSuccess || res == scclInProgress) {
+//                     op = opnext;
+//                 } else {
+//                     // Res is a bad result
+//                     closeConn = 1;
+//                     WARN("[Service thread] Error encountered progressing operation=%s, res=%d, closing connection", scclProxyMsgTypeStr[type], res);
+//                     break;
+//                 }
+//             }
+//             // Check for additional ops coming in
+//             if(pollfds[s].revents & POLLIN) {
+//                 int closed;
+//                 res = scclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/);
+//                 if(res != scclSuccess && res != scclInProgress) {
+//                     WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed);
+//                     closeConn = 1;
+//                 } else if(closed) {
+//                     INFO(SCCL_INIT | SCCL_NET | SCCL_PROXY, "[Service thread] Connection closed by localRank %d", peer->tpLocalRank);
+//                     closeConn = 1;
+//                 } else if(res == scclSuccess) { // We received something from the sock
+//                     if(type == scclProxyMsgStop) {
+//                         stop      = 1;
+//                         closeConn = 1;
+//                     } else if(type == scclProxyMsgClose) {
+//                         closeConn = 1;
+//                     } else if(proxyMatchOpType(type)) {
+//                         res = proxyServiceInitOp(type, peers + s, &connectionPool, proxyState, &asyncOpCount);
+//                     } else {
+//                         WARN("[Service thread] Unknown command %d from localRank %d", type, peer->tpLocalRank);
+//                         closeConn = 1;
+//                     }
+//                     INFO(SCCL_PROXY, "Received and initiated operation=%s res=%d", scclProxyMsgTypeStr[type], res);
+//                 }
+//             } else if(pollfds[s].revents & POLLHUP) {
+//                 closeConn = 1;
+//             }
+//             if(res != scclSuccess && res != scclInProgress) {
+//                 WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d",
+//                      proxyState->tpRank,
+//                      scclProxyMsgTypeStr[type],
+//                      peer->tpRank,
+//                      res);
+//                 closeConn = 1;
+//             }
+//             if(closeConn) {
+//                 scclSocketClose(sock);
+//                 if(op != nullptr) {
+//                     asyncProxyOpDequeue(peer, op);
+//                     asyncOpCount--;
+//                 }
+//                 pollfds[s].fd = -1;
+//                 npeers--;
+//             }
+//         }
+//     }
+//     // Wait for all operations to complete and stop progress thread before freeing any resource
+//     if(scclProxyProgressDestroy(proxyState) != scclSuccess) {
+//         WARN("[Proxy Service] proxyDestroy failed");
+//     }
+//     for(int s = 0; s < maxnpeers; s++) {
+//         scclSocketClose(&peers[s].sock);
+//     }
+//     scclProxyFreeConnections(&connectionPool, proxyState);
+//     scclSocketClose(proxyState->listenSock);
+//     free(proxyState->listenSock);
+//     proxyOpsFree(proxyState);
+//     return NULL;
+// }
+// scclResult_t scclProxyInit(struct scclComm* comm, struct scclSocket* sock, union scclSocketAddress* peerAddresses) {
+//     assert(comm->sharedRes->proxyState == NULL);
+//     scclCHECK(scclCalloc(&comm->sharedRes->proxyState, 1));
+//     comm->proxyState                = comm->sharedRes->proxyState;
+//     comm->proxyState->refCount      = 1;
+//     comm->proxyState->listenSock    = sock;
+//     comm->proxyState->peerAddresses = peerAddresses;
+//     return scclSuccess;
+// }
+// scclResult_t scclProxyCreate(struct scclComm* comm) {
+//     /* proxyState is shared among parent comm and split comms. comm->proxyState->thread is
+//      * pthread_join()'d by commFree() in init.cc when the refCount reduces down to 0. */
+//     struct scclProxyState* proxyState = comm->proxyState;
+//     if(proxyState->refCount == 1) {
+//         /* we have to make sure all following fields in comm have been initialized. */
+//         proxyState->tpRank               = comm->rank;
+//         proxyState->tpnRanks             = comm->nRanks;
+//         proxyState->tpLocalnRanks        = comm->localRanks;
+//         proxyState->cudaDev              = comm->cudaDev;
+//         proxyState->abortFlag            = comm->abortFlag;
+//         proxyState->p2pnChannels         = comm->p2pnChannels;
+//         proxyState->p2pChunkSize         = comm->p2pChunkSize;
+//         proxyState->nChannels            = comm->nChannels;
+//         proxyState->allocP2pNetLLBuffers = comm->allocP2pNetLLBuffers;
+//         proxyState->dmaBufSupport        = comm->dmaBufSupport;
+//         proxyState->scclNet              = comm->scclNet;
+//         proxyState->scclCollNet          = comm->scclCollNet;
+//         memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes));
+//         pthread_create(&comm->proxyState->thread, NULL, scclProxyService, comm->proxyState);
+//         scclSetThreadName(comm->proxyState->thread, "sccl Service %2d", comm->cudaDev);
+//     }
+//     return scclSuccess;
+// }
+// scclResult_t scclProxyStop(struct scclComm* comm) {
+//     if(comm->sharedRes && comm->sharedRes->proxyState) {
+//         struct scclProxyState* sharedProxyState = comm->sharedRes->proxyState;
+//         if((comm->proxyRefCountOld = scclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
+//             if(sharedProxyState->peerAddresses) {
+//                 if(*comm->abortFlag == 0) {
+//                     struct scclSocket sock;
+//                     int type = scclProxyMsgStop;
+//                     scclCHECK(scclSocketInit(&sock,
+//                                              sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank],
+//                                              comm->sharedRes->magic,
+//                                              scclSocketTypeProxy,
+//                                              comm->abortFlag));
+//                     scclCHECK(scclSocketConnect(&sock));
+//                     scclCHECK(scclSocketSend(&sock, &type, sizeof(int)));
+//                     scclCHECK(scclSocketClose(&sock));
+//                 }
+//             }
+//             if(sharedProxyState->peerSocks) {
+//                 int tplocalRanks = comm->sharedRes->tpNLocalRanks;
+//                 for(int i = 0; i < tplocalRanks; i++) {
+//                     int fd;
+//                     scclCHECK(scclSocketGetFd(sharedProxyState->peerSocks + i, &fd));
+//                     if(fd >= 0) {
+//                         if(sharedProxyState->proxyOps[i].pool) {
+//                             scclCHECK(scclShmClose(sharedProxyState->proxyOps[i].handle));
+//                         }
+//                         if(sharedProxyState->sharedDevMems[i]) {
+//                             if(!scclCuMemEnable()) {
+//                                 CUDACHECK(cudaIpcCloseMemHandle(sharedProxyState->sharedDevMems[i]));
+//                             }
+//                         }
+//                         int type = scclProxyMsgClose;
+//                         if(*comm->abortFlag == 0)
+//                             scclCHECK(scclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)));
+//                         scclCHECK(scclSocketClose(sharedProxyState->peerSocks + i));
+//                     }
+//                 }
+//             }
+//         }
+//     }
+//     return scclSuccess;
+// }
+// scclResult_t scclProxyDestroy(struct scclComm* comm) {
+//     struct scclProxyState* sharedProxyState = comm->sharedRes->proxyState;
+//     assert(sharedProxyState->refCount == 0);
+//     free(sharedProxyState->peerAddresses);
+//     free(sharedProxyState->peerSocks);
+//     free(sharedProxyState->proxyOps);
+//     free(sharedProxyState->sharedDevMems);
+//     expectedProxyResponseFree(sharedProxyState);
+//     free(sharedProxyState);
+//     return scclSuccess;
+// }
--- a/src/hardware/topology/bootstrap/proxy.h
+++ b/src/hardware/topology/bootstrap/proxy.h
+#pragma once
+#include <pthread.h>
+#include "socket.h"
+#include "ipcsocket.h"
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace bootstrap {
+typedef net::host::scclSocketAddress scclSocketAddress_t;
+typedef net::host::scclSocket scclSocket_t;
+#define SCCL_PROXY_MAX_SUBS MAXCHANNELS
+#define PROXYARGS_ALLOCATE_SIZE SCCL_MAX_OPS
+enum proxyConnectState : uint8_t {
+    connUninitialized     = 0,
+    connInitialized       = 1,
+    connSharedInitialized = 2,
+    connSetupDone         = 3,
+    connConnected         = 4,
+    numConnStates         = 5
+};
+// 期望代理响应FIFO
+struct scclExpectedProxyResponse {
+    void* opId;                             // 操作ID，用于标识特定的操作
+    int respSize;                           // 响应大小，表示响应数据的字节数
+    bool done;                              // 完成标志，表示该响应是否已完成处理
+    void* respBuff;                         // 响应缓冲区，用于存储接收到的响应数据
+    struct scclExpectedProxyResponse* next; // 指向下一个预期代理响应的指针，形成链表结构
+};
+// 子代理参数数组
+struct scclProxySubArgs {
+    int channelId;  // 通道ID
+    int nsteps;     // 操作步骤数
+    ssize_t nbytes; // 数据字节数
+    int peer;       // 对等体ID
+    int groupSize;  //
+    uint64_t base;        // 基础计数
+    uint64_t posted;      // 已发布的计数
+    uint64_t received;    // 已接收的计数
+    uint64_t flushed;     // 已刷新的计数
+    uint64_t transmitted; // 已传输的计数
+    uint64_t done;        // 已完成的计数
+    uint64_t end;         // 结束计数
+    void* requests[SCCL_STEPS]; // 每个步骤的请求指针数组
+};
+// 定义代理参数结构体
+struct scclProxyArgs {
+    struct scclProxySubArgs subs[SCCL_PROXY_MAX_SUBS]; // 子代理参数数组
+    int nsubs;                                         // 子代理数量
+    int done;                                          // 是否完成的标志
+    uint64_t opCount;                                  // 操作计数
+    int sliceSteps;                                    // 切片步骤数
+    int chunkSteps;                                    // 数据块步骤数
+    int chunkSize;                                     // 数据块大小
+    scclDataType_t dtype;                              // 数据类型
+    scclProtocolType_t protocol;                       // 协议类型
+    int state;                                         // 当前状态
+    char* sharedBuff[SCCL_STEPS];                      // 共享缓冲区指针数组
+    int sharedSize[SCCL_STEPS];                        // 共享缓冲区大小数组
+    int idle;                                          // 是否空闲的标志
+    // 元素链接
+    struct scclProxyArgs* next;            // 指向下一个代理参数的指针
+    struct scclProxyArgs* nextPeer;        // 指向下一个对等代理参数的指针
+    struct scclProxyArgs** proxyAppendPtr; // 指向代理追加指针的指针
+};
+struct scclProxyPool {
+    struct scclProxyPool* next;                          // 指向下一个代理池的指针
+    struct scclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; // 代理参数元素数组
+};
+struct scclProxyProgressState {
+    // 用于主线程向进度线程发送工作
+    // struct scclProxyOpsPool* opsPool;
+    // scclShmHandle_t handle;
+    char opsPoolShmSuffix[6]; // 操作池共享内存后缀
+    pthread_t thread; // 进度线程的线程ID
+    bool stop;        // 停止标志，用于控制线程停止
+    // struct scclProxyPeer** localPeers;
+    // struct scclSharedNetComms* netComms[SCCL_MAX_NETDEVS];
+    struct scclProxyArgs* active; // 当前活动的代理参数
+    struct scclProxyArgs* pool;   // 代理参数池
+    struct scclProxyPool* pools;  // 代理池
+    int nextOps;                  // 下一个操作的索引
+};
+// struct scclProxyOp {
+//     struct scclProxyConnection* connection;
+//     int channelId;
+//     int nsteps;
+//     ssize_t nbytes;
+//     struct {
+//         int root : 30;
+//         uint32_t connIndex : 2;
+//     };
+//     int next;
+//     uint64_t opCount;
+//     int sliceSteps;
+//     int chunkSteps;
+//     int chunkSize;
+//     uint8_t /*scclDataType_t*/ dtype;
+//     uint8_t /*scclDevRedOp_t*/ redOp;
+//     uint8_t /*scclPattern_t*/ pattern;
+//     uint8_t protocol;
+//     union {
+//         uint64_t unused;
+//         // For use by enqueue.cc
+//         struct scclProxyOp* enqNext;
+//     };
+// };
+// struct scclProxyOpsPool {
+//     struct scclProxyOp ops[MAX_OPS_PER_PEER * SCCL_MAX_LOCAL_RANKS];
+//     volatile int nextOps;
+//     volatile int nextOpsEnd;
+//     volatile int freeOps[SCCL_MAX_LOCAL_RANKS];
+//     pthread_mutex_t mutex;
+//     pthread_cond_t cond;
+// };
+////////////////////////////////////////////////////////////////////////////////////////////////
+// scclResult_t scclProxyInit(struct scclComm* comm, scclSocket_t* sock, union scclSocketAddress* peerAddresses);
+} // namespace bootstrap
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
+// enum scclProxyOpState {
+//     scclProxyOpNone,
+//     scclProxyOpReady,
+//     scclProxyOpProgress
+// };
+// enum {
+//     proxyRecv = 0,
+//     proxySend = 1
+// };
+// struct scclProxyArgs;
+// typedef scclResult_t (*proxyProgressFunc_t)(struct scclProxyState*, struct scclProxyArgs*);
+// static_assert(SCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
+// struct scclProxyOp {
+//     struct scclProxyConnection* connection;
+//     int channelId;
+//     int nsteps;
+//     ssize_t nbytes;
+//     struct {
+//         int root : 30;
+//         uint32_t connIndex : 2;
+//     };
+//     int next;
+//     uint64_t opCount;
+//     int sliceSteps;
+//     int chunkSteps;
+//     int chunkSize;
+//     uint8_t /*scclDataType_t*/
+// dtype;
+// uint8_t /*scclDevRedOp_t*/ redOp;
+// uint8_t /*scclPattern_t*/ pattern;
+// uint8_t protocol;
+// union {
+//     uint64_t unused;
+//     // For use by enqueue.cc
+//     struct scclProxyOp* enqNext;
+// };
+// }
+// ;
+// static_assert(sizeof(struct scclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
+// #define SCCL_MAX_NETDEVS 128
+// // ProxyOps are used to communicate between main thread and service thread
+// // Make sure we have enough to store two full rounds of operations on all channels.
+// // Otherwise we'd be unable to post half of them to free new elements.
+// #define MAX_OPS_PER_PEER (2 * MAXCHANNELS * SCCL_MAX_WORK_ELEMENTS_P2P)
+// #define SCCL_MAX_LOCAL_RANKS 64
+// struct scclProxyOpsPool {
+//     struct scclProxyOp ops[MAX_OPS_PER_PEER * SCCL_MAX_LOCAL_RANKS];
+//     volatile int nextOps;
+//     volatile int nextOpsEnd;
+//     volatile int freeOps[SCCL_MAX_LOCAL_RANKS];
+//     pthread_mutex_t mutex;
+//     pthread_cond_t cond;
+// };
+// struct scclProxyOps {
+//     scclProxyOpsPool* pool;
+//     scclShmHandle_t handle;
+//     int count;
+//     int freeOp;
+//     int nextOps;
+//     int nextOpsEnd;
+// };
+// struct scclProxySharedP2p {
+//     int refcount;
+//     size_t size;
+//     char* cudaBuff;
+//     char* hostBuff;
+//     // CUDA IPC
+//     scclIpcDesc ipcDesc;
+//     struct scclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
+// };
+// struct scclProxyPeer {
+//     struct scclProxySharedP2p send;
+//     struct scclProxySharedP2p recv;
+// };
+// struct scclSharedNetComms {
+//     void* sendComm[MAXCHANNELS];
+//     void* recvComm[MAXCHANNELS];
+//     int sendRefCount[MAXCHANNELS];
+//     int recvRefCount[MAXCHANNELS];
+// };
+// struct scclProxyPool;
+// struct scclProxyProgressState {
+//     // Used by main threads to send work to progress thread
+//     struct scclProxyOpsPool* opsPool;
+//     scclShmHandle_t handle;
+//     char opsPoolShmSuffix[6];
+//     pthread_t thread;
+//     bool stop;
+//     struct scclProxyPeer** localPeers;
+//     struct scclSharedNetComms* netComms[SCCL_MAX_NETDEVS];
+//     struct scclProxyArgs* active;
+//     struct scclProxyArgs* pool;
+//     struct scclProxyPool* pools;
+//     int nextOps;
+// };
+// struct scclProxyAsyncOp {
+//     int type;
+//     struct scclProxyConnection* connection;
+//     int reqSize, respSize;
+//     char *reqBuff, *respBuff;
+//     void* opId;
+//     scclProxyAsyncOp* next;
+// };
+// struct scclProxyLocalPeer {
+//     struct scclSocket sock;
+//     int tpRank;
+//     int tpLocalRank;
+//     scclProxyAsyncOp* asyncOps;
+//     int asyncOpCounter;
+// };
+// struct scclProxyState {
+//     int refCount;
+//     int tpRank;
+//     int tpnRanks;
+//     int tpLocalnRanks;
+//     int cudaDev;
+//     int p2pnChannels;
+//     int p2pChunkSize;
+//     int nChannels;
+//     int buffSizes[SCCL_NUM_PROTOCOLS];
+//     bool allocP2pNetLLBuffers;
+//     bool dmaBufSupport;
+//     scclNet_t* scclNet;
+//     scclCollNet_t* scclCollNet;
+//     volatile uint32_t* abortFlag;
+//     // Service thread
+//     pthread_t thread;
+//     struct scclSocket* listenSock;
+//     int stop;
+//     CUcontext cudaCtx;
+//     // Used by main thread
+//     union scclSocketAddress* peerAddresses;
+//     struct scclSocket* peerSocks;
+//     struct scclProxyOps* proxyOps;
+//     void** sharedDevMems;
+//     struct scclIpcSocket peerIpcSock; // cuMEM API support (UDS)
+//     // Progress thread
+//     struct scclProxyProgressState progressState;
+//     // Queue of expected responses from the proxy
+//     struct scclExpectedProxyResponse* expectedResponses;
+// };
+// enum proxyConnectState {
+//     connUninitialized     = 0,
+//     connInitialized       = 1,
+//     connSharedInitialized = 2,
+//     connSetupDone         = 3,
+//     connConnected         = 4,
+//     numConnStates         = 5
+// };
+// struct scclProxyConnection {
+//     int send, transport, shared;
+//     int tpLocalRank, sameProcess;
+//     struct scclSocket* sock;
+//     struct scclTransportComm* tcomm;
+//     struct scclProxyArgs* proxyAppend;
+//     struct scclProxyArgs** proxyAppendPtr;
+//     void* transportResources;
+//     proxyConnectState state;
+//     struct scclCollNetSharedRes* collNet;
+// };
+// typedef scclResult_t (*threadFunc_t)(struct scclProxyArgs*);
+// enum proxyMode {
+//     proxyRing = 0,
+//     proxyFrom = 1,
+//     proxyTo   = 2
+// };
+// scclResult_t scclProxySaveOp(struct scclComm* comm, struct scclProxyOp* proxyOp, bool* justInquire);
+// scclResult_t scclProxyComputeP2p(struct scclInfo* info, struct scclProxyOp* proxyOp);
+// scclResult_t scclProxyStart(struct scclComm* comm);
+// scclResult_t scclProxyCreate(struct scclComm* comm);
+// scclResult_t scclProxyConnect(struct scclComm* comm, int transport, int send, int proxyRank, struct scclProxyConnector* proxyConn);
+// enum scclProxyMsgType {
+//     scclProxyMsgInit       = 1,
+//     scclProxyMsgSharedInit = 2,
+//     scclProxyMsgSetup      = 3,
+//     scclProxyMsgConnect    = 4,
+//     scclProxyMsgStart      = 5,
+//     scclProxyMsgClose      = 6,
+//     scclProxyMsgAbort      = 7,
+//     scclProxyMsgStop       = 8,
+//     scclProxyMsgConvertFd  = 9, // cuMem API support (UDS)
+// };
+// // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
+// // Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
+// // scclPollProxyResponse(), supplying the same opId to confirm the operation has completed
+// scclResult_t scclProxyCallAsync(struct scclComm* comm, struct scclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
+// // This function will internally call scclProxyCallAsync() and spin until scclPollProxyResponse() confirms the result is received
+// scclResult_t
+// scclProxyCallBlocking(struct scclComm* comm, struct scclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
+// scclResult_t scclPollProxyResponse(struct scclComm* comm, struct scclProxyConnector* proxyConn, void* respBuff, void* opId);
+// scclResult_t scclProxyClientConvertFdBlocking(struct scclComm* comm, struct scclProxyConnector* proxyConn, int fd, int* convertedFd);
+// scclResult_t scclProxyStop(struct scclComm* comm);
+// scclResult_t scclProxyShmUnlink(struct scclComm* comm);
+// scclResult_t scclProxyDestroy(struct scclComm* comm);
+// scclResult_t mscclSaveProxy(struct scclComm* comm, struct scclChannel* channel, int type, int peer, struct scclProxyOp* op, int connIndex);
--- a/src/hardware/topology/topo.h
+++ b/src/hardware/topology/topo.h
+#pragma once
+#include <string.h>
+#include "base.h"
+#include "archinfo.h"
+namespace sccl {
+namespace hardware {
+namespace topology {
+pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
+static bool initialized  = false;
+// scclResult_t scclTopoInit() {}
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/topology/topo_utils.cpp
+++ b/src/hardware/topology/topo_utils.cpp
+#include <string.h>
+#include "topo_utils.h"
+// #include "net.h"
+// #include "xml.h"
+// #include "net.h"
+namespace sccl {
+namespace hardware {
+namespace topology {
+scclResult_t int64ToBusId(int64_t id, char* busId) {
+    sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
+    return scclSuccess;
+}
+scclResult_t busIdToInt64(const char* busId, int64_t* id) {
+    char hexStr[17]; // Longest possible int64 hex string + null terminator.
+    int hexOffset = 0;
+    for(int i = 0; hexOffset < sizeof(hexStr) - 1; i++) {
+        char c = busId[i];
+        if(c == '.' || c == ':')
+            continue;
+        if((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) {
+            hexStr[hexOffset++] = busId[i];
+        } else
+            break;
+    }
+    hexStr[hexOffset] = '\0';
+    *id               = strtol(hexStr, NULL, 16);
+    return scclSuccess;
+}
+scclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
+    char* str = path + offset;
+    // Remove trailing "/"
+    if(*str == '/')
+        str--;
+    // Find next /
+    while(*str != '/')
+        str--;
+    str++;
+    int64_t numid;
+    SCCLCHECK(busIdToInt64(str, &numid));
+    // Ignore subdevice because those should use the same PCI link so we want to merge nodes.
+    numid -= numid & 0xf;
+    *id = numid;
+    return scclSuccess;
+}
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/topology/topo_utils.h
+++ b/src/hardware/topology/topo_utils.h
+#pragma once
+#include <string.h>
+#include "base.h"
+namespace sccl {
+namespace hardware {
+namespace topology {
+// 定义硬件拓扑类型枚举
+typedef enum topoNodeType {
+    GPU = 0, // 图形处理单元
+    PCI = 1, // 外围组件互连
+    NVS = 2, // 非易失性存储器
+    CPU = 3, // 中央处理器，实际上是NUMA域
+    NIC = 4, // 网络接口控制器
+    NET = 5  // 网络
+} topoNodeType_t;
+// 定义 topoPathType_t 枚举类型，用于表示不同的路径类型。
+enum topoPathType {
+    PATH_LOC = 0, // 本地路径
+    PATH_NVL = 1, // 通过 NVLink 连接
+    PATH_NVB = 2, // 通过中间 GPU 使用 NVLink 连接
+    PATH_PIX = 3, // 通过最多一个 PCIe 桥连接
+    PATH_PXB = 4, // 通过多个 PCIe 桥连接（不经过 PCIe 主桥）
+    PATH_PXN = 5, // GPU 和 NIC 之间通过中间 GPU 连接
+    PATH_PHB = 6, // 通过 PCIe 以及 PCIe 主桥连接
+    PATH_SYS = 7, // 通过 PCIe 以及 NUMA 节点之间的 SMP 互连连接
+    PATH_NET = 8, // 通过网络连接
+    PATH_DIS = 9  // 断开连接
+};
+////////////////////////////////////////////////////////////////////////////////////////////////
+// 将64位整数转换为总线ID字符串
+scclResult_t int64ToBusId(int64_t id, char* busId);
+// 将总线ID字符串转换为64位整数
+scclResult_t busIdToInt64(const char* busId, int64_t* id);
+// 将PCI路径转换为64位整数，路径偏移量和最小偏移量作为参数
+scclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id);
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/include/base.h
+++ b/src/include/base.h
+#pragma once
+#include <stdint.h>
+#include <string.h>
+#include "debug.h"
+#include "check.h"
+#include "param.h"
+#include "alloc.h"
+#include "utils.h"
+/*
+外部环境变量设置:
+src/debug.h：
+SCCL_DEBUG_LEVEL、SCCL_DEBUG_POS
+*/
+namespace sccl {
+#define WARP_SIZE warpSize
+#define MAXCHANNELS 32
+#define SCCL_MAX_NTHREADS 256
+#define SCCL_MAX_OPS 2048
+#define SCCL_STEPS 8
+typedef enum : uint8_t {
+    scclInt8     = 0,
+    scclChar     = 0,
+    scclUint8    = 1,
+    scclInt32    = 2,
+    scclInt      = 2,
+    scclUint32   = 3,
+    scclInt64    = 4,
+    scclUint64   = 5,
+    scclFloat16  = 6,
+    scclHalf     = 6,
+    scclFloat32  = 7,
+    scclFloat    = 7,
+    scclFloat64  = 8,
+    scclDouble   = 8,
+    scclBfloat16 = 9,
+} scclDataType_t;
+#define SCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+typedef enum : uint8_t {
+    SCCL_PROTO_LL     = 0,
+    SCCL_PROTO_LL128  = 1,
+    SCCL_PROTO_SIMPLE = 2
+} scclProtocolType_t;
+// 每个进程的唯一ID
+struct scclUniqueId {
+    int rank;   // 当前节点的全局排名
+    int nRanks; // 总的节点数量
+}
+} // namespace sccl
--- a/src/include/check.h
+++ b/src/include/check.h
+#pragma once
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_fp16.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <pwd.h>
+#include <errno.h>
+#include "debug.h"
+#define SCCL_MAJOR 1
+#define SCCL_MINOR 0
+#define SCCL_PATCH 0
+#define SCCL_SUFFIX ""
+#define SCCL_VERSION(X, Y, Z) ((X) * 1000 + (Y) * 100 + (Z))
+namespace sccl {
+/**
+ * @brief 对选中的代码进行简要功能说明
+ * @note 根据代码作用域（如公开API或内部实现）编写适当的文档注释
+ */
+typedef enum {
+    scclSuccess           = 0, /*!< No error */
+    scclUnhandledHipError = 1, /*!< Unhandled HIP error */
+    scclSystemError       = 2, /*!< Unhandled system error */
+    scclInternalError     = 3, /*!< Internal Error - Please report to RCCL developers */
+    scclInvalidArgument   = 4, /*!< Invalid argument */
+    scclInvalidUsage      = 5, /*!< Invalid usage */
+    scclRemoteError       = 6, /*!< Remote process exited or there was a network error */
+    scclInProgress        = 7, /*!< RCCL operation in progress */
+    scclNumResults        = 8  /*!< Number of result types */
+} scclResult_t;
+typedef enum {
+    testSuccess       = 0,
+    testInternalError = 1,
+    testHipError      = 2,
+    testScclError     = 3,
+    testTimeout       = 4,
+    testNumResults    = 5
+} testResult_t;
+static const char* scclGetErrorString(scclResult_t code) {
+    switch(code) {
+        case scclSuccess: return "success";
+        case scclUnhandledHipError: return "unhandled hip error (run with SCCL_DEBUG=INFO for details)";
+        case scclSystemError: return "unhandled system error (run with SCCL_DEBUG=INFO for details)";
+        case scclInternalError: return "internal error - please report this issue to the SCCL developers";
+        case scclInvalidArgument: return "invalid argument (run with SCCL_DEBUG=WARN for details)";
+        case scclInvalidUsage: return "invalid usage (run with SCCL_DEBUG=WARN for details)";
+        case scclRemoteError: return "remote process exited or there was a network error";
+        case scclInProgress: return "SCCL operation in progress";
+        default: return "unknown result code";
+    }
+}
+////////////////////////////// SCCL和HIP //////////////////////////////
+// Propagate errors up
+#define SCCLCHECK(call)                                                        \
+    do {                                                                       \
+        scclResult_t RES = call;                                               \
+        if(RES != scclSuccess && RES != scclInProgress) {                      \
+            /* Print the back trace*/                                          \
+            INFO(SCCL_LOG_CODEALL, "check fail: %s", scclGetErrorString(RES)); \
+            return RES;                                                        \
+        }                                                                      \
+    } while(0);
+#define SCCLCHECKGOTO(call, RES, label)                                     \
+    do {                                                                    \
+        RES = call;                                                         \
+        if(RES != scclSuccess && RES != scclInProgress) {                   \
+            INFO(SCCL_LOG_CODEALL, "%s:%d -> %d", __FILE__, __LINE__, RES); \
+            goto label;                                                     \
+        }                                                                   \
+    } while(0);
+#define HIPCHECK(cmd)                                                                                          \
+    do {                                                                                                       \
+        hipError_t err = cmd;                                                                                  \
+        if(err != hipSuccess) {                                                                                \
+            char hostname[1024];                                                                               \
+            gethostname(hostname, 1024);                                                                       \
+            printf("%s: Test HIP failure %s:%d '%s'\n", hostname, __FILE__, __LINE__, hipGetErrorString(err)); \
+            return scclUnhandledHipError;                                                                      \
+        }                                                                                                      \
+    } while(0)
+#define HIPCHECKGOTO(cmd, RES, label)                         \
+    do {                                                      \
+        hipError_t err = cmd;                                 \
+        if(err != hipSuccess) {                               \
+            WARN("HIP failure '%s'", hipGetErrorString(err)); \
+            RES = scclUnhandledHipError;                      \
+            goto label;                                       \
+        }                                                     \
+    } while(false)
+////////////////////////////// Value检查 //////////////////////////////
+#define EQCHECK(statement, value)                                                                             \
+    do {                                                                                                      \
+        if((statement) == value) {                                                                            \
+            /* Print the back trace*/                                                                         \
+            INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, scclSystemError, strerror(errno)); \
+            return scclSystemError;                                                                           \
+        }                                                                                                     \
+    } while(0);
+#define EQCHECKGOTO(statement, value, RES, label)                                                 \
+    do {                                                                                          \
+        if((statement) == value) {                                                                \
+            /* Print the back trace*/                                                             \
+            RES = scclSystemError;                                                                \
+            INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
+            goto label;                                                                           \
+        }                                                                                         \
+    } while(0);
+////////////////////////////// SYS //////////////////////////////
+// Check system calls
+#define SYSCHECK(call, name)             \
+    do {                                 \
+        int retval;                      \
+        SYSCHECKVAL(call, name, retval); \
+    } while(false)
+#define SYSCHECKVAL(call, name, retval)                            \
+    do {                                                           \
+        SYSCHECKSYNC(call, name, retval);                          \
+        if(retval == -1) {                                         \
+            WARN("Call to " name " failed : %s", strerror(errno)); \
+            return scclSystemError;                                \
+        }                                                          \
+    } while(false)
+#define SYSCHECKSYNC(call, name, retval)                                                       \
+    do {                                                                                       \
+        retval = call;                                                                         \
+        if(retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) {      \
+            INFO(SCCL_LOG_CODEALL, "Call to " name " returned %s, retrying", strerror(errno)); \
+        } else {                                                                               \
+            break;                                                                             \
+        }                                                                                      \
+    } while(true)
+#define SYSCHECKGOTO(statement, RES, label)                                                       \
+    do {                                                                                          \
+        if((statement) == -1) {                                                                   \
+            /* Print the back trace*/                                                             \
+            RES = scclSystemError;                                                                \
+            INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
+            goto label;                                                                           \
+        }                                                                                         \
+    } while(0);
+} // namespace sccl
--- a/src/include/debug.h
+++ b/src/include/debug.h
+#pragma once
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_fp16.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <string.h>
+#include <stdarg.h>
+#include <sys/syscall.h>
+namespace sccl {
+#define SCCL_NET_MAX_REQUESTS 8
+typedef enum : uint8_t {
+    SCCL_LOG_NONE    = 0,
+    SCCL_LOG_VERSION = 1,
+    SCCL_LOG_WARN    = 2,
+    SCCL_LOG_INFO    = 3,
+    SCCL_LOG_ABORT   = 4
+} scclDebugLogLevel_t;
+typedef enum : uint8_t {
+    SCCL_LOG_CODEALL    = 0,
+    SCCL_LOG_NET        = 1,
+    SCCL_LOG_TOPO       = 2,
+    SCCL_LOG_BOOTSTRAP  = 3,
+    SCCL_LOG_TRANSPORT  = 4,
+    SCCL_LOG_GRAPH      = 5,
+    SCCL_LOG_CONNECT    = 6,
+    SCCL_LOG_P2P        = 7,
+    SCCL_LOG_COLLECTIVE = 8,
+    SCCL_LOG_ALLOC      = 9
+} scclDebugLogPos_t;
+namespace debug {
+static char scclLastError[1024] = "";                             // 全局字符串，用于存储可读的最后错误信息
+static char hostname[1024];                                       // 存储主机名的全局字符串
+static pthread_mutex_t scclDebugLock = PTHREAD_MUTEX_INITIALIZER; // 用于调试操作的互斥锁，保证多线程环境下的线程安全
+static __thread int tid              = -1;                        // 线程局部存储（Thread Local Storage）变量，存储当前线程的ID，默认值为-1
+static int pid                       = -1;                        // 存储当前进程的ID，默认值为-1
+static FILE* scclDebugFile           = stdout;                    // 指向调试输出流的文件指针，默认指向标准输出（stdout
+static int scclDebugLevel = -1; // 初始化为 -1，表示未设置
+// 在文件顶部或适当位置定义变量
+static int scclDebugPos = -1; // 初始化为 -1，表示未设置
+/**
+ * @brief 获取主机名并截断到指定分隔符
+ *
+ * 该函数获取当前主机名，并将其截断到第一个出现的指定分隔符处。
+ * 如果获取主机名失败，则返回"unknown"。
+ *
+ * @param hostname 用于存储主机名的缓冲区
+ * @param maxlen 缓冲区最大长度
+ * @param delim 用于截断主机名的分隔符
+ */
+static void getHostName(char* hostname, int maxlen, const char delim) {
+    if(gethostname(hostname, maxlen) != 0) {
+        strncpy(hostname, "unknown", maxlen);
+        return;
+    }
+    int i = 0;
+    while((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1))
+        i++;
+    hostname[i] = '\0';
+}
+////////////////////////////// 初始化debug //////////////////////////////
+/**
+ * @brief 初始化SCCL调试系统
+ *
+ * 该函数负责初始化SCCL的调试功能，包括：
+ * 1. 从环境变量SCCL_DEBUG_LEVEL读取并设置调试等级
+ * 2. 从环境变量SCCL_DEBUG_POS读取并设置调试位置
+ * 3. 缓存当前进程的PID和主机名
+ * 4. 根据SCCL_DEBUG_FILE环境变量创建调试日志文件
+ *
+ * 函数使用互斥锁保证线程安全，并通过原子操作设置最终的调试等级和位置。
+ * 调试等级和位置的默认值分别为SCCL_LOG_INFO和SCCL_LOG_CODEALL。
+ *
+ * @note 该函数是线程安全的，但应在程序早期调用以避免竞态条件
+ */
+static void scclDebugInit() {
+    pthread_mutex_lock(&scclDebugLock);
+    if(scclDebugLevel != -1) {
+        pthread_mutex_unlock(&scclDebugLock);
+        return;
+    }
+    //// 按照debug等级划分
+    int tempScclDebugLevel = -1;
+    {
+        const char* sccl_debug = getenv("SCCL_DEBUG_LEVEL");
+        if(sccl_debug == NULL) {
+            tempScclDebugLevel = SCCL_LOG_INFO;
+        } else if(strcasecmp(sccl_debug, "VERSION") == 0) {
+            tempScclDebugLevel = SCCL_LOG_VERSION;
+        } else if(strcasecmp(sccl_debug, "WARN") == 0) {
+            tempScclDebugLevel = SCCL_LOG_WARN;
+        } else if(strcasecmp(sccl_debug, "INFO") == 0) {
+            tempScclDebugLevel = SCCL_LOG_INFO;
+        } else if(strcasecmp(sccl_debug, "ABORT") == 0) {
+            tempScclDebugLevel = SCCL_LOG_ABORT;
+        }
+    }
+    //// 按照代码位置划分
+    int tempScclDebugPos = -1;
+    {
+        const char* sccl_debug = getenv("SCCL_DEBUG_POS");
+        if(sccl_debug == NULL) {
+            tempScclDebugPos = SCCL_LOG_CODEALL;
+        } else if(strcasecmp(sccl_debug, "NET") == 0) {
+            tempScclDebugPos = SCCL_LOG_NET;
+        } else if(strcasecmp(sccl_debug, "TOPO") == 0) {
+            tempScclDebugPos = SCCL_LOG_TOPO;
+        } else if(strcasecmp(sccl_debug, "BOOTSTRAP") == 0) {
+            tempScclDebugPos = SCCL_LOG_BOOTSTRAP;
+        } else if(strcasecmp(sccl_debug, "TRANSPORT") == 0) {
+            tempScclDebugPos = SCCL_LOG_TRANSPORT;
+        } else if(strcasecmp(sccl_debug, "GRAPH") == 0) {
+            tempScclDebugPos = SCCL_LOG_GRAPH;
+        } else if(strcasecmp(sccl_debug, "CONNECT") == 0) {
+            tempScclDebugPos = SCCL_LOG_CONNECT;
+        } else if(strcasecmp(sccl_debug, "P2P") == 0) {
+            tempScclDebugPos = SCCL_LOG_P2P;
+        } else if(strcasecmp(sccl_debug, "COLLECTIVE") == 0) {
+            tempScclDebugPos = SCCL_LOG_COLLECTIVE;
+        } else if(strcasecmp(sccl_debug, "ALLOC") == 0) {
+            tempScclDebugPos = SCCL_LOG_ALLOC;
+        }
+    }
+    // Cache pid and hostname
+    getHostName(hostname, 1024, '.');
+    pid = getpid();
+    /* Parse and expand the SCCL_DEBUG_FILE path and
+     * then create the debug file. But don't bother unless the
+     * SCCL_DEBUG level is > VERSION
+     */
+    const char* scclDebugFileEnv = getenv("SCCL_DEBUG_FILE");
+    if(tempScclDebugLevel > SCCL_LOG_VERSION && scclDebugFileEnv != NULL) {
+        int c                      = 0;
+        char debugFn[PATH_MAX + 1] = "";
+        char* dfn                  = debugFn;
+        while(scclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
+            if(scclDebugFileEnv[c++] != '%') {
+                *dfn++ = scclDebugFileEnv[c - 1];
+                continue;
+            }
+            switch(scclDebugFileEnv[c++]) {
+                case '%': // Double %
+                    *dfn++ = '%';
+                    break;
+                case 'h': // %h = hostname
+                    dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
+                    break;
+                case 'p': // %p = pid
+                    dfn += snprintf(dfn, PATH_MAX, "%d", pid);
+                    break;
+                default: // Echo everything we don't understand
+                    *dfn++ = '%';
+                    *dfn++ = scclDebugFileEnv[c - 1];
+                    break;
+            }
+        }
+        *dfn = '\0';
+        if(debugFn[0] != '\0') {
+            FILE* file = fopen(debugFn, "w");
+            if(file != nullptr) {
+                setbuf(file, nullptr); // disable buffering
+                scclDebugFile = file;
+            }
+        }
+    }
+    __atomic_store_n(&scclDebugLevel, tempScclDebugLevel, __ATOMIC_RELEASE);
+    __atomic_store_n(&scclDebugPos, tempScclDebugPos, __ATOMIC_RELEASE);
+    pthread_mutex_unlock(&scclDebugLock);
+}
+////////////////////////////// 打印DEBUG信息 //////////////////////////////
+template <scclDebugLogLevel_t level>
+void scclDebugLog(scclDebugLogPos_t pos_flags, const char* filepath, int line, const char* fmt, ...) {
+    if(__atomic_load_n(&scclDebugLevel, __ATOMIC_ACQUIRE) == -1)
+        scclDebugInit();
+    if constexpr(level == SCCL_LOG_WARN)
+        scclDebugPos = SCCL_LOG_CODEALL;
+    // 检查调试级别和位置标志
+    bool isDebugLevelSufficient = (scclDebugLevel >= level);
+    bool isDebugPositionMatch   = (scclDebugPos == SCCL_LOG_CODEALL || scclDebugPos == pos_flags);
+    // 如果调试级别不足或位置标志不匹配，则不执行后续操作
+    if(!isDebugLevelSufficient || !isDebugPositionMatch) {
+        return;
+    }
+    // Save the last error (WARN) as a human readable string
+    if constexpr(level == SCCL_LOG_WARN) {
+        pthread_mutex_lock(&scclDebugLock);
+        va_list vargs;
+        va_start(vargs, fmt);
+        (void)vsnprintf(scclLastError, sizeof(scclLastError), fmt, vargs);
+        va_end(vargs);
+        pthread_mutex_unlock(&scclDebugLock);
+    }
+    if(tid == -1) {
+        tid = syscall(SYS_gettid);
+    }
+    char buffer[1024];
+    size_t len = 0;
+    if constexpr(level == SCCL_LOG_WARN) {
+        len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d %s:%d SCCL WARN ", hostname, pid, tid, filepath, line);
+    } else if constexpr(level == SCCL_LOG_INFO) {
+        len = snprintf(buffer, sizeof(buffer), "%s:%d:%d %s:%d SCCL INFO ", hostname, pid, tid, filepath, line);
+    }
+    if(len) {
+        va_list vargs;
+        va_start(vargs, fmt);
+        len += vsnprintf(buffer + len, sizeof(buffer) - len, fmt, vargs);
+        va_end(vargs);
+        buffer[len++] = '\n';
+        fwrite(buffer, 1, len, scclDebugFile);
+    }
+}
+} // namespace debug
+#define WARN(...) debug::scclDebugLog<SCCL_LOG_WARN>(SCCL_LOG_CODEALL, __FILE__, __LINE__, __VA_ARGS__)
+#define INFO(FLAGS, ...) debug::scclDebugLog<SCCL_LOG_INFO>((FLAGS), __FILE__, __LINE__, __VA_ARGS__)
+} // namespace sccl
--- a/src/utils/align.h
+++ b/src/utils/align.h
+#pragma once
+#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#define ROUNDUP(x, y) (DIVUP((x), (y)) * (y))
+#define ALIGN_POWER(x, y) ((x) > (y) ? ROUNDUP(x, y) : ((y) / ((y) / (x))))
+#define ALIGN_SIZE(size, align) size = ((size + (align) - 1) / (align)) * (align);
+#if !__CUDA_ARCH__
+#ifndef __host__
+#define __host__
+#endif
+#ifndef __device__
+#define __device__
+#endif
+#endif
+template <typename X, typename Y, typename Z = decltype(X() + Y())>
+/**
+ * @brief 计算向上取整的除法结果
+ * @tparam X 被除数的类型
+ * @tparam Y 除数的类型
+ * @tparam Z 返回值的类型
+ * @param x 被除数
+ * @param y 除数
+ * @return 返回 (x + y - 1) / y 的结果
+ * @note 该函数为constexpr，可在编译时计算
+ * @note 支持host和device端调用
+ */
+__host__ __device__ constexpr Z divUp(X x, Y y) {
+    return (x + y - 1) / y;
+}
+template <typename X, typename Y, typename Z = decltype(X() + Y())>
+/**
+ * @brief 将数值x向上对齐到y的倍数
+ *
+ * @tparam X 输入数值类型
+ * @tparam Y 对齐基数类型
+ * @tparam Z 返回数值类型
+ * @param x 需要对齐的数值
+ * @param y 对齐基数
+ * @return constexpr Z 返回向上对齐后的数值
+ *
+ * @note 该函数支持主机端(__host__)和设备端(__device__)调用
+ * @note 使用公式 (x + y - 1) - (x + y - 1) % y 实现向上对齐
+ */
+__host__ __device__ constexpr Z roundUp(X x, Y y) {
+    return (x + y - 1) - (x + y - 1) % y;
+}
+// assumes second argument is a power of 2
+template <typename X, typename Z = decltype(X() + int())>
+/**
+ * @brief 将给定值向上对齐到指定边界
+ *
+ * @tparam X 输入值类型
+ * @tparam Z 返回值类型
+ * @param x 需要对齐的值
+ * @param a 对齐边界(必须是2的幂次)
+ * @return constexpr Z 对齐后的值
+ *
+ * @note 该函数支持主机和设备端调用
+ */
+__host__ __device__ constexpr Z alignUp(X x, int a) {
+    return (x + a - 1) & Z(-a);
+}
--- a/src/utils/alloc.h
+++ b/src/utils/alloc.h
+#pragma once
+#include <sys/mman.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include "align.h"
+#include "check.h"
+namespace sccl {
+namespace alloc {
+template <typename T>
+/**
+ * @brief 使用HIP分配并初始化主机内存（带调试信息）
+ *
+ * 该函数使用HIP API分配主机内存，并将内存初始化为0。支持HIP流捕获模式切换，
+ * 并记录分配调试信息（文件/行号）。
+ *
+ * @tparam T 要分配的数据类型
+ * @param[out] ptr 指向分配内存的指针
+ * @param[in] nelem 要分配的元素数量
+ * @param[in] filefunc 调用位置的文件/函数名（调试用）
+ * @param[in] line 调用位置的行号（调试用）
+ * @return scclResult_t 返回操作结果（scclSuccess表示成功）
+ *
+ * @note 分配失败时会输出警告日志，成功时会记录分配信息
+ */
+scclResult_t scclHipHostCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
+    scclResult_t result       = scclSuccess;
+    hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+    *ptr                      = nullptr;
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    HIPCHECKGOTO(hipHostMalloc(ptr, nelem * sizeof(T), hipHostMallocMapped), result, finish);
+    memset(*ptr, 0, nelem * sizeof(T));
+finish:
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    if(*ptr == nullptr)
+        WARN("Failed to HIP host alloc %ld bytes", nelem * sizeof(T));
+    INFO(SCCL_LOG_ALLOC, "%s:%d Hip Host Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
+    return result;
+}
+inline scclResult_t scclHipHostFree(void* ptr) {
+    HIPCHECK(hipHostFree(ptr));
+    return scclSuccess;
+}
+template <typename T>
+/**
+ * @brief 分配并清零指定数量的元素内存（调试版本）
+ *
+ * @tparam T 元素类型
+ * @param[out] ptr 指向分配内存的指针
+ * @param nelem 要分配的元素数量
+ * @param filefunc 调用位置的文件/函数信息（用于调试）
+ * @param line 调用位置的行号（用于调试）
+ * @return scclResult_t 返回操作结果，scclSuccess表示成功
+ *
+ * @note 此函数会记录内存分配日志，并在失败时返回错误
+ */
+scclResult_t scclCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
+    void* p = malloc(nelem * sizeof(T));
+    if(p == NULL) {
+        WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
+        return scclSystemError;
+    }
+    INFO(SCCL_LOG_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), p);
+    memset(p, 0, nelem * sizeof(T));
+    *ptr = (T*)p;
+    return scclSuccess;
+}
+template <typename T>
+scclResult_t scclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
+    if(nelem < oldNelem)
+        return scclInternalError;
+    if(nelem == oldNelem)
+        return scclSuccess;
+    T* oldp = *ptr;
+    T* p    = (T*)malloc(nelem * sizeof(T));
+    if(p == NULL) {
+        WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
+        return scclSystemError;
+    }
+    memcpy(p, oldp, oldNelem * sizeof(T));
+    free(oldp);
+    memset(p + oldNelem, 0, (nelem - oldNelem) * sizeof(T));
+    *ptr = (T*)p;
+    INFO(SCCL_LOG_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem * sizeof(T), nelem * sizeof(T), *ptr);
+    return scclSuccess;
+}
+struct __attribute__((aligned(64))) allocationTracker {
+    union {
+        struct {
+            uint64_t totalAlloc;
+            uint64_t totalAllocSize;
+        };
+        char align[64];
+    };
+};
+static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be size of 64 bytes");
+static constexpr int MAX_ALLOC_TRACK_NGPU = 32;
+extern struct allocationTracker allocTracker[];
+static int scclCuMemEnable() { return 0; }
+static inline scclResult_t scclCuMemAlloc(void** ptr, void* handlep, size_t size) {
+    WARN("CUMEM not supported prior to HIP 11.3");
+    return scclInternalError;
+}
+static inline scclResult_t scclCuMemFree(void* ptr) {
+    WARN("CUMEM not supported prior to HIP 11.3");
+    return scclInternalError;
+}
+template <typename T>
+/**
+ * @brief 使用HIP分配设备内存（带调试信息）
+ *
+ * @tparam T 数据类型
+ * @param filefunc 调用位置的文件/函数信息
+ * @param line 调用位置的行号
+ * @param[out] ptr 分配的内存指针
+ * @param nelem 元素数量
+ * @param isFineGrain 是否使用细粒度内存（默认为false）
+ * @return scclResult_t 返回操作结果状态码
+ *
+ * @note 此函数会记录分配大小和指针地址的调试信息
+ *       支持细粒度内存分配选项，并根据HIP_UNCACHED_MEMORY宏选择分配方式
+ *       自动处理流捕获模式切换
+ */
+scclResult_t scclHipMallocDebug(const char* filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
+    scclResult_t result       = scclSuccess;
+    hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+    *ptr                      = nullptr;
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    if(isFineGrain) {
+#if defined(HIP_UNCACHED_MEMORY)
+        HIPCHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem * sizeof(T), hipDeviceMallocUncached), result, finish);
+#else
+        HIPCHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem * sizeof(T), hipDeviceMallocFinegrained), result, finish);
+#endif
+    } else
+        HIPCHECKGOTO(hipMalloc(ptr, nelem * sizeof(T)), result, finish);
+finish:
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    if(*ptr == nullptr)
+        WARN("Failed to HIP malloc %ld bytes", nelem * sizeof(T));
+    INFO(SCCL_LOG_ALLOC, "%s:%d Hip Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
+    return result;
+}
+template <typename T>
+/**
+ * @brief 使用HIP分配并清零设备内存（调试版本）
+ *
+ * @tparam T 数据类型
+ * @param filefunc 调用源文件名/函数名（用于调试）
+ * @param line 调用行号（用于调试）
+ * @param[out] ptr 分配的设备内存指针
+ * @param nelem 元素数量
+ * @param sideStream 可选侧边流（避免干扰图捕获）
+ * @param isFineGrain 是否使用细粒度内存
+ * @return scclResult_t 返回操作结果状态码
+ *
+ * @note 1. 会自动跟踪分配统计
+ *       2. 支持细粒度内存分配（需HSA支持）
+ *       3. 使用异步方式清零内存
+ *       4. 会临时修改流捕获模式
+ */
+scclResult_t scclHipCallocDebug(const char* filefunc, int line, T** ptr, size_t nelem, hipStream_t sideStream = nullptr, bool isFineGrain = false) {
+    scclResult_t result = scclSuccess;
+    extern bool hsaFineGrainFlag;
+    hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+    *ptr                      = nullptr;
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    // Need a side stream so as not to interfere with graph capture.
+    hipStream_t stream = sideStream;
+    if(stream == nullptr)
+        HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+    if(isFineGrain && hsaFineGrainFlag) {
+#if defined(HIP_UNCACHED_MEMORY)
+        HIPCHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem * sizeof(T), hipDeviceMallocUncached), result, finish);
+#else
+        HIPCHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem * sizeof(T), hipDeviceMallocFinegrained), result, finish);
+#endif
+    } else
+        HIPCHECKGOTO(hipMalloc(ptr, nelem * sizeof(T)), result, finish);
+    HIPCHECKGOTO(hipMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
+    HIPCHECKGOTO(hipStreamSynchronize(stream), result, finish);
+    if(sideStream == nullptr)
+        HIPCHECKGOTO(hipStreamDestroy(stream), result, finish);
+    int dev;
+    HIPCHECK(hipGetDevice(&dev));
+    if(dev < MAX_ALLOC_TRACK_NGPU) {
+        __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
+        __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem * sizeof(T), __ATOMIC_RELAXED);
+    }
+finish:
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    if(*ptr == nullptr)
+        WARN("Failed to HIP calloc %ld bytes", nelem * sizeof(T));
+    INFO(SCCL_LOG_ALLOC, "%s:%d Hip Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
+    return result;
+}
+template <typename T>
+/**
+ * @brief 异步分配并初始化HIP设备内存（调试版本）
+ *
+ * 该函数用于在HIP设备上异步分配内存并将其初始化为0，支持细粒度内存分配选项。
+ * 同时会跟踪内存分配情况并记录调试信息。
+ *
+ * @tparam T 数据类型
+ * @param filefunc 调用位置的文件名和函数名（用于调试）
+ * @param line 调用位置的行号（用于调试）
+ * @param[out] ptr 指向分配内存的指针
+ * @param nelem 要分配的元素数量
+ * @param stream HIP流，用于异步操作
+ * @param isFineGrain 是否使用细粒度内存分配（默认为false）
+ * @return scclResult_t 返回操作结果（scclSuccess表示成功）
+ *
+ * @note 该函数会修改全局内存分配跟踪器，并记录分配日志
+ * @warning 分配失败时会输出警告信息
+ */
+scclResult_t scclHipCallocAsyncDebug(const char* filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, bool isFineGrain = false) {
+    scclResult_t result       = scclSuccess;
+    hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+    *ptr                      = nullptr;
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    if(isFineGrain) {
+#if defined(HIP_UNCACHED_MEMORY)
+        HIPCHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem * sizeof(T), hipDeviceMallocUncached), result, finish);
+#else
+        HIPCHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem * sizeof(T), hipDeviceMallocFinegrained), result, finish);
+#endif
+    } else
+        HIPCHECKGOTO(hipMalloc(ptr, nelem * sizeof(T)), result, finish);
+    HIPCHECKGOTO(hipMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
+    int dev;
+    HIPCHECK(hipGetDevice(&dev));
+    if(dev < MAX_ALLOC_TRACK_NGPU) {
+        __atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
+        __atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem * sizeof(T), __ATOMIC_RELAXED);
+    }
+finish:
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    if(*ptr == nullptr)
+        WARN("Failed to HIP calloc async %ld bytes", nelem * sizeof(T));
+    INFO(SCCL_LOG_ALLOC, "%s:%d Hip Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
+    return result;
+}
+template <typename T>
+/**
+ * 异步执行HIP内存拷贝操作
+ *
+ * @tparam T 数据类型模板参数
+ * @param dst 目标内存地址
+ * @param src 源内存地址
+ * @param nelem 要拷贝的元素数量
+ * @param stream HIP流对象
+ * @return scclResult_t 返回操作结果，成功返回scclSuccess
+ *
+ * @note 此函数会临时修改流捕获模式为hipStreamCaptureModeRelaxed，
+ *       并在操作完成后恢复原始模式
+ */
+scclResult_t scclHipMemcpyAsync(T* dst, T* src, size_t nelem, hipStream_t stream) {
+    scclResult_t result       = scclSuccess;
+    hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    HIPCHECKGOTO(hipMemcpyAsync(dst, src, nelem * sizeof(T), hipMemcpyDefault, stream), result, finish);
+finish:
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    return result;
+}
+template <typename T>
+/**
+ * @brief 使用HIP在主机和设备间同步拷贝数据
+ *
+ * 该函数创建一个非阻塞流执行异步内存拷贝，并同步等待完成。
+ * 使用hipStreamCaptureModeRelaxed模式避免干扰图捕获操作。
+ *
+ * @tparam T 数据类型模板参数
+ * @param dst 目标内存地址
+ * @param src 源内存地址
+ * @param nelem 要拷贝的元素数量
+ * @return scclResult_t 返回操作结果状态码
+ */
+scclResult_t scclHipMemcpy(T* dst, T* src, size_t nelem) {
+    scclResult_t result       = scclSuccess;
+    hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    // Need a side stream so as not to interfere with graph capture.
+    hipStream_t stream;
+    HIPCHECKGOTO(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking), result, finish);
+    SCCLCHECKGOTO(scclHipMemcpyAsync(dst, src, nelem, stream), result, finish);
+    HIPCHECKGOTO(hipStreamSynchronize(stream), result, finish);
+    HIPCHECKGOTO(hipStreamDestroy(stream), result, finish);
+finish:
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    return result;
+}
+template <typename T>
+/**
+ * @brief 释放HIP设备内存
+ *
+ * 该函数用于释放通过HIP分配的设备内存指针。支持两种模式：
+ * 1. 当启用CUDA内存管理时，调用scclCuMemFree释放
+ * 2. 否则直接调用hipFree释放
+ *
+ * @tparam T 指针类型
+ * @param ptr 要释放的设备内存指针
+ * @return scclResult_t 返回操作结果，scclSuccess表示成功
+ *
+ * @note 函数会在执行前后自动处理HIP流捕获模式
+ */
+scclResult_t scclHipFree(T* ptr) {
+    scclResult_t result       = scclSuccess;
+    hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+    INFO(SCCL_LOG_ALLOC, "Hip Free pointer %p", ptr);
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    if(scclCuMemEnable()) {
+        SCCLCHECKGOTO(scclCuMemFree((void*)ptr), result, finish);
+    } else {
+        HIPCHECKGOTO(hipFree(ptr), result, finish);
+    }
+finish:
+    HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    return result;
+}
+/**
+ * @brief 分配对齐的内存并初始化为0（调试版本）
+ *
+ * 使用posix_memalign分配页面对齐的内存，并将内存区域初始化为0。
+ * 记录分配信息到日志系统。
+ *
+ * @param[out] ptr 指向分配内存指针的指针
+ * @param[in] size 请求分配的内存大小（字节）
+ * @param[in] filefunc 调用位置的文件/函数信息（用于调试）
+ * @param[in] line 调用位置的行号（用于调试）
+ * @return scclResult_t 返回操作状态（scclSuccess或scclSystemError）
+ */
+inline scclResult_t scclIbMallocDebug(void** ptr, size_t size, const char* filefunc, int line) {
+    size_t page_size = sysconf(_SC_PAGESIZE);
+    void* p;
+    int size_aligned = ROUNDUP(size, page_size);
+    int ret          = posix_memalign(&p, page_size, size_aligned);
+    if(ret != 0)
+        return scclSystemError;
+    memset(p, 0, size);
+    *ptr = p;
+    INFO(SCCL_LOG_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
+    return scclSuccess;
+}
+} // namespace alloc
+// 定义宏 scclHipHostCalloc，用于调试版本的主机端内存分配，自动添加文件名和行号信息
+#define scclHipHostCalloc(...) alloc::scclHipHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+// 定义宏 scclCalloc，用于调试版本的常规内存分配，自动添加文件名和行号信息
+#define scclCalloc(...) alloc::scclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+// 定义宏 scclHipMalloc，用于调试版本的 HIP (Heterogeneous-Compute Interface for Portability) 内存分配，自动添加文件名和行号信息
+#define scclHipMalloc(...) alloc::scclHipMallocDebug(__FILE__, __LINE__, __VA_ARGS__)
+// 定义宏 scclHipCalloc，用于调试版本的 HIP 内存清零分配，自动添加文件名和行号信息
+#define scclHipCalloc(...) alloc::scclHipCallocDebug(__FILE__, __LINE__, __VA_ARGS__)
+// 定义宏 scclHipCallocAsync，用于调试版本的异步 HIP 内存清零分配，自动添加文件名和行号信息
+#define scclHipCallocAsync(...) alloc::scclHipCallocAsyncDebug(__FILE__, __LINE__, __VA_ARGS__)
+// 定义宏 scclIbMalloc，用于调试版本的 InfiniBand 内存分配，自动添加文件名和行号信息
+#define scclIbMalloc(...) alloc::scclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+///////////////////////////////////////// 内存申请和释放函数 /////////////////////////////////////////
+inline scclResult_t scclHipHostFree(void* ptr) { return alloc::scclHipFree(ptr); }
+template <typename T>
+scclResult_t scclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
+    return alloc::scclRealloc(ptr, oldNelem, nelem);
+}
+template <typename T>
+scclResult_t scclHipMemcpyAsync(T* dst, T* src, size_t nelem, hipStream_t stream) {
+    return alloc::scclHipMemcpyAsync(dst, src, nelem, stream);
+}
+template <typename T>
+scclResult_t scclHipMemcpy(T* dst, T* src, size_t nelem) {
+    return alloc::scclHipMemcpy(dst, src, nelem);
+}
+template <typename T>
+scclResult_t scclHipFree(T* ptr) {
+    return alloc::scclHipFree(ptr);
+}
+} // namespace sccl
--- a/src/utils/archinfo.cc
+++ b/src/utils/archinfo.cc
+#include "archinfo.h"
+#include "check.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+namespace sccl {
+void GcnArchNameFormat(char* gcnArchName, char* out) {
+    // this function parses the char array from the device properties into something easier to handle.
+    // as the gcnArchName attribute looks something like: "gfx900:xnack+:blah-:etc-"
+    char* gcnArchNameToken = strtok(gcnArchName, ":");
+    strcpy(out, gcnArchNameToken);
+    return;
+}
+void convertGcnArchToGcnArchName(const char* gcnArch, const char** gcnArchName) {
+    if(strcmp(gcnArch, "906") == 0)
+        *gcnArchName = "gfx906";
+    else if(strcmp(gcnArch, "908") == 0)
+        *gcnArchName = "gfx908";
+    else if(strcmp(gcnArch, "910") == 0)
+        *gcnArchName = "gfx90a";
+    else if(strcmp(gcnArch, "940") == 0)
+        *gcnArchName = "gfx940";
+    else if(strcmp(gcnArch, "941") == 0)
+        *gcnArchName = "gfx941";
+    else if(strcmp(gcnArch, "942") == 0)
+        *gcnArchName = "gfx942";
+    else
+        *gcnArchName = gcnArch;
+    return;
+}
+int GetGcnArchName(int deviceId, char* out) {
+    hipDeviceProp_t devProp;
+    HIPCHECK(hipGetDeviceProperties(&devProp, deviceId));
+    GcnArchNameFormat(devProp.gcnArchName, out);
+    return 0;
+}
+double GetDeviceWallClockRateInKhz(int deviceId) {
+    char gcn[256];
+    GetGcnArchName(deviceId, gcn);
+    if(strncmp("gfx94", gcn, 5) == 0)
+        return 1.0E5;
+    else
+        return 2.5E4;
+}
+bool IsArchMatch(char const* arch, char const* target) { return (strncmp(arch, target, strlen(target)) == 0); }
+} // namespace sccl
--- a/src/utils/archinfo.h
+++ b/src/utils/archinfo.h
+#ifndef ARCHINFO_H_
+#define ARCHINFO_H_
+#include <string.h>
+/*
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
+*/
+namespace sccl {
+// 将GCN架构名称格式化为指定的输出格式
+void GcnArchNameFormat(char* gcnArchName, char* out);
+// 将GCN架构转换为GCN架构名称
+void convertGcnArchToGcnArchName(const char* gcnArch, const char** gcnArchName);
+// 获取指定设备ID的GCN架构名称
+int GetGcnArchName(int deviceId, char* out);
+// 获取指定设备ID的设备墙钟速率（以KHz为单位）
+double GetDeviceWallClockRateInKhz(int deviceId);
+// 判断指定的架构名称是否与目标架构匹配
+bool IsArchMatch(char const* arch, char const* target);
+} // namespace sccl
+#endif // ARCHINFO_H_
--- a/src/utils/param.cpp
+++ b/src/utils/param.cpp
+#include <algorithm>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <pwd.h>
+#include "param.h"
+#include "debug.h"
+namespace sccl {
+/**
+ * 获取当前用户的主目录路径
+ *
+ * @return 返回指向用户主目录路径的指针，如果获取失败则返回NULL
+ */
+const char* userHomeDir() {
+    struct passwd* pwUser = getpwuid(getuid());
+    return pwUser == NULL ? NULL : pwUser->pw_dir;
+}
+/**
+ * @brief 从指定文件中读取环境变量并设置到系统环境
+ *
+ * 该函数读取指定格式的配置文件，每行格式为"VAR=VALUE"，
+ * 忽略以#开头的注释行，并将解析出的环境变量设置到当前进程环境。
+ *
+ * @param fileName 环境变量配置文件路径
+ */
+void setEnvFile(const char* fileName) {
+    FILE* file = fopen(fileName, "r");
+    if(file == NULL)
+        return;
+    char* line = NULL;
+    char envVar[1024];
+    char envValue[1024];
+    size_t n = 0;
+    ssize_t read;
+    while((read = getline(&line, &n, file)) != -1) {
+        if(line[0] == '#')
+            continue;
+        if(line[read - 1] == '\n')
+            line[read - 1] = '\0';
+        int s = 0; // Env Var Size
+        while(line[s] != '\0' && line[s] != '=')
+            s++;
+        if(line[s] == '\0')
+            continue;
+        strncpy(envVar, line, std::min(1023, s));
+        envVar[std::min(1023, s)] = '\0';
+        s++;
+        strncpy(envValue, line + s, 1023);
+        envValue[1023] = '\0';
+        setenv(envVar, envValue, 0);
+        // printf("%s : %s->%s\n", fileName, envVar, envValue);
+    }
+    if(line)
+        free(line);
+    fclose(file);
+}
+/**
+ * 初始化环境配置函数
+ *
+ * 该函数用于加载SCCL配置文件，按照以下顺序查找：
+ * 1. 首先检查环境变量"SCCL_CONF_FILE"指定的文件
+ * 2. 其次查找用户主目录下的".sccl.conf"文件
+ * 3. 最后尝试加载系统默认的"/etc/sccl.conf"文件
+ *
+ * 每个找到的配置文件都会被通过setEnvFile函数加载
+ */
+static void initEnvFunc() {
+    char confFilePath[1024];
+    const char* userFile = getenv("SCCL_CONF_FILE");
+    if(userFile && strlen(userFile) > 0) {
+        snprintf(confFilePath, sizeof(confFilePath), "%s", userFile);
+        setEnvFile(confFilePath);
+    } else {
+        const char* userDir = userHomeDir();
+        if(userDir) {
+            snprintf(confFilePath, sizeof(confFilePath), "%s/.sccl.conf", userDir);
+            setEnvFile(confFilePath);
+        }
+    }
+    snprintf(confFilePath, sizeof(confFilePath), "/etc/sccl.conf");
+    setEnvFile(confFilePath);
+    return;
+}
+/**
+ * 初始化环境变量（线程安全）
+ *
+ * 使用pthread_once确保initEnvFunc仅被调用一次
+ * 适用于多线程环境下环境变量的初始化
+ */
+void initEnv() {
+    static pthread_once_t once = PTHREAD_ONCE_INIT;
+    pthread_once(&once, initEnvFunc);
+    return;
+}
+/**
+ * @brief 加载环境变量参数并缓存
+ *
+ * 该函数用于从环境变量中读取整型参数值，并进行缓存以避免重复读取。
+ * 如果环境变量未设置或解析失败，则使用默认值。
+ *
+ * @param env 环境变量名
+ * @param deftVal 默认值
+ * @param uninitialized 未初始化标记值
+ * @param cache 用于缓存参数值的指针
+ *
+ * @note 该函数是线程安全的，使用互斥锁保护缓存操作
+ */
+void scclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
+    static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+    pthread_mutex_lock(&mutex);
+    if(__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
+        const char* str = scclGetEnv(env);
+        int64_t value   = deftVal;
+        if(str && strlen(str) > 0) {
+            errno = 0;
+            value = strtoll(str, nullptr, 0);
+            if(errno) {
+                value = deftVal;
+                INFO(SCCL_LOG_CODEALL, "Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
+            } else {
+                INFO(SCCL_LOG_TRANSPORT, "%s set by environment to %lld.", env, (long long)value);
+            }
+        }
+        __atomic_store_n(cache, value, __ATOMIC_RELAXED);
+    }
+    pthread_mutex_unlock(&mutex);
+    return;
+}
+/**
+ * 获取环境变量的值
+ *
+ * @param name 环境变量名称
+ * @return 环境变量的值，如果未找到则返回NULL
+ *
+ * @note 该函数会先初始化环境变量
+ */
+const char* scclGetEnv(const char* name) {
+    initEnv();
+    return getenv(name);
+}
+#define SCCL_THREAD_NAMELEN 16
+SCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
+/**
+ * @brief 设置指定线程的名称
+ *
+ * 该函数使用GNU扩展的pthread_setname_np来设置线程名称。名称通过可变参数格式化字符串生成，
+ * 最大长度为SCCL_THREAD_NAMELEN。仅在启用了_GNU_SOURCE宏且scclParamSetThreadName()返回1时生效。
+ *
+ * @param thread 要设置名称的线程句柄
+ * @param fmt 格式化字符串，用于生成线程名称
+ * @param ... 可变参数，用于格式化字符串
+ */
+void scclSetThreadName(pthread_t thread, const char* fmt, ...) {
+    // pthread_setname_np 是 GNU 的非标准扩展
+    // 需要以下特性测试宏
+#ifdef _GNU_SOURCE
+    // 检查是否启用了设置线程名称的功能，如果未启用则直接返回
+    if(scclParamSetThreadName() != 1)
+        return;
+    // 定义一个足够长的字符数组用于存储线程名
+    char threadName[SCCL_THREAD_NAMELEN];
+    // 声明可变参数列表变量
+    va_list vargs;
+    // 初始化可变参数列表
+    va_start(vargs, fmt);
+    // 使用可变参数和格式化字符串生成线程名，并写入 threadName 数组
+    vsnprintf(threadName, SCCL_THREAD_NAMELEN, fmt, vargs);
+    // 结束可变参数列表的使用
+    va_end(vargs);
+    // 使用 pthread_setname_np 设置线程名称
+    pthread_setname_np(thread, threadName);
+#endif
+}
+} // namespace sccl
--- a/src/utils/param.h
+++ b/src/utils/param.h
+#pragma once
+#include <stdint.h>
+namespace sccl {
+// 返回用户的主目录路径
+const char* userHomeDir();
+// 设置环境文件的名称
+void setEnvFile(const char* fileName);
+// 初始化环境变量
+void initEnv();
+// 获取指定名称的环境变量
+const char* scclGetEnv(const char* name);
+// 加载参数，如果环境变量未设置，则使用默认值
+void scclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
+// 设置线程名称，使用可变参数格式化字符串
+void scclSetThreadName(pthread_t thread, const char* fmt, ...);
+/**
+ * 定义SCCL参数宏。
+ * 此宏用于创建一个获取SCCL参数的函数。
+ * 参数通过环境变量获取，若环境变量未设置，则使用默认值。
+ *
+ * @param name 参数名称
+ * @param env 环境变量名称
+ * @param deftVal 默认值
+ *
+ * @return 返回参数的值
+ *
+ * 宏内部实现：
+ * - 使用constexpr定义一个未初始化的值。
+ * - 使用static_assert确保默认值不是未初始化的值。
+ * - 定义一个静态变量cache用于缓存参数值，初始为未初始化的值。
+ * - 使用__builtin_expect和__atomic_load_n检查cache是否为未初始化的值。
+ * - 若cache未初始化，则调用scclLoadParam函数加载参数值。
+ * - 返回cache的值。
+ */
+#define SCCL_PARAM(name, env, deftVal)                                                               \
+    int64_t scclParam##name() {                                                                      \
+        constexpr int64_t uninitialized = INT64_MIN;                                                 \
+        static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
+        static int64_t cache = uninitialized;                                                        \
+        if(__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) {    \
+            scclLoadParam("SCCL_" env, deftVal, uninitialized, &cache);                              \
+        }                                                                                            \
+        return cache;                                                                                \
+    }
+} // namespace sccl
--- a/src/utils/utils.cc
+++ b/src/utils/utils.cc
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "utils.h"
+// #include "core.h"
+// #include "nvmlwrap.h"
+#include <dirent.h>
+#include <fstream>
+#include <stdlib.h>
+namespace sccl {
+// // Get current Compute Capability
+// int scclCudaCompCap() {
+//     int cudaDev;
+//     if(cudaGetDevice(&cudaDev) != cudaSuccess)
+//         return 0;
+//     int ccMajor, ccMinor;
+//     if(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess)
+//         return 0;
+//     if(cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess)
+//         return 0;
+//     return ccMajor * 10 + ccMinor;
+// }
+// scclResult_t int64ToBusId(int64_t id, char* busId) {
+//     sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
+//     return scclSuccess;
+// }
+// scclResult_t busIdToInt64(const char* busId, int64_t* id) {
+//     char hexStr[17]; // Longest possible int64 hex string + null terminator.
+//     int hexOffset = 0;
+//     for(int i = 0; hexOffset < sizeof(hexStr) - 1; i++) {
+//         char c = busId[i];
+//         if(c == '.' || c == ':')
+//             continue;
+//         if((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) {
+//             hexStr[hexOffset++] = busId[i];
+//         } else
+//             break;
+//     }
+//     hexStr[hexOffset] = '\0';
+//     *id               = strtol(hexStr, NULL, 16);
+//     return scclSuccess;
+// }
+// // Convert a logical cudaDev index to the NVML device minor number
+// scclResult_t getBusId(int cudaDev, int64_t* busId) {
+//     // On most systems, the PCI bus ID comes back as in the 0000:00:00.0
+//     // format. Still need to allocate proper space in case PCI domain goes
+//     // higher.
+//     char busIdStr[] = "00000000:00:00.0";
+//     CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
+//     NCCLCHECK(busIdToInt64(busIdStr, busId));
+//     return scclSuccess;
+// }
+// scclResult_t getHostName(char* hostname, int maxlen, const char delim) {
+//     if(gethostname(hostname, maxlen) != 0) {
+//         strncpy(hostname, "unknown", maxlen);
+//         return scclSystemError;
+//     }
+//     int i = 0;
+//     while((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1))
+//         i++;
+//     hostname[i] = '\0';
+//     return scclSuccess;
+// }
+// uint64_t getHash(const char* string, int n) {
+//     // Based on DJB2a, result = result * 33 ^ char
+//     uint64_t result = 5381;
+//     for(int c = 0; c < n; c++) {
+//         result = ((result << 5) + result) ^ string[c];
+//     }
+//     return result;
+// }
+// /* Generate a hash of the unique identifying string for this host
+//  * that will be unique for both bare-metal and container instances
+//  * Equivalent of a hash of;
+//  *
+//  * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+//  *
+//  * This string can be overridden by using the NCCL_HOSTID env var.
+//  */
+// #define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+// uint64_t getHostHash(void) {
+//     char hostHash[1024];
+//     char* hostId;
+//     // Fall back is the full hostname if something fails
+//     (void)getHostName(hostHash, sizeof(hostHash), '\0');
+//     int offset = strlen(hostHash);
+//     if((hostId = getenv("NCCL_HOSTID")) != NULL) {
+//         INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId);
+//         strncpy(hostHash, hostId, sizeof(hostHash));
+//     } else {
+//         FILE* file = fopen(HOSTID_FILE, "r");
+//         if(file != NULL) {
+//             char* p;
+//             if(fscanf(file, "%ms", &p) == 1) {
+//                 strncpy(hostHash + offset, p, sizeof(hostHash) - offset - 1);
+//                 free(p);
+//             }
+//         }
+//         fclose(file);
+//     }
+//     // Make sure the string is terminated
+//     hostHash[sizeof(hostHash) - 1] = '\0';
+//     TRACE(NCCL_INIT, "unique hostname '%s'", hostHash);
+//     return getHash(hostHash, strlen(hostHash));
+// }
+// /* Generate a hash of the unique identifying string for this process
+//  * that will be unique for both bare-metal and container instances
+//  * Equivalent of a hash of;
+//  *
+//  * $$ $(readlink /proc/self/ns/pid)
+//  */
+// uint64_t getPidHash(void) {
+//     char pname[1024];
+//     // Start off with our pid ($$)
+//     sprintf(pname, "%ld", (long)getpid());
+//     int plen = strlen(pname);
+//     int len  = readlink("/proc/self/ns/pid", pname + plen, sizeof(pname) - 1 - plen);
+//     if(len < 0)
+//         len = 0;
+//     pname[plen + len] = '\0';
+//     TRACE(NCCL_INIT, "unique PID '%s'", pname);
+//     return getHash(pname, strlen(pname));
+// }
+// int parseStringList(const char* string, struct netIf* ifList, int maxList) {
+//     if(!string)
+//         return 0;
+//     const char* ptr = string;
+//     int ifNum = 0;
+//     int ifC   = 0;
+//     char c;
+//     do {
+//         c = *ptr;
+//         if(c == ':') {
+//             if(ifC > 0) {
+//                 ifList[ifNum].prefix[ifC] = '\0';
+//                 ifList[ifNum].port        = atoi(ptr + 1);
+//                 ifNum++;
+//                 ifC = 0;
+//             }
+//             while(c != ',' && c != '\0')
+//                 c = *(++ptr);
+//         } else if(c == ',' || c == '\0') {
+//             if(ifC > 0) {
+//                 ifList[ifNum].prefix[ifC] = '\0';
+//                 ifList[ifNum].port        = -1;
+//                 ifNum++;
+//                 ifC = 0;
+//             }
+//         } else {
+//             ifList[ifNum].prefix[ifC] = c;
+//             ifC++;
+//         }
+//         ptr++;
+//     } while(ifNum < maxList && c);
+//     return ifNum;
+// }
+// static bool matchIf(const char* string, const char* ref, bool matchExact) {
+//     // Make sure to include '\0' in the exact case
+//     int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
+//     return strncmp(string, ref, matchLen) == 0;
+// }
+// static bool matchPort(const int port1, const int port2) {
+//     if(port1 == -1)
+//         return true;
+//     if(port2 == -1)
+//         return true;
+//     if(port1 == port2)
+//         return true;
+//     return false;
+// }
+// bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
+//     // Make an exception for the case where no user list is defined
+//     if(listSize == 0)
+//         return true;
+//     for(int i = 0; i < listSize; i++) {
+//         if(matchIf(string, ifList[i].prefix, matchExact) && matchPort(port, ifList[i].port)) {
+//             return true;
+//         }
+//     }
+//     return false;
+// }
+// __thread struct scclThreadSignal scclThreadSignalLocalInstance = scclThreadSignalStaticInitializer();
+// void* scclMemoryStack::allocateSpilled(struct scclMemoryStack* me, size_t size, size_t align) {
+//     // `me->hunks` points to the top of the stack non-empty hunks. Hunks above
+//     // this (reachable via `->above`) are empty.
+//     struct Hunk* top  = me->topFrame.hunk;
+//     size_t mallocSize = 0;
+//     // If we have lots of space left in hunk but that wasn't enough then we'll
+//     // allocate the object unhunked.
+//     if(me->topFrame.end - me->topFrame.bumper >= 8 << 10)
+//         goto unhunked;
+//     // If we have another hunk (which must be empty) waiting above this one and
+//     // the object fits then use that.
+//     if(top && top->above) {
+//         struct Hunk* top1 = top->above;
+//         uintptr_t uobj    = (reinterpret_cast<uintptr_t>(top1) + sizeof(struct Hunk) + align - 1) & -uintptr_t(align);
+//         if(uobj + size <= reinterpret_cast<uintptr_t>(top1) + top1->size) {
+//             me->topFrame.hunk   = top1;
+//             me->topFrame.bumper = uobj + size;
+//             me->topFrame.end    = reinterpret_cast<uintptr_t>(top1) + top1->size;
+//             return reinterpret_cast<void*>(uobj);
+//         }
+//     }
+//     { // If the next hunk we're going to allocate wouldn't be big enough but the
+//         // Unhunk proxy fits in the current hunk then go allocate as unhunked.
+//         size_t nextSize           = (top ? top->size : 0) + (64 << 10);
+//         constexpr size_t maxAlign = 64;
+//         if(nextSize < sizeof(struct Hunk) + maxAlign + size) {
+//             uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk) - 1) & -uintptr_t(alignof(Unhunk));
+//             if(uproxy + sizeof(struct Unhunk) <= me->topFrame.end)
+//                 goto unhunked;
+//         }
+//         // At this point we must need another hunk, either to fit the object
+//         // itself or its Unhunk proxy.
+//         mallocSize = nextSize;
+//         INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
+//         struct Hunk* top1 = (struct Hunk*)malloc(mallocSize);
+//         if(top1 == nullptr)
+//             goto malloc_exhausted;
+//         top1->size  = nextSize;
+//         top1->above = nullptr;
+//         if(top)
+//             top->above = top1;
+//         top                 = top1;
+//         me->topFrame.hunk   = top;
+//         me->topFrame.end    = reinterpret_cast<uintptr_t>(top) + nextSize;
+//         me->topFrame.bumper = reinterpret_cast<uintptr_t>(top) + sizeof(struct Hunk);
+//     }
+//     { // Try to fit object in the new top hunk.
+//         uintptr_t uobj = (me->topFrame.bumper + align - 1) & -uintptr_t(align);
+//         if(uobj + size <= me->topFrame.end) {
+//             me->topFrame.bumper = uobj + size;
+//             return reinterpret_cast<void*>(uobj);
+//         }
+//     }
+// unhunked: { // We need to allocate the object out-of-band and put an Unhunk proxy in-band
+//     // to keep track of it.
+//     uintptr_t uproxy     = (me->topFrame.bumper + alignof(Unhunk) - 1) & -uintptr_t(alignof(Unhunk));
+//     Unhunk* proxy        = reinterpret_cast<Unhunk*>(uproxy);
+//     me->topFrame.bumper  = uproxy + sizeof(Unhunk);
+//     proxy->next          = me->topFrame.unhunks;
+//     me->topFrame.unhunks = proxy;
+//     mallocSize           = size;
+//     proxy->obj           = malloc(mallocSize);
+//     INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
+//     if(proxy->obj == nullptr)
+//         goto malloc_exhausted;
+//     return proxy->obj;
+// }
+// malloc_exhausted:
+//     WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
+//     abort();
+// }
+// void scclMemoryStackDestruct(struct scclMemoryStack* me) {
+//     // Free unhunks first because both the frames and unhunk proxies lie within the hunks.
+//     struct scclMemoryStack::Frame* f = &me->topFrame;
+//     while(f != nullptr) {
+//         struct scclMemoryStack::Unhunk* u = f->unhunks;
+//         while(u != nullptr) {
+//             free(u->obj);
+//             u = u->next;
+//         }
+//         f = f->below;
+//     }
+//     // Free hunks
+//     struct scclMemoryStack::Hunk* h = me->stub.above;
+//     while(h != nullptr) {
+//         struct scclMemoryStack::Hunk* h1 = h->above;
+//         free(h);
+//         h = h1;
+//     }
+// }
+// typedef struct {
+//     pid_t pid;
+//     pid_t ppid;
+//     char pcmdLine[4096];
+//     char cmdLine[4096];
+// } appConfigOptimizeArg_t;
+// static bool barrier_Flag;
+// int maxGPUs = -1;
+// int initInfo() {
+//     /* get barrier_Flag */
+//     uint32_t index              = 0;
+//     appConfigOptimizeArg_t args = {0};
+//     args.pid                    = getpid();
+//     args.ppid                   = getppid();
+//     std::string cmdLinePath     = "/proc/" + std::to_string(args.ppid) + "/cmdline";
+//     std::ifstream cmdLineFile;
+//     cmdLineFile.open(cmdLinePath.c_str());
+//     cmdLineFile.read(args.pcmdLine, sizeof(args.pcmdLine));
+//     cmdLineFile.close();
+//     cmdLinePath = "/proc/" + std::to_string(args.pid) + "/cmdline";
+//     cmdLineFile.open(cmdLinePath.c_str());
+//     cmdLineFile.read(args.cmdLine, sizeof(args.cmdLine));
+//     cmdLineFile.close();
+//     if(memmem(args.cmdLine, sizeof(args.cmdLine), "sccl_context_test", strlen("sccl_context_test")) ||
+//        memmem(args.pcmdLine, sizeof(args.pcmdLine), "sccl_context_test", strlen("sccl_context_test"))) {
+//         barrier_Flag = true;
+//     } else {
+//         barrier_Flag = false;
+//     }
+//     INFO(NCCL_INIT, "Init config for sccl_context_test: %d", barrier_Flag);
+//     /* get maximum number of GPUs in all NUMA nodes */
+//     if(maxGPUs == -1) {
+//         int gpuCount[32] = {0}; // Assume MAX_NUMA_NODES=32
+//         int deviceCount;
+//         hipGetDeviceCount(&deviceCount);
+//         // Get numbers of GPUs in all NUMA nodes in system
+//         for(int i = 1; i <= deviceCount; ++i) {
+//             char path[256];
+//             snprintf(path, sizeof(path), "/sys/class/drm/card%d/device/numa_node", i);
+//             FILE* fp = fopen(path, "r");
+//             if(fp == NULL) {
+//                 perror("Error opening NUMA node file");
+//                 continue;
+//             }
+//             int numaNode;
+//             if(fscanf(fp, "%d", &numaNode) == 1 && numaNode >= 0 && numaNode < 32) {
+//                 gpuCount[numaNode]++;
+//             }
+//             fclose(fp);
+//         }
+//         // Find maximum number of GPUs in all NUMA nodes
+//         for(int i = 0; i < 32; ++i) {
+//             if(gpuCount[i] > maxGPUs) {
+//                 maxGPUs = gpuCount[i];
+//             }
+//         }
+//         INFO(NCCL_INIT, "Maximum number of GPUs in any NUMA node: %d\n", maxGPUs);
+//     }
+//     return 0;
+// }
+// bool getBarrierFlag() { return barrier_Flag; }
+// int getNumaMaxGpus() { return maxGPUs; }
+} // namespace sccl
--- a/src/utils/utils.h
+++ b/src/utils/utils.h
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_UTILS_H_
+#define NCCL_UTILS_H_
+#include "check.h"
+#include <stdint.h>
+#include <time.h>
+#include <sched.h>
+#include <new>
+namespace sccl {
+// int ncclCudaCompCap();
+// scclResult_t int64ToBusId(int64_t id, char* busId);
+// scclResult_t busIdToInt64(const char* busId, int64_t* id);
+// ncclResult_t getBusId(int cudaDev, int64_t* busId);
+// ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+// uint64_t getHash(const char* string, int n);
+// uint64_t getHostHash();
+// uint64_t getPidHash();
+// ncclResult_t getRandomData(void* buffer, size_t bytes);
+// struct netIf {
+//     char prefix[64];
+//     int port;
+// };
+// int parseStringList(const char* string, struct netIf* ifList, int maxList);
+// bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
+// static long log2i(long n) {
+//     long l = 0;
+//     while(n >>= 1)
+//         l++;
+//     return l;
+// }
+// inline uint64_t clockNano() {
+//     struct timespec ts;
+//     clock_gettime(CLOCK_MONOTONIC, &ts);
+//     return uint64_t(ts.tv_sec) * 1000 * 1000 * 1000 + ts.tv_nsec;
+// }
+// /* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
+//  * return -1 */
+// inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
+//     ncclResult_t ret = ncclSuccess;
+//     if(bytes > 0) {
+//         const size_t one = 1UL;
+//         FILE* fp         = fopen("/dev/urandom", "r");
+//         if(buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one)
+//             ret = ncclSystemError;
+//         if(fp)
+//             fclose(fp);
+//     }
+//     return ret;
+// }
+// ////////////////////////////////////////////////////////////////////////////////
+// template <typename Int>
+// inline void ncclAtomicRefCountIncrement(Int* refs) {
+//     __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
+// }
+// template <typename Int>
+// inline Int ncclAtomicRefCountDecrement(Int* refs) {
+//     return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL);
+// }
+// ////////////////////////////////////////////////////////////////////////////////
+// /* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that
+//  * granularity of LIFO is not per object, instead frames containing many objects
+//  * are pushed and popped. Therefor deallocation is extremely cheap since its
+//  * done at the frame granularity.
+//  *
+//  * The initial state of the stack is with one frame, the "nil" frame, which
+//  * cannot be popped. Therefor objects allocated in the nil frame cannot be
+//  * deallocated sooner than stack destruction.
+//  */
+// struct ncclMemoryStack;
+// void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
+// void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
+// void ncclMemoryStackPush(struct ncclMemoryStack* me);
+// void ncclMemoryStackPop(struct ncclMemoryStack* me);
+// template <typename T>
+// T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n = 1);
+// int initInfo();
+// bool getBarrierFlag();
+// int getNumaMaxGpus();
+// ////////////////////////////////////////////////////////////////////////////////
+// /* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
+//  * a pool instance to ever hold objects whose type have differing
+//  * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by
+//  * a backing `ncclMemoryStack` passed during Alloc(). If memory
+//  * backing any currently held object is deallocated then it is an error to do
+//  * anything other than reconstruct it, after which it is a valid empty pool.
+//  */
+// struct ncclMemoryPool;
+// // Equivalent to zero-initialization
+// void ncclMemoryPoolConstruct(struct ncclMemoryPool* me);
+// template <typename T>
+// T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing);
+// template <typename T>
+// void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj);
+// void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from);
+// ////////////////////////////////////////////////////////////////////////////////
+// /* ncclIntruQueue: A singly-linked list queue where the per-object next pointer
+//  * field is given via the `next` template argument.
+//  *
+//  * Example:
+//  *   struct Foo {
+//  *     struct Foo *next1, *next2; // can be a member of two lists at once
+//  *   };
+//  *   ncclIntruQueue<Foo, &Foo::next1> list1;
+//  *   ncclIntruQueue<Foo, &Foo::next2> list2;
+//  */
+// template <typename T, T* T::* next>
+// struct ncclIntruQueue;
+// template <typename T, T* T::* next>
+// void ncclIntruQueueConstruct(ncclIntruQueue<T, next>* me);
+// template <typename T, T* T::* next>
+// bool ncclIntruQueueEmpty(ncclIntruQueue<T, next>* me);
+// template <typename T, T* T::* next>
+// T* ncclIntruQueueHead(ncclIntruQueue<T, next>* me);
+// template <typename T, T* T::* next>
+// void ncclIntruQueueEnqueue(ncclIntruQueue<T, next>* me, T* x);
+// template <typename T, T* T::* next>
+// T* ncclIntruQueueDequeue(ncclIntruQueue<T, next>* me);
+// template <typename T, T* T::* next>
+// T* ncclIntruQueueTryDequeue(ncclIntruQueue<T, next>* me);
+// template <typename T, T* T::* next>
+// void ncclIntruQueueFreeAll(ncclIntruQueue<T, next>* me, ncclMemoryPool* memPool);
+// ////////////////////////////////////////////////////////////////////////////////
+// /* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
+//  * and "cond" fields are part of the public interface.
+//  */
+// struct ncclThreadSignal {
+//     pthread_mutex_t mutex;
+//     pthread_cond_t cond;
+// };
+// // returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}
+// constexpr ncclThreadSignal ncclThreadSignalStaticInitializer();
+// void ncclThreadSignalConstruct(struct ncclThreadSignal* me);
+// void ncclThreadSignalDestruct(struct ncclThreadSignal* me);
+// // A convenience instance per-thread.
+// extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance;
+// ////////////////////////////////////////////////////////////////////////////////
+// template <typename T, T* T::* next>
+// struct ncclIntruQueueMpsc;
+// template <typename T, T* T::* next>
+// void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T, next>* me);
+// template <typename T, T* T::* next>
+// bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T, next>* me);
+// // Enqueue element. Returns true if queue is not abandoned. Even if queue is
+// // abandoned the element enqueued, so the caller needs to make arrangements for
+// // the queue to be tended.
+// template <typename T, T* T::* next>
+// bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc<T, next>* me, T* x);
+// // Dequeue all elements at a glance. If there aren't any and `waitSome` is
+// // true then this call will wait until it can return a non empty list.
+// template <typename T, T* T::* next>
+// T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc<T, next>* me, bool waitSome);
+// // Dequeue all elements and set queue to abandoned state.
+// template <typename T, T* T::* next>
+// T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc<T, next>* me);
+// ////////////////////////////////////////////////////////////////////////////////
+// struct ncclMemoryStack {
+//     struct Hunk {
+//         struct Hunk* above; // reverse stack pointer
+//         size_t size;        // size of this allocation (including this header struct)
+//     };
+//     struct Unhunk { // proxy header for objects allocated out-of-hunk
+//         struct Unhunk* next;
+//         void* obj;
+//     };
+//     struct Frame {
+//         struct Hunk* hunk;     // top of non-empty hunks
+//         uintptr_t bumper, end; // points into top hunk
+//         struct Unhunk* unhunks;
+//         struct Frame* below;
+//     };
+//     static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align);
+//     static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align);
+//     struct Hunk stub;
+//     struct Frame topFrame;
+// };
+// inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) {
+//     me->stub.above       = nullptr;
+//     me->stub.size        = 0;
+//     me->topFrame.hunk    = &me->stub;
+//     me->topFrame.bumper  = 0;
+//     me->topFrame.end     = 0;
+//     me->topFrame.unhunks = nullptr;
+//     me->topFrame.below   = nullptr;
+// }
+// inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) {
+//     uintptr_t o = (me->topFrame.bumper + align - 1) & -uintptr_t(align);
+//     void* obj;
+//     if(__builtin_expect(o + size <= me->topFrame.end, true)) {
+//         me->topFrame.bumper = o + size;
+//         obj                 = reinterpret_cast<void*>(o);
+//     } else {
+//         obj = allocateSpilled(me, size, align);
+//     }
+//     return obj;
+// }
+// template <typename T>
+// inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
+//     void* obj = ncclMemoryStack::allocate(me, n * sizeof(T), alignof(T));
+//     memset(obj, 0, n * sizeof(T));
+//     return (T*)obj;
+// }
+// inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
+//     using Frame          = ncclMemoryStack::Frame;
+//     Frame tmp            = me->topFrame;
+//     Frame* snapshot      = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame));
+//     *snapshot            = tmp; // C++ struct assignment
+//     me->topFrame.unhunks = nullptr;
+//     me->topFrame.below   = snapshot;
+// }
+// inline void ncclMemoryStackPop(struct ncclMemoryStack* me) {
+//     ncclMemoryStack::Unhunk* un = me->topFrame.unhunks;
+//     while(un != nullptr) {
+//         free(un->obj);
+//         un = un->next;
+//     }
+//     me->topFrame = *me->topFrame.below; // C++ struct assignment
+// }
+// ////////////////////////////////////////////////////////////////////////////////
+// struct ncclMemoryPool {
+//     struct Cell {
+//         Cell* next;
+//     };
+//     template <int Size, int Align>
+//     union CellSized {
+//         Cell cell;
+//         alignas(Align) char space[Size];
+//     };
+//     struct Cell* head;
+//     struct Cell* tail; // meaningful only when head != nullptr
+// };
+// inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) { me->head = nullptr; }
+// template <typename T>
+// inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
+//     using Cell      = ncclMemoryPool::Cell;
+//     using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
+//     Cell* cell;
+//     if(__builtin_expect(me->head != nullptr, true)) {
+//         cell     = me->head;
+//         me->head = cell->next;
+//     } else {
+//         // Use the internal allocate() since it doesn't memset to 0 yet.
+//         cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
+//     }
+//     memset(cell, 0, sizeof(T));
+//     return reinterpret_cast<T*>(cell);
+// }
+// template <typename T>
+// inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) {
+//     using Cell = ncclMemoryPool::Cell;
+//     Cell* cell = reinterpret_cast<Cell*>(obj);
+//     cell->next = me->head;
+//     if(me->head == nullptr)
+//         me->tail = cell;
+//     me->head = cell;
+// }
+// inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) {
+//     if(from->head != nullptr) {
+//         from->tail->next = me->head;
+//         if(me->head == nullptr)
+//             me->tail = from->tail;
+//         me->head   = from->head;
+//         from->head = nullptr;
+//     }
+// }
+// ////////////////////////////////////////////////////////////////////////////////
+// template <typename T, T* T::* next>
+// struct ncclIntruQueue {
+//     T *head, *tail;
+// };
+// template <typename T, T* T::* next>
+// inline void ncclIntruQueueConstruct(ncclIntruQueue<T, next>* me) {
+//     me->head = nullptr;
+//     me->tail = nullptr;
+// }
+// template <typename T, T* T::* next>
+// inline bool ncclIntruQueueEmpty(ncclIntruQueue<T, next>* me) {
+//     return me->head == nullptr;
+// }
+// template <typename T, T* T::* next>
+// inline T* ncclIntruQueueHead(ncclIntruQueue<T, next>* me) {
+//     return me->head;
+// }
+// template <typename T, T* T::* next>
+// inline T* ncclIntruQueueTail(ncclIntruQueue<T, next>* me) {
+//     return me->tail;
+// }
+// template <typename T, T* T::* next>
+// inline void ncclIntruQueueEnqueue(ncclIntruQueue<T, next>* me, T* x) {
+//     x->*next                                = nullptr;
+//     (me->head ? me->tail->*next : me->head) = x;
+//     me->tail                                = x;
+// }
+// template <typename T, T* T::* next>
+// inline T* ncclIntruQueueDequeue(ncclIntruQueue<T, next>* me) {
+//     T* ans   = me->head;
+//     me->head = ans->*next;
+//     if(me->head == nullptr)
+//         me->tail = nullptr;
+//     return ans;
+// }
+// template <typename T, T* T::* next>
+// inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T, next>* me) {
+//     T* ans = me->head;
+//     if(ans != nullptr) {
+//         me->head = ans->*next;
+//         if(me->head == nullptr)
+//             me->tail = nullptr;
+//     }
+//     return ans;
+// }
+// template <typename T, T* T::* next>
+// void ncclIntruQueueFreeAll(ncclIntruQueue<T, next>* me, ncclMemoryPool* pool) {
+//     T* head  = me->head;
+//     me->head = nullptr;
+//     me->tail = nullptr;
+//     while(head != nullptr) {
+//         T* tmp = head->*next;
+//         ncclMemoryPoolFree(pool, tmp);
+//         head = tmp;
+//     }
+// }
+// ////////////////////////////////////////////////////////////////////////////////
+// constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() { return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}; }
+// inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) {
+//     pthread_mutex_init(&me->mutex, nullptr);
+//     pthread_cond_init(&me->cond, nullptr);
+// }
+// inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) {
+//     pthread_mutex_destroy(&me->mutex);
+//     pthread_cond_destroy(&me->cond);
+// }
+// ////////////////////////////////////////////////////////////////////////////////
+// template <typename T, T* T::* next>
+// struct ncclIntruQueueMpsc {
+//     T* head;
+//     uintptr_t tail;
+//     struct ncclThreadSignal* waiting;
+// };
+// template <typename T, T* T::* next>
+// void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T, next>* me) {
+//     me->head    = nullptr;
+//     me->tail    = 0x0;
+//     me->waiting = nullptr;
+// }
+// template <typename T, T* T::* next>
+// bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T, next>* me) {
+//     return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2;
+// }
+// template <typename T, T* T::* next>
+// bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc<T, next>* me, T* x) {
+//     __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED);
+//     uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast<uintptr_t>(x), __ATOMIC_ACQ_REL);
+//     T* prev         = reinterpret_cast<T*>(utail);
+//     T** prevNext    = utail <= 0x2 ? &me->head : &(prev->*next);
+//     __atomic_store_n(prevNext, x, __ATOMIC_RELAXED);
+//     if(utail == 0x1) {                           // waiting
+//         __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting
+//         // This lock/unlock is essential to ensure we don't race ahead of the consumer
+//         // and signal the cond before they begin waiting on it.
+//         struct ncclThreadSignal* waiting = me->waiting;
+//         pthread_mutex_lock(&waiting->mutex);
+//         pthread_mutex_unlock(&waiting->mutex);
+//         pthread_cond_broadcast(&waiting->cond);
+//     }
+//     return utail != 0x2; // not abandoned
+// }
+// template <typename T, T* T::* next>
+// T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc<T, next>* me, bool waitSome) {
+//     T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+//     if(head == nullptr) {
+//         if(!waitSome)
+//             return nullptr;
+//         uint64_t t0   = clockNano();
+//         bool sleeping = false;
+//         do {
+//             if(clockNano() - t0 >= 10 * 1000) { // spin for first 10us
+//                 struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance;
+//                 pthread_mutex_lock(&waitSignal->mutex);
+//                 uintptr_t expected = sleeping ? 0x1 : 0x0;
+//                 uintptr_t desired  = 0x1;
+//                 me->waiting        = waitSignal; // release done by successful compare exchange
+//                 if(__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
+//                     sleeping = true;
+//                     pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
+//                 }
+//                 pthread_mutex_unlock(&waitSignal->mutex);
+//             }
+//             head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+//         } while(head == nullptr);
+//     }
+//     __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
+//     uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL);
+//     T* tail         = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
+//     T* x            = head;
+//     while(x != tail) {
+//         T* x1;
+//         int spins = 0;
+//         while(true) {
+//             x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
+//             if(x1 != nullptr)
+//                 break;
+//             if(++spins == 1024) {
+//                 spins = 1024 - 1;
+//                 sched_yield();
+//             }
+//         }
+//         x = x1;
+//     }
+//     return head;
+// }
+// template <typename T, T* T::* next>
+// T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T, next>* me) {
+//     uintptr_t expected = 0x0;
+//     if(__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
+//         return nullptr;
+//     } else {
+//         int spins = 0;
+//         T* head;
+//         while(true) {
+//             head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+//             if(head != nullptr)
+//                 break;
+//             if(++spins == 1024) {
+//                 spins = 1024 - 1;
+//                 sched_yield();
+//             }
+//         }
+//         __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
+//         uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL);
+//         T* tail         = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
+//         T* x            = head;
+//         while(x != tail) {
+//             T* x1;
+//             spins = 0;
+//             while(true) {
+//                 x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
+//                 if(x1 != nullptr)
+//                     break;
+//                 if(++spins == 1024) {
+//                     spins = 1024 - 1;
+//                     sched_yield();
+//                 }
+//             }
+//             x = x1;
+//         }
+//         return head;
+//     }
+// }
+// ////////////////////////////////////////////////////////////////////////////////
+// static inline long get_now_ns(void) {
+//     struct timespec time;
+//     if(clock_gettime(CLOCK_MONOTONIC, &time) != 0) {
+//         return 0;
+//     }
+//     return time.tv_sec * 1000000000L + time.tv_nsec;
+// }
+// static inline void thread_bind_cpu(int coreid) {
+//     cpu_set_t cpuset;
+//     CPU_ZERO(&cpuset);
+//     CPU_SET(coreid, &cpuset);
+//     pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+// }
+} // namespace sccl
+#endif