完成全部网络的node建立，以及GPU到GPU的path物理路径搜索

571a75b5 · lishen · 379c4128 · 571a75b5 · 571a75b5 · 571a75b5
Commit 571a75b5 authored Aug 09, 2025 by lishen
Showing with 103 additions and 896 deletions

src/include/debug.h src/include/debug.h +6 -6

src/utils/container.h src/utils/container.h +95 -0

src/utils/utils.cpp src/utils/utils.cpp +1 -372

src/utils/utils.h src/utils/utils.h +1 -518

No files found.
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -48,8 +48,8 @@ static __thread int tid              = -1;                        // 线程局
 static int pid                       = -1;                        // 存储当前进程的ID，默认值为-1
 static FILE* scclDebugFile           = stdout;                    // 指向调试输出流的文件指针，默认指向标准输出（stdout
-static uint64_t scclDebugMask = SCCL_LOG_TOPO | SCCL_LOG_BOOTSTRAP; // Default debug sub-system mask is INIT and ENV
+static uint64_t scclDebugMask = SCCL_LOG_GRAPH /*| SCCL_LOG_BOOTSTRAP*/; // Default debug sub-system mask is INIT and ENV
-static int scclDebugLevel     = -1;                                 // 初始化为 -1，表示未设置
+static int scclDebugLevel     = -1;                                      // 初始化为 -1，表示未设置
 // 在文件顶部或适当位置定义变量
 static int scclDebugPos = -1; // 初始化为 -1，表示未设置
@@ -135,10 +135,10 @@ static void scclDebugInit() {
                mask = SCCL_LOG_TOPO;
            } else if(strcasecmp(subsys, "BOOTSTRAP") == 0) {
                mask = SCCL_LOG_BOOTSTRAP;
-            } else if(strcasecmp(subsys, "TRANSPORT") == 0) {
-                mask = SCCL_LOG_TRANSPORT;
            } else if(strcasecmp(subsys, "GRAPH") == 0) {
                mask = SCCL_LOG_GRAPH;
+            } else if(strcasecmp(subsys, "TRANSPORT") == 0) {
+                mask = SCCL_LOG_TRANSPORT;
            } else if(strcasecmp(subsys, "CONNECT") == 0) {
                mask = SCCL_LOG_CONNECT;
            } else if(strcasecmp(subsys, "P2P") == 0) {
@@ -245,9 +245,9 @@ void scclDebugLog(scclDebugLogSubSys_t pos_flags, const char* filepath, const ch
    char buffer[1024];
    size_t len = 0;
    if constexpr(level == SCCL_LOG_WARN) {
-        len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d %s:%s:%d SCCL WARN ", hostname, pid, tid, filepath, filefunc, line);
+        len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d %s:%d - %s SCCL WARN ", hostname, pid, tid, filepath, line, filefunc);
    } else if constexpr(level == SCCL_LOG_INFO) {
-        len = snprintf(buffer, sizeof(buffer), "%s:%d:%d %s:%s:%d SCCL INFO ", hostname, pid, tid, filepath, filefunc, line);
+        len = snprintf(buffer, sizeof(buffer), "%s:%d:%d %s:%d - %s SCCL INFO ", hostname, pid, tid, filepath, line, filefunc);
    }
    if(len) {

--- a/src/utils/container.h
+++ b/src/utils/container.h
+#pragma once
+#include <stdint.h>
+namespace sccl {
+// 实现类似于std::span的功能，将字节数组转换为类型数组
+// 采用Vector的各种接口，区别在于ByteSpanVector对data元素为直接操作，而std::vector为拷贝
+template <typename T>
+class ByteSpanVector {
+public:
+    // 构造函数，接受一个指向数据的指针和数据的容量
+    ByteSpanVector(void* data, size_t capacity) : data_(reinterpret_cast<T*>(data)), capacity_(capacity / sizeof(T)), size_(0) {}
+    // 提供一个data()函数，返回指向数据的指针
+    T* data() const { return data_; }
+    // 提供一个size()函数，返回当前已经写入的数据的数量
+    size_t size() const { return size_; }
+    // 提供一个capacity()函数，返回预留给数据的最大空间
+    size_t capacity() const { return capacity_; }
+    // 提供一个空的检查函数
+    bool empty() const { return size_ == 0; }
+    // 提供一个检查是否已满的函数
+    bool full() const { return size_ == capacity_; }
+    // 提供一个在末尾添加元素的函数
+    void push_back(const T& value) {
+        if(size_ < capacity_) {
+            new(data_ + size_) T(value);
+            ++size_;
+        } else {
+            // 处理容量不足的情况，例如抛出异常或扩展容量
+            throw std::overflow_error("ByteSpanVector push_back capacity exceeded");
+        }
+    }
+    // 提供一个访问指定索引处元素的函数，返回指针
+    T* operator[](size_t index) {
+        if(index < size_) {
+            return &(data_[index]);
+        } else {
+            return nullptr; // 返回空指针
+        }
+    }
+    const T* operator[](size_t index) const {
+        if(index < size_) {
+            return &(data_[index]);
+        } else {
+            return nullptr; // 返回空指针
+        }
+    }
+private:
+    T* data_;
+    size_t capacity_;
+    size_t size_;
+};
+template <typename T>
+class ByteSpanArray {
+public:
+    // 构造函数，接受一个指向数据的void*指针和总的字节大小
+    ByteSpanArray(void* data, size_t size) : data_(reinterpret_cast<T*>(data)), size_(size / sizeof(T)) {}
+    // 提供一个size()函数，返回当前已经写入的数据的数量
+    size_t size() const { return size_; }
+    // 提供一个访问指定索引处元素的函数，返回T*类型的数据，或者在索引超出范围时返回空指针nullptr
+    T* operator[](size_t index) {
+        if(index < size_) {
+            return data_ + index;
+        } else {
+            return nullptr;
+        }
+    }
+    const T* operator[](size_t index) const {
+        if(index < size_) {
+            return data_ + index;
+        } else {
+            return nullptr;
+        }
+    }
+private:
+    T* data_;
+    size_t size_;
+};
+} // namespace sccl
--- a/src/utils/utils.cpp
+++ b/src/utils/utils.cpp
@@ -5,382 +5,11 @@
 ************************************************************************/
 #include "utils.h"
-// #include "core.h"
-// #include "nvmlwrap.h"
 #include <dirent.h>
 #include <fstream>
 #include <stdlib.h>
 namespace sccl {
+///
-// // Get current Compute Capability
-// int scclCudaCompCap() {
-//     int hipDev;
-//     if(cudaGetDevice(&hipDev) != cudaSuccess)
-//         return 0;
-//     int ccMajor, ccMinor;
-//     if(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, hipDev) != cudaSuccess)
-//         return 0;
-//     if(cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, hipDev) != cudaSuccess)
-//         return 0;
-//     return ccMajor * 10 + ccMinor;
-// }
-// scclResult_t int64ToBusId(int64_t id, char* busId) {
-//     sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
-//     return scclSuccess;
-// }
-// scclResult_t busIdToInt64(const char* busId, int64_t* id) {
-//     char hexStr[17]; // Longest possible int64 hex string + null terminator.
-//     int hexOffset = 0;
-//     for(int i = 0; hexOffset < sizeof(hexStr) - 1; i++) {
-//         char c = busId[i];
-//         if(c == '.' || c == ':')
-//             continue;
-//         if((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) {
-//             hexStr[hexOffset++] = busId[i];
-//         } else
-//             break;
-//     }
-//     hexStr[hexOffset] = '\0';
-//     *id               = strtol(hexStr, NULL, 16);
-//     return scclSuccess;
-// }
-// // Convert a logical hipDev index to the NVML device minor number
-// scclResult_t getBusId(int hipDev, int64_t* busId) {
-//     // On most systems, the PCI bus ID comes back as in the 0000:00:00.0
-//     // format. Still need to allocate proper space in case PCI domain goes
-//     // higher.
-//     char busIdStr[] = "00000000:00:00.0";
-//     CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), hipDev));
-//     NCCLCHECK(busIdToInt64(busIdStr, busId));
-//     return scclSuccess;
-// }
-// scclResult_t getHostName(char* hostname, int maxlen, const char delim) {
-//     if(gethostname(hostname, maxlen) != 0) {
-//         strncpy(hostname, "unknown", maxlen);
-//         return scclSystemError;
-//     }
-//     int i = 0;
-//     while((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1))
-//         i++;
-//     hostname[i] = '\0';
-//     return scclSuccess;
-// }
-// uint64_t getHash(const char* string, int n) {
-//     // Based on DJB2a, result = result * 33 ^ char
-//     uint64_t result = 5381;
-//     for(int c = 0; c < n; c++) {
-//         result = ((result << 5) + result) ^ string[c];
-//     }
-//     return result;
-// }
-// /* Generate a hash of the unique identifying string for this host
-//  * that will be unique for both bare-metal and container instances
-//  * Equivalent of a hash of;
-//  *
-//  * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
-//  *
-//  * This string can be overridden by using the NCCL_HOSTID env var.
-//  */
-// #define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
-// uint64_t getHostHash(void) {
-//     char hostHash[1024];
-//     char* hostId;
-//     // Fall back is the full hostname if something fails
-//     (void)getHostName(hostHash, sizeof(hostHash), '\0');
-//     int offset = strlen(hostHash);
-//     if((hostId = getenv("NCCL_HOSTID")) != NULL) {
-//         INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId);
-//         strncpy(hostHash, hostId, sizeof(hostHash));
-//     } else {
-//         FILE* file = fopen(HOSTID_FILE, "r");
-//         if(file != NULL) {
-//             char* p;
-//             if(fscanf(file, "%ms", &p) == 1) {
-//                 strncpy(hostHash + offset, p, sizeof(hostHash) - offset - 1);
-//                 free(p);
-//             }
-//         }
-//         fclose(file);
-//     }
-//     // Make sure the string is terminated
-//     hostHash[sizeof(hostHash) - 1] = '\0';
-//     TRACE(NCCL_INIT, "unique hostname '%s'", hostHash);
-//     return getHash(hostHash, strlen(hostHash));
-// }
-// /* Generate a hash of the unique identifying string for this process
-//  * that will be unique for both bare-metal and container instances
-//  * Equivalent of a hash of;
-//  *
-//  * $$ $(readlink /proc/self/ns/pid)
-//  */
-// uint64_t getPidHash(void) {
-//     char pname[1024];
-//     // Start off with our pid ($$)
-//     sprintf(pname, "%ld", (long)getpid());
-//     int plen = strlen(pname);
-//     int len  = readlink("/proc/self/ns/pid", pname + plen, sizeof(pname) - 1 - plen);
-//     if(len < 0)
-//         len = 0;
-//     pname[plen + len] = '\0';
-//     TRACE(NCCL_INIT, "unique PID '%s'", pname);
-//     return getHash(pname, strlen(pname));
-// }
-// int parseStringList(const char* string, struct netIf* ifList, int maxList) {
-//     if(!string)
-//         return 0;
-//     const char* ptr = string;
-//     int ifNum = 0;
-//     int ifC   = 0;
-//     char c;
-//     do {
-//         c = *ptr;
-//         if(c == ':') {
-//             if(ifC > 0) {
-//                 ifList[ifNum].prefix[ifC] = '\0';
-//                 ifList[ifNum].port        = atoi(ptr + 1);
-//                 ifNum++;
-//                 ifC = 0;
-//             }
-//             while(c != ',' && c != '\0')
-//                 c = *(++ptr);
-//         } else if(c == ',' || c == '\0') {
-//             if(ifC > 0) {
-//                 ifList[ifNum].prefix[ifC] = '\0';
-//                 ifList[ifNum].port        = -1;
-//                 ifNum++;
-//                 ifC = 0;
-//             }
-//         } else {
-//             ifList[ifNum].prefix[ifC] = c;
-//             ifC++;
-//         }
-//         ptr++;
-//     } while(ifNum < maxList && c);
-//     return ifNum;
-// }
-// static bool matchIf(const char* string, const char* ref, bool matchExact) {
-//     // Make sure to include '\0' in the exact case
-//     int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
-//     return strncmp(string, ref, matchLen) == 0;
-// }
-// static bool matchPort(const int port1, const int port2) {
-//     if(port1 == -1)
-//         return true;
-//     if(port2 == -1)
-//         return true;
-//     if(port1 == port2)
-//         return true;
-//     return false;
-// }
-// bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
-//     // Make an exception for the case where no user list is defined
-//     if(listSize == 0)
-//         return true;
-//     for(int i = 0; i < listSize; i++) {
-//         if(matchIf(string, ifList[i].prefix, matchExact) && matchPort(port, ifList[i].port)) {
-//             return true;
-//         }
-//     }
-//     return false;
-// }
-// __thread struct scclThreadSignal scclThreadSignalLocalInstance = scclThreadSignalStaticInitializer();
-// void* scclMemoryStack::allocateSpilled(struct scclMemoryStack* me, size_t size, size_t align) {
-//     // `me->hunks` points to the top of the stack non-empty hunks. Hunks above
-//     // this (reachable via `->above`) are empty.
-//     struct Hunk* top  = me->topFrame.hunk;
-//     size_t mallocSize = 0;
-//     // If we have lots of space left in hunk but that wasn't enough then we'll
-//     // allocate the object unhunked.
-//     if(me->topFrame.end - me->topFrame.bumper >= 8 << 10)
-//         goto unhunked;
-//     // If we have another hunk (which must be empty) waiting above this one and
-//     // the object fits then use that.
-//     if(top && top->above) {
-//         struct Hunk* top1 = top->above;
-//         uintptr_t uobj    = (reinterpret_cast<uintptr_t>(top1) + sizeof(struct Hunk) + align - 1) & -uintptr_t(align);
-//         if(uobj + size <= reinterpret_cast<uintptr_t>(top1) + top1->size) {
-//             me->topFrame.hunk   = top1;
-//             me->topFrame.bumper = uobj + size;
-//             me->topFrame.end    = reinterpret_cast<uintptr_t>(top1) + top1->size;
-//             return reinterpret_cast<void*>(uobj);
-//         }
-//     }
-//     { // If the next hunk we're going to allocate wouldn't be big enough but the
-//         // Unhunk proxy fits in the current hunk then go allocate as unhunked.
-//         size_t nextSize           = (top ? top->size : 0) + (64 << 10);
-//         constexpr size_t maxAlign = 64;
-//         if(nextSize < sizeof(struct Hunk) + maxAlign + size) {
-//             uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk) - 1) & -uintptr_t(alignof(Unhunk));
-//             if(uproxy + sizeof(struct Unhunk) <= me->topFrame.end)
-//                 goto unhunked;
-//         }
-//         // At this point we must need another hunk, either to fit the object
-//         // itself or its Unhunk proxy.
-//         mallocSize = nextSize;
-//         INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
-//         struct Hunk* top1 = (struct Hunk*)malloc(mallocSize);
-//         if(top1 == nullptr)
-//             goto malloc_exhausted;
-//         top1->size  = nextSize;
-//         top1->above = nullptr;
-//         if(top)
-//             top->above = top1;
-//         top                 = top1;
-//         me->topFrame.hunk   = top;
-//         me->topFrame.end    = reinterpret_cast<uintptr_t>(top) + nextSize;
-//         me->topFrame.bumper = reinterpret_cast<uintptr_t>(top) + sizeof(struct Hunk);
-//     }
-//     { // Try to fit object in the new top hunk.
-//         uintptr_t uobj = (me->topFrame.bumper + align - 1) & -uintptr_t(align);
-//         if(uobj + size <= me->topFrame.end) {
-//             me->topFrame.bumper = uobj + size;
-//             return reinterpret_cast<void*>(uobj);
-//         }
-//     }
-// unhunked: { // We need to allocate the object out-of-band and put an Unhunk proxy in-band
-//     // to keep track of it.
-//     uintptr_t uproxy     = (me->topFrame.bumper + alignof(Unhunk) - 1) & -uintptr_t(alignof(Unhunk));
-//     Unhunk* proxy        = reinterpret_cast<Unhunk*>(uproxy);
-//     me->topFrame.bumper  = uproxy + sizeof(Unhunk);
-//     proxy->next          = me->topFrame.unhunks;
-//     me->topFrame.unhunks = proxy;
-//     mallocSize           = size;
-//     proxy->obj           = malloc(mallocSize);
-//     INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
-//     if(proxy->obj == nullptr)
-//         goto malloc_exhausted;
-//     return proxy->obj;
-// }
-// malloc_exhausted:
-//     WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
-//     abort();
-// }
-// void scclMemoryStackDestruct(struct scclMemoryStack* me) {
-//     // Free unhunks first because both the frames and unhunk proxies lie within the hunks.
-//     struct scclMemoryStack::Frame* f = &me->topFrame;
-//     while(f != nullptr) {
-//         struct scclMemoryStack::Unhunk* u = f->unhunks;
-//         while(u != nullptr) {
-//             free(u->obj);
-//             u = u->next;
-//         }
-//         f = f->below;
-//     }
-//     // Free hunks
-//     struct scclMemoryStack::Hunk* h = me->stub.above;
-//     while(h != nullptr) {
-//         struct scclMemoryStack::Hunk* h1 = h->above;
-//         free(h);
-//         h = h1;
-//     }
-// }
-// typedef struct {
-//     pid_t pid;
-//     pid_t ppid;
-//     char pcmdLine[4096];
-//     char cmdLine[4096];
-// } appConfigOptimizeArg_t;
-// static bool barrier_Flag;
-// int maxGPUs = -1;
-// int initInfo() {
-//     /* get barrier_Flag */
-//     uint32_t index              = 0;
-//     appConfigOptimizeArg_t args = {0};
-//     args.pid                    = getpid();
-//     args.ppid                   = getppid();
-//     std::string cmdLinePath     = "/proc/" + std::to_string(args.ppid) + "/cmdline";
-//     std::ifstream cmdLineFile;
-//     cmdLineFile.open(cmdLinePath.c_str());
-//     cmdLineFile.read(args.pcmdLine, sizeof(args.pcmdLine));
-//     cmdLineFile.close();
-//     cmdLinePath = "/proc/" + std::to_string(args.pid) + "/cmdline";
-//     cmdLineFile.open(cmdLinePath.c_str());
-//     cmdLineFile.read(args.cmdLine, sizeof(args.cmdLine));
-//     cmdLineFile.close();
-//     if(memmem(args.cmdLine, sizeof(args.cmdLine), "sccl_context_test", strlen("sccl_context_test")) ||
-//        memmem(args.pcmdLine, sizeof(args.pcmdLine), "sccl_context_test", strlen("sccl_context_test"))) {
-//         barrier_Flag = true;
-//     } else {
-//         barrier_Flag = false;
-//     }
-//     INFO(NCCL_INIT, "Init config for sccl_context_test: %d", barrier_Flag);
-//     /* get maximum number of GPUs in all NUMA nodes */
-//     if(maxGPUs == -1) {
-//         int gpuCount[32] = {0}; // Assume MAX_NUMA_NODES=32
-//         int deviceCount;
-//         hipGetDeviceCount(&deviceCount);
-//         // Get numbers of GPUs in all NUMA nodes in system
-//         for(int i = 1; i <= deviceCount; ++i) {
-//             char path[256];
-//             snprintf(path, sizeof(path), "/sys/class/drm/card%d/device/numa_node", i);
-//             FILE* fp = fopen(path, "r");
-//             if(fp == NULL) {
-//                 perror("Error opening NUMA node file");
-//                 continue;
-//             }
-//             int numaNode;
-//             if(fscanf(fp, "%d", &numaNode) == 1 && numaNode >= 0 && numaNode < 32) {
-//                 gpuCount[numaNode]++;
-//             }
-//             fclose(fp);
-//         }
-//         // Find maximum number of GPUs in all NUMA nodes
-//         for(int i = 0; i < 32; ++i) {
-//             if(gpuCount[i] > maxGPUs) {
-//                 maxGPUs = gpuCount[i];
-//             }
-//         }
-//         INFO(NCCL_INIT, "Maximum number of GPUs in any NUMA node: %d\n", maxGPUs);
-//     }
-//     return 0;
-// }
-// bool getBarrierFlag() { return barrier_Flag; }
-// int getNumaMaxGpus() { return maxGPUs; }
 } // namespace sccl
--- a/src/utils/utils.h
+++ b/src/utils/utils.h
@@ -13,522 +13,5 @@ static inline void thread_bind_cpu(int coreid) {
    CPU_SET(coreid, &cpuset);
    pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
 }
+////
-// int ncclCudaCompCap();
-// scclResult_t int64ToBusId(int64_t id, char* busId);
-// scclResult_t busIdToInt64(const char* busId, int64_t* id);
-// ncclResult_t getBusId(int hipDev, int64_t* busId);
-// ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
-// uint64_t getHash(const char* string, int n);
-// uint64_t getHostHash();
-// uint64_t getPidHash();
-// ncclResult_t getRandomData(void* buffer, size_t bytes);
-// struct netIf {
-//     char prefix[64];
-//     int port;
-// };
-// int parseStringList(const char* string, struct netIf* ifList, int maxList);
-// bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
-// static long log2i(long n) {
-//     long l = 0;
-//     while(n >>= 1)
-//         l++;
-//     return l;
-// }
-// inline uint64_t clockNano() {
-//     struct timespec ts;
-//     clock_gettime(CLOCK_MONOTONIC, &ts);
-//     return uint64_t(ts.tv_sec) * 1000 * 1000 * 1000 + ts.tv_nsec;
-// }
-// /* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
-//  * return -1 */
-// inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
-//     ncclResult_t ret = ncclSuccess;
-//     if(bytes > 0) {
-//         const size_t one = 1UL;
-//         FILE* fp         = fopen("/dev/urandom", "r");
-//         if(buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one)
-//             ret = ncclSystemError;
-//         if(fp)
-//             fclose(fp);
-//     }
-//     return ret;
-// }
-// ////////////////////////////////////////////////////////////////////////////////
-// template <typename Int>
-// inline void ncclAtomicRefCountIncrement(Int* refs) {
-//     __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
-// }
-// template <typename Int>
-// inline Int ncclAtomicRefCountDecrement(Int* refs) {
-//     return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL);
-// }
-// ////////////////////////////////////////////////////////////////////////////////
-// /* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that
-//  * granularity of LIFO is not per object, instead frames containing many objects
-//  * are pushed and popped. Therefor deallocation is extremely cheap since its
-//  * done at the frame granularity.
-//  *
-//  * The initial state of the stack is with one frame, the "nil" frame, which
-//  * cannot be popped. Therefor objects allocated in the nil frame cannot be
-//  * deallocated sooner than stack destruction.
-//  */
-// struct ncclMemoryStack;
-// void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
-// void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
-// void ncclMemoryStackPush(struct ncclMemoryStack* me);
-// void ncclMemoryStackPop(struct ncclMemoryStack* me);
-// template <typename T>
-// T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n = 1);
-// int initInfo();
-// bool getBarrierFlag();
-// int getNumaMaxGpus();
-// ////////////////////////////////////////////////////////////////////////////////
-// /* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
-//  * a pool instance to ever hold objects whose type have differing
-//  * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by
-//  * a backing `ncclMemoryStack` passed during Alloc(). If memory
-//  * backing any currently held object is deallocated then it is an error to do
-//  * anything other than reconstruct it, after which it is a valid empty pool.
-//  */
-// struct ncclMemoryPool;
-// // Equivalent to zero-initialization
-// void ncclMemoryPoolConstruct(struct ncclMemoryPool* me);
-// template <typename T>
-// T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing);
-// template <typename T>
-// void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj);
-// void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from);
-// ////////////////////////////////////////////////////////////////////////////////
-// /* ncclIntruQueue: A singly-linked list queue where the per-object next pointer
-//  * field is given via the `next` template argument.
-//  *
-//  * Example:
-//  *   struct Foo {
-//  *     struct Foo *next1, *next2; // can be a member of two lists at once
-//  *   };
-//  *   ncclIntruQueue<Foo, &Foo::next1> list1;
-//  *   ncclIntruQueue<Foo, &Foo::next2> list2;
-//  */
-// template <typename T, T* T::* next>
-// struct ncclIntruQueue;
-// template <typename T, T* T::* next>
-// void ncclIntruQueueConstruct(ncclIntruQueue<T, next>* me);
-// template <typename T, T* T::* next>
-// bool ncclIntruQueueEmpty(ncclIntruQueue<T, next>* me);
-// template <typename T, T* T::* next>
-// T* ncclIntruQueueHead(ncclIntruQueue<T, next>* me);
-// template <typename T, T* T::* next>
-// void ncclIntruQueueEnqueue(ncclIntruQueue<T, next>* me, T* x);
-// template <typename T, T* T::* next>
-// T* ncclIntruQueueDequeue(ncclIntruQueue<T, next>* me);
-// template <typename T, T* T::* next>
-// T* ncclIntruQueueTryDequeue(ncclIntruQueue<T, next>* me);
-// template <typename T, T* T::* next>
-// void ncclIntruQueueFreeAll(ncclIntruQueue<T, next>* me, ncclMemoryPool* memPool);
-// ////////////////////////////////////////////////////////////////////////////////
-// /* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
-//  * and "cond" fields are part of the public interface.
-//  */
-// struct ncclThreadSignal {
-//     pthread_mutex_t mutex;
-//     pthread_cond_t cond;
-// };
-// // returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}
-// constexpr ncclThreadSignal ncclThreadSignalStaticInitializer();
-// void ncclThreadSignalConstruct(struct ncclThreadSignal* me);
-// void ncclThreadSignalDestruct(struct ncclThreadSignal* me);
-// // A convenience instance per-thread.
-// extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance;
-// ////////////////////////////////////////////////////////////////////////////////
-// template <typename T, T* T::* next>
-// struct ncclIntruQueueMpsc;
-// template <typename T, T* T::* next>
-// void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T, next>* me);
-// template <typename T, T* T::* next>
-// bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T, next>* me);
-// // Enqueue element. Returns true if queue is not abandoned. Even if queue is
-// // abandoned the element enqueued, so the caller needs to make arrangements for
-// // the queue to be tended.
-// template <typename T, T* T::* next>
-// bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc<T, next>* me, T* x);
-// // Dequeue all elements at a glance. If there aren't any and `waitSome` is
-// // true then this call will wait until it can return a non empty list.
-// template <typename T, T* T::* next>
-// T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc<T, next>* me, bool waitSome);
-// // Dequeue all elements and set queue to abandoned state.
-// template <typename T, T* T::* next>
-// T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc<T, next>* me);
-// ////////////////////////////////////////////////////////////////////////////////
-// struct ncclMemoryStack {
-//     struct Hunk {
-//         struct Hunk* above; // reverse stack pointer
-//         size_t size;        // size of this allocation (including this header struct)
-//     };
-//     struct Unhunk { // proxy header for objects allocated out-of-hunk
-//         struct Unhunk* next;
-//         void* obj;
-//     };
-//     struct Frame {
-//         struct Hunk* hunk;     // top of non-empty hunks
-//         uintptr_t bumper, end; // points into top hunk
-//         struct Unhunk* unhunks;
-//         struct Frame* below;
-//     };
-//     static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align);
-//     static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align);
-//     struct Hunk stub;
-//     struct Frame topFrame;
-// };
-// inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) {
-//     me->stub.above       = nullptr;
-//     me->stub.size        = 0;
-//     me->topFrame.hunk    = &me->stub;
-//     me->topFrame.bumper  = 0;
-//     me->topFrame.end     = 0;
-//     me->topFrame.unhunks = nullptr;
-//     me->topFrame.below   = nullptr;
-// }
-// inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) {
-//     uintptr_t o = (me->topFrame.bumper + align - 1) & -uintptr_t(align);
-//     void* obj;
-//     if(__builtin_expect(o + size <= me->topFrame.end, true)) {
-//         me->topFrame.bumper = o + size;
-//         obj                 = reinterpret_cast<void*>(o);
-//     } else {
-//         obj = allocateSpilled(me, size, align);
-//     }
-//     return obj;
-// }
-// template <typename T>
-// inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
-//     void* obj = ncclMemoryStack::allocate(me, n * sizeof(T), alignof(T));
-//     memset(obj, 0, n * sizeof(T));
-//     return (T*)obj;
-// }
-// inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
-//     using Frame          = ncclMemoryStack::Frame;
-//     Frame tmp            = me->topFrame;
-//     Frame* snapshot      = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame));
-//     *snapshot            = tmp; // C++ struct assignment
-//     me->topFrame.unhunks = nullptr;
-//     me->topFrame.below   = snapshot;
-// }
-// inline void ncclMemoryStackPop(struct ncclMemoryStack* me) {
-//     ncclMemoryStack::Unhunk* un = me->topFrame.unhunks;
-//     while(un != nullptr) {
-//         free(un->obj);
-//         un = un->next;
-//     }
-//     me->topFrame = *me->topFrame.below; // C++ struct assignment
-// }
-// ////////////////////////////////////////////////////////////////////////////////
-// struct ncclMemoryPool {
-//     struct Cell {
-//         Cell* next;
-//     };
-//     template <int Size, int Align>
-//     union CellSized {
-//         Cell cell;
-//         alignas(Align) char space[Size];
-//     };
-//     struct Cell* head;
-//     struct Cell* tail; // meaningful only when head != nullptr
-// };
-// inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) { me->head = nullptr; }
-// template <typename T>
-// inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
-//     using Cell      = ncclMemoryPool::Cell;
-//     using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
-//     Cell* cell;
-//     if(__builtin_expect(me->head != nullptr, true)) {
-//         cell     = me->head;
-//         me->head = cell->next;
-//     } else {
-//         // Use the internal allocate() since it doesn't memset to 0 yet.
-//         cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
-//     }
-//     memset(cell, 0, sizeof(T));
-//     return reinterpret_cast<T*>(cell);
-// }
-// template <typename T>
-// inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) {
-//     using Cell = ncclMemoryPool::Cell;
-//     Cell* cell = reinterpret_cast<Cell*>(obj);
-//     cell->next = me->head;
-//     if(me->head == nullptr)
-//         me->tail = cell;
-//     me->head = cell;
-// }
-// inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) {
-//     if(from->head != nullptr) {
-//         from->tail->next = me->head;
-//         if(me->head == nullptr)
-//             me->tail = from->tail;
-//         me->head   = from->head;
-//         from->head = nullptr;
-//     }
-// }
-// ////////////////////////////////////////////////////////////////////////////////
-// template <typename T, T* T::* next>
-// struct ncclIntruQueue {
-//     T *head, *tail;
-// };
-// template <typename T, T* T::* next>
-// inline void ncclIntruQueueConstruct(ncclIntruQueue<T, next>* me) {
-//     me->head = nullptr;
-//     me->tail = nullptr;
-// }
-// template <typename T, T* T::* next>
-// inline bool ncclIntruQueueEmpty(ncclIntruQueue<T, next>* me) {
-//     return me->head == nullptr;
-// }
-// template <typename T, T* T::* next>
-// inline T* ncclIntruQueueHead(ncclIntruQueue<T, next>* me) {
-//     return me->head;
-// }
-// template <typename T, T* T::* next>
-// inline T* ncclIntruQueueTail(ncclIntruQueue<T, next>* me) {
-//     return me->tail;
-// }
-// template <typename T, T* T::* next>
-// inline void ncclIntruQueueEnqueue(ncclIntruQueue<T, next>* me, T* x) {
-//     x->*next                                = nullptr;
-//     (me->head ? me->tail->*next : me->head) = x;
-//     me->tail                                = x;
-// }
-// template <typename T, T* T::* next>
-// inline T* ncclIntruQueueDequeue(ncclIntruQueue<T, next>* me) {
-//     T* ans   = me->head;
-//     me->head = ans->*next;
-//     if(me->head == nullptr)
-//         me->tail = nullptr;
-//     return ans;
-// }
-// template <typename T, T* T::* next>
-// inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T, next>* me) {
-//     T* ans = me->head;
-//     if(ans != nullptr) {
-//         me->head = ans->*next;
-//         if(me->head == nullptr)
-//             me->tail = nullptr;
-//     }
-//     return ans;
-// }
-// template <typename T, T* T::* next>
-// void ncclIntruQueueFreeAll(ncclIntruQueue<T, next>* me, ncclMemoryPool* pool) {
-//     T* head  = me->head;
-//     me->head = nullptr;
-//     me->tail = nullptr;
-//     while(head != nullptr) {
-//         T* tmp = head->*next;
-//         ncclMemoryPoolFree(pool, tmp);
-//         head = tmp;
-//     }
-// }
-// ////////////////////////////////////////////////////////////////////////////////
-// constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() { return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}; }
-// inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) {
-//     pthread_mutex_init(&me->mutex, nullptr);
-//     pthread_cond_init(&me->cond, nullptr);
-// }
-// inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) {
-//     pthread_mutex_destroy(&me->mutex);
-//     pthread_cond_destroy(&me->cond);
-// }
-// ////////////////////////////////////////////////////////////////////////////////
-// template <typename T, T* T::* next>
-// struct ncclIntruQueueMpsc {
-//     T* head;
-//     uintptr_t tail;
-//     struct ncclThreadSignal* waiting;
-// };
-// template <typename T, T* T::* next>
-// void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T, next>* me) {
-//     me->head    = nullptr;
-//     me->tail    = 0x0;
-//     me->waiting = nullptr;
-// }
-// template <typename T, T* T::* next>
-// bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T, next>* me) {
-//     return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2;
-// }
-// template <typename T, T* T::* next>
-// bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc<T, next>* me, T* x) {
-//     __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED);
-//     uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast<uintptr_t>(x), __ATOMIC_ACQ_REL);
-//     T* prev         = reinterpret_cast<T*>(utail);
-//     T** prevNext    = utail <= 0x2 ? &me->head : &(prev->*next);
-//     __atomic_store_n(prevNext, x, __ATOMIC_RELAXED);
-//     if(utail == 0x1) {                           // waiting
-//         __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting
-//         // This lock/unlock is essential to ensure we don't race ahead of the consumer
-//         // and signal the cond before they begin waiting on it.
-//         struct ncclThreadSignal* waiting = me->waiting;
-//         pthread_mutex_lock(&waiting->mutex);
-//         pthread_mutex_unlock(&waiting->mutex);
-//         pthread_cond_broadcast(&waiting->cond);
-//     }
-//     return utail != 0x2; // not abandoned
-// }
-// template <typename T, T* T::* next>
-// T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc<T, next>* me, bool waitSome) {
-//     T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
-//     if(head == nullptr) {
-//         if(!waitSome)
-//             return nullptr;
-//         uint64_t t0   = clockNano();
-//         bool sleeping = false;
-//         do {
-//             if(clockNano() - t0 >= 10 * 1000) { // spin for first 10us
-//                 struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance;
-//                 pthread_mutex_lock(&waitSignal->mutex);
-//                 uintptr_t expected = sleeping ? 0x1 : 0x0;
-//                 uintptr_t desired  = 0x1;
-//                 me->waiting        = waitSignal; // release done by successful compare exchange
-//                 if(__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
-//                     sleeping = true;
-//                     pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
-//                 }
-//                 pthread_mutex_unlock(&waitSignal->mutex);
-//             }
-//             head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
-//         } while(head == nullptr);
-//     }
-//     __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
-//     uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL);
-//     T* tail         = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
-//     T* x            = head;
-//     while(x != tail) {
-//         T* x1;
-//         int spins = 0;
-//         while(true) {
-//             x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
-//             if(x1 != nullptr)
-//                 break;
-//             if(++spins == 1024) {
-//                 spins = 1024 - 1;
-//                 sched_yield();
-//             }
-//         }
-//         x = x1;
-//     }
-//     return head;
-// }
-// template <typename T, T* T::* next>
-// T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T, next>* me) {
-//     uintptr_t expected = 0x0;
-//     if(__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
-//         return nullptr;
-//     } else {
-//         int spins = 0;
-//         T* head;
-//         while(true) {
-//             head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
-//             if(head != nullptr)
-//                 break;
-//             if(++spins == 1024) {
-//                 spins = 1024 - 1;
-//                 sched_yield();
-//             }
-//         }
-//         __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
-//         uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL);
-//         T* tail         = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
-//         T* x            = head;
-//         while(x != tail) {
-//             T* x1;
-//             spins = 0;
-//             while(true) {
-//                 x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
-//                 if(x1 != nullptr)
-//                     break;
-//                 if(++spins == 1024) {
-//                     spins = 1024 - 1;
-//                     sched_yield();
-//                 }
-//             }
-//             x = x1;
-//         }
-//         return head;
-//     }
-// }
-// ////////////////////////////////////////////////////////////////////////////////
-// static inline long get_now_ns(void) {
-//     struct timespec time;
-//     if(clock_gettime(CLOCK_MONOTONIC, &time) != 0) {
-//         return 0;
-//     }
-//     return time.tv_sec * 1000000000L + time.tv_nsec;
-// }
-// static inline void thread_bind_cpu(int coreid) {
-//     cpu_set_t cpuset;
-//     CPU_ZERO(&cpuset);
-//     CPU_SET(coreid, &cpuset);
-//     pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
-// }
 } // namespace sccl