"vscode:/vscode.git/clone" did not exist on "5d9c03890bb79fe81d8ff923b30613e699f4674a"
Commit 571a75b5 authored by lishen's avatar lishen
Browse files

完成全部网络的node建立,以及GPU到GPU的path物理路径搜索

parent 379c4128
...@@ -48,8 +48,8 @@ static __thread int tid = -1; // 线程局 ...@@ -48,8 +48,8 @@ static __thread int tid = -1; // 线程局
static int pid = -1; // 存储当前进程的ID,默认值为-1 static int pid = -1; // 存储当前进程的ID,默认值为-1
static FILE* scclDebugFile = stdout; // 指向调试输出流的文件指针,默认指向标准输出(stdout static FILE* scclDebugFile = stdout; // 指向调试输出流的文件指针,默认指向标准输出(stdout
static uint64_t scclDebugMask = SCCL_LOG_TOPO | SCCL_LOG_BOOTSTRAP; // Default debug sub-system mask is INIT and ENV static uint64_t scclDebugMask = SCCL_LOG_GRAPH /*| SCCL_LOG_BOOTSTRAP*/; // Default debug sub-system mask is INIT and ENV
static int scclDebugLevel = -1; // 初始化为 -1,表示未设置 static int scclDebugLevel = -1; // 初始化为 -1,表示未设置
// 在文件顶部或适当位置定义变量 // 在文件顶部或适当位置定义变量
static int scclDebugPos = -1; // 初始化为 -1,表示未设置 static int scclDebugPos = -1; // 初始化为 -1,表示未设置
...@@ -135,10 +135,10 @@ static void scclDebugInit() { ...@@ -135,10 +135,10 @@ static void scclDebugInit() {
mask = SCCL_LOG_TOPO; mask = SCCL_LOG_TOPO;
} else if(strcasecmp(subsys, "BOOTSTRAP") == 0) { } else if(strcasecmp(subsys, "BOOTSTRAP") == 0) {
mask = SCCL_LOG_BOOTSTRAP; mask = SCCL_LOG_BOOTSTRAP;
} else if(strcasecmp(subsys, "TRANSPORT") == 0) {
mask = SCCL_LOG_TRANSPORT;
} else if(strcasecmp(subsys, "GRAPH") == 0) { } else if(strcasecmp(subsys, "GRAPH") == 0) {
mask = SCCL_LOG_GRAPH; mask = SCCL_LOG_GRAPH;
} else if(strcasecmp(subsys, "TRANSPORT") == 0) {
mask = SCCL_LOG_TRANSPORT;
} else if(strcasecmp(subsys, "CONNECT") == 0) { } else if(strcasecmp(subsys, "CONNECT") == 0) {
mask = SCCL_LOG_CONNECT; mask = SCCL_LOG_CONNECT;
} else if(strcasecmp(subsys, "P2P") == 0) { } else if(strcasecmp(subsys, "P2P") == 0) {
...@@ -245,9 +245,9 @@ void scclDebugLog(scclDebugLogSubSys_t pos_flags, const char* filepath, const ch ...@@ -245,9 +245,9 @@ void scclDebugLog(scclDebugLogSubSys_t pos_flags, const char* filepath, const ch
char buffer[1024]; char buffer[1024];
size_t len = 0; size_t len = 0;
if constexpr(level == SCCL_LOG_WARN) { if constexpr(level == SCCL_LOG_WARN) {
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d %s:%s:%d SCCL WARN ", hostname, pid, tid, filepath, filefunc, line); len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d %s:%d - %s SCCL WARN ", hostname, pid, tid, filepath, line, filefunc);
} else if constexpr(level == SCCL_LOG_INFO) { } else if constexpr(level == SCCL_LOG_INFO) {
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d %s:%s:%d SCCL INFO ", hostname, pid, tid, filepath, filefunc, line); len = snprintf(buffer, sizeof(buffer), "%s:%d:%d %s:%d - %s SCCL INFO ", hostname, pid, tid, filepath, line, filefunc);
} }
if(len) { if(len) {
......
#pragma once
#include <stdint.h>
namespace sccl {
// 实现类似于std::span的功能,将字节数组转换为类型数组
// 采用Vector的各种接口,区别在于ByteSpanVector对data元素为直接操作,而std::vector为拷贝
template <typename T>
class ByteSpanVector {
public:
// 构造函数,接受一个指向数据的指针和数据的容量
ByteSpanVector(void* data, size_t capacity) : data_(reinterpret_cast<T*>(data)), capacity_(capacity / sizeof(T)), size_(0) {}
// 提供一个data()函数,返回指向数据的指针
T* data() const { return data_; }
// 提供一个size()函数,返回当前已经写入的数据的数量
size_t size() const { return size_; }
// 提供一个capacity()函数,返回预留给数据的最大空间
size_t capacity() const { return capacity_; }
// 提供一个空的检查函数
bool empty() const { return size_ == 0; }
// 提供一个检查是否已满的函数
bool full() const { return size_ == capacity_; }
// 提供一个在末尾添加元素的函数
void push_back(const T& value) {
if(size_ < capacity_) {
new(data_ + size_) T(value);
++size_;
} else {
// 处理容量不足的情况,例如抛出异常或扩展容量
throw std::overflow_error("ByteSpanVector push_back capacity exceeded");
}
}
// 提供一个访问指定索引处元素的函数,返回指针
T* operator[](size_t index) {
if(index < size_) {
return &(data_[index]);
} else {
return nullptr; // 返回空指针
}
}
const T* operator[](size_t index) const {
if(index < size_) {
return &(data_[index]);
} else {
return nullptr; // 返回空指针
}
}
private:
T* data_;
size_t capacity_;
size_t size_;
};
template <typename T>
class ByteSpanArray {
public:
// 构造函数,接受一个指向数据的void*指针和总的字节大小
ByteSpanArray(void* data, size_t size) : data_(reinterpret_cast<T*>(data)), size_(size / sizeof(T)) {}
// 提供一个size()函数,返回当前已经写入的数据的数量
size_t size() const { return size_; }
// 提供一个访问指定索引处元素的函数,返回T*类型的数据,或者在索引超出范围时返回空指针nullptr
T* operator[](size_t index) {
if(index < size_) {
return data_ + index;
} else {
return nullptr;
}
}
const T* operator[](size_t index) const {
if(index < size_) {
return data_ + index;
} else {
return nullptr;
}
}
private:
T* data_;
size_t size_;
};
} // namespace sccl
...@@ -5,382 +5,11 @@ ...@@ -5,382 +5,11 @@
************************************************************************/ ************************************************************************/
#include "utils.h" #include "utils.h"
// #include "core.h"
// #include "nvmlwrap.h"
#include <dirent.h> #include <dirent.h>
#include <fstream> #include <fstream>
#include <stdlib.h> #include <stdlib.h>
namespace sccl { namespace sccl {
///
// // Get current Compute Capability
// int scclCudaCompCap() {
// int hipDev;
// if(cudaGetDevice(&hipDev) != cudaSuccess)
// return 0;
// int ccMajor, ccMinor;
// if(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, hipDev) != cudaSuccess)
// return 0;
// if(cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, hipDev) != cudaSuccess)
// return 0;
// return ccMajor * 10 + ccMinor;
// }
// scclResult_t int64ToBusId(int64_t id, char* busId) {
// sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
// return scclSuccess;
// }
// scclResult_t busIdToInt64(const char* busId, int64_t* id) {
// char hexStr[17]; // Longest possible int64 hex string + null terminator.
// int hexOffset = 0;
// for(int i = 0; hexOffset < sizeof(hexStr) - 1; i++) {
// char c = busId[i];
// if(c == '.' || c == ':')
// continue;
// if((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) {
// hexStr[hexOffset++] = busId[i];
// } else
// break;
// }
// hexStr[hexOffset] = '\0';
// *id = strtol(hexStr, NULL, 16);
// return scclSuccess;
// }
// // Convert a logical hipDev index to the NVML device minor number
// scclResult_t getBusId(int hipDev, int64_t* busId) {
// // On most systems, the PCI bus ID comes back as in the 0000:00:00.0
// // format. Still need to allocate proper space in case PCI domain goes
// // higher.
// char busIdStr[] = "00000000:00:00.0";
// CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), hipDev));
// NCCLCHECK(busIdToInt64(busIdStr, busId));
// return scclSuccess;
// }
// scclResult_t getHostName(char* hostname, int maxlen, const char delim) {
// if(gethostname(hostname, maxlen) != 0) {
// strncpy(hostname, "unknown", maxlen);
// return scclSystemError;
// }
// int i = 0;
// while((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1))
// i++;
// hostname[i] = '\0';
// return scclSuccess;
// }
// uint64_t getHash(const char* string, int n) {
// // Based on DJB2a, result = result * 33 ^ char
// uint64_t result = 5381;
// for(int c = 0; c < n; c++) {
// result = ((result << 5) + result) ^ string[c];
// }
// return result;
// }
// /* Generate a hash of the unique identifying string for this host
// * that will be unique for both bare-metal and container instances
// * Equivalent of a hash of;
// *
// * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
// *
// * This string can be overridden by using the NCCL_HOSTID env var.
// */
// #define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
// uint64_t getHostHash(void) {
// char hostHash[1024];
// char* hostId;
// // Fall back is the full hostname if something fails
// (void)getHostName(hostHash, sizeof(hostHash), '\0');
// int offset = strlen(hostHash);
// if((hostId = getenv("NCCL_HOSTID")) != NULL) {
// INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId);
// strncpy(hostHash, hostId, sizeof(hostHash));
// } else {
// FILE* file = fopen(HOSTID_FILE, "r");
// if(file != NULL) {
// char* p;
// if(fscanf(file, "%ms", &p) == 1) {
// strncpy(hostHash + offset, p, sizeof(hostHash) - offset - 1);
// free(p);
// }
// }
// fclose(file);
// }
// // Make sure the string is terminated
// hostHash[sizeof(hostHash) - 1] = '\0';
// TRACE(NCCL_INIT, "unique hostname '%s'", hostHash);
// return getHash(hostHash, strlen(hostHash));
// }
// /* Generate a hash of the unique identifying string for this process
// * that will be unique for both bare-metal and container instances
// * Equivalent of a hash of;
// *
// * $$ $(readlink /proc/self/ns/pid)
// */
// uint64_t getPidHash(void) {
// char pname[1024];
// // Start off with our pid ($$)
// sprintf(pname, "%ld", (long)getpid());
// int plen = strlen(pname);
// int len = readlink("/proc/self/ns/pid", pname + plen, sizeof(pname) - 1 - plen);
// if(len < 0)
// len = 0;
// pname[plen + len] = '\0';
// TRACE(NCCL_INIT, "unique PID '%s'", pname);
// return getHash(pname, strlen(pname));
// }
// int parseStringList(const char* string, struct netIf* ifList, int maxList) {
// if(!string)
// return 0;
// const char* ptr = string;
// int ifNum = 0;
// int ifC = 0;
// char c;
// do {
// c = *ptr;
// if(c == ':') {
// if(ifC > 0) {
// ifList[ifNum].prefix[ifC] = '\0';
// ifList[ifNum].port = atoi(ptr + 1);
// ifNum++;
// ifC = 0;
// }
// while(c != ',' && c != '\0')
// c = *(++ptr);
// } else if(c == ',' || c == '\0') {
// if(ifC > 0) {
// ifList[ifNum].prefix[ifC] = '\0';
// ifList[ifNum].port = -1;
// ifNum++;
// ifC = 0;
// }
// } else {
// ifList[ifNum].prefix[ifC] = c;
// ifC++;
// }
// ptr++;
// } while(ifNum < maxList && c);
// return ifNum;
// }
// static bool matchIf(const char* string, const char* ref, bool matchExact) {
// // Make sure to include '\0' in the exact case
// int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
// return strncmp(string, ref, matchLen) == 0;
// }
// static bool matchPort(const int port1, const int port2) {
// if(port1 == -1)
// return true;
// if(port2 == -1)
// return true;
// if(port1 == port2)
// return true;
// return false;
// }
// bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
// // Make an exception for the case where no user list is defined
// if(listSize == 0)
// return true;
// for(int i = 0; i < listSize; i++) {
// if(matchIf(string, ifList[i].prefix, matchExact) && matchPort(port, ifList[i].port)) {
// return true;
// }
// }
// return false;
// }
// __thread struct scclThreadSignal scclThreadSignalLocalInstance = scclThreadSignalStaticInitializer();
// void* scclMemoryStack::allocateSpilled(struct scclMemoryStack* me, size_t size, size_t align) {
// // `me->hunks` points to the top of the stack non-empty hunks. Hunks above
// // this (reachable via `->above`) are empty.
// struct Hunk* top = me->topFrame.hunk;
// size_t mallocSize = 0;
// // If we have lots of space left in hunk but that wasn't enough then we'll
// // allocate the object unhunked.
// if(me->topFrame.end - me->topFrame.bumper >= 8 << 10)
// goto unhunked;
// // If we have another hunk (which must be empty) waiting above this one and
// // the object fits then use that.
// if(top && top->above) {
// struct Hunk* top1 = top->above;
// uintptr_t uobj = (reinterpret_cast<uintptr_t>(top1) + sizeof(struct Hunk) + align - 1) & -uintptr_t(align);
// if(uobj + size <= reinterpret_cast<uintptr_t>(top1) + top1->size) {
// me->topFrame.hunk = top1;
// me->topFrame.bumper = uobj + size;
// me->topFrame.end = reinterpret_cast<uintptr_t>(top1) + top1->size;
// return reinterpret_cast<void*>(uobj);
// }
// }
// { // If the next hunk we're going to allocate wouldn't be big enough but the
// // Unhunk proxy fits in the current hunk then go allocate as unhunked.
// size_t nextSize = (top ? top->size : 0) + (64 << 10);
// constexpr size_t maxAlign = 64;
// if(nextSize < sizeof(struct Hunk) + maxAlign + size) {
// uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk) - 1) & -uintptr_t(alignof(Unhunk));
// if(uproxy + sizeof(struct Unhunk) <= me->topFrame.end)
// goto unhunked;
// }
// // At this point we must need another hunk, either to fit the object
// // itself or its Unhunk proxy.
// mallocSize = nextSize;
// INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
// struct Hunk* top1 = (struct Hunk*)malloc(mallocSize);
// if(top1 == nullptr)
// goto malloc_exhausted;
// top1->size = nextSize;
// top1->above = nullptr;
// if(top)
// top->above = top1;
// top = top1;
// me->topFrame.hunk = top;
// me->topFrame.end = reinterpret_cast<uintptr_t>(top) + nextSize;
// me->topFrame.bumper = reinterpret_cast<uintptr_t>(top) + sizeof(struct Hunk);
// }
// { // Try to fit object in the new top hunk.
// uintptr_t uobj = (me->topFrame.bumper + align - 1) & -uintptr_t(align);
// if(uobj + size <= me->topFrame.end) {
// me->topFrame.bumper = uobj + size;
// return reinterpret_cast<void*>(uobj);
// }
// }
// unhunked: { // We need to allocate the object out-of-band and put an Unhunk proxy in-band
// // to keep track of it.
// uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk) - 1) & -uintptr_t(alignof(Unhunk));
// Unhunk* proxy = reinterpret_cast<Unhunk*>(uproxy);
// me->topFrame.bumper = uproxy + sizeof(Unhunk);
// proxy->next = me->topFrame.unhunks;
// me->topFrame.unhunks = proxy;
// mallocSize = size;
// proxy->obj = malloc(mallocSize);
// INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
// if(proxy->obj == nullptr)
// goto malloc_exhausted;
// return proxy->obj;
// }
// malloc_exhausted:
// WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
// abort();
// }
// void scclMemoryStackDestruct(struct scclMemoryStack* me) {
// // Free unhunks first because both the frames and unhunk proxies lie within the hunks.
// struct scclMemoryStack::Frame* f = &me->topFrame;
// while(f != nullptr) {
// struct scclMemoryStack::Unhunk* u = f->unhunks;
// while(u != nullptr) {
// free(u->obj);
// u = u->next;
// }
// f = f->below;
// }
// // Free hunks
// struct scclMemoryStack::Hunk* h = me->stub.above;
// while(h != nullptr) {
// struct scclMemoryStack::Hunk* h1 = h->above;
// free(h);
// h = h1;
// }
// }
// typedef struct {
// pid_t pid;
// pid_t ppid;
// char pcmdLine[4096];
// char cmdLine[4096];
// } appConfigOptimizeArg_t;
// static bool barrier_Flag;
// int maxGPUs = -1;
// int initInfo() {
// /* get barrier_Flag */
// uint32_t index = 0;
// appConfigOptimizeArg_t args = {0};
// args.pid = getpid();
// args.ppid = getppid();
// std::string cmdLinePath = "/proc/" + std::to_string(args.ppid) + "/cmdline";
// std::ifstream cmdLineFile;
// cmdLineFile.open(cmdLinePath.c_str());
// cmdLineFile.read(args.pcmdLine, sizeof(args.pcmdLine));
// cmdLineFile.close();
// cmdLinePath = "/proc/" + std::to_string(args.pid) + "/cmdline";
// cmdLineFile.open(cmdLinePath.c_str());
// cmdLineFile.read(args.cmdLine, sizeof(args.cmdLine));
// cmdLineFile.close();
// if(memmem(args.cmdLine, sizeof(args.cmdLine), "sccl_context_test", strlen("sccl_context_test")) ||
// memmem(args.pcmdLine, sizeof(args.pcmdLine), "sccl_context_test", strlen("sccl_context_test"))) {
// barrier_Flag = true;
// } else {
// barrier_Flag = false;
// }
// INFO(NCCL_INIT, "Init config for sccl_context_test: %d", barrier_Flag);
// /* get maximum number of GPUs in all NUMA nodes */
// if(maxGPUs == -1) {
// int gpuCount[32] = {0}; // Assume MAX_NUMA_NODES=32
// int deviceCount;
// hipGetDeviceCount(&deviceCount);
// // Get numbers of GPUs in all NUMA nodes in system
// for(int i = 1; i <= deviceCount; ++i) {
// char path[256];
// snprintf(path, sizeof(path), "/sys/class/drm/card%d/device/numa_node", i);
// FILE* fp = fopen(path, "r");
// if(fp == NULL) {
// perror("Error opening NUMA node file");
// continue;
// }
// int numaNode;
// if(fscanf(fp, "%d", &numaNode) == 1 && numaNode >= 0 && numaNode < 32) {
// gpuCount[numaNode]++;
// }
// fclose(fp);
// }
// // Find maximum number of GPUs in all NUMA nodes
// for(int i = 0; i < 32; ++i) {
// if(gpuCount[i] > maxGPUs) {
// maxGPUs = gpuCount[i];
// }
// }
// INFO(NCCL_INIT, "Maximum number of GPUs in any NUMA node: %d\n", maxGPUs);
// }
// return 0;
// }
// bool getBarrierFlag() { return barrier_Flag; }
// int getNumaMaxGpus() { return maxGPUs; }
} // namespace sccl } // namespace sccl
...@@ -13,522 +13,5 @@ static inline void thread_bind_cpu(int coreid) { ...@@ -13,522 +13,5 @@ static inline void thread_bind_cpu(int coreid) {
CPU_SET(coreid, &cpuset); CPU_SET(coreid, &cpuset);
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
} }
////
// int ncclCudaCompCap();
// scclResult_t int64ToBusId(int64_t id, char* busId);
// scclResult_t busIdToInt64(const char* busId, int64_t* id);
// ncclResult_t getBusId(int hipDev, int64_t* busId);
// ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
// uint64_t getHash(const char* string, int n);
// uint64_t getHostHash();
// uint64_t getPidHash();
// ncclResult_t getRandomData(void* buffer, size_t bytes);
// struct netIf {
// char prefix[64];
// int port;
// };
// int parseStringList(const char* string, struct netIf* ifList, int maxList);
// bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
// static long log2i(long n) {
// long l = 0;
// while(n >>= 1)
// l++;
// return l;
// }
// inline uint64_t clockNano() {
// struct timespec ts;
// clock_gettime(CLOCK_MONOTONIC, &ts);
// return uint64_t(ts.tv_sec) * 1000 * 1000 * 1000 + ts.tv_nsec;
// }
// /* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
// * return -1 */
// inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
// ncclResult_t ret = ncclSuccess;
// if(bytes > 0) {
// const size_t one = 1UL;
// FILE* fp = fopen("/dev/urandom", "r");
// if(buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one)
// ret = ncclSystemError;
// if(fp)
// fclose(fp);
// }
// return ret;
// }
// ////////////////////////////////////////////////////////////////////////////////
// template <typename Int>
// inline void ncclAtomicRefCountIncrement(Int* refs) {
// __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
// }
// template <typename Int>
// inline Int ncclAtomicRefCountDecrement(Int* refs) {
// return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL);
// }
// ////////////////////////////////////////////////////////////////////////////////
// /* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that
// * granularity of LIFO is not per object, instead frames containing many objects
// * are pushed and popped. Therefor deallocation is extremely cheap since its
// * done at the frame granularity.
// *
// * The initial state of the stack is with one frame, the "nil" frame, which
// * cannot be popped. Therefor objects allocated in the nil frame cannot be
// * deallocated sooner than stack destruction.
// */
// struct ncclMemoryStack;
// void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
// void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
// void ncclMemoryStackPush(struct ncclMemoryStack* me);
// void ncclMemoryStackPop(struct ncclMemoryStack* me);
// template <typename T>
// T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n = 1);
// int initInfo();
// bool getBarrierFlag();
// int getNumaMaxGpus();
// ////////////////////////////////////////////////////////////////////////////////
// /* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
// * a pool instance to ever hold objects whose type have differing
// * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by
// * a backing `ncclMemoryStack` passed during Alloc(). If memory
// * backing any currently held object is deallocated then it is an error to do
// * anything other than reconstruct it, after which it is a valid empty pool.
// */
// struct ncclMemoryPool;
// // Equivalent to zero-initialization
// void ncclMemoryPoolConstruct(struct ncclMemoryPool* me);
// template <typename T>
// T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing);
// template <typename T>
// void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj);
// void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from);
// ////////////////////////////////////////////////////////////////////////////////
// /* ncclIntruQueue: A singly-linked list queue where the per-object next pointer
// * field is given via the `next` template argument.
// *
// * Example:
// * struct Foo {
// * struct Foo *next1, *next2; // can be a member of two lists at once
// * };
// * ncclIntruQueue<Foo, &Foo::next1> list1;
// * ncclIntruQueue<Foo, &Foo::next2> list2;
// */
// template <typename T, T* T::* next>
// struct ncclIntruQueue;
// template <typename T, T* T::* next>
// void ncclIntruQueueConstruct(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// bool ncclIntruQueueEmpty(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// T* ncclIntruQueueHead(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// void ncclIntruQueueEnqueue(ncclIntruQueue<T, next>* me, T* x);
// template <typename T, T* T::* next>
// T* ncclIntruQueueDequeue(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// T* ncclIntruQueueTryDequeue(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// void ncclIntruQueueFreeAll(ncclIntruQueue<T, next>* me, ncclMemoryPool* memPool);
// ////////////////////////////////////////////////////////////////////////////////
// /* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
// * and "cond" fields are part of the public interface.
// */
// struct ncclThreadSignal {
// pthread_mutex_t mutex;
// pthread_cond_t cond;
// };
// // returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}
// constexpr ncclThreadSignal ncclThreadSignalStaticInitializer();
// void ncclThreadSignalConstruct(struct ncclThreadSignal* me);
// void ncclThreadSignalDestruct(struct ncclThreadSignal* me);
// // A convenience instance per-thread.
// extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance;
// ////////////////////////////////////////////////////////////////////////////////
// template <typename T, T* T::* next>
// struct ncclIntruQueueMpsc;
// template <typename T, T* T::* next>
// void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T, next>* me);
// template <typename T, T* T::* next>
// bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T, next>* me);
// // Enqueue element. Returns true if queue is not abandoned. Even if queue is
// // abandoned the element enqueued, so the caller needs to make arrangements for
// // the queue to be tended.
// template <typename T, T* T::* next>
// bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc<T, next>* me, T* x);
// // Dequeue all elements at a glance. If there aren't any and `waitSome` is
// // true then this call will wait until it can return a non empty list.
// template <typename T, T* T::* next>
// T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc<T, next>* me, bool waitSome);
// // Dequeue all elements and set queue to abandoned state.
// template <typename T, T* T::* next>
// T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc<T, next>* me);
// ////////////////////////////////////////////////////////////////////////////////
// struct ncclMemoryStack {
// struct Hunk {
// struct Hunk* above; // reverse stack pointer
// size_t size; // size of this allocation (including this header struct)
// };
// struct Unhunk { // proxy header for objects allocated out-of-hunk
// struct Unhunk* next;
// void* obj;
// };
// struct Frame {
// struct Hunk* hunk; // top of non-empty hunks
// uintptr_t bumper, end; // points into top hunk
// struct Unhunk* unhunks;
// struct Frame* below;
// };
// static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align);
// static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align);
// struct Hunk stub;
// struct Frame topFrame;
// };
// inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) {
// me->stub.above = nullptr;
// me->stub.size = 0;
// me->topFrame.hunk = &me->stub;
// me->topFrame.bumper = 0;
// me->topFrame.end = 0;
// me->topFrame.unhunks = nullptr;
// me->topFrame.below = nullptr;
// }
// inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) {
// uintptr_t o = (me->topFrame.bumper + align - 1) & -uintptr_t(align);
// void* obj;
// if(__builtin_expect(o + size <= me->topFrame.end, true)) {
// me->topFrame.bumper = o + size;
// obj = reinterpret_cast<void*>(o);
// } else {
// obj = allocateSpilled(me, size, align);
// }
// return obj;
// }
// template <typename T>
// inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
// void* obj = ncclMemoryStack::allocate(me, n * sizeof(T), alignof(T));
// memset(obj, 0, n * sizeof(T));
// return (T*)obj;
// }
// inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
// using Frame = ncclMemoryStack::Frame;
// Frame tmp = me->topFrame;
// Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame));
// *snapshot = tmp; // C++ struct assignment
// me->topFrame.unhunks = nullptr;
// me->topFrame.below = snapshot;
// }
// inline void ncclMemoryStackPop(struct ncclMemoryStack* me) {
// ncclMemoryStack::Unhunk* un = me->topFrame.unhunks;
// while(un != nullptr) {
// free(un->obj);
// un = un->next;
// }
// me->topFrame = *me->topFrame.below; // C++ struct assignment
// }
// ////////////////////////////////////////////////////////////////////////////////
// struct ncclMemoryPool {
// struct Cell {
// Cell* next;
// };
// template <int Size, int Align>
// union CellSized {
// Cell cell;
// alignas(Align) char space[Size];
// };
// struct Cell* head;
// struct Cell* tail; // meaningful only when head != nullptr
// };
// inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) { me->head = nullptr; }
// template <typename T>
// inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
// using Cell = ncclMemoryPool::Cell;
// using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
// Cell* cell;
// if(__builtin_expect(me->head != nullptr, true)) {
// cell = me->head;
// me->head = cell->next;
// } else {
// // Use the internal allocate() since it doesn't memset to 0 yet.
// cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
// }
// memset(cell, 0, sizeof(T));
// return reinterpret_cast<T*>(cell);
// }
// template <typename T>
// inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) {
// using Cell = ncclMemoryPool::Cell;
// Cell* cell = reinterpret_cast<Cell*>(obj);
// cell->next = me->head;
// if(me->head == nullptr)
// me->tail = cell;
// me->head = cell;
// }
// inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) {
// if(from->head != nullptr) {
// from->tail->next = me->head;
// if(me->head == nullptr)
// me->tail = from->tail;
// me->head = from->head;
// from->head = nullptr;
// }
// }
// ////////////////////////////////////////////////////////////////////////////////
// template <typename T, T* T::* next>
// struct ncclIntruQueue {
// T *head, *tail;
// };
// template <typename T, T* T::* next>
// inline void ncclIntruQueueConstruct(ncclIntruQueue<T, next>* me) {
// me->head = nullptr;
// me->tail = nullptr;
// }
// template <typename T, T* T::* next>
// inline bool ncclIntruQueueEmpty(ncclIntruQueue<T, next>* me) {
// return me->head == nullptr;
// }
// template <typename T, T* T::* next>
// inline T* ncclIntruQueueHead(ncclIntruQueue<T, next>* me) {
// return me->head;
// }
// template <typename T, T* T::* next>
// inline T* ncclIntruQueueTail(ncclIntruQueue<T, next>* me) {
// return me->tail;
// }
// template <typename T, T* T::* next>
// inline void ncclIntruQueueEnqueue(ncclIntruQueue<T, next>* me, T* x) {
// x->*next = nullptr;
// (me->head ? me->tail->*next : me->head) = x;
// me->tail = x;
// }
// template <typename T, T* T::* next>
// inline T* ncclIntruQueueDequeue(ncclIntruQueue<T, next>* me) {
// T* ans = me->head;
// me->head = ans->*next;
// if(me->head == nullptr)
// me->tail = nullptr;
// return ans;
// }
// template <typename T, T* T::* next>
// inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T, next>* me) {
// T* ans = me->head;
// if(ans != nullptr) {
// me->head = ans->*next;
// if(me->head == nullptr)
// me->tail = nullptr;
// }
// return ans;
// }
// template <typename T, T* T::* next>
// void ncclIntruQueueFreeAll(ncclIntruQueue<T, next>* me, ncclMemoryPool* pool) {
// T* head = me->head;
// me->head = nullptr;
// me->tail = nullptr;
// while(head != nullptr) {
// T* tmp = head->*next;
// ncclMemoryPoolFree(pool, tmp);
// head = tmp;
// }
// }
// ////////////////////////////////////////////////////////////////////////////////
// constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() { return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}; }
// inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) {
// pthread_mutex_init(&me->mutex, nullptr);
// pthread_cond_init(&me->cond, nullptr);
// }
// inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) {
// pthread_mutex_destroy(&me->mutex);
// pthread_cond_destroy(&me->cond);
// }
// ////////////////////////////////////////////////////////////////////////////////
// template <typename T, T* T::* next>
// struct ncclIntruQueueMpsc {
// T* head;
// uintptr_t tail;
// struct ncclThreadSignal* waiting;
// };
// template <typename T, T* T::* next>
// void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T, next>* me) {
// me->head = nullptr;
// me->tail = 0x0;
// me->waiting = nullptr;
// }
// template <typename T, T* T::* next>
// bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T, next>* me) {
// return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2;
// }
// template <typename T, T* T::* next>
// bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc<T, next>* me, T* x) {
// __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED);
// uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast<uintptr_t>(x), __ATOMIC_ACQ_REL);
// T* prev = reinterpret_cast<T*>(utail);
// T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next);
// __atomic_store_n(prevNext, x, __ATOMIC_RELAXED);
// if(utail == 0x1) { // waiting
// __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting
// // This lock/unlock is essential to ensure we don't race ahead of the consumer
// // and signal the cond before they begin waiting on it.
// struct ncclThreadSignal* waiting = me->waiting;
// pthread_mutex_lock(&waiting->mutex);
// pthread_mutex_unlock(&waiting->mutex);
// pthread_cond_broadcast(&waiting->cond);
// }
// return utail != 0x2; // not abandoned
// }
// template <typename T, T* T::* next>
// T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc<T, next>* me, bool waitSome) {
// T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
// if(head == nullptr) {
// if(!waitSome)
// return nullptr;
// uint64_t t0 = clockNano();
// bool sleeping = false;
// do {
// if(clockNano() - t0 >= 10 * 1000) { // spin for first 10us
// struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance;
// pthread_mutex_lock(&waitSignal->mutex);
// uintptr_t expected = sleeping ? 0x1 : 0x0;
// uintptr_t desired = 0x1;
// me->waiting = waitSignal; // release done by successful compare exchange
// if(__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
// sleeping = true;
// pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
// }
// pthread_mutex_unlock(&waitSignal->mutex);
// }
// head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
// } while(head == nullptr);
// }
// __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
// uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL);
// T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
// T* x = head;
// while(x != tail) {
// T* x1;
// int spins = 0;
// while(true) {
// x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
// if(x1 != nullptr)
// break;
// if(++spins == 1024) {
// spins = 1024 - 1;
// sched_yield();
// }
// }
// x = x1;
// }
// return head;
// }
// template <typename T, T* T::* next>
// T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T, next>* me) {
// uintptr_t expected = 0x0;
// if(__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
// return nullptr;
// } else {
// int spins = 0;
// T* head;
// while(true) {
// head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
// if(head != nullptr)
// break;
// if(++spins == 1024) {
// spins = 1024 - 1;
// sched_yield();
// }
// }
// __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
// uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL);
// T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
// T* x = head;
// while(x != tail) {
// T* x1;
// spins = 0;
// while(true) {
// x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
// if(x1 != nullptr)
// break;
// if(++spins == 1024) {
// spins = 1024 - 1;
// sched_yield();
// }
// }
// x = x1;
// }
// return head;
// }
// }
// ////////////////////////////////////////////////////////////////////////////////
// static inline long get_now_ns(void) {
// struct timespec time;
// if(clock_gettime(CLOCK_MONOTONIC, &time) != 0) {
// return 0;
// }
// return time.tv_sec * 1000000000L + time.tv_nsec;
// }
// static inline void thread_bind_cpu(int coreid) {
// cpu_set_t cpuset;
// CPU_ZERO(&cpuset);
// CPU_SET(coreid, &cpuset);
// pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
// }
} // namespace sccl } // namespace sccl
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment