/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_UTILS_H_ #define NCCL_UTILS_H_ #include "check.h" #include #include #include #include namespace sccl { // int ncclCudaCompCap(); // scclResult_t int64ToBusId(int64_t id, char* busId); // scclResult_t busIdToInt64(const char* busId, int64_t* id); // ncclResult_t getBusId(int cudaDev, int64_t* busId); // ncclResult_t getHostName(char* hostname, int maxlen, const char delim); // uint64_t getHash(const char* string, int n); // uint64_t getHostHash(); // uint64_t getPidHash(); // ncclResult_t getRandomData(void* buffer, size_t bytes); // struct netIf { // char prefix[64]; // int port; // }; // int parseStringList(const char* string, struct netIf* ifList, int maxList); // bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); // static long log2i(long n) { // long l = 0; // while(n >>= 1) // l++; // return l; // } // inline uint64_t clockNano() { // struct timespec ts; // clock_gettime(CLOCK_MONOTONIC, &ts); // return uint64_t(ts.tv_sec) * 1000 * 1000 * 1000 + ts.tv_nsec; // } // /* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else // * return -1 */ // inline ncclResult_t getRandomData(void* buffer, size_t bytes) { // ncclResult_t ret = ncclSuccess; // if(bytes > 0) { // const size_t one = 1UL; // FILE* fp = fopen("/dev/urandom", "r"); // if(buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) // ret = ncclSystemError; // if(fp) // fclose(fp); // } // return ret; // } // //////////////////////////////////////////////////////////////////////////////// // template // inline void ncclAtomicRefCountIncrement(Int* refs) { // __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED); // } // template // inline Int ncclAtomicRefCountDecrement(Int* refs) { // return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL); // } // //////////////////////////////////////////////////////////////////////////////// // /* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that // * granularity of LIFO is not per object, instead frames containing many objects // * are pushed and popped. Therefor deallocation is extremely cheap since its // * done at the frame granularity. // * // * The initial state of the stack is with one frame, the "nil" frame, which // * cannot be popped. Therefor objects allocated in the nil frame cannot be // * deallocated sooner than stack destruction. // */ // struct ncclMemoryStack; // void ncclMemoryStackConstruct(struct ncclMemoryStack* me); // void ncclMemoryStackDestruct(struct ncclMemoryStack* me); // void ncclMemoryStackPush(struct ncclMemoryStack* me); // void ncclMemoryStackPop(struct ncclMemoryStack* me); // template // T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n = 1); // int initInfo(); // bool getBarrierFlag(); // int getNumaMaxGpus(); // //////////////////////////////////////////////////////////////////////////////// // /* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for // * a pool instance to ever hold objects whose type have differing // * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by // * a backing `ncclMemoryStack` passed during Alloc(). If memory // * backing any currently held object is deallocated then it is an error to do // * anything other than reconstruct it, after which it is a valid empty pool. // */ // struct ncclMemoryPool; // // Equivalent to zero-initialization // void ncclMemoryPoolConstruct(struct ncclMemoryPool* me); // template // T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing); // template // void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj); // void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from); // //////////////////////////////////////////////////////////////////////////////// // /* ncclIntruQueue: A singly-linked list queue where the per-object next pointer // * field is given via the `next` template argument. // * // * Example: // * struct Foo { // * struct Foo *next1, *next2; // can be a member of two lists at once // * }; // * ncclIntruQueue list1; // * ncclIntruQueue list2; // */ // template // struct ncclIntruQueue; // template // void ncclIntruQueueConstruct(ncclIntruQueue* me); // template // bool ncclIntruQueueEmpty(ncclIntruQueue* me); // template // T* ncclIntruQueueHead(ncclIntruQueue* me); // template // void ncclIntruQueueEnqueue(ncclIntruQueue* me, T* x); // template // T* ncclIntruQueueDequeue(ncclIntruQueue* me); // template // T* ncclIntruQueueTryDequeue(ncclIntruQueue* me); // template // void ncclIntruQueueFreeAll(ncclIntruQueue* me, ncclMemoryPool* memPool); // //////////////////////////////////////////////////////////////////////////////// // /* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex" // * and "cond" fields are part of the public interface. // */ // struct ncclThreadSignal { // pthread_mutex_t mutex; // pthread_cond_t cond; // }; // // returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER} // constexpr ncclThreadSignal ncclThreadSignalStaticInitializer(); // void ncclThreadSignalConstruct(struct ncclThreadSignal* me); // void ncclThreadSignalDestruct(struct ncclThreadSignal* me); // // A convenience instance per-thread. // extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance; // //////////////////////////////////////////////////////////////////////////////// // template // struct ncclIntruQueueMpsc; // template // void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc* me); // template // bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc* me); // // Enqueue element. Returns true if queue is not abandoned. Even if queue is // // abandoned the element enqueued, so the caller needs to make arrangements for // // the queue to be tended. // template // bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc* me, T* x); // // Dequeue all elements at a glance. If there aren't any and `waitSome` is // // true then this call will wait until it can return a non empty list. // template // T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc* me, bool waitSome); // // Dequeue all elements and set queue to abandoned state. // template // T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc* me); // //////////////////////////////////////////////////////////////////////////////// // struct ncclMemoryStack { // struct Hunk { // struct Hunk* above; // reverse stack pointer // size_t size; // size of this allocation (including this header struct) // }; // struct Unhunk { // proxy header for objects allocated out-of-hunk // struct Unhunk* next; // void* obj; // }; // struct Frame { // struct Hunk* hunk; // top of non-empty hunks // uintptr_t bumper, end; // points into top hunk // struct Unhunk* unhunks; // struct Frame* below; // }; // static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align); // static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align); // struct Hunk stub; // struct Frame topFrame; // }; // inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) { // me->stub.above = nullptr; // me->stub.size = 0; // me->topFrame.hunk = &me->stub; // me->topFrame.bumper = 0; // me->topFrame.end = 0; // me->topFrame.unhunks = nullptr; // me->topFrame.below = nullptr; // } // inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) { // uintptr_t o = (me->topFrame.bumper + align - 1) & -uintptr_t(align); // void* obj; // if(__builtin_expect(o + size <= me->topFrame.end, true)) { // me->topFrame.bumper = o + size; // obj = reinterpret_cast(o); // } else { // obj = allocateSpilled(me, size, align); // } // return obj; // } // template // inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) { // void* obj = ncclMemoryStack::allocate(me, n * sizeof(T), alignof(T)); // memset(obj, 0, n * sizeof(T)); // return (T*)obj; // } // inline void ncclMemoryStackPush(struct ncclMemoryStack* me) { // using Frame = ncclMemoryStack::Frame; // Frame tmp = me->topFrame; // Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame)); // *snapshot = tmp; // C++ struct assignment // me->topFrame.unhunks = nullptr; // me->topFrame.below = snapshot; // } // inline void ncclMemoryStackPop(struct ncclMemoryStack* me) { // ncclMemoryStack::Unhunk* un = me->topFrame.unhunks; // while(un != nullptr) { // free(un->obj); // un = un->next; // } // me->topFrame = *me->topFrame.below; // C++ struct assignment // } // //////////////////////////////////////////////////////////////////////////////// // struct ncclMemoryPool { // struct Cell { // Cell* next; // }; // template // union CellSized { // Cell cell; // alignas(Align) char space[Size]; // }; // struct Cell* head; // struct Cell* tail; // meaningful only when head != nullptr // }; // inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) { me->head = nullptr; } // template // inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) { // using Cell = ncclMemoryPool::Cell; // using CellSized = ncclMemoryPool::CellSized; // Cell* cell; // if(__builtin_expect(me->head != nullptr, true)) { // cell = me->head; // me->head = cell->next; // } else { // // Use the internal allocate() since it doesn't memset to 0 yet. // cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized)); // } // memset(cell, 0, sizeof(T)); // return reinterpret_cast(cell); // } // template // inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) { // using Cell = ncclMemoryPool::Cell; // Cell* cell = reinterpret_cast(obj); // cell->next = me->head; // if(me->head == nullptr) // me->tail = cell; // me->head = cell; // } // inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) { // if(from->head != nullptr) { // from->tail->next = me->head; // if(me->head == nullptr) // me->tail = from->tail; // me->head = from->head; // from->head = nullptr; // } // } // //////////////////////////////////////////////////////////////////////////////// // template // struct ncclIntruQueue { // T *head, *tail; // }; // template // inline void ncclIntruQueueConstruct(ncclIntruQueue* me) { // me->head = nullptr; // me->tail = nullptr; // } // template // inline bool ncclIntruQueueEmpty(ncclIntruQueue* me) { // return me->head == nullptr; // } // template // inline T* ncclIntruQueueHead(ncclIntruQueue* me) { // return me->head; // } // template // inline T* ncclIntruQueueTail(ncclIntruQueue* me) { // return me->tail; // } // template // inline void ncclIntruQueueEnqueue(ncclIntruQueue* me, T* x) { // x->*next = nullptr; // (me->head ? me->tail->*next : me->head) = x; // me->tail = x; // } // template // inline T* ncclIntruQueueDequeue(ncclIntruQueue* me) { // T* ans = me->head; // me->head = ans->*next; // if(me->head == nullptr) // me->tail = nullptr; // return ans; // } // template // inline T* ncclIntruQueueTryDequeue(ncclIntruQueue* me) { // T* ans = me->head; // if(ans != nullptr) { // me->head = ans->*next; // if(me->head == nullptr) // me->tail = nullptr; // } // return ans; // } // template // void ncclIntruQueueFreeAll(ncclIntruQueue* me, ncclMemoryPool* pool) { // T* head = me->head; // me->head = nullptr; // me->tail = nullptr; // while(head != nullptr) { // T* tmp = head->*next; // ncclMemoryPoolFree(pool, tmp); // head = tmp; // } // } // //////////////////////////////////////////////////////////////////////////////// // constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() { return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}; } // inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) { // pthread_mutex_init(&me->mutex, nullptr); // pthread_cond_init(&me->cond, nullptr); // } // inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) { // pthread_mutex_destroy(&me->mutex); // pthread_cond_destroy(&me->cond); // } // //////////////////////////////////////////////////////////////////////////////// // template // struct ncclIntruQueueMpsc { // T* head; // uintptr_t tail; // struct ncclThreadSignal* waiting; // }; // template // void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc* me) { // me->head = nullptr; // me->tail = 0x0; // me->waiting = nullptr; // } // template // bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc* me) { // return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2; // } // template // bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc* me, T* x) { // __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED); // uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast(x), __ATOMIC_ACQ_REL); // T* prev = reinterpret_cast(utail); // T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next); // __atomic_store_n(prevNext, x, __ATOMIC_RELAXED); // if(utail == 0x1) { // waiting // __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting // // This lock/unlock is essential to ensure we don't race ahead of the consumer // // and signal the cond before they begin waiting on it. // struct ncclThreadSignal* waiting = me->waiting; // pthread_mutex_lock(&waiting->mutex); // pthread_mutex_unlock(&waiting->mutex); // pthread_cond_broadcast(&waiting->cond); // } // return utail != 0x2; // not abandoned // } // template // T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc* me, bool waitSome) { // T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); // if(head == nullptr) { // if(!waitSome) // return nullptr; // uint64_t t0 = clockNano(); // bool sleeping = false; // do { // if(clockNano() - t0 >= 10 * 1000) { // spin for first 10us // struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance; // pthread_mutex_lock(&waitSignal->mutex); // uintptr_t expected = sleeping ? 0x1 : 0x0; // uintptr_t desired = 0x1; // me->waiting = waitSignal; // release done by successful compare exchange // if(__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) { // sleeping = true; // pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex); // } // pthread_mutex_unlock(&waitSignal->mutex); // } // head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); // } while(head == nullptr); // } // __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED); // uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL); // T* tail = utail <= 0x2 ? nullptr : reinterpret_cast(utail); // T* x = head; // while(x != tail) { // T* x1; // int spins = 0; // while(true) { // x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED); // if(x1 != nullptr) // break; // if(++spins == 1024) { // spins = 1024 - 1; // sched_yield(); // } // } // x = x1; // } // return head; // } // template // T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc* me) { // uintptr_t expected = 0x0; // if(__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) { // return nullptr; // } else { // int spins = 0; // T* head; // while(true) { // head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); // if(head != nullptr) // break; // if(++spins == 1024) { // spins = 1024 - 1; // sched_yield(); // } // } // __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED); // uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL); // T* tail = utail <= 0x2 ? nullptr : reinterpret_cast(utail); // T* x = head; // while(x != tail) { // T* x1; // spins = 0; // while(true) { // x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED); // if(x1 != nullptr) // break; // if(++spins == 1024) { // spins = 1024 - 1; // sched_yield(); // } // } // x = x1; // } // return head; // } // } // //////////////////////////////////////////////////////////////////////////////// // static inline long get_now_ns(void) { // struct timespec time; // if(clock_gettime(CLOCK_MONOTONIC, &time) != 0) { // return 0; // } // return time.tv_sec * 1000000000L + time.tv_nsec; // } // static inline void thread_bind_cpu(int coreid) { // cpu_set_t cpuset; // CPU_ZERO(&cpuset); // CPU_SET(coreid, &cpuset); // pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); // } } // namespace sccl #endif