utils.h

/*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#ifndef NCCL_UTILS_H_
#define NCCL_UTILS_H_

#include "check.h"
#include <stdint.h>
#include <time.h>
#include <sched.h>
#include <new>

namespace sccl {

// int ncclCudaCompCap();

// scclResult_t int64ToBusId(int64_t id, char* busId);
// scclResult_t busIdToInt64(const char* busId, int64_t* id);

// ncclResult_t getBusId(int hipDev, int64_t* busId);

// ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
// uint64_t getHash(const char* string, int n);
// uint64_t getHostHash();
// uint64_t getPidHash();
// ncclResult_t getRandomData(void* buffer, size_t bytes);

// struct netIf {
//     char prefix[64];
//     int port;
// };

// int parseStringList(const char* string, struct netIf* ifList, int maxList);
// bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);

// static long log2i(long n) {
//     long l = 0;
//     while(n >>= 1)
//         l++;
//     return l;
// }

// inline uint64_t clockNano() {
//     struct timespec ts;
//     clock_gettime(CLOCK_MONOTONIC, &ts);
//     return uint64_t(ts.tv_sec) * 1000 * 1000 * 1000 + ts.tv_nsec;
// }

// /* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
//  * return -1 */
// inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
//     ncclResult_t ret = ncclSuccess;
//     if(bytes > 0) {
//         const size_t one = 1UL;
//         FILE* fp         = fopen("/dev/urandom", "r");
//         if(buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one)
//             ret = ncclSystemError;
//         if(fp)
//             fclose(fp);
//     }
//     return ret;
// }

// ////////////////////////////////////////////////////////////////////////////////

// template <typename Int>
// inline void ncclAtomicRefCountIncrement(Int* refs) {
//     __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
// }

// template <typename Int>
// inline Int ncclAtomicRefCountDecrement(Int* refs) {
//     return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL);
// }

// ////////////////////////////////////////////////////////////////////////////////
// /* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that
//  * granularity of LIFO is not per object, instead frames containing many objects
//  * are pushed and popped. Therefor deallocation is extremely cheap since its
//  * done at the frame granularity.
//  *
//  * The initial state of the stack is with one frame, the "nil" frame, which
//  * cannot be popped. Therefor objects allocated in the nil frame cannot be
//  * deallocated sooner than stack destruction.
//  */
// struct ncclMemoryStack;

// void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
// void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
// void ncclMemoryStackPush(struct ncclMemoryStack* me);
// void ncclMemoryStackPop(struct ncclMemoryStack* me);
// template <typename T>
// T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n = 1);
// int initInfo();
// bool getBarrierFlag();
// int getNumaMaxGpus();
// ////////////////////////////////////////////////////////////////////////////////
// /* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
//  * a pool instance to ever hold objects whose type have differing
//  * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by
//  * a backing `ncclMemoryStack` passed during Alloc(). If memory
//  * backing any currently held object is deallocated then it is an error to do
//  * anything other than reconstruct it, after which it is a valid empty pool.
//  */
// struct ncclMemoryPool;

// // Equivalent to zero-initialization
// void ncclMemoryPoolConstruct(struct ncclMemoryPool* me);
// template <typename T>
// T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing);
// template <typename T>
// void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj);
// void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from);

// ////////////////////////////////////////////////////////////////////////////////
// /* ncclIntruQueue: A singly-linked list queue where the per-object next pointer
//  * field is given via the `next` template argument.
//  *
//  * Example:
//  *   struct Foo {
//  *     struct Foo *next1, *next2; // can be a member of two lists at once
//  *   };
//  *   ncclIntruQueue<Foo, &Foo::next1> list1;
//  *   ncclIntruQueue<Foo, &Foo::next2> list2;
//  */
// template <typename T, T* T::* next>
// struct ncclIntruQueue;

// template <typename T, T* T::* next>
// void ncclIntruQueueConstruct(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// bool ncclIntruQueueEmpty(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// T* ncclIntruQueueHead(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// void ncclIntruQueueEnqueue(ncclIntruQueue<T, next>* me, T* x);
// template <typename T, T* T::* next>
// T* ncclIntruQueueDequeue(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// T* ncclIntruQueueTryDequeue(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// void ncclIntruQueueFreeAll(ncclIntruQueue<T, next>* me, ncclMemoryPool* memPool);

// ////////////////////////////////////////////////////////////////////////////////
// /* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
//  * and "cond" fields are part of the public interface.
//  */
// struct ncclThreadSignal {
//     pthread_mutex_t mutex;
//     pthread_cond_t cond;
// };

// // returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}
// constexpr ncclThreadSignal ncclThreadSignalStaticInitializer();

// void ncclThreadSignalConstruct(struct ncclThreadSignal* me);
// void ncclThreadSignalDestruct(struct ncclThreadSignal* me);

// // A convenience instance per-thread.
// extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance;

// ////////////////////////////////////////////////////////////////////////////////

// template <typename T, T* T::* next>
// struct ncclIntruQueueMpsc;

// template <typename T, T* T::* next>
// void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T, next>* me);
// template <typename T, T* T::* next>
// bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T, next>* me);
// // Enqueue element. Returns true if queue is not abandoned. Even if queue is
// // abandoned the element enqueued, so the caller needs to make arrangements for
// // the queue to be tended.
// template <typename T, T* T::* next>
// bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc<T, next>* me, T* x);
// // Dequeue all elements at a glance. If there aren't any and `waitSome` is
// // true then this call will wait until it can return a non empty list.
// template <typename T, T* T::* next>
// T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc<T, next>* me, bool waitSome);
// // Dequeue all elements and set queue to abandoned state.
// template <typename T, T* T::* next>
// T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc<T, next>* me);

// ////////////////////////////////////////////////////////////////////////////////

// struct ncclMemoryStack {
//     struct Hunk {
//         struct Hunk* above; // reverse stack pointer
//         size_t size;        // size of this allocation (including this header struct)
//     };
//     struct Unhunk { // proxy header for objects allocated out-of-hunk
//         struct Unhunk* next;
//         void* obj;
//     };
//     struct Frame {
//         struct Hunk* hunk;     // top of non-empty hunks
//         uintptr_t bumper, end; // points into top hunk
//         struct Unhunk* unhunks;
//         struct Frame* below;
//     };

//     static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align);
//     static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align);

//     struct Hunk stub;
//     struct Frame topFrame;
// };

// inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) {
//     me->stub.above       = nullptr;
//     me->stub.size        = 0;
//     me->topFrame.hunk    = &me->stub;
//     me->topFrame.bumper  = 0;
//     me->topFrame.end     = 0;
//     me->topFrame.unhunks = nullptr;
//     me->topFrame.below   = nullptr;
// }

// inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) {
//     uintptr_t o = (me->topFrame.bumper + align - 1) & -uintptr_t(align);
//     void* obj;
//     if(__builtin_expect(o + size <= me->topFrame.end, true)) {
//         me->topFrame.bumper = o + size;
//         obj                 = reinterpret_cast<void*>(o);
//     } else {
//         obj = allocateSpilled(me, size, align);
//     }
//     return obj;
// }

// template <typename T>
// inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
//     void* obj = ncclMemoryStack::allocate(me, n * sizeof(T), alignof(T));
//     memset(obj, 0, n * sizeof(T));
//     return (T*)obj;
// }

// inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
//     using Frame          = ncclMemoryStack::Frame;
//     Frame tmp            = me->topFrame;
//     Frame* snapshot      = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame));
//     *snapshot            = tmp; // C++ struct assignment
//     me->topFrame.unhunks = nullptr;
//     me->topFrame.below   = snapshot;
// }

// inline void ncclMemoryStackPop(struct ncclMemoryStack* me) {
//     ncclMemoryStack::Unhunk* un = me->topFrame.unhunks;
//     while(un != nullptr) {
//         free(un->obj);
//         un = un->next;
//     }
//     me->topFrame = *me->topFrame.below; // C++ struct assignment
// }

// ////////////////////////////////////////////////////////////////////////////////

// struct ncclMemoryPool {
//     struct Cell {
//         Cell* next;
//     };
//     template <int Size, int Align>
//     union CellSized {
//         Cell cell;
//         alignas(Align) char space[Size];
//     };
//     struct Cell* head;
//     struct Cell* tail; // meaningful only when head != nullptr
// };

// inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) { me->head = nullptr; }

// template <typename T>
// inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
//     using Cell      = ncclMemoryPool::Cell;
//     using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
//     Cell* cell;
//     if(__builtin_expect(me->head != nullptr, true)) {
//         cell     = me->head;
//         me->head = cell->next;
//     } else {
//         // Use the internal allocate() since it doesn't memset to 0 yet.
//         cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
//     }
//     memset(cell, 0, sizeof(T));
//     return reinterpret_cast<T*>(cell);
// }

// template <typename T>
// inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) {
//     using Cell = ncclMemoryPool::Cell;
//     Cell* cell = reinterpret_cast<Cell*>(obj);
//     cell->next = me->head;
//     if(me->head == nullptr)
//         me->tail = cell;
//     me->head = cell;
// }

// inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) {
//     if(from->head != nullptr) {
//         from->tail->next = me->head;
//         if(me->head == nullptr)
//             me->tail = from->tail;
//         me->head   = from->head;
//         from->head = nullptr;
//     }
// }

// ////////////////////////////////////////////////////////////////////////////////

// template <typename T, T* T::* next>
// struct ncclIntruQueue {
//     T *head, *tail;
// };

// template <typename T, T* T::* next>
// inline void ncclIntruQueueConstruct(ncclIntruQueue<T, next>* me) {
//     me->head = nullptr;
//     me->tail = nullptr;
// }

// template <typename T, T* T::* next>
// inline bool ncclIntruQueueEmpty(ncclIntruQueue<T, next>* me) {
//     return me->head == nullptr;
// }

// template <typename T, T* T::* next>
// inline T* ncclIntruQueueHead(ncclIntruQueue<T, next>* me) {
//     return me->head;
// }

// template <typename T, T* T::* next>
// inline T* ncclIntruQueueTail(ncclIntruQueue<T, next>* me) {
//     return me->tail;
// }

// template <typename T, T* T::* next>
// inline void ncclIntruQueueEnqueue(ncclIntruQueue<T, next>* me, T* x) {
//     x->*next                                = nullptr;
//     (me->head ? me->tail->*next : me->head) = x;
//     me->tail                                = x;
// }

// template <typename T, T* T::* next>
// inline T* ncclIntruQueueDequeue(ncclIntruQueue<T, next>* me) {
//     T* ans   = me->head;
//     me->head = ans->*next;
//     if(me->head == nullptr)
//         me->tail = nullptr;
//     return ans;
// }

// template <typename T, T* T::* next>
// inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T, next>* me) {
//     T* ans = me->head;
//     if(ans != nullptr) {
//         me->head = ans->*next;
//         if(me->head == nullptr)
//             me->tail = nullptr;
//     }
//     return ans;
// }

// template <typename T, T* T::* next>
// void ncclIntruQueueFreeAll(ncclIntruQueue<T, next>* me, ncclMemoryPool* pool) {
//     T* head  = me->head;
//     me->head = nullptr;
//     me->tail = nullptr;
//     while(head != nullptr) {
//         T* tmp = head->*next;
//         ncclMemoryPoolFree(pool, tmp);
//         head = tmp;
//     }
// }

// ////////////////////////////////////////////////////////////////////////////////

// constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() { return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}; }

// inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) {
//     pthread_mutex_init(&me->mutex, nullptr);
//     pthread_cond_init(&me->cond, nullptr);
// }

// inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) {
//     pthread_mutex_destroy(&me->mutex);
//     pthread_cond_destroy(&me->cond);
// }

// ////////////////////////////////////////////////////////////////////////////////

// template <typename T, T* T::* next>
// struct ncclIntruQueueMpsc {
//     T* head;
//     uintptr_t tail;
//     struct ncclThreadSignal* waiting;
// };

// template <typename T, T* T::* next>
// void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T, next>* me) {
//     me->head    = nullptr;
//     me->tail    = 0x0;
//     me->waiting = nullptr;
// }

// template <typename T, T* T::* next>
// bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T, next>* me) {
//     return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2;
// }

// template <typename T, T* T::* next>
// bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc<T, next>* me, T* x) {
//     __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED);
//     uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast<uintptr_t>(x), __ATOMIC_ACQ_REL);
//     T* prev         = reinterpret_cast<T*>(utail);
//     T** prevNext    = utail <= 0x2 ? &me->head : &(prev->*next);
//     __atomic_store_n(prevNext, x, __ATOMIC_RELAXED);
//     if(utail == 0x1) {                           // waiting
//         __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting
//         // This lock/unlock is essential to ensure we don't race ahead of the consumer
//         // and signal the cond before they begin waiting on it.
//         struct ncclThreadSignal* waiting = me->waiting;
//         pthread_mutex_lock(&waiting->mutex);
//         pthread_mutex_unlock(&waiting->mutex);
//         pthread_cond_broadcast(&waiting->cond);
//     }
//     return utail != 0x2; // not abandoned
// }

// template <typename T, T* T::* next>
// T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc<T, next>* me, bool waitSome) {
//     T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
//     if(head == nullptr) {
//         if(!waitSome)
//             return nullptr;
//         uint64_t t0   = clockNano();
//         bool sleeping = false;
//         do {
//             if(clockNano() - t0 >= 10 * 1000) { // spin for first 10us
//                 struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance;
//                 pthread_mutex_lock(&waitSignal->mutex);
//                 uintptr_t expected = sleeping ? 0x1 : 0x0;
//                 uintptr_t desired  = 0x1;
//                 me->waiting        = waitSignal; // release done by successful compare exchange
//                 if(__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
//                     sleeping = true;
//                     pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
//                 }
//                 pthread_mutex_unlock(&waitSignal->mutex);
//             }
//             head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
//         } while(head == nullptr);
//     }

//     __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
//     uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL);
//     T* tail         = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
//     T* x            = head;
//     while(x != tail) {
//         T* x1;
//         int spins = 0;
//         while(true) {
//             x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
//             if(x1 != nullptr)
//                 break;
//             if(++spins == 1024) {
//                 spins = 1024 - 1;
//                 sched_yield();
//             }
//         }
//         x = x1;
//     }
//     return head;
// }

// template <typename T, T* T::* next>
// T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T, next>* me) {
//     uintptr_t expected = 0x0;
//     if(__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
//         return nullptr;
//     } else {
//         int spins = 0;
//         T* head;
//         while(true) {
//             head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
//             if(head != nullptr)
//                 break;
//             if(++spins == 1024) {
//                 spins = 1024 - 1;
//                 sched_yield();
//             }
//         }
//         __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
//         uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL);
//         T* tail         = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
//         T* x            = head;
//         while(x != tail) {
//             T* x1;
//             spins = 0;
//             while(true) {
//                 x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
//                 if(x1 != nullptr)
//                     break;
//                 if(++spins == 1024) {
//                     spins = 1024 - 1;
//                     sched_yield();
//                 }
//             }
//             x = x1;
//         }
//         return head;
//     }
// }

// ////////////////////////////////////////////////////////////////////////////////
// static inline long get_now_ns(void) {
//     struct timespec time;
//     if(clock_gettime(CLOCK_MONOTONIC, &time) != 0) {
//         return 0;
//     }

//     return time.tv_sec * 1000000000L + time.tv_nsec;
// }

// static inline void thread_bind_cpu(int coreid) {
//     cpu_set_t cpuset;
//     CPU_ZERO(&cpuset);
//     CPU_SET(coreid, &cpuset);
//     pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
// }

} // namespace sccl

#endif