utils.cc

/*************************************************************************
 * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#include "utils.h"
// #include "core.h"
// #include "nvmlwrap.h"
#include <dirent.h>
#include <fstream>

#include <stdlib.h>

namespace sccl {

// // Get current Compute Capability
// int scclCudaCompCap() {
//     int cudaDev;
//     if(cudaGetDevice(&cudaDev) != cudaSuccess)
//         return 0;
//     int ccMajor, ccMinor;
//     if(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess)
//         return 0;
//     if(cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess)
//         return 0;
//     return ccMajor * 10 + ccMinor;
// }

// scclResult_t int64ToBusId(int64_t id, char* busId) {
//     sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
//     return scclSuccess;
// }

// scclResult_t busIdToInt64(const char* busId, int64_t* id) {
//     char hexStr[17]; // Longest possible int64 hex string + null terminator.
//     int hexOffset = 0;
//     for(int i = 0; hexOffset < sizeof(hexStr) - 1; i++) {
//         char c = busId[i];
//         if(c == '.' || c == ':')
//             continue;
//         if((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) {
//             hexStr[hexOffset++] = busId[i];
//         } else
//             break;
//     }
//     hexStr[hexOffset] = '\0';
//     *id               = strtol(hexStr, NULL, 16);
//     return scclSuccess;
// }

// // Convert a logical cudaDev index to the NVML device minor number
// scclResult_t getBusId(int cudaDev, int64_t* busId) {
//     // On most systems, the PCI bus ID comes back as in the 0000:00:00.0
//     // format. Still need to allocate proper space in case PCI domain goes
//     // higher.
//     char busIdStr[] = "00000000:00:00.0";
//     CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
//     NCCLCHECK(busIdToInt64(busIdStr, busId));
//     return scclSuccess;
// }

// scclResult_t getHostName(char* hostname, int maxlen, const char delim) {
//     if(gethostname(hostname, maxlen) != 0) {
//         strncpy(hostname, "unknown", maxlen);
//         return scclSystemError;
//     }
//     int i = 0;
//     while((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1))
//         i++;
//     hostname[i] = '\0';
//     return scclSuccess;
// }

// uint64_t getHash(const char* string, int n) {
//     // Based on DJB2a, result = result * 33 ^ char
//     uint64_t result = 5381;
//     for(int c = 0; c < n; c++) {
//         result = ((result << 5) + result) ^ string[c];
//     }
//     return result;
// }

// /* Generate a hash of the unique identifying string for this host
//  * that will be unique for both bare-metal and container instances
//  * Equivalent of a hash of;
//  *
//  * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
//  *
//  * This string can be overridden by using the NCCL_HOSTID env var.
//  */
// #define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
// uint64_t getHostHash(void) {
//     char hostHash[1024];
//     char* hostId;

//     // Fall back is the full hostname if something fails
//     (void)getHostName(hostHash, sizeof(hostHash), '\0');
//     int offset = strlen(hostHash);

//     if((hostId = getenv("NCCL_HOSTID")) != NULL) {
//         INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId);
//         strncpy(hostHash, hostId, sizeof(hostHash));
//     } else {
//         FILE* file = fopen(HOSTID_FILE, "r");
//         if(file != NULL) {
//             char* p;
//             if(fscanf(file, "%ms", &p) == 1) {
//                 strncpy(hostHash + offset, p, sizeof(hostHash) - offset - 1);
//                 free(p);
//             }
//         }
//         fclose(file);
//     }

//     // Make sure the string is terminated
//     hostHash[sizeof(hostHash) - 1] = '\0';

//     TRACE(NCCL_INIT, "unique hostname '%s'", hostHash);

//     return getHash(hostHash, strlen(hostHash));
// }

// /* Generate a hash of the unique identifying string for this process
//  * that will be unique for both bare-metal and container instances
//  * Equivalent of a hash of;
//  *
//  * $$ $(readlink /proc/self/ns/pid)
//  */
// uint64_t getPidHash(void) {
//     char pname[1024];
//     // Start off with our pid ($$)
//     sprintf(pname, "%ld", (long)getpid());
//     int plen = strlen(pname);
//     int len  = readlink("/proc/self/ns/pid", pname + plen, sizeof(pname) - 1 - plen);
//     if(len < 0)
//         len = 0;

//     pname[plen + len] = '\0';
//     TRACE(NCCL_INIT, "unique PID '%s'", pname);

//     return getHash(pname, strlen(pname));
// }

// int parseStringList(const char* string, struct netIf* ifList, int maxList) {
//     if(!string)
//         return 0;

//     const char* ptr = string;

//     int ifNum = 0;
//     int ifC   = 0;
//     char c;
//     do {
//         c = *ptr;
//         if(c == ':') {
//             if(ifC > 0) {
//                 ifList[ifNum].prefix[ifC] = '\0';
//                 ifList[ifNum].port        = atoi(ptr + 1);
//                 ifNum++;
//                 ifC = 0;
//             }
//             while(c != ',' && c != '\0')
//                 c = *(++ptr);
//         } else if(c == ',' || c == '\0') {
//             if(ifC > 0) {
//                 ifList[ifNum].prefix[ifC] = '\0';
//                 ifList[ifNum].port        = -1;
//                 ifNum++;
//                 ifC = 0;
//             }
//         } else {
//             ifList[ifNum].prefix[ifC] = c;
//             ifC++;
//         }
//         ptr++;
//     } while(ifNum < maxList && c);
//     return ifNum;
// }

// static bool matchIf(const char* string, const char* ref, bool matchExact) {
//     // Make sure to include '\0' in the exact case
//     int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
//     return strncmp(string, ref, matchLen) == 0;
// }

// static bool matchPort(const int port1, const int port2) {
//     if(port1 == -1)
//         return true;
//     if(port2 == -1)
//         return true;
//     if(port1 == port2)
//         return true;
//     return false;
// }

// bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
//     // Make an exception for the case where no user list is defined
//     if(listSize == 0)
//         return true;

//     for(int i = 0; i < listSize; i++) {
//         if(matchIf(string, ifList[i].prefix, matchExact) && matchPort(port, ifList[i].port)) {
//             return true;
//         }
//     }
//     return false;
// }

// __thread struct scclThreadSignal scclThreadSignalLocalInstance = scclThreadSignalStaticInitializer();

// void* scclMemoryStack::allocateSpilled(struct scclMemoryStack* me, size_t size, size_t align) {
//     // `me->hunks` points to the top of the stack non-empty hunks. Hunks above
//     // this (reachable via `->above`) are empty.
//     struct Hunk* top  = me->topFrame.hunk;
//     size_t mallocSize = 0;

//     // If we have lots of space left in hunk but that wasn't enough then we'll
//     // allocate the object unhunked.
//     if(me->topFrame.end - me->topFrame.bumper >= 8 << 10)
//         goto unhunked;

//     // If we have another hunk (which must be empty) waiting above this one and
//     // the object fits then use that.
//     if(top && top->above) {
//         struct Hunk* top1 = top->above;
//         uintptr_t uobj    = (reinterpret_cast<uintptr_t>(top1) + sizeof(struct Hunk) + align - 1) & -uintptr_t(align);
//         if(uobj + size <= reinterpret_cast<uintptr_t>(top1) + top1->size) {
//             me->topFrame.hunk   = top1;
//             me->topFrame.bumper = uobj + size;
//             me->topFrame.end    = reinterpret_cast<uintptr_t>(top1) + top1->size;
//             return reinterpret_cast<void*>(uobj);
//         }
//     }

//     { // If the next hunk we're going to allocate wouldn't be big enough but the
//         // Unhunk proxy fits in the current hunk then go allocate as unhunked.
//         size_t nextSize           = (top ? top->size : 0) + (64 << 10);
//         constexpr size_t maxAlign = 64;
//         if(nextSize < sizeof(struct Hunk) + maxAlign + size) {
//             uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk) - 1) & -uintptr_t(alignof(Unhunk));
//             if(uproxy + sizeof(struct Unhunk) <= me->topFrame.end)
//                 goto unhunked;
//         }

//         // At this point we must need another hunk, either to fit the object
//         // itself or its Unhunk proxy.
//         mallocSize = nextSize;
//         INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
//         struct Hunk* top1 = (struct Hunk*)malloc(mallocSize);
//         if(top1 == nullptr)
//             goto malloc_exhausted;
//         top1->size  = nextSize;
//         top1->above = nullptr;
//         if(top)
//             top->above = top1;
//         top                 = top1;
//         me->topFrame.hunk   = top;
//         me->topFrame.end    = reinterpret_cast<uintptr_t>(top) + nextSize;
//         me->topFrame.bumper = reinterpret_cast<uintptr_t>(top) + sizeof(struct Hunk);
//     }

//     { // Try to fit object in the new top hunk.
//         uintptr_t uobj = (me->topFrame.bumper + align - 1) & -uintptr_t(align);
//         if(uobj + size <= me->topFrame.end) {
//             me->topFrame.bumper = uobj + size;
//             return reinterpret_cast<void*>(uobj);
//         }
//     }

// unhunked: { // We need to allocate the object out-of-band and put an Unhunk proxy in-band
//     // to keep track of it.
//     uintptr_t uproxy     = (me->topFrame.bumper + alignof(Unhunk) - 1) & -uintptr_t(alignof(Unhunk));
//     Unhunk* proxy        = reinterpret_cast<Unhunk*>(uproxy);
//     me->topFrame.bumper  = uproxy + sizeof(Unhunk);
//     proxy->next          = me->topFrame.unhunks;
//     me->topFrame.unhunks = proxy;
//     mallocSize           = size;
//     proxy->obj           = malloc(mallocSize);
//     INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
//     if(proxy->obj == nullptr)
//         goto malloc_exhausted;
//     return proxy->obj;
// }

// malloc_exhausted:
//     WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
//     abort();
// }

// void scclMemoryStackDestruct(struct scclMemoryStack* me) {
//     // Free unhunks first because both the frames and unhunk proxies lie within the hunks.
//     struct scclMemoryStack::Frame* f = &me->topFrame;
//     while(f != nullptr) {
//         struct scclMemoryStack::Unhunk* u = f->unhunks;
//         while(u != nullptr) {
//             free(u->obj);
//             u = u->next;
//         }
//         f = f->below;
//     }
//     // Free hunks
//     struct scclMemoryStack::Hunk* h = me->stub.above;
//     while(h != nullptr) {
//         struct scclMemoryStack::Hunk* h1 = h->above;
//         free(h);
//         h = h1;
//     }
// }

// typedef struct {
//     pid_t pid;
//     pid_t ppid;
//     char pcmdLine[4096];
//     char cmdLine[4096];
// } appConfigOptimizeArg_t;

// static bool barrier_Flag;
// int maxGPUs = -1;

// int initInfo() {
//     /* get barrier_Flag */
//     uint32_t index              = 0;
//     appConfigOptimizeArg_t args = {0};
//     args.pid                    = getpid();
//     args.ppid                   = getppid();
//     std::string cmdLinePath     = "/proc/" + std::to_string(args.ppid) + "/cmdline";
//     std::ifstream cmdLineFile;

//     cmdLineFile.open(cmdLinePath.c_str());
//     cmdLineFile.read(args.pcmdLine, sizeof(args.pcmdLine));
//     cmdLineFile.close();
//     cmdLinePath = "/proc/" + std::to_string(args.pid) + "/cmdline";
//     cmdLineFile.open(cmdLinePath.c_str());
//     cmdLineFile.read(args.cmdLine, sizeof(args.cmdLine));
//     cmdLineFile.close();

//     if(memmem(args.cmdLine, sizeof(args.cmdLine), "sccl_context_test", strlen("sccl_context_test")) ||
//        memmem(args.pcmdLine, sizeof(args.pcmdLine), "sccl_context_test", strlen("sccl_context_test"))) {
//         barrier_Flag = true;
//     } else {
//         barrier_Flag = false;
//     }

//     INFO(NCCL_INIT, "Init config for sccl_context_test: %d", barrier_Flag);

//     /* get maximum number of GPUs in all NUMA nodes */
//     if(maxGPUs == -1) {
//         int gpuCount[32] = {0}; // Assume MAX_NUMA_NODES=32
//         int deviceCount;
//         hipGetDeviceCount(&deviceCount);

//         // Get numbers of GPUs in all NUMA nodes in system
//         for(int i = 1; i <= deviceCount; ++i) {
//             char path[256];
//             snprintf(path, sizeof(path), "/sys/class/drm/card%d/device/numa_node", i);
//             FILE* fp = fopen(path, "r");
//             if(fp == NULL) {
//                 perror("Error opening NUMA node file");
//                 continue;
//             }

//             int numaNode;
//             if(fscanf(fp, "%d", &numaNode) == 1 && numaNode >= 0 && numaNode < 32) {
//                 gpuCount[numaNode]++;
//             }
//             fclose(fp);
//         }

//         // Find maximum number of GPUs in all NUMA nodes
//         for(int i = 0; i < 32; ++i) {
//             if(gpuCount[i] > maxGPUs) {
//                 maxGPUs = gpuCount[i];
//             }
//         }

//         INFO(NCCL_INIT, "Maximum number of GPUs in any NUMA node: %d\n", maxGPUs);
//     }
//     return 0;
// }

// bool getBarrierFlag() { return barrier_Flag; }

// int getNumaMaxGpus() { return maxGPUs; }

} // namespace sccl