/************************************************************************* * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "utils.h" #include "core.h" #include "nvmlwrap.h" #include #include // Get current Compute Capability int ncclCudaCompCap() { int cudaDev; if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0; int ccMajor, ccMinor; if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0; if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0; return ccMajor*10+ccMinor; } ncclResult_t int64ToBusId(int64_t id, char* busId) { sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf)); return ncclSuccess; } ncclResult_t busIdToInt64(const char* busId, int64_t* id) { char hexStr[17]; // Longest possible int64 hex string + null terminator. int hexOffset = 0; for (int i = 0; hexOffset < sizeof(hexStr) - 1; i++) { char c = busId[i]; if (c == '.' || c == ':') continue; if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) { hexStr[hexOffset++] = busId[i]; } else break; } hexStr[hexOffset] = '\0'; *id = strtol(hexStr, NULL, 16); return ncclSuccess; } // Convert a logical cudaDev index to the NVML device minor number ncclResult_t getBusId(int cudaDev, int64_t *busId) { // On most systems, the PCI bus ID comes back as in the 0000:00:00.0 // format. Still need to allocate proper space in case PCI domain goes // higher. char busIdStr[] = "00000000:00:00.0"; CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev)); NCCLCHECK(busIdToInt64(busIdStr, busId)); return ncclSuccess; } ncclResult_t getHostName(char* hostname, int maxlen, const char delim) { if (gethostname(hostname, maxlen) != 0) { strncpy(hostname, "unknown", maxlen); return ncclSystemError; } int i = 0; while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++; hostname[i] = '\0'; return ncclSuccess; } uint64_t getHash(const char* string, int n) { // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; for (int c = 0; c < n; c++) { result = ((result << 5) + result) ^ string[c]; } return result; } /* Generate a hash of the unique identifying string for this host * that will be unique for both bare-metal and container instances * Equivalent of a hash of; * * $(hostname)$(cat /proc/sys/kernel/random/boot_id) * * This string can be overridden by using the NCCL_HOSTID env var. */ #define HOSTID_FILE "/proc/sys/kernel/random/boot_id" uint64_t getHostHash(void) { char hostHash[1024]; char *hostId; // Fall back is the full hostname if something fails (void) getHostName(hostHash, sizeof(hostHash), '\0'); int offset = strlen(hostHash); if ((hostId = getenv("NCCL_HOSTID")) != NULL) { INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId); strncpy(hostHash, hostId, sizeof(hostHash)); } else { FILE *file = fopen(HOSTID_FILE, "r"); if (file != NULL) { char *p; if (fscanf(file, "%ms", &p) == 1) { strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); free(p); } } fclose(file); } // Make sure the string is terminated hostHash[sizeof(hostHash)-1]='\0'; TRACE(NCCL_INIT,"unique hostname '%s'", hostHash); return getHash(hostHash, strlen(hostHash)); } /* Generate a hash of the unique identifying string for this process * that will be unique for both bare-metal and container instances * Equivalent of a hash of; * * $$ $(readlink /proc/self/ns/pid) */ uint64_t getPidHash(void) { char pname[1024]; // Start off with our pid ($$) sprintf(pname, "%ld", (long) getpid()); int plen = strlen(pname); int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen); if (len < 0) len = 0; pname[plen+len]='\0'; TRACE(NCCL_INIT,"unique PID '%s'", pname); return getHash(pname, strlen(pname)); } int parseStringList(const char* string, struct netIf* ifList, int maxList) { if (!string) return 0; const char* ptr = string; int ifNum = 0; int ifC = 0; char c; do { c = *ptr; if (c == ':') { if (ifC > 0) { ifList[ifNum].prefix[ifC] = '\0'; ifList[ifNum].port = atoi(ptr+1); ifNum++; ifC = 0; } while (c != ',' && c != '\0') c = *(++ptr); } else if (c == ',' || c == '\0') { if (ifC > 0) { ifList[ifNum].prefix[ifC] = '\0'; ifList[ifNum].port = -1; ifNum++; ifC = 0; } } else { ifList[ifNum].prefix[ifC] = c; ifC++; } ptr++; } while (ifNum < maxList && c); return ifNum; } static bool matchIf(const char* string, const char* ref, bool matchExact) { // Make sure to include '\0' in the exact case int matchLen = matchExact ? strlen(string) + 1 : strlen(ref); return strncmp(string, ref, matchLen) == 0; } static bool matchPort(const int port1, const int port2) { if (port1 == -1) return true; if (port2 == -1) return true; if (port1 == port2) return true; return false; } bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) { // Make an exception for the case where no user list is defined if (listSize == 0) return true; for (int i=0; ihunks` points to the top of the stack non-empty hunks. Hunks above // this (reachable via `->above`) are empty. struct Hunk* top = me->topFrame.hunk; size_t mallocSize = 0; // If we have lots of space left in hunk but that wasn't enough then we'll // allocate the object unhunked. if (me->topFrame.end - me->topFrame.bumper >= 8<<10) goto unhunked; // If we have another hunk (which must be empty) waiting above this one and // the object fits then use that. if (top && top->above) { struct Hunk* top1 = top->above; uintptr_t uobj = (reinterpret_cast(top1) + sizeof(struct Hunk) + align-1) & -uintptr_t(align); if (uobj + size <= reinterpret_cast(top1) + top1->size) { me->topFrame.hunk = top1; me->topFrame.bumper = uobj + size; me->topFrame.end = reinterpret_cast(top1) + top1->size; return reinterpret_cast(uobj); } } { // If the next hunk we're going to allocate wouldn't be big enough but the // Unhunk proxy fits in the current hunk then go allocate as unhunked. size_t nextSize = (top ? top->size : 0) + (64<<10); constexpr size_t maxAlign = 64; if (nextSize < sizeof(struct Hunk) + maxAlign + size) { uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk)); if (uproxy + sizeof(struct Unhunk) <= me->topFrame.end) goto unhunked; } // At this point we must need another hunk, either to fit the object // itself or its Unhunk proxy. mallocSize = nextSize; INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize); struct Hunk *top1 = (struct Hunk*)malloc(mallocSize); if (top1 == nullptr) goto malloc_exhausted; top1->size = nextSize; top1->above = nullptr; if (top) top->above = top1; top = top1; me->topFrame.hunk = top; me->topFrame.end = reinterpret_cast(top) + nextSize; me->topFrame.bumper = reinterpret_cast(top) + sizeof(struct Hunk); } { // Try to fit object in the new top hunk. uintptr_t uobj = (me->topFrame.bumper + align-1) & -uintptr_t(align); if (uobj + size <= me->topFrame.end) { me->topFrame.bumper = uobj + size; return reinterpret_cast(uobj); } } unhunked: { // We need to allocate the object out-of-band and put an Unhunk proxy in-band // to keep track of it. uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk)); Unhunk* proxy = reinterpret_cast(uproxy); me->topFrame.bumper = uproxy + sizeof(Unhunk); proxy->next = me->topFrame.unhunks; me->topFrame.unhunks = proxy; mallocSize = size; proxy->obj = malloc(mallocSize); INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize); if (proxy->obj == nullptr) goto malloc_exhausted; return proxy->obj; } malloc_exhausted: WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize); abort(); } void ncclMemoryStackDestruct(struct ncclMemoryStack* me) { // Free unhunks first because both the frames and unhunk proxies lie within the hunks. struct ncclMemoryStack::Frame* f = &me->topFrame; while (f != nullptr) { struct ncclMemoryStack::Unhunk* u = f->unhunks; while (u != nullptr) { free(u->obj); u = u->next; } f = f->below; } // Free hunks struct ncclMemoryStack::Hunk* h = me->stub.above; while (h != nullptr) { struct ncclMemoryStack::Hunk *h1 = h->above; free(h); h = h1; } } typedef struct { pid_t pid; pid_t ppid; char pcmdLine[4096]; char cmdLine[4096]; } appConfigOptimizeArg_t; static bool vaspOptimizeFlag; static bool barrier_Flag; int initAppInfo() { uint32_t index = 0; appConfigOptimizeArg_t args = {0}; args.pid = getpid(); args.ppid = getppid(); std::string cmdLinePath = "/proc/" + std::to_string(args.ppid) + "/cmdline"; std::ifstream cmdLineFile; cmdLineFile.open(cmdLinePath.c_str()); cmdLineFile.read(args.pcmdLine, sizeof(args.pcmdLine)); cmdLineFile.close(); cmdLinePath = "/proc/" + std::to_string(args.pid) + "/cmdline"; cmdLineFile.open(cmdLinePath.c_str()); cmdLineFile.read(args.cmdLine, sizeof(args.cmdLine)); cmdLineFile.close(); if(memmem(args.cmdLine, sizeof(args.cmdLine), "vasp_", strlen("vasp_")) || memmem(args.pcmdLine, sizeof(args.pcmdLine), "vasp_", strlen("vasp_"))){ vaspOptimizeFlag = true; } else{ vaspOptimizeFlag = false; } if(memmem(args.cmdLine, sizeof(args.cmdLine), "nccl_context_test", strlen("nccl_context_test")) || memmem(args.pcmdLine, sizeof(args.pcmdLine), "nccl_context_test", strlen("nccl_context_test"))){ barrier_Flag = true; }else{ barrier_Flag = false; } INFO(NCCL_INIT, "Init config for app VASP: %d", vaspOptimizeFlag); INFO(NCCL_INIT, "Init config for nccl_context_test: %d", barrier_Flag); return 0; } bool getVaspOptimizeFlag(){ return vaspOptimizeFlag; } bool getBarrierFlag(){ return barrier_Flag; }