/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #include "devcomm.h" #include "comm.h" #include "topo.h" NCCL_PARAM(Nthreads, "NTHREADS", -2); NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2); static int getNthreads(const char* name, int env, int min, int max, int def, int WarpSize) { int nt = env; if (nt > 0) { if (nt % WarpSize != 0) { WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WarpSize); nt = max; } else if (nt > max) { WARN("Invalid %s %d (maximum %d).", name, nt, max); nt = max; } else if (nt < min) { WARN("Invalid %s %d (minimum %d).", name, nt, min); nt = min; } } else { nt = def; } return nt; } ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) { int def, set; if (str[0] == '^') { def = 1; set = 0; str++; } else { def = 0; set = 1; } for (int i=0; itopo, &cpuArch, &cpuVendor, &cpuModel)); if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0; if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0; else return 1.0; } ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) { int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, simpleDefaultThreads, comm->WarpSize); comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS, comm->WarpSize); comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS, comm->WarpSize); comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), 4*comm->WarpSize, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS, comm->WarpSize); #else getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads); comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS); comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] = NCCL_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS); comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS); #endif int nNodes = comm->nNodes; int nRanks = comm->nRanks; if (nRanks <= 1) return ncclSuccess; int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX; int cpuArch, cpuVendor, cpuModel; NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); int index2 = nNodes <= 2 ? nNodes-1 : 2; // LL: for single node, we look at GPU type; for multi-node, we look at CPU type int index1 = nNodes == 1 ? compCapIndex : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0; double llMaxBw = llMaxBws[index1][index2]; double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2]; double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2]; double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2]; // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring //if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]; float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; for (int a=0; atypeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI; for (int a=0; a 1 ? 2*nNodes :0) : coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 : nNodes; for (int a=0; abwIntra : graphs[a]->bwInter; float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw; //INFO(NCCL_INIT, "algo %s proto %s busBw %f baseBw %f bw %f nChannels %d bwIntra %f bwInter %f", ncclAlgoStr[a], ncclProtoStr[p], busBw, comm->topo->baseBw, bw, graphs[a]->nChannels, graphs[a]->bwIntra, graphs[a]->bwInter); // Various model refinements #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) if (nNodes <= 2) busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p]; else busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p]; #else if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); } if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw); if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw); if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85; if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) { // Collnet+Direct requires all GPUs to have a local NIC to work at full speed float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio factor -= (factor-1)/2; busBw /= factor; } #endif if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE && minCompCap >= 90) busBw *= .85; // Convert bus BW to algorithm BW float ratio; if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps; else if (a == NCCL_ALGO_NVLS) ratio = 5.0/6.0; else if (a == NCCL_ALGO_NVLS_TREE) ratio = .70 * nNodes / (2*(nNodes-1)); else ratio = .5; comm->bandwidths[coll][a][p] = busBw * ratio; comm->latencies[coll][a][p] = baseLat[a][p]; float intraLat = rcclTuningModel[comm->topo->tuning].hwLat[intraHw[a]][a][p]; float interLat = graphs[a]->latencyInter ? graphs[a]->latencyInter : rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p]; //if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8; if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter; if (a == NCCL_ALGO_RING) { float lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][a][p]; if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) { if (graphs[a]->sameChannels) { comm->latencies[coll][a][p] += lat; } else { if (p == NCCL_PROTO_SIMPLE) lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling comm->latencies[coll][a][p] += nsteps*lat; } } else { // Inter-node rings still have to launch nsteps * net overhead. float netOverhead = 0.0; if (nNodes > 1) { netOverhead = getNetOverhead(comm); if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3; } intraLat = std::max(intraLat, netOverhead); comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat; } } else if (a == NCCL_ALGO_TREE) { comm->latencies[coll][a][p] += 2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat); } else if (a == NCCL_ALGO_COLLNET_DIRECT) { comm->latencies[coll][a][p] += 2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat; // Add 0.5 arity serialization latency } else if (a == NCCL_ALGO_COLLNET_CHAIN) { comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat; } else if (a == NCCL_ALGO_NVLS) { if (nNodes > 1) comm->latencies[coll][a][p] += rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p]; } else if (a == NCCL_ALGO_NVLS_TREE) { comm->latencies[coll][a][p] += 2*(nNodes-1)*rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p]; } } } } // Protocols/Algorithms enable/disable, and user overrides. // All are enabled except ll128 which is enabled by default only in certain cases. int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 }; const char *protoStr = getenv("NCCL_PROTO"); if (protoStr) { INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr); NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable)); } const char *algoStr = getenv("NCCL_ALGO"); if (algoStr) { INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr); NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable)); } if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0; // Disable CollNet if it is not supported if (comm->collNetSupport == 0) { algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0; if (comm->nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0; // If user has hard set NCCL_ALGO=COLLNET, ignore it if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 && algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) { algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1; if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET"); } } else { // Disable CollNet+Direct if not on an NVSwitch system int nvsCount = 0; NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount)); if (nvsCount == 0) algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; } for (int c=0; ctypeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && comm->topo->ll128Enabled) ? 1 : 0; #else pEnable = 0; #endif #else // Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption. pEnable = 1; pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN)); pEnable &= (graphs[a]->typeIntra <= PATH_NVB); pEnable &= (minCompCap == maxCompCap); switch (minCompCap) { case 70: pEnable &= 1; break; case 80: pEnable &= 1; break; case 90: pEnable &= !(CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2); break; default: pEnable &= 0; break; } #endif } if (pEnable == 0) comm->bandwidths[c][a][p] = 0; // Never disable ring for non-allreduce operations. That allows to run real apps with NCCL_ALGO=TREE. if (a == NCCL_ALGO_RING && c != ncclFuncAllReduce) continue; if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; } if (comm->rank == 0) { char line[1024]; for (int block=0; block<2; block++) { sprintf(line, " Algorithm |"); for (int ba=0; bamaxThreads[a][p]); } } INFO(NCCL_TUNING, "%s", line); for (int c=0; clatencies[c][a][p], comm->bandwidths[c][a][p]); } } INFO(NCCL_TUNING, "%s", line); } } } // Set per-thread amount of work before we increase nThreads and nChannels for (int a=0; athreadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD; comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD; comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD; } comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks; comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = 256; comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = 256; // Override defaults with user env char* str = getenv("NCCL_THREAD_THRESHOLDS"); if (str) { INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str); ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }}; sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2); for (int a=0; a<2; a++) { for (int p=0; p= 0) comm->threadThresholds[a][p] = t[a][p]; } } } INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld | %ld", comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL], comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128], comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128], comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE], comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE]); return ncclSuccess; } ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) { float bw = info->comm->bandwidths[info->coll][algorithm][protocol]; float lat = info->comm->latencies[info->coll][algorithm][protocol]; if (bw == 0) { *time = -1.0; return ncclSuccess; } int logSize = log2i(info->nBytes>>6); #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) if (algorithm == NCCL_ALGO_TREE) { if (logSize < 27) bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][logSize]; else bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][26]; } else if (algorithm == NCCL_ALGO_RING && info->comm->nNodes > 1) { if(logSize < 27) bw *= rcclTuningModel[info->comm->topo->tuning].ringCorrectionFactor[protocol][logSize]; else bw *= rcclTuningModel[info->comm->topo->tuning].ringCorrectionFactor[protocol][26]; } #else if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize]; if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels; if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1 && info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) { lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring } #endif // Tree pipelining saves latency in aggregation cases int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS); *time = lat * latCount + (info->nBytes) / (1000 * bw); return ncclSuccess; }