#include "core.h" #include "graph.h" #include "topo.h" #include "xml.h" #include #include #include "rome_models.h" namespace sccl { namespace hardware { namespace topology { namespace detect { SCCL_PARAM(CrossNic, "CROSS_NIC", 2); // Initialize system->maxBw. This is the per-channel (i.e. per-SM) // max bw. static float getMaxBw(struct scclTopoSystem* system, struct scclTopoNode* gpu, int type) { float maxBw = 0.0; for(int i = 0; i < system->nodes[type].count; i++) { struct scclTopoLinkList* path = gpu->paths[type] + i; float bw = path->bw; if(path->count == 0) continue; maxBw = std::max(maxBw, bw); } return maxBw; } static float getTotalBw(struct scclTopoSystem* system, struct scclTopoNode* gpu) { float nvlinkBw = 0.0, pciBw = 0.0; for(int l = 0; l < gpu->nlinks; l++) { struct scclTopoLink* link = gpu->links + l; if(link->type == LINK_NVL) nvlinkBw += link->bw; if(link->type == LINK_PCI) pciBw = link->bw; } return std::max(pciBw, nvlinkBw); } scclResult_t scclTopoSearchInit(struct scclTopoSystem* system) { system->maxBw = 0.0; system->totalBw = 0.0; int inter = system->nodes[NET].count; if(inter == 0 && system->nodes[GPU].count == 1) { system->maxBw = LOC_BW; return scclSuccess; } for(int g = 0; g < system->nodes[GPU].count; g++) { struct scclTopoNode* gpu = system->nodes[GPU].nodes + g; system->maxBw = std::max(system->maxBw, getMaxBw(system, gpu, inter ? NET : GPU)); system->totalBw = std::max(system->totalBw, getTotalBw(system, gpu)); } return scclSuccess; } static scclResult_t findRevLink(struct scclTopoNode* node1, struct scclTopoNode* node2, struct scclTopoLink** revLink) { for(int l = 0; l < node2->nlinks; l++) { struct scclTopoLink* link = node2->links + l; if(link->remNode == node1) { *revLink = link; return scclSuccess; } } WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id); return scclInternalError; } // This is unfortunately needed since manipulating floats often results in rounding errors. #define SUB_ROUND(a, b) (a = roundf((a - b) * 1000) / 1000) static scclResult_t followPath(struct scclTopoLinkList* path, struct scclTopoNode* start, int maxSteps, float bw, int* steps) { float pciBw = bw; for(int step = 0; step < path->count; step++) { struct scclTopoNode* node = path->list[step]->remNode; if(node->type == CPU) { // Account for P2P inefficiency through Intel CPU RC if(path->type == PATH_PHB && start->type == GPU && node->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && node->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) { pciBw = INTEL_P2P_OVERHEAD(bw); } } } struct scclTopoNode* node = start; for(int step = 0; step < maxSteps; step++) { struct scclTopoLink* link = path->list[step]; struct scclTopoLink* revLink = NULL; float fwBw = link->type == LINK_PCI ? pciBw : bw; float revBw = 0; if(link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) { if(revLink == NULL) SCCLCHECK(findRevLink(node, link->remNode, &revLink)); revBw += fwBw / 8; } if(link->remNode->type == CPU && link->type == LINK_NVL) { if(revLink == NULL) SCCLCHECK(findRevLink(node, link->remNode, &revLink)); revBw += fwBw; } if(link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return scclSuccess; } SUB_ROUND(link->bw, fwBw); if(revBw) SUB_ROUND(revLink->bw, revBw); node = link->remNode; } *steps = maxSteps; return scclSuccess; } // Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1). static scclResult_t scclTopoFollowPath( struct scclTopoSystem* system, struct scclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct scclTopoNode** node) { // First handle easy cases *node = system->nodes[type2].nodes + index2; if(type1 == -1) return scclSuccess; struct scclTopoNode* node1 = system->nodes[type1].nodes + index1; struct scclTopoLinkList* path = node1->paths[type2] + index2; struct scclTopoNode* node2 = system->nodes[type2].nodes + index2; struct scclTopoLinkList* revPath = node2->paths[type1] + index1; if(path == NULL) { WARN("No path computed to go from %s/%d to %s/%d", topoNodeTypeStr[type1], index1, topoNodeTypeStr[type2], index2); return scclInternalError; } if(path->count == 0) return scclSuccess; // Now check link type *node = NULL; int intra = (type1 == GPU || type1 == NVS) && (type2 == GPU || type2 == NVS); float bw = intra ? graph->bwIntra : graph->bwInter; int type = intra ? graph->typeIntra : graph->typeInter; if(mult == 1 && (path->type > type)) return scclSuccess; if(mult == 1 && (graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE || graph->pattern == SCCL_TOPO_PATTERN_TREE || graph->pattern == SCCL_TOPO_PATTERN_SPLIT_TREE) && (revPath->type > type)) return scclSuccess; bw *= mult; // Check there is enough bandwidth on paths. int step = 0; SCCLCHECK(followPath(path, node1, path->count, bw, &step)); if(step < path->count) goto rewind; // Enough bandwidth : return destination node. graph->nHops += mult * path->count; *node = system->nodes[type2].nodes + index2; return scclSuccess; rewind: // Not enough bandwidth : rewind and exit. SCCLCHECK(followPath(path, node1, step, -bw, &step)); return scclSuccess; } static int gpuPciBw(struct scclTopoNode* gpu) { for(int l = 0; l < gpu->nlinks; l++) { struct scclTopoLink* gpuLink = gpu->links + l; if(gpuLink->type != LINK_PCI) continue; struct scclTopoNode* pci = gpuLink->remNode; for(int l = 0; l < pci->nlinks; l++) { struct scclTopoLink* pciLink = pci->links + l; if(pciLink->remNode != gpu) continue; return std::min(gpuLink->bw, pciLink->bw); } } return -1; } /* Choose the order in which we try next GPUs. This is critical for the search to quickly converge to the best solution even if it eventually times out. */ struct scclGpuScore { int g; // Retain the index int startIndex; // Least important int intraNhops; int intraBw; int interNhops; int interPciBw; int interBw; // Most important }; static int cmpScore(const void* g1, const void* g2) { struct scclGpuScore* s1 = (struct scclGpuScore*)g1; struct scclGpuScore* s2 = (struct scclGpuScore*)g2; int d; if((d = (s2->interBw - s1->interBw))) return d; if((d = (s2->interPciBw - s1->interPciBw))) return d; if((d = (s1->interNhops - s2->interNhops))) return d; if((d = (s2->startIndex - s1->startIndex))) return d; if((d = (s2->intraBw - s1->intraBw))) return d; if((d = (s1->intraNhops - s2->intraNhops))) return d; return s1->startIndex - s2->startIndex; } static int cmpIntraScores(struct scclGpuScore* scores, int count) { int intraBw = scores[0].intraBw; int intraNhops = scores[0].intraNhops; for(int i = 1; i < count; i++) { if(scores[i].intraBw != intraBw || scores[i].intraNhops != intraNhops) return 1; } return 0; } static scclResult_t getGpuIndex(struct scclTopoSystem* system, int rank, int* index) { for(int g = 0; g < system->nodes[GPU].count; g++) { if(system->nodes[GPU].nodes[g].gpu.rank == rank) { *index = g; return scclSuccess; } } WARN("Could not find gpu rank %d", rank); return scclInternalError; } static scclResult_t getNetIndex(struct scclTopoSystem* system, int64_t id, int* index) { for(int n = 0; n < system->nodes[NET].count; n++) { if(system->nodes[NET].nodes[n].id == id) { *index = n; return scclSuccess; } } WARN("Could not find net id %lx", id); return scclInternalError; } static scclResult_t getNetPaths(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoLinkList** netPaths) { int netId = graph->inter[graph->nChannels * 2]; int n; SCCLCHECK(getNetIndex(system, netId, &n)); *netPaths = system->nodes[NET].nodes[n].paths[GPU]; return scclSuccess; } scclResult_t scclTopoSearchNextGpuSort(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoNode* gpu, int* next, int* countPtr, int sortNet) { const uint64_t flag = 1ULL << (graph->nChannels); int ngpus = system->nodes[GPU].count; struct scclTopoLinkList* paths = gpu->paths[GPU]; struct scclTopoLinkList* netPaths = NULL; if(sortNet) SCCLCHECK(getNetPaths(system, graph, &netPaths)); struct scclGpuScore scores[SCCL_TOPO_MAX_NODES]; memset(scores, 0, ngpus * sizeof(struct scclGpuScore)); int start = gpu - system->nodes[GPU].nodes; int count = 0; for(int i = 1; i < ngpus; i++) { int g = (start + i) % ngpus; if(paths[g].count == 0) continue; // There is no path to that GPU if(system->nodes[GPU].nodes[g].used & flag) continue; scores[count].g = g; scores[count].startIndex = i; scores[count].intraNhops = paths[g].count; scores[count].intraBw = paths[g].bw; if(netPaths) { scores[count].interNhops = netPaths[g].count; scores[count].interPciBw = gpuPciBw(system->nodes[GPU].nodes + g); scores[count].interBw = netPaths[g].bw; } count++; } // Sort GPUs qsort(scores, count, sizeof(struct scclGpuScore), cmpScore); // Check if all have the same intra-node score in which case we go reverse for sortNet = -1 if(sortNet == -1 && cmpIntraScores(scores, count) == 0) { for(int i = 0; i < count; i++) next[i] = scores[count - 1 - i].g; } else { for(int i = 0; i < count; i++) next[i] = scores[i].g; } *countPtr = count; return scclSuccess; } scclResult_t scclTopoSearchRec(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int* time); // Try to keep all searchs within one second #define SCCL_SEARCH_GLOBAL_TIMEOUT (5ULL << 16) #define SCCL_SEARCH_TIMEOUT (1 << 14) #define SCCL_SEARCH_TIMEOUT_TREE (1 << 14) #define SCCL_SEARCH_TIMEOUT_SAMECHANNELS (1 << 8) #define FORCED_ORDER_PCI 1 #define FORCED_ORDER_REPLAY 2 scclResult_t scclTopoReplayGetGpu(struct scclTopoSystem* system, struct scclTopoGraph* graph, int step, int* g) { *g = -1; if(graph->nChannels == 0) return scclInternalError; int ngpus = system->nodes[GPU].count; int nextRank = graph->intra[(graph->nChannels - 1) * ngpus + step + 1]; for(int i = 0; i < ngpus; i++) if(system->nodes[GPU].nodes[i].gpu.rank == nextRank) { *g = i; return scclSuccess; } if(*g == -1) return scclInternalError; return scclSuccess; } scclResult_t scclTopoSearchRecGpu(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, struct scclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int* time); scclResult_t scclTopoSearchTryGpu(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int* time, int type, int index, int g) { const uint64_t flag = 1ULL << (graph->nChannels); struct scclTopoNode* gpu; SCCLCHECK(scclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu)); if(gpu) { gpu->used ^= flag; SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, time)); gpu->used ^= flag; SCCLCHECK(scclTopoFollowPath(system, graph, type, index, GPU, g, -1, &gpu)); } return scclSuccess; } static int scclTopoCountXGMI(struct scclTopoSystem* system, struct scclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; int count = 0; for(int c = 0; c < graph->nChannels; c++) { for(int i = 0; i < ngpus; i++) { int g = graph->intra[ngpus * c + i]; int n = graph->intra[ngpus * c + ((i + 1) % ngpus)]; struct scclTopoNode* node; int j; for(j = 0; j < ngpus; j++) if(system->nodes[GPU].nodes[j].gpu.rank == g) break; if(j < ngpus) { node = system->nodes[GPU].nodes + j; for(int k = 0; k < system->nodes[GPU].count; k++) { if(node->paths[GPU][k].count == 1) { struct scclTopoLink* link = node->paths[GPU][k].list[0]; struct scclTopoNode* remNode = link->remNode; if(remNode->gpu.rank == n) { if(link->type == LINK_NVL) count++; } } } } } } return count; } scclResult_t scclTopoSearchTryNvls(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int g, int ngpus, int* time) { struct scclTopoNode* nvs; struct scclTopoNode* gpu; int d0 = 0; // See if there is enough bandwidth for NVS->GPU traffic do { SCCLCHECK(scclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? 2 : 1, &gpu)); d0++; } while(gpu && d0 < system->nodes[GPU].count); if(gpu == NULL) { d0--; } else { int d1 = 0; // See if there is enough bandwidth for GPU->NVS traffic do { SCCLCHECK(scclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? 2 : 1, &nvs)); d1++; } while(nvs && d1 < system->nodes[GPU].count); if(nvs == NULL) { d1--; } else { // Both directions worked. Move on to the next path. SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time)); } while(d1) { d1--; SCCLCHECK(scclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? -2 : -1, &nvs)); } } while(d0) { d0--; SCCLCHECK(scclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? -2 : -1, &gpu)); } return scclSuccess; } scclResult_t scclTopoCompareGraphs(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* refGraph, int* copy) { // 1. Try to get the same nChannels between Rings and Trees if(graph->nChannels < graph->minChannels) return scclSuccess; if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better. if(graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count) *copy = 1; return scclSuccess; } // 2. Try to get better bandwidth // Give a 15% perf bonus to paths not crossing nics float target = 1.0 - (refGraph->crossNic - graph->crossNic) * .15; if(graph->nChannels * graph->bwIntra > refGraph->nChannels * refGraph->bwIntra * target) { *copy = 1; return scclSuccess; } if(graph->nChannels * graph->bwIntra < refGraph->nChannels * refGraph->bwIntra * target) return scclSuccess; // 3. Less hops if(graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1; // 4. Prefer graph with more XGMI connections if(graph->nChannels == refGraph->nChannels && scclTopoCountXGMI(system, refGraph) < scclTopoCountXGMI(system, graph)) *copy = 1; return scclSuccess; } // Build a list of the best NETs to try. // // "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu // index when trying to get back to the NIC. // // The list is built the following way: // 1. Select NETs starting with those close to GPU(s), based on paths[n].type. // 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list // based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which // might have been choosen by GPU 0 (case with multiple independent communicators per node) // 3. Then add the NETs to the final list if they were not already added by another closer GPU. scclResult_t scclTopoSelectNets(struct scclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) { int netCount = 0; int localNetCount; int* localNets; SCCLCHECK(scclCalloc(&localNets, system->nodes[NET].count)); for(int t = 0; t <= typeInter; t++) { for(int g = 0; g < system->nodes[GPU].count; g++) { if(gpu != -1 && gpu != g) continue; localNetCount = 0; struct scclTopoNode* gpu = system->nodes[GPU].nodes + g; struct scclTopoLinkList* paths = gpu->paths[NET]; for(int n = 0; n < system->nodes[NET].count; n++) { if(paths[n].type == t) localNets[localNetCount++] = n; } if(localNetCount == 0) continue; // Shuffle by gpu NVML device number so that GPUs on the same PCI switch // with multiple NICs don't use the same one as first choice. for(int r = 0; r < system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) { int net0 = localNets[0]; for(int i = 0; i < localNetCount - 1; i++) localNets[i] = localNets[i + 1]; localNets[localNetCount - 1] = net0; } // Append NICs to list for(int i = 0; i < localNetCount; i++) { int n = localNets[i]; int found = 0; while(nets[found] != n && found < netCount) found++; if(found == netCount) nets[netCount++] = n; } } } *netCountRet = netCount; free(localNets); return scclSuccess; } scclResult_t scclTopoSearchRecGpu(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, struct scclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int* time) { if((*time) <= 0) return scclSuccess; (*time)--; int ngpus = system->nodes[GPU].count; if(step == ngpus) { // Determine whether we found a better solution or not int copy = 0; graph->nChannels++; SCCLCHECK(scclTopoCompareGraphs(system, graph, saveGraph, ©)); if(copy) { memcpy(saveGraph, graph, sizeof(struct scclTopoGraph)); if(graph->nChannels == graph->maxChannels) *time = -1; } if(graph->nChannels < graph->maxChannels) { SCCLCHECK(scclTopoSearchRec(system, graph, saveGraph, time)); } graph->nChannels--; return scclSuccess; } graph->intra[graph->nChannels * ngpus + step] = gpu->gpu.rank; int g = gpu - system->nodes[GPU].nodes; if(step == backToNet) { // first get back to NIC if(system->nodes[NET].count) { int startNetIndex; SCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels * 2], &startNetIndex)); struct scclTopoNode* startNet = system->nodes[NET].nodes + startNetIndex; int netcount; int* nets; SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count)); SCCLCHECK(scclTopoSelectNets(system, graph->typeInter, g, nets, &netcount)); for(int i = 0; i < netcount; i++) { int n = nets[i]; struct scclTopoNode* net = system->nodes[NET].nodes + n; if(graph->pattern == SCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric if(graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue; // Balanced Tree : count half of the bandwidth on first two GPUs int nextBackToNet = -1; float bwInterSave = graph->bwInter; if(graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE) { // Count half of the bandwidth on each of the first two GPUs if(step == 0) nextBackToNet = 1; else if(net->id != graph->inter[graph->nChannels * 2 + 1]) continue; graph->bwInter /= 2; } SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net)); graph->bwInter = bwInterSave; if(net) { graph->inter[graph->nChannels * 2 + 1] = net->id; SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time)); if(graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2; SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net)); graph->bwInter = bwInterSave; } } free(nets); } } else if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) { SCCLCHECK(scclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time)); } else if(step < system->nodes[GPU].count - 1) { // Go to next GPU int next[SCCL_TOPO_MAX_NODES]; int count; if(forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order next[0] = step + 1; count = 1; } else if(forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order SCCLCHECK(scclTopoReplayGetGpu(system, graph, step, next)); count = 1; } else { // Normal search SCCLCHECK(scclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step + 1 ? 1 : -1)); } for(int i = 0; i < count; i++) { SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, step + 1, backToNet, backToFirstRank, forcedOrder, time, GPU, g, next[i])); } } else if(step == backToFirstRank) { // Find first GPU and loop back to it int p; SCCLCHECK(getGpuIndex(system, graph->intra[graph->nChannels * ngpus], &p)); struct scclTopoNode* firstGpu; SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, GPU, p, 1, &firstGpu)); if(firstGpu) { SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step + 1, backToNet, -1, forcedOrder, time)); SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, GPU, p, -1, &firstGpu)); } } else { // Next path SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time)); } return scclSuccess; } scclResult_t scclTopoSearchRecNet( struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) { const int bw = graph->bwInter; int* nets; SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count)); int netcount; SCCLCHECK(scclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount)); for(int i = 0; i < netcount; i++) { int n = nets[i]; struct scclTopoNode* net = system->nodes[NET].nodes + n; struct scclTopoNode* gpu; if(graph->collNet && net->net.collSupport == 0) continue; if(net->net.bw < bw) continue; graph->inter[graph->nChannels * 2] = net->id; graph->latencyInter = net->net.latency; for(int i = 0; i < system->nodes[NET].count; i++) { if((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) { system->nodes[NET].nodes[i].net.bw -= bw; } } // NVLS needs to balance on all NICs if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) { SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, nets[graph->nChannels])); } else { if(graph->nChannels > 0) { // Try to replay the last channel int g; SCCLCHECK(scclTopoReplayGetGpu(system, graph, -1, &g)); SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); } if(graph->nChannels == 0 || graph->sameChannels == 0) { if(graph->nChannels == 0) { // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long struct scclTopoLinkList* paths = net->paths[GPU]; int f = 0, f_gdr = 0; // find the first GPU that is closest to NIC for(int i = 0; i < system->nodes[GPU].count; i++) { if(paths[i].count <= paths[f].count) { // prefer GPU direct RDMA int gdr; SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr)); if(paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) { f = i; f_gdr = gdr; } } } int t = 1 << 10; SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0)); if(t == -1) *time = -1; } // Then try the most local GPUs float maxBw = 0; int minHops = 0xfffffff; struct scclTopoLinkList* paths = net->paths[GPU]; for(int g = 0; g < system->nodes[GPU].count; g++) { if(paths[g].bw > maxBw) { maxBw = paths[g].bw; minHops = paths[g].count; } else if(paths[g].bw == maxBw && paths[g].count < minHops) { minHops = paths[g].count; } } if(maxBw >= bw) { // In the first loop, avoid using GPUs in both directions between channels (one channel // sending from that GPU and one channel receiving to that GPU), since that usually leads // to lower BW. for(int tryGpuBidir = 0; tryGpuBidir < 2; tryGpuBidir++) { for(int g = 0; g < system->nodes[GPU].count; g++) { if(paths[g].bw == maxBw && paths[g].count == minHops) { gpu = system->nodes[GPU].nodes + g; int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1; if(tryGpuBidir == gpuUsed) { SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); } } } } } } } for(int i = 0; i < system->nodes[NET].count; i++) { if((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) { system->nodes[NET].nodes[i].net.bw += bw; } } } free(nets); return scclSuccess; } /* Search Patterns * * Intra-node * Ring : GPU a -> GPU b -> .. -> GPU x -> GPU a * (=Split Tree Loop) * Tree : GPU a -> GPU b -> .. -> GPU x * (=Split Tree) * * Inter-node * Ring : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic) * Tree : NET n -> GPU a -> GPU b -> .. -> GPU x * `--> NET n (or m if crossNic) * Split Tree : NET n -> GPU a -> GPU b -> .. -> GPU x * `--> NET n (or m if crossNic) * Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a * `--> NET n (or m if crossNic) */ scclResult_t scclTopoSearchParams(struct scclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) { if(system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) { if(pattern == SCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count - 1; else if(pattern == SCCL_TOPO_PATTERN_SPLIT_TREE) *backToNet = 1; else *backToNet = 0; *backToFirstRank = -1; } else { *backToNet = -1; if(pattern == SCCL_TOPO_PATTERN_RING) *backToFirstRank = system->nodes[GPU].count - 1; else *backToFirstRank = -1; } return scclSuccess; } scclResult_t scclTopoSearchRec(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int* time) { int backToNet, backToFirstRank; SCCLCHECK(scclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank)); if(system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) { // Start from NET scclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time); } else { // Intra-node only. if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) { SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, graph->nChannels)); return scclSuccess; } else if(graph->nChannels == 0) { // Try PCI order first SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0)); } else { // Also try to replay previous channel int g; SCCLCHECK(scclTopoReplayGetGpu(system, graph, -1, &g)); SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g)); } if(graph->sameChannels == 0 || graph->nChannels == 0) { // Finally, try all other possibilities unless we are forced to use the same channels for(int g = 0; g < system->nodes[GPU].count; g++) { SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g)); } } } return scclSuccess; } /************************************/ /* User defined graph from XML file */ /************************************/ struct kvDict kvDictLinkType[] = {{"LOC", PATH_LOC}, {"NVL", PATH_NVL}, {"NVB", PATH_NVB}, {"PIX", PATH_PIX}, {"PXB", PATH_PXB}, {"PXN", PATH_PXN}, {"PHB", PATH_PHB}, {"SYS", PATH_SYS}, {NULL, 0}}; scclResult_t scclTopoGetChannelFromXml(struct scclXmlNode* xmlChannel, int c, struct scclTopoSystem* system, struct scclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; int* inter = graph->inter + 2 * c; int* intra = graph->intra + ngpus * c; int n = 0, g = 0; for(int s = 0; s < xmlChannel->nSubs; s++) { struct scclXmlNode* sub = xmlChannel->subs[s]; int dev; SCCLCHECK(xmlGetAttrInt(sub, "dev", &dev)); if(strcmp(sub->name, "net") == 0) { inter[n++] = dev; } else if(strcmp(sub->name, "gpu") == 0) { int rank = -1; for(int g = 0; g < ngpus; g++) { if(system->nodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank; } if(rank == -1) { WARN("XML Import Channel : dev %d not found.", dev); return scclSystemError; } intra[g++] = rank; } } return scclSuccess; } scclResult_t scclTopoGetGraphFromXmlSub(struct scclXmlNode* xmlGraph, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* nChannels) { int id; SCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id)); if(graph->id != id) return scclSuccess; int crossNic; SCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic)); if(scclParamCrossNic() == 0 && crossNic == 1) return scclSuccess; graph->crossNic = crossNic; SCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern)); SCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels)); SCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra)); SCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter)); if(xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != scclSuccess) graph->latencyInter = 0.0; const char* str; SCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str)); SCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType)); SCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str)); SCCLCHECK(kvConvertToInt(str, &graph->typeInter, kvDictLinkType)); SCCLCHECK(xmlGetAttrInt(xmlGraph, "samechannels", &graph->sameChannels)); for(int s = 0; s < xmlGraph->nSubs; s++) { SCCLCHECK(scclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph)); } *nChannels = xmlGraph->nSubs; return scclSuccess; } scclResult_t scclTopoGetGraphFromXml(struct scclXmlNode* xmlGraphs, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* nChannels) { for(int s = 0; s < xmlGraphs->nSubs; s++) { SCCLCHECK(scclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph, nChannels)); } return scclSuccess; } /* And the reverse : graph->xml */ scclResult_t scclTopoGetXmlFromChannel(struct scclTopoGraph* graph, int c, struct scclTopoSystem* system, struct scclXml* xml, struct scclXmlNode* parent) { struct scclXmlNode* xmlChannel; int ngpus = system->nodes[GPU].count; int* inter = graph->inter + 2 * c; int* intra = graph->intra + ngpus * c; SCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel)); struct scclXmlNode* node; if(system->nodes[NET].count) { SCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node)); SCCLCHECK(xmlSetAttrInt(node, "dev", inter[0])); } for(int g = 0; g < ngpus; g++) { SCCLCHECK(xmlAddNode(xml, xmlChannel, "gpu", &node)); int dev = -1; for(int i = 0; i < ngpus; i++) { if(system->nodes[GPU].nodes[i].gpu.rank == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev; } if(dev == -1) { WARN("XML Export Channel : rank %d not found.", intra[g]); return scclInternalError; } SCCLCHECK(xmlSetAttrInt(node, "dev", dev)); } if(system->nodes[NET].count) { SCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node)); SCCLCHECK(xmlSetAttrInt(node, "dev", inter[1])); } return scclSuccess; } scclResult_t scclTopoGetXmlFromGraph(struct scclTopoGraph* graph, struct scclTopoSystem* system, struct scclXml* xml, struct scclXmlNode* parent) { struct scclXmlNode* xmlGraph; SCCLCHECK(xmlAddNode(xml, parent, "graph", &xmlGraph)); SCCLCHECK(xmlSetAttrInt(xmlGraph, "id", graph->id)); SCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern)); SCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic)); SCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels)); SCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->bwIntra)); SCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->bwInter)); SCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter)); const char* str; SCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType)); SCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str)); SCCLCHECK(kvConvertToStr(graph->typeInter, &str, kvDictLinkType)); SCCLCHECK(xmlSetAttr(xmlGraph, "typeinter", str)); SCCLCHECK(xmlSetAttrInt(xmlGraph, "samechannels", graph->sameChannels)); for(int c = 0; c < graph->nChannels; c++) { SCCLCHECK(scclTopoGetXmlFromChannel(graph, c, system, xml, xmlGraph)); } return scclSuccess; } scclResult_t scclTopoGetXmlFromGraphs(int ngraphs, struct scclTopoGraph** graphs, struct scclTopoSystem* system, struct scclXml* xml) { xml->maxIndex = 0; struct scclXmlNode* xmlGraphs; SCCLCHECK(xmlAddNode(xml, NULL, "graphs", &xmlGraphs)); SCCLCHECK(xmlSetAttrInt(xmlGraphs, "version", SCCL_GRAPH_XML_VERSION)); for(int g = 0; g < ngraphs; g++) { SCCLCHECK(scclTopoGetXmlFromGraph(graphs[g], system, xml, xmlGraphs)); } return scclSuccess; } #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) float speedArrayIntra[] = {48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12}; float speedArrayInter[] = {48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12}; #define NSPEEDSINTRA (sizeof(speedArrayIntra) / sizeof(float)) #define NSPEEDSINTER (sizeof(speedArrayInter) / sizeof(float)) #else float speedArrayIntra[] = {40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0}; float speedArrayInter[] = {48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12}; #define NSPEEDSINTRA (sizeof(speedArrayIntra) / sizeof(float)) #define NSPEEDSINTER (sizeof(speedArrayInter) / sizeof(float)) float sm90SpeedArrayIntra[] = {60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0}; float sm90SpeedArrayInter[] = {48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12}; #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra) / sizeof(float)) #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter) / sizeof(float)) #endif RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0); RCCL_PARAM(NChannels, "NCHANNELS", 0); scclResult_t scclTopoCompute(scclTopoSystem* system, struct scclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; graph->crossNic = scclParamCrossNic(); int crossNic = (system->nodes[NET].count > 1) && graph->crossNic && (graph->pattern == SCCL_TOPO_PATTERN_RING || graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE || graph->pattern == SCCL_TOPO_PATTERN_SPLIT_TREE) ? 1 : 0; graph->bwIntra = graph->bwInter = 0; graph->latencyInter = 0; if(graph->crossNic == 2) graph->crossNic = 0; graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; graph->typeInter = PATH_PIX; graph->nChannels = 0; graph->nIntraChannels = 0; memset(graph->intraNets, 0, MAXCHANNELS * SCCL_TOPO_MAX_NODES * 2 * sizeof(int)); int trySameChannels = graph->pattern == SCCL_TOPO_PATTERN_NVLS ? 0 : 1; graph->sameChannels = trySameChannels; char* str = getenv("SCCL_GRAPH_FILE"); if(str) { INFO(SCCL_ENV, "SCCL_GRAPH_FILE set by environment to %s", str); struct scclXml* xml; SCCLCHECK(scclCalloc(&xml, 1)); SCCLCHECK(scclTopoGetXmlGraphFromFile(str, xml)); int nChannels; SCCLCHECK(scclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels)); INFO(SCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels); free(xml); if(graph->nChannels > 0) return scclSuccess; } str = getenv("SCCL_RINGS"); char* strTrees = getenv("RCCL_TREES"); if(str || strTrees) { // user supplied topo if(strTrees) { SCCLCHECK(parseGraphLight(strTrees, system, graph, NULL)); system->treeDefined = true; } else { SCCLCHECK(parseGraph(str, system, graph, NULL, NULL)); int arch, vendor, model; SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model)); if(graph->nChannels && arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME) { system->type |= RCCL_TOPO_4P2H_ROME; } } } else if(!rcclParamModelMatchingDisable() && !graph->collNet) { // try to match 8P6L SCCLCHECK(parseChordalRing(system, graph)); if(graph->nChannels) return scclSuccess; // try to match Rome 4P2H SCCLCHECK(parseRome4P2H(system, graph)); if(graph->nChannels) return scclSuccess; // try to match 1H16P SCCLCHECK(parse1H16P(system, graph)); if(graph->nChannels) return scclSuccess; // try to match 4H4P SCCLCHECK(parse4H4P(system, graph)); } if(graph->nChannels) return scclSuccess; if((graph->pattern == SCCL_TOPO_PATTERN_RING) && (system->type & RCCL_TOPO_4P2H_ROME) && (ngpus == system->nRanks)) { // limit single node max channels when searching ring graph on Rome graph->maxChannels = 2; } if(ngpus == 1) if(graph->pattern != SCCL_TOPO_PATTERN_RING) graph->pattern = SCCL_TOPO_PATTERN_TREE; int ccMin; SCCLCHECK(scclTopoGetCompCap(system, &ccMin, NULL)); if(graph->pattern == SCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return scclSuccess; if(ngpus == 1) if(graph->pattern != SCCL_TOPO_PATTERN_RING) graph->pattern = SCCL_TOPO_PATTERN_TREE; if(system->nodes[NET].count == 0 && graph->pattern == SCCL_TOPO_PATTERN_NVLS) { // Force intra-node NVLS algorithm to pull evenly from all GPUs. graph->minChannels = graph->maxChannels = system->nodes[GPU].count; } struct scclTopoGraph tmpGraph; memcpy(&tmpGraph, graph, sizeof(struct scclTopoGraph)); // First try crossnic, then decrease bw and finally increase bwIntra. int nspeeds = 0; float* speedArray = NULL; if(system->nodes[NET].count == 0) { nspeeds = NSPEEDSINTRA; speedArray = speedArrayIntra; } else { nspeeds = NSPEEDSINTER; speedArray = speedArrayInter; } int pass = 1; int speedIndex = 0; float maxBw = system->maxBw; float totalBw = system->totalBw; if(ngpus == 1 || graph->pattern != SCCL_TOPO_PATTERN_RING) totalBw *= ngpus * 1.0 / (ngpus - 1); while((speedArray[speedIndex] > maxBw || speedArray[speedIndex] * graph->minChannels > totalBw) && speedIndex < nspeeds - 1) speedIndex++; tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; int64_t globalTimeout = SCCL_SEARCH_GLOBAL_TIMEOUT; search: int time = tmpGraph.sameChannels ? SCCL_SEARCH_TIMEOUT_SAMECHANNELS : tmpGraph.pattern == SCCL_TOPO_PATTERN_TREE ? SCCL_SEARCH_TIMEOUT_TREE : SCCL_SEARCH_TIMEOUT; tmpGraph.nChannels = 0; globalTimeout -= time; SCCLCHECK(scclTopoSearchRec(system, &tmpGraph, graph, &time)); #if 0 printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : ""); for (int c=0; cnChannels; c++) { printf("%2d : ", c); for (int g=0; gintra[c*ngpus+g]); } printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]); printf("\n"); } #endif // Optimal solution, stop here if(time == -1) goto done; if(graph->nChannels * graph->bwInter >= system->totalBw) goto done; if(pass == 1) { // First pass, we don't have a solution yet ; try other options // Try having different channels if(tmpGraph.sameChannels == 1) { tmpGraph.sameChannels = 0; goto search; } tmpGraph.sameChannels = trySameChannels; if(time != -1) globalTimeout += time; else globalTimeout = SCCL_SEARCH_GLOBAL_TIMEOUT; if(globalTimeout < 0 && graph->nChannels) goto done; tmpGraph.pattern = graph->pattern; int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS; if(tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) { tmpGraph.typeIntra += 1; goto search; } tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; if(system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { tmpGraph.typeInter += 1; goto search; } tmpGraph.typeInter = PATH_PIX; if(crossNic && tmpGraph.crossNic == 0) { // Try again with crossNic if permitted tmpGraph.crossNic = crossNic; goto search; } tmpGraph.crossNic = 0; // Decrease bw until we find a solution if((speedIndex < nspeeds - 1) && (graph->nChannels == 0 || (speedArray[speedIndex + 1] / graph->bwInter > .49))) { tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex]; goto search; } speedIndex = 0; while(speedArray[speedIndex] > maxBw && speedIndex < nspeeds - 1) speedIndex++; tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; } done: // We have a solution. Start from that solution and move to pass 2. if(pass == 1) { time = -1; memcpy(&tmpGraph, graph, sizeof(tmpGraph)); speedIndex = 0; while(speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds - 1) speedIndex++; tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex]; tmpGraph.minChannels = graph->nChannels; pass = 2; } // 3. See if we can increase bwIntra for trees (2 nodes or collnet) if(pass == 2) { if(time != 0 && graph->pattern != SCCL_TOPO_PATTERN_RING && tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter * 2 && speedIndex > 0) { tmpGraph.bwIntra = speedArray[--speedIndex]; goto search; } time = -1; memcpy(&tmpGraph, graph, sizeof(tmpGraph)); } if(graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != SCCL_TOPO_PATTERN_NVLS) { WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern); for(int i = 0; i < ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank; graph->inter[0] = graph->inter[1] = 0; graph->bwIntra = graph->bwInter = 0.1; graph->typeIntra = graph->typeInter = PATH_SYS; graph->nChannels = 1; } if(graph->nChannels == 0) return scclSuccess; if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) return scclSuccess; if(graph->bwIntra < 25.0) return scclSuccess; if(ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return scclSuccess; int dupChannels = std::min(graph->nChannels * 2, graph->maxChannels); memcpy(graph->intra + graph->nChannels * ngpus, graph->intra, (dupChannels - graph->nChannels) * ngpus * sizeof(int)); memcpy(graph->inter + graph->nChannels * 2, graph->inter, (dupChannels - graph->nChannels) * 2 * sizeof(int)); graph->bwIntra /= DIVUP(dupChannels, graph->nChannels); graph->bwInter /= DIVUP(dupChannels, graph->nChannels); graph->nChannels = dupChannels; int nc = rcclParamNChannels(); if(graph->nChannels > 0 && nc > 0 && nc <= MAXCHANNELS / 2 && nc > graph->nChannels) { int nChannels = nc - graph->nChannels; int nnets = system->nodes[NET].count; if(nnets <= 2) { for(int i = 0; i < nChannels; ++i) { memcpy(graph->intra + graph->nChannels * ngpus, graph->intra, ngpus * sizeof(int)); memcpy(graph->inter + graph->nChannels * 2, graph->inter, 2 * sizeof(int)); memcpy(graph->intraNets + graph->nChannels * ngpus * 2, graph->intraNets, 2 * ngpus * sizeof(int)); graph->nChannels++; } } else { typedef struct { int id; int used; } Net; Net nets[nnets]; auto sortFunc = [](const void* a, const void* b) -> int { return ((Net*)a)->used - ((Net*)b)->used; }; memset(nets, 0, nnets * sizeof(Net)); for(int i = 0; i < nnets; ++i) { nets[i].id = system->nodes[NET].nodes[i].id; } for(int i = 0; i < graph->nChannels; ++i) { for(int j = 0; j < nnets; ++j) { if(nets[j].id == *(graph->inter + i * 2) || nets[j].id == *(graph->inter + i * 2 + 1)) { nets[j].used++; } } } for(int i = 0; i < nChannels; ++i) { memcpy(graph->intra + graph->nChannels * ngpus, graph->intra, ngpus * sizeof(int)); qsort(nets, nnets, sizeof(Net), sortFunc); *(graph->inter + graph->nChannels * 2) = nets[0].id; nets[0].used++; qsort(nets, nnets, sizeof(Net), sortFunc); if(graph->crossNic == 0 || graph->crossNic == 2) { *(graph->inter + graph->nChannels * 2 + 1) = nets[0].id; nets[0].used++; qsort(nets, nnets, sizeof(Net), sortFunc); } else { nets[0].used++; qsort(nets, nnets, sizeof(Net), sortFunc); *(graph->inter + graph->nChannels * 2 + 1) = nets[0].id; } nets[0].used++; memcpy(graph->intraNets + graph->nChannels * ngpus * 2, graph->intraNets, 2 * ngpus * sizeof(int)); graph->nChannels++; } } graph->bwIntra /= DIVUP(nc, graph->nChannels); graph->bwInter /= DIVUP(nc, graph->nChannels); } return scclSuccess; } scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph) { INFO(SCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, bw %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->bwIntra, graph->bwInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels); int ngpus = system->nodes[GPU].count; char line[1024]; for(int c = 0; c < graph->nChannels; c++) { sprintf(line, "%2d :", c); int offset = strlen(line); if(system->nodes[NET].count > 0 && system->nodes[GPU].count != system->nRanks && !graph->nIntraChannels) { sprintf(line + offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2 * c]); offset = strlen(line); } for(int i = 0; i < ngpus; i++) { int n = graph->intraNets[(ngpus * c + i) * 2] - 'N'; if(n >= 0 && n < system->nodes[NET].count) { sprintf(line + offset, " NET/%d", n); offset = strlen(line); } sprintf(line + offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus * c + i]); offset = strlen(line); n = graph->intraNets[(ngpus * c + i) * 2 + 1] - 'N'; if(n >= 0 && n < system->nodes[NET].count) { sprintf(line + offset, " NET/%d", n); offset = strlen(line); } } if(system->nodes[NET].count > 0 && system->nodes[GPU].count != system->nRanks && !graph->nIntraChannels) { sprintf(line + offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2 * c + 1]); offset = strlen(line); } INFO(SCCL_GRAPH, "%s", line); } return scclSuccess; } scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs) { char* str = getenv("SCCL_GRAPH_DUMP_FILE"); if(str) { INFO(SCCL_ENV, "SCCL_GRAPH_DUMP_FILE set by environment to %s", str); struct scclXml* xml; SCCLCHECK(scclCalloc(&xml, 1)); SCCLCHECK(scclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml)); SCCLCHECK(scclTopoDumpXmlToFile(str, xml)); free(xml); } return scclSuccess; } #include "comm.h" // NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head scclResult_t getNvlsNetDev(struct scclComm* comm, struct scclTopoGraph* graph, int* dev) { int localRanks = comm->topo->nodes[GPU].count; for(int c = 0; c < graph->nChannels; c++) { if(graph->intra[c * localRanks] == comm->rank) { *dev = graph->inter[c * 2]; return scclSuccess; } } WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank); return scclInternalError; } // 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation SCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2); scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) { if(graph) { // Honor the net device in the graph int channel = channelId % graph->nChannels; int ngpus = comm->topo->nodes[GPU].count; int index = graph->intra[channel * ngpus] == rank ? 0 : 1; if(graph->pattern != SCCL_TOPO_PATTERN_NVLS) { *dev = graph->inter[channel * 2 + index]; } else { SCCLCHECK(getNvlsNetDev(comm, graph, dev)); } SCCLCHECK(scclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank)); } else if(peerRank == -1) { return scclInternalError; } else { // Start with our local NIC and local Rank SCCLCHECK(scclTopoGetLocalNet(comm->topo, rank, channelId, dev)); *proxyRank = rank; int pxnLevel = scclPxnDisable(comm) == 1 ? 0 : scclParamP2pPxnLevel(); // See whether we can use the remote rank preferred device. if(scclParamCrossNic() == 0 || (pxnLevel != 0)) { // Find local NIC number close to local cudaDev int cudaDev = comm->peerInfo[peerRank].cudaDev; int localRank; if(scclTopoDevToRank(comm->topo, cudaDev, &localRank) != scclSuccess) return scclSuccess; int netDev; SCCLCHECK(scclTopoGetLocalNet(comm->topo, localRank, channelId, &netDev)); int n; // Check that device exists on our node if(scclParamCrossNic() == 0) { if(scclTopoIdToIndex(comm->topo, NET, netDev, &n) != scclSuccess) { WARN("Rank %d requires NIC %d but that NIC is not available for rank %d", peerRank, netDev, rank); return scclInvalidUsage; } *dev = netDev; } if(pxnLevel == 1) { int g, n; SCCLCHECK(scclTopoRankToIndex(comm->topo, rank, &g)); SCCLCHECK(scclTopoIdToIndex(comm->topo, NET, netDev, &n)); struct scclTopoNode* gpu = comm->topo->nodes[GPU].nodes + g; if(gpu->paths[NET][n].type <= PATH_PXN) { *dev = netDev; SCCLCHECK(scclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank)); } } else if(pxnLevel == 2) { // Check which local GPU corresponds to that NIC and see if we can use PXN. int n, g1, g2; SCCLCHECK(scclTopoIdToIndex(comm->topo, NET, netDev, &n)); SCCLCHECK(scclTopoRankToIndex(comm->topo, rank, &g1)); SCCLCHECK(scclTopoGetLocalGpu(comm->topo, netDev, &g2)); if(g2 != -1) { struct scclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes + g2; if(peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) { *proxyRank = peerGpu->gpu.rank; *dev = netDev; return scclSuccess; } } } } } return scclSuccess; } scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev) { *dev = -1; if(graph && graph->nIntraChannels) { int n1 = -1; int ngpus = system->nodes[GPU].count; int nnets = system->nodes[NET].count; int chan = channelId % graph->nIntraChannels; for(int i = 0; i < ngpus; i++) { if(graph->intra[ngpus * chan + i] == rank) { n1 = graph->intraNets[(ngpus * chan + i) * 2 + type] - 'N'; break; } } if(n1 >= 0 && n1 < nnets) { *dev = n1; } } return scclSuccess; } scclResult_t scclTopoGetLinkType(struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter, int nInter, int* inter) { int interGpus[MAX_XGMI_INTER_GPUS + 1]; int ngpus = system->nodes[GPU].count; *isXGMI = false; // check for direct XGMI connection for(int i = 0; i < ngpus; i++) { if(system->nodes[GPU].nodes[i].gpu.dev == cudaDev1) { struct scclTopoNode* node = system->nodes[GPU].nodes + i; for(int k = 0; k < system->nodes[GPU].count; k++) { if(node->paths[GPU][k].count == 1) { struct scclTopoLink* link = node->paths[GPU][k].list[0]; struct scclTopoNode* remNode = link->remNode; if(remNode->gpu.dev == cudaDev2) { *isXGMI = (link->type == LINK_NVL); if(*isXGMI) return scclSuccess; } } } } } // try intermediate GPUs if(maxInter) { // check if there are intermediate GPUs that are connected to both bool res1, res2, res3; int j; for(j = 0; j < nInter; j++) { scclTopoGetLinkType(system, inter[j], inter[j + 1], &res1, 0); if(!res1) break; } if(j < nInter) return scclSuccess; if(nInter > 0 && inter != nullptr) { scclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0); if(res2) { *isXGMI = true; return scclSuccess; } memcpy(interGpus + 1, inter + 1, sizeof(int) * nInter); } interGpus[0] = cudaDev1; // add one more intermediate GPU recursively util reaching max depth nInter++; if(nInter + 2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return scclSuccess; for(int i = 0; i < ngpus; i++) { int dev = system->nodes[GPU].nodes[i].gpu.dev; // skip duplicated GPU if(dev == cudaDev2) continue; for(j = 0; j < nInter; j++) if(dev == interGpus[j]) break; if(j < nInter) continue; // check connectivity with intermediate GPUs interGpus[nInter] = dev; scclTopoGetLinkType(system, cudaDev1, cudaDev2, &res3, maxInter, nInter, interGpus); if(res3) { *isXGMI = true; return scclSuccess; } } } return scclSuccess; } } // namespace detect } // namespace topology } // namespace hardware } // namespace sccl