#include "core.h" #include "graph.h" #include "topo.h" #include "comm.h" #include "net.h" #include "channel.h" #include "xml.h" namespace sccl { namespace hardware { namespace topology { namespace graph { // Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths struct scclTopoNodeList { struct scclTopoNode* list[SCCL_TOPO_MAX_NODES]; int count; }; static scclResult_t getPath(struct scclTopoSystem* system, struct scclTopoNode* node, int t, int64_t id, struct scclTopoLinkList** path) { for(int i = 0; i < system->nodes[t].count; i++) { if(system->nodes[t].nodes[i].id == id) { *path = node->paths[t] + i; return scclSuccess; } } WARN("Could not find node of type %d id %lx", t, id); return scclInternalError; } static scclResult_t scclTopoSetPaths(struct scclTopoNode* baseNode, struct scclTopoSystem* system) { if(baseNode->paths[baseNode->type] == NULL) { SCCLCHECK(scclCalloc(baseNode->paths + baseNode->type, system->nodes[baseNode->type].count)); } // breadth-first search to set all paths to that node in the system struct scclTopoNodeList nodeList; struct scclTopoNodeList nextNodeList; nodeList.count = 1; nodeList.list[0] = baseNode; nextNodeList.count = 0; struct scclTopoLinkList* basePath; SCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath)); basePath->count = 0; basePath->bw = LOC_BW; basePath->type = PATH_LOC; while(nodeList.count) { nextNodeList.count = 0; for(int n = 0; n < nodeList.count; n++) { struct scclTopoNode* node = nodeList.list[n]; struct scclTopoLinkList* path; SCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path)); for(int l = 0; l < node->nlinks; l++) { struct scclTopoLink* link = node->links + l; struct scclTopoNode* remNode = link->remNode; if(remNode->paths[baseNode->type] == NULL) { SCCLCHECK(scclCalloc(remNode->paths + baseNode->type, system->nodes[baseNode->type].count)); } struct scclTopoLinkList* remPath; SCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath)); float bw = std::min(path->bw, link->bw); // allow routing through a GPU only as 1 hop if(node != baseNode && node->type == GPU && (link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue; if((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) { // Find reverse link for(int l = 0; l < remNode->nlinks; l++) { if(remNode->links[l].remNode == node) { remPath->list[0] = remNode->links + l; break; } } if(remPath->list[0] == NULL) { WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx", remNode->type, remNode->id, remNode->nlinks, node->type, node->id); return scclInternalError; } // Copy the rest of the path for(int i = 0; i < path->count; i++) remPath->list[i + 1] = path->list[i]; remPath->count = path->count + 1; remPath->bw = bw; // Start with path type = link type. PATH and LINK types are supposed to match. // Don't consider LINK_NET as we only care about the NIC->GPU path. int type = link->type == LINK_NET ? LINK_LOC : link->type; // Differentiate between one and multiple PCI switches if(node->type == PCI && remNode->type == PCI) type = PATH_PXB; // Consider a path going through the CPU as PATH_PHB if(link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB; // Set 1 hop NVLink as NVB // if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB; remPath->type = std::max(path->type, type); // Add to the list for the next iteration if not already in the list // Disallow GPUs as intermediate steps for now if(remNode->type != GPU) { int i; for(i = 0; i < nextNodeList.count; i++) if(nextNodeList.list[i] == remNode) break; if(i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode; } } } } memcpy(&nodeList, &nextNodeList, sizeof(nodeList)); } return scclSuccess; } /** * 打印节点路径信息 * * @param system 拓扑系统指针 * @param node 待打印路径的节点指针 * * 该函数用于输出指定节点的路径信息,包括路径类型、目标节点ID、 * 路径跳数、带宽和路径类型字符串。输出格式为一行字符串。 */ static void printNodePaths(struct scclTopoSystem* system, struct scclTopoNode* node) { char line[1024]; sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id); int offset = strlen(line); for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) { if(node->paths[t] == NULL) continue; for(int n = 0; n < system->nodes[t].count; n++) { sprintf(line + offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].bw, topoPathTypeStr[node->paths[t][n].type]); offset = strlen(line); } } } static scclResult_t getLocalCpu(struct scclTopoSystem* system, int gpu, int* retCpu) { // Find the closest CPU to a GPU int minHops = 0; int localCpu = -1; struct scclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU]; for(int c = 0; c < system->nodes[CPU].count; c++) { int hops = paths[c].count; if(minHops == 0 || hops < minHops) { localCpu = c; minHops = hops; } } if(localCpu == -1) { WARN("Error : could not find CPU close to GPU %d", gpu); return scclInternalError; } *retCpu = localCpu; return scclSuccess; } static scclResult_t addInterStep(struct scclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) { struct scclTopoNode* cpuNode = system->nodes[tx].nodes + ix; struct scclTopoNode* srcNode = system->nodes[t1].nodes + i1; int l = 0; // Node 1 -> CPU for(int i = 0; i < srcNode->paths[tx][ix].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i]; // CPU -> Node 2 for(int i = 0; i < cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i]; // Update path characteristics srcNode->paths[t2][i2].count = l; srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type); if(tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN; srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw); return scclSuccess; } // Remove/free paths for a given type static void scclTopoRemovePathType(struct scclTopoSystem* system, int nodeType) { for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) { // Remove links _to_ the given type for(int n = 0; n < system->nodes[t].count; n++) { struct scclTopoNode* node = system->nodes[t].nodes + n; free(node->paths[nodeType]); node->paths[nodeType] = NULL; } // Remove links _from_ the given type for(int n = 0; n < system->nodes[nodeType].count; n++) { struct scclTopoNode* node = system->nodes[nodeType].nodes + n; free(node->paths[t]); node->paths[t] = NULL; } } } static const int levelsOldToNew[] = {PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS}; scclResult_t scclGetLevel(int* level, const char* disableEnv, const char* levelEnv) { if(*level == -1) { int l = -1; if(disableEnv) { char* str = getenv(disableEnv); if(str) { int disable = strtol(str, NULL, 0); if(disable == 1) l = 0; } } if(l == -1) { char* str = getenv(levelEnv); if(str) { for(int i = 0; i <= PATH_SYS; i++) { if(strcmp(str, topoPathTypeStr[i]) == 0) { l = i; break; } } // Old style numbering // levelsOldToNew to is an array with each index corresponding to the // "old level" int, and each value mapping to the correct value defined in topo.h // maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew) if(l == -1 && str[0] >= '0' && str[0] <= '9') { int oldLevel = strtol(str, NULL, 0); const int maxOldLevel = sizeof(levelsOldToNew) / sizeof(int) - 1; if(oldLevel > maxOldLevel) oldLevel = maxOldLevel; l = levelsOldToNew[oldLevel]; } } } if(l >= 0) INFO(SCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]); *level = l >= 0 ? l : -2; } return scclSuccess; } SCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); int scclTopoUserGdrLevel = -1; scclResult_t scclTopoCheckGdr(struct scclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) { *useGdr = 0; // Get GPU and NET int n, g; SCCLCHECK(scclTopoIdToIndex(system, NET, netDev, &n)); struct scclTopoNode* net = system->nodes[NET].nodes + n; SCCLCHECK(scclTopoIdToIndex(system, GPU, busId, &g)); struct scclTopoNode* gpu = system->nodes[GPU].nodes + g; // Check that both the NIC and GPUs support it if(net->net.gdrSupport == 0) return scclSuccess; if(gpu->gpu.gdrSupport == 0) return scclSuccess; if(read) { // For reads (sends) only enable under certain conditions int gdrReadParam = scclParamNetGdrRead(); if(gdrReadParam == 0) return scclSuccess; if(gdrReadParam < 0) { int nvlink = 0; // Since we don't know whether there are other communicators, // it's better to keep things local if we have a single GPU. if(system->nodes[GPU].count == 1) nvlink = 1; for(int i = 0; i < system->nodes[GPU].count; i++) { if(i == g) continue; if(gpu->paths[GPU][i].type == PATH_NVL) { nvlink = 1; break; } } if(!nvlink) return scclSuccess; } } // Check if we are close enough that it makes sense to enable GDR int netGdrLevel = system->netGdrLevel == -2 ? PATH_PXB : system->netGdrLevel; SCCLCHECK(scclGetLevel(&scclTopoUserGdrLevel, NULL, "SCCL_NET_GDR_LEVEL")); if(scclTopoUserGdrLevel != -2) netGdrLevel = scclTopoUserGdrLevel; else { int arch, vendor, model; SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model)); if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME) { int i, d1 = -1, d2 = -1; for(i = 0; i < system->nodes[CPU].count; i++) if(system->nodes[GPU].nodes[g].paths[CPU][i].count == 2) break; if(i < system->nodes[CPU].count) d1 = system->nodes[CPU].nodes[i].id; for(i = 0; i < system->nodes[CPU].count; i++) if(system->nodes[NET].nodes[n].paths[CPU][i].count == 2) break; if(i < system->nodes[CPU].count) d2 = system->nodes[CPU].nodes[i].id; if(d1 != -1 && d2 != -1 && d1 == d2 && (system->nodes[GPU].nodes[g].id & 0xf0000) == (system->nodes[NET].nodes[n].net.busId & 0xf0000)) { netGdrLevel = PATH_PHB; } } } int distance = gpu->paths[NET][n].type; if(distance == PATH_PXN) { // In case of PXN, use the intermediate GPU distance instead int proxyRank, g; SCCLCHECK(scclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank)); SCCLCHECK(scclTopoRankToIndex(system, proxyRank, &g)); struct scclTopoNode* proxyGpu = system->nodes[GPU].nodes + g; distance = proxyGpu->paths[NET][n].type; } if(distance > netGdrLevel) { INFO(SCCL_NET, "GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel); return scclSuccess; } *useGdr = 1; INFO(SCCL_NET, "GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read); return scclSuccess; } // Set to 0 to disable the flush on Hopper when using GDR SCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1); // Determine whether we need to flush the GDR recv buffers scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush) { int g; SCCLCHECK(scclTopoIdToIndex(system, GPU, busId, &g)); struct scclTopoNode* gpu = system->nodes[GPU].nodes + g; // Flush is required on Ampere and earlier *flush = gpu->gpu.cudaCompCap < 90 ? 1 : scclParamNetForceFlush(); return scclSuccess; } SCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 1); // Check whether going through the network would be faster than going through P2P/SHM. scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net) { if(scclParamNetDisableIntra() == 1) { *net = 0; return scclSuccess; } *net = 1; // First check the current GPU-to-GPU speed. int g1, g2; if(scclTopoIdToIndex(system, GPU, id1, &g1) != scclSuccess || scclTopoIdToIndex(system, GPU, id2, &g2) != scclSuccess) { return scclSuccess; } struct scclTopoNode* gpu1 = system->nodes[GPU].nodes + g1; struct scclTopoNode* gpu2 = system->nodes[GPU].nodes + g2; float speed = gpu1->paths[GPU][g2].bw; // Now check the speed each GPU can access the network through PXB or better float netSpeed1 = 0, netSpeed2 = 0; for(int n = 0; n < system->nodes[NET].count; n++) { struct scclTopoLinkList* path = gpu1->paths[NET] + n; if(path->type <= PATH_PXB && path->bw > netSpeed1) netSpeed1 = path->bw; path = gpu2->paths[NET] + n; if(path->type <= PATH_PXB && path->bw > netSpeed2) netSpeed2 = path->bw; } if(netSpeed1 > speed && netSpeed2 > speed) return scclSuccess; *net = 0; return scclSuccess; } scclResult_t scclTopoGetIntermediateRank(struct scclTopoSystem* system, int rank, int netDev, int* intermediateRank) { // Get GPU and NET int n, g; SCCLCHECK(scclTopoIdToIndex(system, NET, netDev, &n)); SCCLCHECK(scclTopoRankToIndex(system, rank, &g)); struct scclTopoNode* gpu = system->nodes[GPU].nodes + g; struct scclTopoLinkList* path = gpu->paths[NET] + n; if(path->type == PATH_PXN) { struct scclTopoNode* node; int type = NVS; for(int i = 0; i < path->count && type == NVS; i++) { node = path->list[i]->remNode; type = node->type; } if(type != GPU) { WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev); return scclInternalError; } *intermediateRank = node->gpu.rank; } else { *intermediateRank = rank; } return scclSuccess; } SCCL_PARAM(PxnDisable, "PXN_DISABLE", 1); // Net v4 plugins don't have non-blocking connect/accept. We can't therefore use // remote proxies without risking deadlocks int scclPxnDisable(struct scclComm* comm) { static int pxnDisable = -1; if(pxnDisable == -1) { if(comm && scclNetVersion(comm) == 4) { INFO(SCCL_INIT, "PXN Disabled as plugin is v4"); pxnDisable = 1; } else { pxnDisable = scclParamPxnDisable(); } } return pxnDisable; } scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks) { struct scclTopoSystem* system = comm->topo; *nranks = 0; *intermediateRanks = NULL; if(system->nodes[NET].count == 0) return scclSuccess; int nr = 0; int* ranks = NULL; for(int rank = 0; rank < comm->nRanks; rank++) { int netDev, proxyRank; SCCLCHECK(scclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank)); if(proxyRank == comm->rank) continue; int useGdr; SCCLCHECK(scclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr)); if(useGdr == 0) continue; int found = 0; for(int r = 0; r < nr; r++) { if(ranks[r] == proxyRank) found = 1; } if(!found) { SCCLCHECK(scclRealloc(&ranks, nr, nr + 1)); ranks[nr++] = proxyRank; } } *nranks = nr; *intermediateRanks = ranks; return scclSuccess; } static bool rcclPathOverride(struct scclTopoSystem* system, uint64_t distance) { int i, j; for(i = 0; i < system->nodes[GPU].count; i++) { for(j = 0; j < system->nodes[NET].count; j++) { if((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) || (system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance)) break; } if(j >= system->nodes[NET].count) break; } if(i >= system->nodes[GPU].count) { for(i = 0; i < system->nodes[GPU].count; i++) { for(j = 0; j < system->nodes[NET].count; j++) { if((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) || (system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance)) system->nodes[GPU].nodes[i].paths[NET][j].type = PATH_PXB; } } return true; } else { return false; } } RCCL_PARAM(EnableIntranet, "ENABLE_INTRANET", -2); scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm) { int* domains; int64_t* ids; SCCLCHECK(scclCalloc(&domains, system->nodes[GPU].count)); SCCLCHECK(scclCalloc(&ids, system->nodes[GPU].count)); int myDomain = 0; for(int g = 0; g < system->nodes[GPU].count; g++) { struct scclTopoNode* gpu = system->nodes[GPU].nodes + g; domains[g] = g; ids[g] = gpu->id; for(int p = 0; p < g; p++) { if(gpu->paths[GPU][p].type < PATH_NET) { domains[g] = std::min(domains[g], domains[p]); } } if(gpu->gpu.rank == comm->rank) myDomain = domains[g]; } int ngpus = system->nodes[GPU].count; for(int i = 0; i < ngpus; i++) { if(domains[i] == myDomain) continue; struct scclTopoNode* gpu = NULL; int g; for(g = 0; g < system->nodes[GPU].count /* This one varies over the loops */; g++) { gpu = system->nodes[GPU].nodes + g; if(gpu->id == ids[i]) break; else gpu = NULL; } if(gpu == NULL) { WARN("Could not find id %lx", ids[i]); free(domains); free(ids); return scclInternalError; } SCCLCHECK(scclTopoRemoveNode(system, GPU, g)); } // trim low speed port on same NIC for(int i = 0; i < system->nodes[NET].count; i++) { for(int j = 0; j < system->nodes[NET].count; j++) { if(i == j) continue; if(system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[j].net.asic) { if(system->nodes[NET].nodes[i].net.bw > system->nodes[NET].nodes[j].net.bw) system->nodes[NET].nodes[j].net.bw = 0; } } } do { int n; for(n = 0; n < system->nodes[NET].count; n++) { if(system->nodes[NET].nodes[n].net.bw == 0) break; } if(n < system->nodes[NET].count) { SCCLCHECK(scclTopoRemoveNode(system, NET, n)); } else break; } while(system->nodes[NET].count); int remove = 1; int gdr = 1; bool allXgmi = true; // detect if all GPUs are connected by XGMI for(int i = 0; i < system->nodes[GPU].count && allXgmi; i++) { int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev; for(int j = 0; j < system->nodes[GPU].count && allXgmi; j++) { if(i == j) continue; int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev; bool isXGMI; SCCLCHECK(scclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI)); allXgmi &= isXGMI; } } if(allXgmi) system->type |= RCCL_TOPO_XGMI_ALL; for(int g = 0; g < system->nodes[GPU].count; g++) { int net; SCCLCHECK(scclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net)); SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr)); if(!gdr) break; } if(gdr && !allXgmi) { remove = 0; system->type |= RCCL_TOPO_GDR_ALL; INFO(SCCL_LOG_TOPO, "GDR is available on all GPUs"); } // Special handling of gfx94x if(rcclParamEnableIntranet() == 1 || (rcclParamEnableIntranet() == -2 && IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && system->nodes[GPU].count == 8 && system->nodes[NET].count == 8)) { remove = 0; system->type |= RCCL_TOPO_FORCE_INTRA; } comm->localRanks = system->nodes[GPU].count; if(system->nodes[GPU].count == comm->nRanks && remove) { for(int n = system->nodes[NET].count - 1; n >= 0; n--) SCCLCHECK(scclTopoRemoveNode(system, NET, n)); } free(domains); free(ids); return scclSuccess; } void scclTopoFree(struct scclTopoSystem* system) { for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) scclTopoRemovePathType(system, t); free(system); } SCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 1); SCCL_PARAM(NChannelsPerPeer, "NCHANNELS_PER_PEER", 4); static scclResult_t scclTopoGetNchannels(struct scclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) { int peer; struct scclTopoLinkList* path = NULL; if(scclTopoRankToIndex(system, peerRank, &peer) == scclSuccess) { // Same rank if(g == peer) { *nChannels = -1; return scclSuccess; } // Local rank path = system->nodes[GPU].nodes[peer].paths[GPU] + g; if(path->type == PATH_NVL) { float nvlBw = scclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn); *nChannels = (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 4 : 2) * std::max(1, (int)(path->bw / nvlBw)); } else { *nChannels = 2; } } else { // Remote rank, use network *nChannels = scclParamNChannelsPerNetPeer(); } return scclSuccess; } SCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 4); SCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS); static int nextPow2(int v) { int pow2 = 1; while(pow2 < v) pow2 <<= 1; return pow2; } scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm) { /* here we already honor comm->max/minCTAs for p2pnChannels. */ int MinP2pNchannels = (int)scclParamMinP2pNChannels(); int MaxP2pNchannels = (int)scclParamMaxP2pNChannels(); int NchannelsPerPeer = (int)scclParamNChannelsPerPeer(); if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MIN_P2P_NCHANNELS") == NULL) MinP2pNchannels = 32; if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MAX_P2P_NCHANNELS") == NULL) MaxP2pNchannels = 32; if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_NCHANNELS_PER_PEER") == NULL) NchannelsPerPeer = 32; int scclMinP2pNchannels = MinP2pNchannels; if(comm->sharedRes->owner != comm) { comm->p2pnChannels = std::min(comm->nChannels, MaxP2pNchannels); comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, scclMinP2pNchannels), comm->sharedRes->tpP2pNChannels); } else { comm->p2pnChannels = std::min(comm->nChannels, MaxP2pNchannels); comm->p2pnChannels = std::max(comm->p2pnChannels, scclMinP2pNchannels); } int minChannels = comm->p2pnChannels; // We need to loop through all local GPUs to have a global picture for(int g = 0; g < comm->topo->nodes[GPU].count; g++) { for(int r = 0; r < comm->nRanks; r++) { int nChannels; SCCLCHECK(scclTopoGetNchannels(comm->topo, g, r, &nChannels)); if(nChannels >= 0) minChannels = std::min(minChannels, nChannels); } } int arch, vendor, model; SCCLCHECK(scclTopoCpuType(comm->topo, &arch, &vendor, &model)); // Round to next pow2 nChannelsPerPeer and nChannels if(getNumaMaxGpus() == 1 && !scclTopoPathAllNVLink(comm->topo)) { comm->p2pnChannelsPerPeer = nextPow2(comm->p2pnChannels); } else { comm->p2pnChannelsPerPeer = (NchannelsPerPeer == -2 ? nextPow2(minChannels) : NchannelsPerPeer); } comm->p2pnChannels = nextPow2(comm->p2pnChannels); // Init channels that weren't used so far for(int c = comm->nChannels; c < std::max(comm->nChannels, comm->p2pnChannels); c++) SCCLCHECK(initChannel(comm, c)); // We want to spread channels used when there aren't many and progressively // fill the whole space of nChannels. To do so we mirror the bits in the // nChannels space. for(int c = 0; c < comm->p2pnChannels; c++) { int mirror = 0; for(int b = 1, mb = (comm->p2pnChannels >> 1); b < comm->p2pnChannels; b <<= 1, mb >>= 1) if(c & b) mirror |= mb; comm->p2pChannels[c] = mirror; } return scclSuccess; } scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks) { int ngpus = system->nodes[GPU].count; SCCLCHECK(scclCalloc(ranks, ngpus)); int nvbGpus = 0; for(int g = 0; g < ngpus; g++) { struct scclTopoNode* gpu = system->nodes[GPU].nodes + g; if(gpu->gpu.rank != rank) continue; for(int p = 0; p < ngpus; p++) { if(gpu->paths[GPU][p].type == PATH_NVB) { (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank; } } } *nranks = nvbGpus; return scclSuccess; } int scclTopoPathAllNVLink(struct scclTopoSystem* system) { int minPath = PATH_DIS; for(int i = 0; i < system->nodes[GPU].count; i++) { struct scclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU]; for(int j = 0; j < system->nodes[GPU].count; j++) { if(i == j) continue; minPath = std::min(minPath, paths[j].type); } } return minPath >= PATH_PIX ? 0 : 1; } } // namespace graph scclResult_t scclTopoPrintPaths(struct scclTopoSystem* system) { for(int i = 0; i < system->nodes[GPU].count; i++) { graph::printNodePaths(system, system->nodes[GPU].nodes + i); } for(int i = 0; i < system->nodes[NET].count; i++) { graph::printNodePaths(system, system->nodes[NET].nodes + i); } return scclSuccess; } int scclTopoUserP2pLevel = -1; scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank) { *p2p = 0; if(read) *read = 0; if(intermediateRank) *intermediateRank = -1; // Get GPUs from topology int g1, g2; SCCLCHECK(scclTopoIdToIndex(system, GPU, id1, &g1)); struct scclTopoNode* gpu1 = system->nodes[GPU].nodes + g1; if(scclTopoIdToIndex(system, GPU, id2, &g2) == scclInternalError) { // GPU not found, we can't use p2p. return scclSuccess; } int intermediateIndex = -1; // Set intermediate GPU rank, if routing through an intermediate GPU. struct scclTopoLinkList* path = gpu1->paths[GPU] + g2; if(path->count == 2) { struct scclTopoNode* intermediateNode = path->list[0]->remNode; if(intermediateNode->type == GPU) { intermediateIndex = intermediateNode - system->nodes[GPU].nodes; if(intermediateRank) *intermediateRank = intermediateNode->gpu.rank; } } // In general, use P2P whenever we can. int p2pLevel = PATH_SYS; // User override if(scclTopoUserP2pLevel == -1) SCCLCHECK(scclGetLevel(&scclTopoUserP2pLevel, "SCCL_P2P_DISABLE", "SCCL_P2P_LEVEL")); if(scclTopoUserP2pLevel != -2) { p2pLevel = scclTopoUserP2pLevel; goto compare; } // Don't use P2P through ARM CPUs int arch, vendor, model; SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model)); if(arch == SCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB; if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_INTEL) { p2pLevel = PATH_PXB; } if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) { p2pLevel = PATH_PXB; } compare: // Compute the PCI distance and compare with the p2pLevel. if(path->type <= p2pLevel) *p2p = 1; if(path->type == PATH_NVL) { struct scclTopoNode* gpu2 = system->nodes[GPU].nodes + g2; // Enable P2P Read for Ampere/NVLink only if(read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1; } return scclSuccess; } scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm) { // Precompute paths between GPUs/NICs. // Remove everything in case we're re-computing for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) graph::scclTopoRemovePathType(system, t); // Set direct paths to CPUs. We need them in many cases. for(int c = 0; c < system->nodes[CPU].count; c++) { SCCLCHECK(graph::scclTopoSetPaths(system->nodes[CPU].nodes + c, system)); } // Set direct paths to GPUs. for(int g = 0; g < system->nodes[GPU].count; g++) { SCCLCHECK(graph::scclTopoSetPaths(system->nodes[GPU].nodes + g, system)); } // Set direct paths to NICs. for(int n = 0; n < system->nodes[NET].count; n++) { SCCLCHECK(graph::scclTopoSetPaths(system->nodes[NET].nodes + n, system)); } // Set direct paths to NVSwitches. for(int n = 0; n < system->nodes[NVS].count; n++) { SCCLCHECK(graph::scclTopoSetPaths(system->nodes[NVS].nodes + n, system)); } // Update path for GPUs when we don't want to / can't use GPU Direct P2P for(int g = 0; g < system->nodes[GPU].count; g++) { for(int p = 0; p < system->nodes[GPU].count; p++) { int p2p; SCCLCHECK(scclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL)); if(p2p == 0) { // Divert all traffic through the CPU int cpu; SCCLCHECK(getLocalCpu(system, g, &cpu)); SCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g)); } } if(comm == NULL) continue; // Remove GPUs we can't (or don't want to) communicate with through P2P or SHM struct scclPeerInfo* dstInfo = comm->peerInfo + system->nodes[GPU].nodes[g].gpu.rank; for(int p = 0; p < system->nodes[GPU].count; p++) { if(p == g) continue; struct scclPeerInfo* srcInfo = comm->peerInfo + system->nodes[GPU].nodes[p].gpu.rank; int p2p; SCCLCHECK(scclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo)); if(p2p == 0) { int shm; SCCLCHECK(scclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo)); if(shm == 0) { // Mark this peer as inaccessible. We'll trim it later. system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET; } } } } // Special handling of gfx94x #if !defined(TOPO_EXPL) char strValue[1024]; SCCLCHECK(scclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue)); if(strncmp("Hyper-V UEFI Release", strValue, 20) == 0) { #endif int arch, vendor, model; SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model)); if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_INTEL && IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && ((system->nodes[GPU].count == 8 && system->nodes[NET].count == 8 && system->nodes[GPU].count == system->nRanks) || (system->nodes[GPU].count != system->nRanks))) { if(!rcclPathOverride(system, 0x100000)) rcclPathOverride(system, 0x1000); } #if !defined(TOPO_EXPL) } #endif // Update paths for NICs (no GPU Direct, PXN, ...) for(int n = 0; n < system->nodes[NET].count; n++) { struct scclTopoNode* netNode = system->nodes[NET].nodes + n; for(int g = 0; g < system->nodes[GPU].count; g++) { // Check whether we can access the NIC through another NVLink-connected GPU (PXN) struct scclTopoNode* gpu = system->nodes[GPU].nodes + g; if(scclPxnDisable(comm) != 1) { int localGpuIndex; SCCLCHECK(scclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex)); if(localGpuIndex != g && localGpuIndex != -1) { // PXN = PCI + NVLink. struct scclTopoNode* peerNode = system->nodes[GPU].nodes + localGpuIndex; // Only use PXN for NIC n if remote GPU p ... if(peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU // We can use that GPU as relay to communicate with that NIC. // Only enabling it in the GPU->NIC direction for now to favor // receiving locally and sending remotely (consistent with net.cc) SCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n)); } } // Update path when we dont want to / can't use GPU Direct RDMA. int gdr; SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr)); if(gdr == 0) { // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU int localCpu; SCCLCHECK(getLocalCpu(system, g, &localCpu)); SCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g)); SCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n)); } } } return scclSuccess; } } // namespace topology } // namespace hardware } // namespace sccl