/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "graph.h" #include "trees.h" #include "rings.h" #include "topo.h" /******************************************************************/ /********************* Internode connection ***********************/ /******************************************************************/ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) { int rank = comm->rank; int localRanks = comm->topo->nodes[GPU].count; int nChannels = comm->nChannels; for (int c=0; cchannels+c; channel->ring.prev = channel->ring.next = -1; channel->tree.up = -1; channel->collnetChain.up = -1; for (int i=0; itree.down[i] = -1; for (int i=0; icollnetChain.down[i] = -1; channel->collnetDirect.out = -1; channel->collnetDirect.headRank = -1; channel->collnetDirect.nHeads = 0; channel->collnetDirect.shift = 0; for (int i=0; icollnetDirect.up[i] = -1; for (int i=0; icollnetDirect.down[i] = -1; int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks; int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks; int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks; int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra+c*localRanks; for (int i=0; iringRecv[c] = ringIntra[0]; topoRanks->ringSend[c] = ringIntra[localRanks-1]; channel->ring.prev = (i == 0) ? -1 : ringIntra[i-1]; channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1]; } if (treeIntra[i] == rank) { int parentIndex = 0; int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0; topoRanks->treeToParent[c] = treeIntra[parentIndex]; topoRanks->treeToChild0[c] = treeIntra[child0Index]; topoRanks->treeToChild1[c] = treeIntra[child1Index]; channel->tree.up = i == 0 ? -1 : treeIntra[i-1]; channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1]; } if (collNetIntra[i] == rank) { channel->collnetChain.up = i == 0 ? comm->nRanks : collNetIntra[i-1]; channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1]; } } topoRanks->ringPrev[c] = channel->ring.prev; topoRanks->ringNext[c] = channel->ring.next; topoRanks->nvlsHeads[c] = nvlsIntra[0]; } // Duplicate channels rings/trees struct ncclChannel* channel0 = comm->channels; struct ncclChannel* channel1 = (nChannels > MAXCHANNELS/2) ? 0 : channel0+nChannels; if (channel1) memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel)); return ncclSuccess; } bool isRankHere(const char* s, int start, int end, int rank) { if (end <= start || start < 0 || end < 0) return false; int num = 0; while (start < end) { char currChar = s[start]; if (isdigit(currChar)) { num = num * 10 + (currChar - '0'); if (isdigit(s[start+1])) { start++; continue; } } else if (currChar == '(' || currChar == ')') { start++; num = 0; continue; } if (num == rank) return true; start++; } return false; } ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph) { int x=0, y=0; for (int i=0; treeGraph->treeBase[i][0]!=0; i++) { x=i+1; } if( treeGraph->treeBase[0][0] == 0) return ncclSuccess; int nChannels = comm->nChannels; int localRanks = comm->topo->nodes[GPU].count; //new tree for (int c=0; ctreeBase[buff][ko] != 0) { tempString[ko] = treeGraph->treeBase[buff][ko]; ko++; } tempString[ko]=0; int start = 0; int curRank = comm->rank; struct ncclChannel* channel = comm->channels+c; int end = 0; while (tempString[end] != 0) end++; int parent = -1; // constructing a number from the continuous digits while (start < end) { int num = 0, num_found = 0; start++; while (start < end && tempString[start] != '(' && tempString[start] != ')') { int num_here = (int)(tempString[start] - '0'); num = num * 10 + num_here; start = start + 1; if (tempString[start] == '(' || tempString[start] == ')' || start == end) num_found = 1; } if (num_found != 0 && num == curRank) { channel->tree.up = parent; int depth = 0; for (int childId = 0; childId < NCCL_MAX_TREE_ARITY; childId++) { int or_start = start; int child = -1; channel->tree.down[childId] = -1; if (or_start >= end -1) continue; num=0; or_start++; while (tempString[or_start] != 0 && tempString[or_start] != '(' && tempString[or_start] != ')') { int num_here = (int)(tempString[or_start] - '0'); num = num * 10 + num_here; or_start++; } child = num; // find next child start while (start < end) { if (tempString[start] == '(' ) depth++; else if(tempString[start] == ')') depth--; if (depth == 0) break; // next child start++; } start++; channel->tree.down[childId] = child; // get kids, update numbers, get out of this string } break; } else { //go to the next one parent = num; int start_c = start; int end_c = start_c; while (end_c < end) { int depth = 0; while (end_c < end) { if (tempString[end_c] == '(' ) depth++; else if(tempString[end_c] == ')') depth--; if (depth == 0) break; // next child end_c++; } if (isRankHere(tempString, start_c, end_c, curRank)) { start = start_c; end = end_c; break; } else { end_c++; start_c = end_c; } } } } } return ncclSuccess; } static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) { int nChannels = comm->nChannels; int nNodes = comm->nNodes; for (int c=0; cnNodes; int* send = ringSend+c*comm->nNodes; int* prev = ringPrev+c*comm->nRanks; int* next = ringNext+c*comm->nRanks; struct ncclChannel* channel0 = comm->channels+c; struct ncclChannel* channel1 = (nChannels > MAXCHANNELS/2) ? 0 : channel0+nChannels; for (int n=0; nrank == recvRank) { channel0->ring.prev = prevSendRank; if (channel1) channel1->ring.prev = prevSendRank; } int sendRank = send[n]; int nextRecvRank = recv[(n+1)%nNodes]; next[sendRank] = nextRecvRank; if (comm->rank == sendRank) { channel0->ring.next = nextRecvRank; if (channel1) channel1->ring.next = nextRecvRank; } } TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next); if (channel1) TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next); } return ncclSuccess; } static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes) { for (int n=0; nup = indexes[u]; return ncclSuccess; } static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) { if (d == -1) return ncclSuccess; int x = 0; while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++; if (x == NCCL_MAX_TREE_ARITY) { WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]); return ncclInternalError; } tree->down[x] = indexes[d]; return ncclSuccess; } static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) { const int nChannels = (comm->nChannels > MAXCHANNELS/2) ? comm->nChannels/2 : comm->nChannels, nNodes = comm->nNodes, node = comm->node; // Compute tree depth. Not an exact value but a good approximation in most // cases int depth = comm->nRanks/nNodes - 1 + log2i(nNodes); int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType; int* ttp, *ttc0, *ttc1; NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType)); if (comm->nChannels <= MAXCHANNELS/2) { for (int c=0; cchannels+c; struct ncclChannel* channel1 = channel0+nChannels; ttp = treeToParent+c*comm->nNodes; ttc0 = treeToChild0+c*comm->nNodes; ttc1 = treeToChild1+c*comm->nNodes; if (comm->rank == ttp[node]) { NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u)); NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u)); } if (comm->rank == ttc0[node]) { NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0)); NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0)); } if (comm->rank == ttc1[node]) { NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1)); NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1)); } if (comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) { INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]); INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]); } channel0->tree.depth = channel1->tree.depth = depth; } } else { for (int c=0; cchannels+c; ttp = treeToParent+c*comm->nNodes; ttc0 = treeToChild0+c*comm->nNodes; ttc1 = treeToChild1+c*comm->nNodes; if (comm->rank == ttp[node]) { NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u)); } if (comm->rank == ttc0[node]) { NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0)); } if (comm->rank == ttc1[node]) { NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1)); } if (comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) { INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]); } channel0->tree.depth = depth; } for (int c=nChannels; cchannels+c; ttp = treeToParent+c*comm->nNodes; ttc0 = treeToChild0+c*comm->nNodes; ttc1 = treeToChild1+c*comm->nNodes; if (comm->rank == ttp[node]) { NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u)); } if (comm->rank == ttc0[node]) { NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0)); } if (comm->rank == ttc1[node]) { NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1)); } if (comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) { INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]); } channel1->tree.depth = depth; } } return ncclSuccess; } static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) { int rank = comm->rank; int localRanks = comm->localRanks; int nHeads = 0; int *heads; NCCLCHECK(ncclCalloc(&heads, localRanks)); // Find all head ranks // Head index is always 0 for (int c=0; cnChannels; c++) { int* collNetIntra = collNetGraph->intra+c*localRanks; int head = collNetIntra[0]; for (int h=0; hnChannels; c++) { struct ncclChannel* channel = comm->channels+c; char line[1024]; sprintf(line, "CollNet channel %d rank %d ", c, rank); int nDown = 0; for (int i=0; icollnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks int* collNetIntra = collNetGraph->intra+i*localRanks; sprintf(line+strlen(line), "down "); for (int r=0; rcollnetDirect.down[nDown++] = collNetIntra[r]; // connect to all peers sprintf(line+strlen(line), " %d ", collNetIntra[r]); } sprintf(line+strlen(line), "nDown %d ", nDown); break; } } // Connect to all heads int nUp = 0; sprintf(line+strlen(line), "up "); for (int h=0; hcollnetDirect.up[nUp++] = heads[h]; sprintf(line+strlen(line), " %d ", heads[h]); } channel->collnetDirect.nHeads = nHeads; channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2; sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads); sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift); INFO(NCCL_GRAPH, "%s", line); channel->collnetChain.depth = comm->nRanks/comm->nNodes; } for (int c=0; cnvlsChannels; c++) { struct ncclChannel* channel = comm->channels+c; if (channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks; } free(heads); return ncclSuccess; } static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct ncclTopoGraph* nvlsGraph) { int nHeads = nvlsGraph->nChannels; int headRank = -1; for (int h=0; hintra[h*comm->localRanks] == comm->rank) headRank = h; } if (nHeads == 0) { comm->nvlsChannels = 0; return ncclSuccess; } for (int c=0; cnvlsChannels; c++) { struct ncclChannel* channel = comm->channels+c; channel->nvls.nHeads = nHeads; for (int h=0; hnvls.up[h] = comm->nRanks+1+h; for (int h=nHeads; hnvls.up[h] = -1; channel->nvls.down = comm->nRanks+1+headRank; channel->nvls.out = -1; // NVLS+SHARP not yet implemented. channel->nvls.headRank = headRank; channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1; channel->nvls.node = comm->node; channel->nvls.nNodes = comm->nNodes; } if (comm->nNodes == 1) return ncclSuccess; // Connect Trees int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1; int pc0, pc1; // ignored NCCLCHECK(ncclGetDtree(comm->nNodes, comm->node, &tree0Parent, &tree0Child0, &tree0Child1, &pc0, &tree1Parent, &tree1Child0, &tree1Child1, &pc1)); int* heads = NULL; int treeUp[2] = { -1, -1 }; int treeDown0[2] = { -1, -1 }; int treeDown1[2] = { -1, -1 }; if (comm->node == 0) { for (int h=0; hnNodes; for (int n=0; nnNodes && n<20; n++) { sprintf(line+strlen(line), " %2d", heads[n]); } INFO(NCCL_INIT, "%s", line); } } // Find the heads where I'm the head rank and retain tree up/down for (int h=0; hnNodes; if (heads[comm->node] == comm->rank) { treeUp[0] = tree0Parent == -1 ? -1: heads[tree0Parent]; treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0]; treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1]; treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent]; treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0]; treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1]; break; } } // Set prev/next in all channels (NVLS compute channels work // orthogonally to NVLS search channels). for (int c=0; cnvlsChannels; c++) { struct ncclChannel* channel = comm->channels+c; channel->nvls.treeUp = treeUp[c%2]; channel->nvls.treeDown[0] = channel->nvls.down; int ix = 1; if (treeDown0[c%2] != -1) channel->nvls.treeDown[ix++] = treeDown0[c%2]; if (treeDown1[c%2] != -1) channel->nvls.treeDown[ix] = treeDown1[c%2]; } struct ncclNvls* nvls0 = &comm->channels[0].nvls; struct ncclNvls* nvls1 = &comm->channels[1].nvls; INFO(NCCL_GRAPH, "NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d", nvls0->treeDown[0], nvls0->treeDown[1], comm->rank, nvls0->treeUp, nvls1->treeDown[0], nvls1->treeDown[1], comm->rank, nvls1->treeUp); return ncclSuccess; } // Legacy naming NCCL_PARAM(MinNrings, "MIN_NRINGS", -2); NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2); // New naming NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", 4); NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2); int ncclMinNchannels() { int minNchannels = 2; if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings(); if (ncclParamMinNchannels() != -2 && !getVaspOptimizeFlag()) minNchannels = ncclParamMinNchannels(); if (minNchannels > MAXCHANNELS) { WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS); minNchannels = MAXCHANNELS; } if (minNchannels < 0) minNchannels = 0; return minNchannels; } int ncclMaxNchannels() { int maxNchannels = MAXCHANNELS; if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings(); if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels(); if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS; if (maxNchannels < 1) { WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels); maxNchannels = 1; } return maxNchannels; } static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) { int nranks = comm->nRanks; int c; for (c=start; cchannels+c, comm->channels+c-start, sizeof(struct ncclChannel)); } return c; } static int copyMixedChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) { int nranks = comm->nRanks; int c; for (c=start; cchannels+c, comm->channels+c-start, sizeof(struct ncclChannel)); comm->channels[c].transportType = comm->mixedTransportType; } return c; } RCCL_PARAM(MaxMixedHylinkNChannels, "MAX_MIXED_HYLINK_NCHANNELS", 0); RCCL_PARAM(MixedTransportType, "MIXED_TRANSPORT_TYPE", TRANSPORT_SHM); ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc) { // Gather data from all ranks int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads; int nranks = comm->nRanks; int nNodes = comm->nNodes; int nChannels = comm->nChannels; NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS)); for (int c=0; cringRecv[c]; ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c]; treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c]; treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c]; treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c]; nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c]; } for (int r=0; rringPrev[c]; ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c]; } } // Connect rings and trees. This should also duplicate the channels. NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext)); NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns)); NCCLCHECK(connectNvls(comm, nvlsHeads, graphs[NCCL_ALGO_NVLS])); // Duplicate ringPrev/ringNext for ncclBuildRing if (nChannels <= MAXCHANNELS/2) memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); if (nChannels <= MAXCHANNELS/2) memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int)); // Get number of channels after duplication nc = std::min((int)ncclMaxNchannels()/comm->nChannels, nc); nc *= comm->nChannels; // Duplication should be complete now nChannels = comm->nChannels = std::min(MAXCHANNELS, (nChannels <= MAXCHANNELS/2) ? nChannels*2 : nChannels); // Setup CollNet if (comm->collNetSupport == 1) { struct ncclTopoGraph* collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT]; // Add more channels to saturate intra-node bandwidth, except the 1 PPN case if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) { int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2); nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext); } NCCLCHECK(connectCollNet(comm, collNetGraph)); } // Use 4 compute channels per search channel to reach peak BW on <8 PPN if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= MAXCHANNELS) { nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext); } // Add Hylink + PCIE double channel path if(graphs[NCCL_ALGO_RING]->typeIntra == PATH_NVL) { comm->nMixedHylinkChannels = std::min(MAXCHANNELS - comm->nChannels, (int)rcclParamMaxMixedHylinkNChannels()); if(comm->nMixedHylinkChannels > 0){ INFO(NCCL_GRAPH,"<%s:%d> -----> comm->nMixedHylinkShmChannels: %d, comm->nChannels: %d\n", __func__, __LINE__, comm->nMixedHylinkChannels, comm->nChannels); comm->mixedTransportType = std::max((int)rcclParamMixedTransportType(), TRANSPORT_SHM); nChannels = comm->nChannels = copyMixedChannels(comm, nChannels, nChannels + comm->nMixedHylinkChannels, ringPrev, ringNext); } } // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS. // We permit combining max, then min, to only use the first channels, then duplicate them. if (comm->sharedRes->owner != comm) { /* child comm #channels cannot exceed top parent #channels. */ nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels); nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), std::max(nc, comm->config.minCTAs)), comm->sharedRes->tpNChannels), ringPrev, ringNext); } else { nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs); nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), std::max(nc, comm->config.minCTAs)), ringPrev, ringNext); } // Create rings array and check all is fine NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext)); free(ringRecv); free(ringSend); free(ringPrev); free(ringNext); free(treeToParent); free(treeToChild0); free(treeToChild1); free(nvlsHeads); return ncclSuccess; }