Commit a4ac3320 authored by lishen's avatar lishen
Browse files

通过线程池实现ipcsocket,满足节点内通信

parent d9d23f34
#include "comm.h"
#include "graph.h"
#include "trees.h"
#include "rings.h"
#include "topo.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
/******************************************************************/
/********************* Internode connection ***********************/
/******************************************************************/
scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks) {
int rank = comm->rank;
int localRanks = comm->topo->nodes[GPU].count;
int nChannels = comm->nChannels;
for(int c = 0; c < nChannels; c++) {
struct scclChannel* channel = comm->channels + c;
channel->ring.prev = channel->ring.next = -1;
channel->tree.up = -1;
channel->collnetChain.up = -1;
for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++)
channel->tree.down[i] = -1;
for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++)
channel->collnetChain.down[i] = -1;
channel->collnetDirect.out = -1;
channel->collnetDirect.headRank = -1;
channel->collnetDirect.nHeads = 0;
channel->collnetDirect.shift = 0;
for(int i = 0; i < SCCL_MAX_DIRECT_ARITY; i++)
channel->collnetDirect.up[i] = -1;
for(int i = 0; i < SCCL_MAX_DIRECT_ARITY; i++)
channel->collnetDirect.down[i] = -1;
int* ringIntra = graphs[SCCL_ALGO_RING]->intra + c * localRanks;
int* treeIntra = graphs[SCCL_ALGO_TREE]->intra + c * localRanks;
int* collNetIntra = graphs[SCCL_ALGO_COLLNET_CHAIN]->intra + c * localRanks;
int* nvlsIntra = graphs[SCCL_ALGO_NVLS]->intra + c * localRanks;
for(int i = 0; i < localRanks; i++) {
if(ringIntra[i] == rank) {
topoRanks->ringRecv[c] = ringIntra[0];
topoRanks->ringSend[c] = ringIntra[localRanks - 1];
channel->ring.prev = (i == 0) ? -1 : ringIntra[i - 1];
channel->ring.next = (i == localRanks - 1) ? -1 : ringIntra[i + 1];
}
if(treeIntra[i] == rank) {
int parentIndex = 0;
int child0Index = graphs[SCCL_ALGO_TREE]->pattern == SCCL_TOPO_PATTERN_TREE ? 0 : 1;
int child1Index = graphs[SCCL_ALGO_TREE]->pattern == SCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
topoRanks->treeToParent[c] = treeIntra[parentIndex];
topoRanks->treeToChild0[c] = treeIntra[child0Index];
topoRanks->treeToChild1[c] = treeIntra[child1Index];
channel->tree.up = i == 0 ? -1 : treeIntra[i - 1];
channel->tree.down[0] = i == localRanks - 1 ? -1 : treeIntra[i + 1];
}
if(collNetIntra[i] == rank) {
channel->collnetChain.up = i == 0 ? comm->nRanks : collNetIntra[i - 1];
channel->collnetChain.down[0] = i == localRanks - 1 ? -1 : collNetIntra[i + 1];
}
}
topoRanks->ringPrev[c] = channel->ring.prev;
topoRanks->ringNext[c] = channel->ring.next;
topoRanks->nvlsHeads[c] = nvlsIntra[0];
}
// Duplicate channels rings/trees
struct scclChannel* channel0 = comm->channels;
struct scclChannel* channel1 = (nChannels > MAXCHANNELS / 2) ? 0 : channel0 + nChannels;
if(channel1)
memcpy(channel1, channel0, nChannels * sizeof(struct scclChannel));
return scclSuccess;
}
bool isRankHere(const char* s, int start, int end, int rank) {
if(end <= start || start < 0 || end < 0)
return false;
int num = 0;
while(start < end) {
char currChar = s[start];
if(isdigit(currChar)) {
num = num * 10 + (currChar - '0');
if(isdigit(s[start + 1])) {
start++;
continue;
}
} else if(currChar == '(' || currChar == ')') {
start++;
num = 0;
continue;
}
if(num == rank)
return true;
start++;
}
return false;
}
scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph) {
int x = 0, y = 0;
for(int i = 0; treeGraph->treeBase[i][0] != 0; i++) {
x = i + 1;
}
if(treeGraph->treeBase[0][0] == 0)
return scclSuccess;
int nChannels = comm->nChannels;
int localRanks = comm->topo->nodes[GPU].count;
// new tree
for(int c = 0; c < nChannels; c++) { // in here
int buff = c % x;
char tempString[SCCL_TOPO_MAX_NODES * 4];
int ko = 0;
while(treeGraph->treeBase[buff][ko] != 0) {
tempString[ko] = treeGraph->treeBase[buff][ko];
ko++;
}
tempString[ko] = 0;
int start = 0;
int curRank = comm->rank;
struct scclChannel* channel = comm->channels + c;
int end = 0;
while(tempString[end] != 0)
end++;
int parent = -1;
// constructing a number from the continuous digits
while(start < end) {
int num = 0, num_found = 0;
start++;
while(start < end && tempString[start] != '(' && tempString[start] != ')') {
int num_here = (int)(tempString[start] - '0');
num = num * 10 + num_here;
start = start + 1;
if(tempString[start] == '(' || tempString[start] == ')' || start == end)
num_found = 1;
}
if(num_found != 0 && num == curRank) {
channel->tree.up = parent;
int depth = 0;
for(int childId = 0; childId < SCCL_MAX_TREE_ARITY; childId++) {
int or_start = start;
int child = -1;
channel->tree.down[childId] = -1;
if(or_start >= end - 1)
continue;
num = 0;
or_start++;
while(tempString[or_start] != 0 && tempString[or_start] != '(' && tempString[or_start] != ')') {
int num_here = (int)(tempString[or_start] - '0');
num = num * 10 + num_here;
or_start++;
}
child = num;
// find next child start
while(start < end) {
if(tempString[start] == '(')
depth++;
else if(tempString[start] == ')')
depth--;
if(depth == 0)
break; // next child
start++;
}
start++;
channel->tree.down[childId] = child;
// get kids, update numbers, get out of this string
}
break;
} else { // go to the next one
parent = num;
int start_c = start;
int end_c = start_c;
while(end_c < end) {
int depth = 0;
while(end_c < end) {
if(tempString[end_c] == '(')
depth++;
else if(tempString[end_c] == ')')
depth--;
if(depth == 0)
break; // next child
end_c++;
}
if(isRankHere(tempString, start_c, end_c, curRank)) {
start = start_c;
end = end_c;
break;
} else {
end_c++;
start_c = end_c;
}
}
}
}
}
return scclSuccess;
}
static scclResult_t connectRings(struct scclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
int nChannels = comm->nChannels;
int nNodes = comm->nNodes;
for(int c = 0; c < nChannels; c++) {
int* recv = ringRecv + c * comm->nNodes;
int* send = ringSend + c * comm->nNodes;
int* prev = ringPrev + c * comm->nRanks;
int* next = ringNext + c * comm->nRanks;
struct scclChannel* channel0 = comm->channels + c;
struct scclChannel* channel1 = (nChannels > MAXCHANNELS / 2) ? 0 : channel0 + nChannels;
for(int n = 0; n < nNodes; n++) {
int recvRank = recv[n];
int prevSendRank = send[(n - 1 + nNodes) % nNodes];
prev[recvRank] = prevSendRank;
if(comm->rank == recvRank) {
channel0->ring.prev = prevSendRank;
if(channel1)
channel1->ring.prev = prevSendRank;
}
int sendRank = send[n];
int nextRecvRank = recv[(n + 1) % nNodes];
next[sendRank] = nextRecvRank;
if(comm->rank == sendRank) {
channel0->ring.next = nextRecvRank;
if(channel1)
channel1->ring.next = nextRecvRank;
}
}
}
return scclSuccess;
}
static scclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
for(int n = 0; n < nNodes; n++)
indexes[n] = ranks[n];
return scclSuccess;
}
static scclResult_t setTreeUp(struct scclTree* tree, int* indexes, int u) {
if(u == -1)
return scclSuccess;
tree->up = indexes[u];
return scclSuccess;
}
static scclResult_t setTreeDown(struct scclTree* tree, int* indexes, int d) {
if(d == -1)
return scclSuccess;
int x = 0;
while(x < SCCL_MAX_TREE_ARITY && tree->down[x] >= 0)
x++;
if(x == SCCL_MAX_TREE_ARITY) {
WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
return scclInternalError;
}
tree->down[x] = indexes[d];
return scclSuccess;
}
static scclResult_t connectTrees(struct scclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
const int nChannels = (comm->nChannels > MAXCHANNELS / 2) ? comm->nChannels / 2 : comm->nChannels, nNodes = comm->nNodes, node = comm->node;
// Compute tree depth. Not an exact value but a good approximation in most
// cases
int depth = comm->nRanks / nNodes - 1 + log2i(nNodes);
int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
int *ttp, *ttc0, *ttc1;
SCCLCHECK(scclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
if(comm->nChannels <= MAXCHANNELS / 2) {
for(int c = 0; c < nChannels; c++) {
struct scclChannel* channel0 = comm->channels + c;
struct scclChannel* channel1 = channel0 + nChannels;
ttp = treeToParent + c * comm->nNodes;
ttc0 = treeToChild0 + c * comm->nNodes;
ttc1 = treeToChild1 + c * comm->nNodes;
if(comm->rank == ttp[node]) {
SCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
SCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
}
if(comm->rank == ttc0[node]) {
SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
}
if(comm->rank == ttc1[node]) {
SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
}
if(comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
INFO(SCCL_LOG_TOPO,
"Tree %d : %d -> %d -> %d/%d/%d",
c,
channel0->tree.up,
comm->rank,
channel0->tree.down[0],
channel0->tree.down[1],
channel0->tree.down[2]);
INFO(SCCL_LOG_TOPO,
"Tree %d : %d -> %d -> %d/%d/%d",
c + nChannels,
channel1->tree.up,
comm->rank,
channel1->tree.down[0],
channel1->tree.down[1],
channel1->tree.down[2]);
}
channel0->tree.depth = channel1->tree.depth = depth;
}
} else {
for(int c = 0; c < nChannels; c++) {
struct scclChannel* channel0 = comm->channels + c;
ttp = treeToParent + c * comm->nNodes;
ttc0 = treeToChild0 + c * comm->nNodes;
ttc1 = treeToChild1 + c * comm->nNodes;
if(comm->rank == ttp[node]) {
SCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
}
if(comm->rank == ttc0[node]) {
SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
}
if(comm->rank == ttc1[node]) {
SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
}
if(comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
INFO(SCCL_LOG_TOPO,
"Tree %d : %d -> %d -> %d/%d/%d",
c,
channel0->tree.up,
comm->rank,
channel0->tree.down[0],
channel0->tree.down[1],
channel0->tree.down[2]);
}
channel0->tree.depth = depth;
}
for(int c = nChannels; c < nChannels * 2; c++) {
struct scclChannel* channel1 = comm->channels + c;
ttp = treeToParent + c * comm->nNodes;
ttc0 = treeToChild0 + c * comm->nNodes;
ttc1 = treeToChild1 + c * comm->nNodes;
if(comm->rank == ttp[node]) {
SCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
}
if(comm->rank == ttc0[node]) {
SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
}
if(comm->rank == ttc1[node]) {
SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
}
if(comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
INFO(SCCL_LOG_TOPO,
"Tree %d : %d -> %d -> %d/%d/%d",
c + nChannels,
channel1->tree.up,
comm->rank,
channel1->tree.down[0],
channel1->tree.down[1],
channel1->tree.down[2]);
}
channel1->tree.depth = depth;
}
}
return scclSuccess;
}
static scclResult_t connectCollNet(struct scclComm* comm, struct scclTopoGraph* collNetGraph) {
int rank = comm->rank;
int localRanks = comm->localRanks;
int nHeads = 0;
int* heads;
SCCLCHECK(scclCalloc(&heads, localRanks));
// Find all head ranks
// Head index is always 0
for(int c = 0; c < collNetGraph->nChannels; c++) {
int* collNetIntra = collNetGraph->intra + c * localRanks;
int head = collNetIntra[0];
for(int h = 0; h < nHeads; h++)
if(heads[h] == head)
head = -1;
if(head != -1)
heads[nHeads++] = collNetIntra[0];
}
// For all channels
for(int c = 0; c < comm->nChannels; c++) {
struct scclChannel* channel = comm->channels + c;
char line[1024];
sprintf(line, "CollNet channel %d rank %d ", c, rank);
int nDown = 0;
for(int i = 0; i < nHeads; i++) {
if(rank == heads[i]) { // is head
channel->collnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel
channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks
int* collNetIntra = collNetGraph->intra + i * localRanks;
sprintf(line + strlen(line), "down ");
for(int r = 0; r < localRanks; r++) {
if(collNetIntra[r] == rank)
continue;
channel->collnetDirect.down[nDown++] = collNetIntra[r]; // connect to all peers
sprintf(line + strlen(line), " %d ", collNetIntra[r]);
}
sprintf(line + strlen(line), "nDown %d ", nDown);
break;
}
}
// Connect to all heads
int nUp = 0;
sprintf(line + strlen(line), "up ");
for(int h = 0; h < nHeads; h++) {
if(rank == heads[h])
continue;
channel->collnetDirect.up[nUp++] = heads[h];
sprintf(line + strlen(line), " %d ", heads[h]);
}
channel->collnetDirect.nHeads = nHeads;
channel->collnetDirect.shift = (rank % localRanks) % nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
sprintf(line + strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
sprintf(line + strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
INFO(SCCL_LOG_TOPO, "%s", line);
channel->collnetChain.depth = comm->nRanks / comm->nNodes;
}
for(int c = 0; c < comm->nvlsChannels; c++) {
struct scclChannel* channel = comm->channels + c;
if(channel->nvls.headRank != -1)
channel->nvls.out = comm->nRanks;
}
free(heads);
return scclSuccess;
}
static scclResult_t connectNvls(struct scclComm* comm, int* nvlsHeads, struct scclTopoGraph* nvlsGraph) {
int nHeads = nvlsGraph->nChannels;
int headRank = -1;
for(int h = 0; h < nHeads; h++) {
if(nvlsGraph->intra[h * comm->localRanks] == comm->rank)
headRank = h;
}
if(nHeads == 0) {
comm->nvlsChannels = 0;
return scclSuccess;
}
for(int c = 0; c < comm->nvlsChannels; c++) {
struct scclChannel* channel = comm->channels + c;
channel->nvls.nHeads = nHeads;
for(int h = 0; h < nHeads; h++)
channel->nvls.up[h] = comm->nRanks + 1 + h;
for(int h = nHeads; h < SCCL_MAX_NVLS_ARITY; h++)
channel->nvls.up[h] = -1;
channel->nvls.down = comm->nRanks + 1 + headRank;
channel->nvls.out = -1; // NVLS+SHARP not yet implemented.
channel->nvls.headRank = headRank;
channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
channel->nvls.node = comm->node;
channel->nvls.nNodes = comm->nNodes;
}
if(comm->nNodes == 1)
return scclSuccess;
// Connect Trees
int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
int pc0, pc1; // ignored
SCCLCHECK(scclGetDtree(comm->nNodes, comm->node, &tree0Parent, &tree0Child0, &tree0Child1, &pc0, &tree1Parent, &tree1Child0, &tree1Child1, &pc1));
int* heads = NULL;
int treeUp[2] = {-1, -1};
int treeDown0[2] = {-1, -1};
int treeDown1[2] = {-1, -1};
if(comm->node == 0) {
for(int h = 0; h < nHeads; h++) {
char line[1024];
sprintf(line, "NVLS Head %2d:", h);
heads = nvlsHeads + h * comm->nNodes;
for(int n = 0; n < comm->nNodes && n < 20; n++) {
sprintf(line + strlen(line), " %2d", heads[n]);
}
INFO(SCCL_INIT, "%s", line);
}
}
// Find the heads where I'm the head rank and retain tree up/down
for(int h = 0; h < nHeads; h++) {
heads = nvlsHeads + h * comm->nNodes;
if(heads[comm->node] == comm->rank) {
treeUp[0] = tree0Parent == -1 ? -1 : heads[tree0Parent];
treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0];
treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1];
treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent];
treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0];
treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1];
break;
}
}
// Set prev/next in all channels (NVLS compute channels work
// orthogonally to NVLS search channels).
for(int c = 0; c < comm->nvlsChannels; c++) {
struct scclChannel* channel = comm->channels + c;
channel->nvls.treeUp = treeUp[c % 2];
channel->nvls.treeDown[0] = channel->nvls.down;
int ix = 1;
if(treeDown0[c % 2] != -1)
channel->nvls.treeDown[ix++] = treeDown0[c % 2];
if(treeDown1[c % 2] != -1)
channel->nvls.treeDown[ix] = treeDown1[c % 2];
}
struct scclNvls* nvls0 = &comm->channels[0].nvls;
struct scclNvls* nvls1 = &comm->channels[1].nvls;
INFO(SCCL_LOG_TOPO,
"NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d",
nvls0->treeDown[0],
nvls0->treeDown[1],
comm->rank,
nvls0->treeUp,
nvls1->treeDown[0],
nvls1->treeDown[1],
comm->rank,
nvls1->treeUp);
return scclSuccess;
}
// Legacy naming
SCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
SCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
// New naming
SCCL_PARAM(MinNchannels, "MIN_NCHANNELS", 4);
SCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
int scclMinNchannels() {
int minNchannels = 2;
if(scclParamMinNrings() != -2)
minNchannels = scclParamMinNrings();
if(scclParamMinNchannels() != -2)
minNchannels = scclParamMinNchannels();
if(minNchannels > MAXCHANNELS) {
WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
minNchannels = MAXCHANNELS;
}
if(minNchannels < 0)
minNchannels = 0;
return minNchannels;
}
int scclMaxNchannels() {
int maxNchannels = MAXCHANNELS;
if(scclParamMaxNrings() != -2)
maxNchannels = scclParamMaxNrings();
if(scclParamMaxNchannels() != -2)
maxNchannels = scclParamMaxNchannels();
if(maxNchannels > MAXCHANNELS)
maxNchannels = MAXCHANNELS;
if(maxNchannels < 1) {
WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
maxNchannels = 1;
}
return maxNchannels;
}
static int copyChannels(struct scclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
int nranks = comm->nRanks;
int c;
for(c = start; c < end; c++) {
memcpy(ringPrev + c * nranks, ringPrev + (c - start) * nranks, nranks * sizeof(int));
memcpy(ringNext + c * nranks, ringNext + (c - start) * nranks, nranks * sizeof(int));
memcpy(comm->channels + c, comm->channels + c - start, sizeof(struct scclChannel));
}
return c;
}
static int copyMixedChannels(struct scclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
int nranks = comm->nRanks;
int c;
for(c = start; c < end; c++) {
memcpy(ringPrev + c * nranks, ringPrev + (c - start) * nranks, nranks * sizeof(int));
memcpy(ringNext + c * nranks, ringNext + (c - start) * nranks, nranks * sizeof(int));
memcpy(comm->channels + c, comm->channels + c - start, sizeof(struct scclChannel));
comm->channels[c].transportType = comm->mixedTransportType;
}
return c;
}
RCCL_PARAM(MaxMixedHylinkNChannels, "MAX_MIXED_HYLINK_NCHANNELS", 0);
RCCL_PARAM(MixedTransportType, "MIXED_TRANSPORT_TYPE", TRANSPORT_SHM);
scclResult_t scclTopoPostset(
struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
int nranks = comm->nRanks;
int nNodes = comm->nNodes;
int nChannels = comm->nChannels;
int MinNChannels = scclMinNchannels();
int MaxNChannels = scclMaxNchannels();
SCCLCHECK(scclCalloc(&ringRecv, nNodes * MAXCHANNELS));
SCCLCHECK(scclCalloc(&ringSend, nNodes * MAXCHANNELS));
SCCLCHECK(scclCalloc(&ringPrev, nranks * MAXCHANNELS));
SCCLCHECK(scclCalloc(&ringNext, nranks * MAXCHANNELS));
SCCLCHECK(scclCalloc(&treeToParent, nNodes * MAXCHANNELS));
SCCLCHECK(scclCalloc(&treeToChild0, nNodes * MAXCHANNELS));
SCCLCHECK(scclCalloc(&treeToChild1, nNodes * MAXCHANNELS));
SCCLCHECK(scclCalloc(&nvlsHeads, nNodes * MAXCHANNELS));
for(int c = 0; c < nChannels; c++) {
for(int n = 0; n < nNodes; n++) {
int r = firstRanks[n];
ringRecv[c * nNodes + n] = allTopoRanks[r]->ringRecv[c];
ringSend[c * nNodes + n] = allTopoRanks[r]->ringSend[c];
treeToParent[c * nNodes + n] = allTopoRanks[r]->treeToParent[c];
treeToChild0[c * nNodes + n] = allTopoRanks[r]->treeToChild0[c];
treeToChild1[c * nNodes + n] = allTopoRanks[r]->treeToChild1[c];
nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
}
for(int r = 0; r < nranks; r++) {
ringPrev[c * nranks + r] = allTopoRanks[r]->ringPrev[c];
ringNext[c * nranks + r] = allTopoRanks[r]->ringNext[c];
}
}
// Connect rings and trees. This should also duplicate the channels.
SCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
SCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
SCCLCHECK(connectNvls(comm, nvlsHeads, graphs[SCCL_ALGO_NVLS]));
// Duplicate ringPrev/ringNext for scclBuildRing
if(nChannels <= MAXCHANNELS / 2)
memcpy(ringPrev + nChannels * nranks, ringPrev, nChannels * nranks * sizeof(int));
if(nChannels <= MAXCHANNELS / 2)
memcpy(ringNext + nChannels * nranks, ringNext, nChannels * nranks * sizeof(int));
if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MIN_NCHANNELS") == NULL)
MinNChannels = 32;
if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MAX_NCHANNELS") == NULL)
MaxNChannels = 32;
#ifdef HCU_SDMA_FEATURE
int ncSdma = nc;
ncSdma = std::min((int)scclMaxNchannels() / comm->nChannels, nc);
ncSdma *= comm->nChannels;
#endif
// Get number of channels after duplication
nc = std::min((int)MaxNChannels / comm->nChannels, nc);
nc *= comm->nChannels;
// Duplication should be complete now
nChannels = comm->nChannels = std::min(MAXCHANNELS, (nChannels <= MAXCHANNELS / 2) ? nChannels * 2 : nChannels);
// Setup CollNet
if(comm->collNetSupport == 1) {
struct scclTopoGraph* collNetGraph = graphs[SCCL_ALGO_COLLNET_DIRECT];
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
if(collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
int collNetNchannels = std::min(MAXCHANNELS, nChannels + nChannels / 2);
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
}
SCCLCHECK(connectCollNet(comm, collNetGraph));
}
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
if(comm->minCompCap == 90 && comm->nNodes > 1 && graphs[SCCL_ALGO_RING]->bwIntra > 45.0 && 2 * nChannels <= MAXCHANNELS) {
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2 * nChannels, ringPrev, ringNext);
}
// Add Hylink + PCIE double channel path
if(graphs[SCCL_ALGO_RING]->typeIntra == PATH_NVL) {
comm->nMixedHylinkChannels = std::min(MAXCHANNELS - comm->nChannels, (int)rcclParamMaxMixedHylinkNChannels());
if(comm->nMixedHylinkChannels > 0) {
INFO(SCCL_LOG_TOPO,
"<%s:%d> -----> comm->nMixedHylinkShmChannels: %d, comm->nChannels: %d\n",
__func__,
__LINE__,
comm->nMixedHylinkChannels,
comm->nChannels);
comm->mixedTransportType = std::max((int)rcclParamMixedTransportType(), TRANSPORT_SHM);
nChannels = comm->nChannels = copyMixedChannels(comm, nChannels, nChannels + comm->nMixedHylinkChannels, ringPrev, ringNext);
}
}
// Honor SCCL_MIN_NRINGS/SCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
if(checkSdmaCopyEnable(comm)) {
uint32_t sdmaChannelNum;
uint32_t maxChannels;
sdmaChannelNum = getSdmaChannelNum(comm);
if(comm->sharedRes->owner != comm) {
/* child comm #channels cannot exceed top parent #channels. */
nChannels = comm->nChannels = std::min(std::min(std::min(scclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
maxChannels =
sdmaChannelNum ? sdmaChannelNum : std::min(std::max(scclMinNchannels(), std::max(ncSdma, comm->config.minCTAs)), comm->sharedRes->tpNChannels);
nChannels = comm->nChannels = copyChannels(comm, nChannels, maxChannels, ringPrev, ringNext);
} else {
nChannels = comm->nChannels = std::min(std::min(scclMaxNchannels(), nChannels), comm->config.maxCTAs);
maxChannels = sdmaChannelNum ? sdmaChannelNum : std::max(scclMinNchannels(), std::max(ncSdma, comm->config.minCTAs));
nChannels = comm->nChannels = copyChannels(comm, nChannels, maxChannels, ringPrev, ringNext);
}
INFO(SCCL_INIT, "-hcugon- scclTopoPostset rank %d sdmaChannelNum %d nChannels %d", comm->rank, sdmaChannelNum, comm->nChannels);
} else {
if(comm->sharedRes->owner != comm) {
/* child comm #channels cannot exceed top parent #channels. */
nChannels = comm->nChannels = std::min(std::min(std::min(MaxNChannels, nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
nChannels = comm->nChannels = copyChannels(
comm, nChannels, std::min(std::max(MinNChannels, std::max(nc, comm->config.minCTAs)), comm->sharedRes->tpNChannels), ringPrev, ringNext);
} else {
nChannels = comm->nChannels = std::min(std::min(MaxNChannels, nChannels), comm->config.maxCTAs);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(MinNChannels, std::max(nc, comm->config.minCTAs)), ringPrev, ringNext);
}
}
// Create rings array and check all is fine
SCCLCHECK(scclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
free(ringRecv);
free(ringSend);
free(ringPrev);
free(ringNext);
free(treeToParent);
free(treeToChild0);
free(treeToChild1);
free(nvlsHeads);
return scclSuccess;
}
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef SCCL_DEVICE_H_
#define SCCL_DEVICE_H_
#include "check.h"
#include "sccl_bfloat16.h"
#include "align.h"
#if defined(ENABLE_NPKIT)
#include "npkit/npkit_struct.h"
#endif
#if defined(ENABLE_TIMELINE)
#include "timeline/timeline.h"
#endif
#include <stdint.h>
#ifdef HCU_SDMA_FEATURE
#include "hsa/hsa_ext_amd.h"
#include "hsa_extra.h"
// #define HCU_PRINT_DEBUG
#endif
namespace sccl {
#define PRINT_ERR(...)
#define PRINT_INFO(...)
#define PRINT_INFOM(...)
#define PRINT_INFOT(tid, ...)
#define PRINT_DEBUG(...)
#if defined(ENABLE_NPKIT) && defined(HCU_SDMA_FEATURE)
#define NPKIT_SET_GPU_EVENT(event, size, cost) \
NpKit::CollectGpuEvent(event, size, cost, NPKIT_GET_GPU_TIMESTAMP(), scclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm) NpKit::CollectGpuEvent(event, size, cost, tm, scclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
#else
#define NPKIT_SET_GPU_EVENT(event, size, cost)
#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm)
#endif
#ifdef HCU_SDMA_FEATURE
#define INIT_PRIMS_SDMA(prims, args) \
{ \
prims.rank = scclShmem.comm.rank; \
prims.useSdmaConfig = args->useSdma; \
prims.useSdmaCopy = args->useSdma && prims.sdmaQueueCtx; \
prims.preFnOps = args->preFnOps; \
prims.sdmaMinCopySize = args->useSdma && prims.sdmaQueueCtx ? prims.sdmaQueueCtx->minCopySize : 0; \
prims.sdmaCountEnable = args->useSdma && prims.sdmaQueueCtx ? prims.sdmaQueueCtx->copyCountEnable : 0; \
prims.sdmaCopyCount = 0; \
prims.allCopyCount = 0; \
}
#endif
#define SCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
typedef enum {
scclFuncBroadcast,
scclFuncReduce,
scclFuncAllGather,
scclFuncReduceScatter,
scclFuncAllReduce,
scclFuncSendRecv,
scclFuncSend,
scclFuncRecv,
scclFuncAllToAllPivot,
scclNumFuncs
} scclFunc_t;
extern const char* scclFuncStr[SCCL_NUM_FUNCTIONS + 2];
#define SCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define SCCL_ALGO_TREE 0
#define SCCL_ALGO_RING 1
#define SCCL_ALGO_COLLNET_DIRECT 2
#define SCCL_ALGO_COLLNET_CHAIN 3
#define SCCL_ALGO_NVLS 4
#define SCCL_ALGO_NVLS_TREE 5
enum scclAlgo {
SCCL_ALGO_TREE = 0, // 树形算法
SCCL_ALGO_RING = 1, // 环形算法
SCCL_ALGO_COLLNET_DIRECT = 2, // 直接网络算法
SCCL_ALGO_COLLNET_CHAIN = 3, // 链式网络算法
SCCL_ALGO_NVLS = 4, // NVLink算法
SCCL_ALGO_NVLS_TREE = 5, // NVLink树形算法
};
extern const char* scclAlgoStr[SCCL_NUM_ALGORITHMS];
#define SCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define SCCL_PROTO_LL 0
#define SCCL_PROTO_LL128 1
#define SCCL_PROTO_SIMPLE 2
extern const char* scclProtoStr[SCCL_NUM_PROTOCOLS];
#define SCCL_MAX_OPS 2048
#define SCCL_STEPS 8
union scclLLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
Note this is assuming that either we receive contiguous chunks of data
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
struct {
uint32_t data1;
uint32_t flag1;
uint32_t data2;
uint32_t flag2;
};
uint64_t v[2];
int4 i4;
};
#define WARP_SIZE warpSize
#define MAXCHANNELS 32
#define SCCL_MAX_NTHREADS 256
#define SCCL_SIMPLE_MAX_NTHREADS SCCL_MAX_NTHREADS
#define SCCL_LL_MAX_NTHREADS SCCL_MAX_NTHREADS
#define SCCL_LL_LINES_PER_THREAD 8
#ifdef TEST_LL_CLEANUP
#define SCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
#define SCCL_LL_FLAG_MAX 0x100
#define SCCL_LL_FLAG(a) ((uint32_t)((a) % SCCL_LL_FLAG_MAX))
#else
#define SCCL_LL_CLEAN_MASK 0x7ffffff8
#define SCCL_LL_FLAG(a) ((uint32_t)(a))
#endif
// Make sure the clean mask will last for at least SCCL_NSTEPS
static_assert(SCCL_LL_CLEAN_MASK % SCCL_STEPS == 0, "Invalid SCCL_LL_CLEAN_MASK value");
#define SCCL_LL128_LINESIZE 64
#define SCCL_LL128_LINEELEMS (SCCL_LL128_LINESIZE / sizeof(uint64_t))
#define SCCL_LL128_DATAELEMS (SCCL_LL128_LINEELEMS - 1)
#define SCCL_LL128_MAX_NTHREADS 256
#define SCCL_LL128_ELEMS_PER_THREAD 28
#define SCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
#define SCCL_LL128_SHMEM_SIZE (SCCL_LL128_SHMEM_ELEMS_PER_THREAD * SCCL_LL128_MAX_NTHREADS)
#define SCCL_DIRECT_WRITE 0x01
#define SCCL_DIRECT_READ 0x02
#define SCCL_DIRECT_NIC 0x04
#define SCCL_IPC_WRITE 0x08
#define SCCL_IPC_READ 0x10
#define SCCL_NVLS_MIN_POLL 0x20
#ifdef HCU_SDMA_FEATURE
#define SDMA_CTX_VALID_MAGIC 0xD65A
#endif
struct scclConnInfo {
// Regular comm mechanism
char* buffs[SCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
uint64_t* tail; // Local for recv, remote for send
uint64_t* head; // Local for send, remote for recv
int flags; // Direct communication / other flags
int shared; // Buffers are shared
void** ptrExchange; // Pointer exchange for direct communication
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
int* sizesFifo; // Sizes fifo from GPU to proxy
int* offsFifo; // Buffer fifo from proxy to GPU
uint64_t step; // Keep where we are
uint64_t llLastCleaning;
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
// allows software to explicitly initiate a flush read to HDP memory. See more
// descriptions in primitives.h.
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
uint32_t* curr_hdp_reg; // Current GPU's HDP register
#ifdef HCU_SDMA_FEATURE
struct sdmaQueueContext* sdmaQueueCtx;
uint32_t sdmaCtxValidMagic;
#endif
};
struct scclProxyConnector {
int tpRank;
int tpLocalRank;
int sameProcess;
struct scclProxyConnection* connection;
};
struct scclConnector {
int connected;
struct scclProxyConnector proxyConn;
struct scclTransportComm* transportComm;
void* transportResources;
struct scclConnInfo conn;
};
struct scclRing {
// Shortcuts for userRanks[1] and userRanks[n-1]
int prev;
int next;
// Maps an internal sccl index to user-specified rank order. This is necessary
// since we need to know how the user expects data to be ordered across
// devices. Ordered from current device.
int* userRanks;
int index; // This rank's index in the ring
};
// The root of each tree only has one node down (+1 intra-node).
#define SCCL_MAX_TREE_ARITY_TOP 2
// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
#define SCCL_MAX_TREE_ARITY 3
struct scclTree {
int depth;
int up;
int down[SCCL_MAX_TREE_ARITY];
};
#define SCCL_MAX_DIRECT_ARITY 7
struct scclDirect {
int depth;
int out;
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
int up[SCCL_MAX_DIRECT_ARITY];
int down[SCCL_MAX_DIRECT_ARITY];
};
#define SCCL_CONN_IDX_P2P_NET 2
#define SCCL_MAX_NVLS_ARITY 8
#define SCCL_MAX_NVLS_TREE_ARITY 3
struct scclNvls {
int out;
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int up[SCCL_MAX_NVLS_ARITY];
int down;
int treeUp;
int treeDown[SCCL_MAX_NVLS_TREE_ARITY];
int node;
int nNodes;
};
#define SCCL_MAX_CONNS 3
struct scclChannelPeer {
struct scclConnector send[SCCL_MAX_CONNS];
struct scclConnector recv[SCCL_MAX_CONNS];
int refCount;
};
struct scclDevComm;
#pragma pack(push) /* push current alignment to stack */
#pragma pack(8) /* set alignment to 8 bytes boundary */
/* scclWork is to be a power of two, currently 8x64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of scclWorkElem. */
#define SCCL_WORK_SIZE 256
enum scclWorkType : uint8_t {
scclWorkTypeUnused = 0,
scclWorkTypeColl = 1,
scclWorkTypeP2p = 2,
scclWorkTypeRegColl = 3
};
enum scclWorkP2PType : uint8_t {
scclWorkP2pTypeUnused = 0,
scclWorkP2pTypeSend,
scclWorkP2pTypeRecv
};
struct scclWorkHeader {
union {
int32_t workNext; // when isLast=0: Offset from kernel argument workHead
uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
};
uint16_t funcIndex;
uint8_t isLast : 1; // last work for this kernel
uint8_t inFifo : 1; // is this work in the fifo
enum scclWorkType type;
};
struct scclWorkElem {
union {
uint8_t flagBits;
struct {
uint8_t isUsed : 1, redOpArgIsPtr : 1, regUsed : 1, nWarps : 5;
};
};
uint8_t direct;
uint8_t bid;
uint8_t nChannels;
struct {
uint32_t root : 28;
uint32_t preFnOps : 1;
uint32_t useSdma : 1;
uint32_t connIndex : 2;
};
const void* sendbuff;
void* recvbuff;
size_t count;
union {
size_t lastChunkSize;
// Pivot A2A kernel computes chunk size itself.
// Instead, it needs the number of bidirectional rings.
size_t pivotA2ANumBiRings;
};
uint64_t redOpArg;
uint64_t opCount;
};
static_assert((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElem))) / sizeof(scclWorkElem) == 4,
"Sanity check: SCCL_MAX_WORK_ELEMENTS == 4");
#define SCCL_MAX_WORK_ELEMENTS 1
struct scclWorkElemP2p {
struct {
int32_t peer : 26;
uint32_t preFnOps : 1;
uint32_t useSdma : 1;
uint32_t connIndex : 2;
int32_t proto : 2;
};
union {
uint16_t flagBits;
struct {
enum scclWorkP2PType p2pType : 4;
uint16_t nWarps : 4;
uint16_t warpStart : 4;
uint16_t ngroups : 4;
};
};
uint16_t opCount;
// Important not to use any fields with greater than 4-byte alignment since
// we need sizeof(scclWorkElemP2p)==28, but that would be padded up to 32 if
// there were 8-byte fields.
// void* buff;
uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
// size_t count;
uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
int chunkSize;
};
static_assert(((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElemP2p))) / sizeof(scclWorkElemP2p)) == 8,
"Sanity check: SCCL_MAX_WORK_ELEMENTS_P2P == 8");
#define SCCL_MAX_WORK_ELEMENTS_P2P 2
struct scclWorkElemReg {
struct scclWorkElem elem;
void* dnInputs[SCCL_MAX_DIRECT_ARITY + 1];
void* dnOutputs[SCCL_MAX_DIRECT_ARITY + 1];
void* upOutputs[SCCL_MAX_DIRECT_ARITY + 1];
};
#define SCCL_MAX_WORK_ELEMENTS_REG ((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElemReg))) / sizeof(scclWorkElemReg))
static_assert(SCCL_MAX_WORK_ELEMENTS_REG == 1, "Sanity check: SCCL_MAX_WORK_ELEMENTS_REG == 1");
// Number of named barriers supported by CUDA
#define SCCL_MAX_GROUPS (SCCL_MAX_NTHREADS / WARP_SIZE)
struct scclWork {
struct scclWorkHeader header;
union {
char pad[SCCL_WORK_SIZE - sizeof(struct scclWorkHeader)];
struct scclWorkElem elems[SCCL_MAX_WORK_ELEMENTS];
struct scclWorkElemP2p p2pElems[SCCL_MAX_WORK_ELEMENTS_P2P];
struct scclWorkElemReg regElems[SCCL_MAX_WORK_ELEMENTS_REG];
};
};
static_assert(sizeof(struct scclWork) == SCCL_WORK_SIZE, "Sanity check: sizeof(struct scclWork) == SCCL_WORK_SIZE");
static_assert(sizeof(struct scclWork) % 16 == 0, "Sanity check: sizeof(struct scclWork)%16 == 0");
struct scclDevChannelPeer {
// Stripped version of scclChannelPeer where we only keep the scclConnInfo
// instead of the full scclConnector.
struct scclConnInfo send[SCCL_MAX_CONNS];
struct scclConnInfo recv[SCCL_MAX_CONNS];
};
#pragma pack(pop) /* restore original alignment from stack */
#ifdef ENABLE_PROFILING
#define PROFILE_NUM_ITEMS 31
#define PROFILE_NUM_LAUNCHES 1024
struct scclProf {
uint32_t count;
uint32_t seq; // only entry from first launch is used
struct {
uint64_t line : 16;
uint64_t timeStamp : 48;
} elem[PROFILE_NUM_ITEMS];
};
static_assert(sizeof(struct scclProf) == 256, "scclProf must have size of 256");
#endif
#ifdef ENABLE_COLLTRACE
typedef enum {
scclCollTraceNotReady = 0,
scclCollTraceKernelLaunchType = 1,
scclCollTraceKernelEndType = 2,
scclCollTraceCollLaunchType = 3,
scclCollTraceAbortType = 4,
scclCollTraceDataType = 5,
scclCollTraceCollElemType = (1 << 4),
scclCollTraceP2pElemType = (1 << 5),
} scclCollTraceDataType_t;
struct scclCollTrace {
uint8_t type;
uint8_t bid;
int16_t funcIndex;
uint32_t data_0;
uint64_t timeStamp;
union {
uint64_t opCount;
uint32_t p2pOpCount[2];
};
union {
uint64_t data_1;
struct {
uint8_t nWarps;
uint8_t bid;
uint8_t nChannels;
} coll;
struct {
int16_t peer;
uint8_t ngroups : 4;
uint8_t connIndex : 4;
uint8_t warpStart : 4;
uint8_t nWarps : 4;
} p2p[2];
};
};
static_assert(sizeof(struct scclCollTrace) == 8 * sizeof(int), "scclCollTrace must have a pow2 size");
union scclCollTraceTail {
uint32_t tail;
char padding[4096];
};
#define COLLTRACE_NUM_ITEMS 8192
#endif
#ifdef HCU_SDMA_FEATURE
struct sdmaQueueContext {
hsa_sdma_info_t* sdmaInfo;
uint64_t pkgIndex;
uint32_t queueId;
uint32_t sumSdmaCopyCount;
uint32_t sumAllCopyCount;
uint32_t queueLock;
uint32_t minCopySize;
uint32_t copyCountEnable;
uint32_t sdmaQueueDepth;
uint32_t sdmaPkgLen;
uint32_t sdmaQueueLen;
};
#endif
struct alignas(16) scclDevChannel {
struct scclDevChannelPeer** peers;
struct scclRing ring;
struct scclTree tree;
struct scclTree collnetChain;
struct scclDirect collnetDirect;
struct scclTree binTree;
struct scclNvls nvls;
uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
};
struct scclDevComm {
int rank;
int nRanks;
int buffSizes[SCCL_NUM_PROTOCOLS];
// Operation list for aggregation
int workFifoDepth;
struct scclWork* workFifoHeap; // may be cudaHost or GDR memory
// Flag to ask SCCL kernels to abort
volatile uint32_t* abortFlag;
// Channels, device side
struct scclDevChannel* channels /*[MAXCHANNELS]*/;
#if defined(ENABLE_NPKIT)
NpKitEventCollectContext* npKitEventCollectContexts;
#endif
#ifdef ENABLE_COLLTRACE
struct scclCollTrace* collTrace;
union scclCollTraceTail* collTraceTail;
pthread_t collTraceThread;
#endif
#ifdef ENABLE_PROFILING
struct scclProf* devProf;
#endif
#if defined(ENABLE_TIMELINE)
TimelineGpuEventContext* gpuEventContext;
#endif
#if defined(ENABLE_NPKIT) || defined(ENABLE_TIMELINE)
uint64_t* cpuTimestamp;
#endif
};
struct alignas(16) scclDevCommAndChannels {
struct scclDevComm comm;
struct scclDevChannel channels[MAXCHANNELS];
};
#ifdef __CUDA_ARCH__
#define SCCL_CUDA_ARCH __CUDA_ARCH__
#else
#define SCCL_CUDA_ARCH 0
#endif
template <typename T>
__host__ __device__ constexpr T min_constexpr(T a) {
return a;
}
template <typename T, typename... Ts>
__host__ __device__ constexpr T min_constexpr(T a, T b, Ts... c) {
return min_constexpr<T>((a < b ? a : b), c...);
}
template <typename T>
__host__ __device__ constexpr T max_constexpr(T a) {
return a;
}
template <typename T, typename... Ts>
__host__ __device__ constexpr T max_constexpr(T a, T b, Ts... c) {
return max_constexpr<T>((a > b ? a : b), c...);
}
// Calculate the unroll factor given:
// * bytePerPack: number of bytes accessed per instruction
// * insns: max permissible unroll value
// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
__host__ __device__ constexpr int scclCalcUnroll(int bytePerPack, int insns, int bytes) {
return min_constexpr(insns, (bytes + bytePerPack - 1) / bytePerPack);
}
// Note that all unroll value logic should depend on a given cudaArch argument
// and not __CUDA_ARCH__ since these need to be host-side executable where the
// arch value is strictly runtime only. By defaulting to SCCL_CUDA_ARCH, device
// side code can elide passing the arch for brevity.
__host__ __device__ constexpr int scclCollUnroll(int cudaArch = SCCL_CUDA_ARCH) {
// Our collective unroll should move to the same bytes&insns model as NVLS.
return cudaArch >= 800 ? 8 : 4;
}
__host__ __device__ constexpr int scclNvlsUnrollBytes(int cudaArch = SCCL_CUDA_ARCH) { return 4 * 16; }
__host__ __device__ constexpr int scclNvlsUnrollInsns(int cudaArch = SCCL_CUDA_ARCH) { return 16; }
__host__ __device__ constexpr int scclNvlsUnroll(int bytePerPack, int cudaArch = SCCL_CUDA_ARCH) {
return scclCalcUnroll(bytePerPack, scclNvlsUnrollInsns(cudaArch), scclNvlsUnrollBytes(cudaArch));
}
// The amount of dynamic shmem per warp
__host__ __device__ constexpr int scclShmemScratchWarpSize(int cudaArch = SCCL_CUDA_ARCH) {
return (max_constexpr<int>(
/*LL */ 0,
/*LL128 */ (SCCL_LL128_SHMEM_ELEMS_PER_THREAD * WARP_SIZE) * sizeof(uint64_t),
/*SIMPLE*/ (scclCollUnroll(cudaArch) * WARP_SIZE + 1) * 16,
// NVLS needs an extra 16B to read unaligned data.
/*NVLS */ WARP_SIZE * (cudaArch >= 900 ? scclNvlsUnrollBytes(cudaArch) : 0) + 16) +
15) &
-16; // pad to 16 bytes
}
// The amount of dynamic shmem per block
__host__ __device__ constexpr int scclShmemDynamicSize(int cudaArch = SCCL_CUDA_ARCH) {
return cudaArch < 700 ? 0 : scclShmemScratchWarpSize(cudaArch) * (SCCL_MAX_NTHREADS / WARP_SIZE);
}
} // namespace sccl
#endif
#ifndef SCCL_GRAPH_H_
#define SCCL_GRAPH_H_
// #include "topo_utils.h"
#include "devcomm.h"
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
#include <sched.h>
namespace sccl {
namespace hardware {
namespace topology {
#define MAX_XGMI_INTER_GPUS 4
struct scclTopoGraph {
// Input / output
int id; // ring : 0, tree : 1, collnet : 2
int pattern;
int crossNic;
int collNet;
int minChannels;
int maxChannels;
// Output
int nChannels;
float bwIntra;
float bwInter;
float latencyInter;
int typeIntra;
int typeInter;
int sameChannels;
int nHops;
int intra[MAXCHANNELS * SCCL_TOPO_MAX_NODES];
int inter[MAXCHANNELS * 2];
int nIntraChannels;
int intraNets[MAXCHANNELS * SCCL_TOPO_MAX_NODES * 2];
char treeBase[SCCL_TOPO_MAX_NODES][SCCL_TOPO_MAX_NODES * 4];
};
struct scclTopoRanks {
int ringRecv[MAXCHANNELS];
int ringSend[MAXCHANNELS];
int ringPrev[MAXCHANNELS];
int ringNext[MAXCHANNELS];
int treeToParent[MAXCHANNELS];
int treeToChild0[MAXCHANNELS];
int treeToChild1[MAXCHANNELS];
int nvlsHeads[MAXCHANNELS];
};
// struct sccl::hardware::topology::topo::scclTopoSystem;
// 对系统拓扑结构进行排序
scclResult_t scclTopoSortSystem(struct scclTopoSystem* system);
// 打印系统拓扑结构
scclResult_t scclTopoPrint(struct scclTopoSystem* system);
// 计算系统中的路径
scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm);
// // 释放系统拓扑结构
// void scclTopoFree(struct scclTopoSystem* system);
// // 裁剪系统拓扑结构
// scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm);
// // 计算点对点通道
// scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm);
// // 获取指定rank的Nvidia GPU信息
// scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks);
// // 检查系统中是否所有路径都通过NVLink
// int scclTopoPathAllNVLink(struct scclTopoSystem* system);
// // 获取网络设备信息
// scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
// // 检查两个设备之间是否存在点对点连接
scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank);
// // 检查是否使用GDR
// scclResult_t scclTopoCheckGdr(struct scclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
// // 获取内部网络设备信息
// scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev);
// // 获取两个CUDA设备之间的连接类型
// scclResult_t scclTopoGetLinkType(
// struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter = MAX_XGMI_INTER_GPUS, int nInter = 0, int* inter = nullptr);
// // 检查是否需要刷新
// scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush);
// // 检查两个设备是否在同一网络中
// scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net);
// // 禁用PXE网络
// int scclPxnDisable(struct scclComm* comm);
// // 获取PXE网络中的中间节点
// scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks);
// // 获取本地节点的rank
// scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank);
// // 获取CPU亲和性
// scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity);
// // 获取CPU类型信息
// scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model);
// // 获取GPU数量
// scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count);
// // 获取NVS数量
// scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count);
// // 获取本地网络设备信息
// scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id);
// // 获取本地GPU索引
// scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex);
// // 初始化搜索,调用scclTopoCompute之前需要执行
// scclResult_t scclTopoSearchInit(struct scclTopoSystem* system);
// // 计算拓扑图
// scclResult_t scclTopoCompute(struct scclTopoSystem* system, struct scclTopoGraph* graph);
// // 打印拓扑图
// scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph);
// // 导出拓扑图
// scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs);
// // 设置预定义拓扑图
// scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks);
// // 设置后处理拓扑图
// scclResult_t scclTopoPostset(
// struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc);
// // 设置基于树的后处理拓扑图
// scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph);
// // 调整模型以适应计算能力
// scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs);
// scclResult_t scclTopoCudaPath(int cudaDev, char** path);
// #include "info.h"
// scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "comm.h"
#include "net.h"
#include "channel.h"
#include "xml.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace graph {
// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
struct scclTopoNodeList {
struct scclTopoNode* list[SCCL_TOPO_MAX_NODES];
int count;
};
static scclResult_t getPath(struct scclTopoSystem* system, struct scclTopoNode* node, int t, int64_t id, struct scclTopoLinkList** path) {
for(int i = 0; i < system->nodes[t].count; i++) {
if(system->nodes[t].nodes[i].id == id) {
*path = node->paths[t] + i;
return scclSuccess;
}
}
WARN("Could not find node of type %d id %lx", t, id);
return scclInternalError;
}
static scclResult_t scclTopoSetPaths(struct scclTopoNode* baseNode, struct scclTopoSystem* system) {
if(baseNode->paths[baseNode->type] == NULL) {
SCCLCHECK(scclCalloc(baseNode->paths + baseNode->type, system->nodes[baseNode->type].count));
}
// breadth-first search to set all paths to that node in the system
struct scclTopoNodeList nodeList;
struct scclTopoNodeList nextNodeList;
nodeList.count = 1;
nodeList.list[0] = baseNode;
nextNodeList.count = 0;
struct scclTopoLinkList* basePath;
SCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
basePath->count = 0;
basePath->bw = LOC_BW;
basePath->type = PATH_LOC;
while(nodeList.count) {
nextNodeList.count = 0;
for(int n = 0; n < nodeList.count; n++) {
struct scclTopoNode* node = nodeList.list[n];
struct scclTopoLinkList* path;
SCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
for(int l = 0; l < node->nlinks; l++) {
struct scclTopoLink* link = node->links + l;
struct scclTopoNode* remNode = link->remNode;
if(remNode->paths[baseNode->type] == NULL) {
SCCLCHECK(scclCalloc(remNode->paths + baseNode->type, system->nodes[baseNode->type].count));
}
struct scclTopoLinkList* remPath;
SCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
float bw = std::min(path->bw, link->bw);
// allow routing through a GPU only as 1 hop
if(node != baseNode && node->type == GPU && (link->type != LINK_NVL || remNode->type != GPU || path->count > 1))
continue;
if((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) {
// Find reverse link
for(int l = 0; l < remNode->nlinks; l++) {
if(remNode->links[l].remNode == node) {
remPath->list[0] = remNode->links + l;
break;
}
}
if(remPath->list[0] == NULL) {
WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
remNode->type,
remNode->id,
remNode->nlinks,
node->type,
node->id);
return scclInternalError;
}
// Copy the rest of the path
for(int i = 0; i < path->count; i++)
remPath->list[i + 1] = path->list[i];
remPath->count = path->count + 1;
remPath->bw = bw;
// Start with path type = link type. PATH and LINK types are supposed to match.
// Don't consider LINK_NET as we only care about the NIC->GPU path.
int type = link->type == LINK_NET ? LINK_LOC : link->type;
// Differentiate between one and multiple PCI switches
if(node->type == PCI && remNode->type == PCI)
type = PATH_PXB;
// Consider a path going through the CPU as PATH_PHB
if(link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU))
type = PATH_PHB;
// Set 1 hop NVLink as NVB
// if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
remPath->type = std::max(path->type, type);
// Add to the list for the next iteration if not already in the list
// Disallow GPUs as intermediate steps for now
if(remNode->type != GPU) {
int i;
for(i = 0; i < nextNodeList.count; i++)
if(nextNodeList.list[i] == remNode)
break;
if(i == nextNodeList.count)
nextNodeList.list[nextNodeList.count++] = remNode;
}
}
}
}
memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
}
return scclSuccess;
}
/**
* 打印节点路径信息
*
* @param system 拓扑系统指针
* @param node 待打印路径的节点指针
*
* 该函数用于输出指定节点的路径信息,包括路径类型、目标节点ID、
* 路径跳数、带宽和路径类型字符串。输出格式为一行字符串。
*/
static void printNodePaths(struct scclTopoSystem* system, struct scclTopoNode* node) {
char line[1024];
sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
int offset = strlen(line);
for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
if(node->paths[t] == NULL)
continue;
for(int n = 0; n < system->nodes[t].count; n++) {
sprintf(line + offset,
"%s/%lX (%d/%f/%s) ",
topoNodeTypeStr[t],
system->nodes[t].nodes[n].id,
node->paths[t][n].count,
node->paths[t][n].bw,
topoPathTypeStr[node->paths[t][n].type]);
offset = strlen(line);
}
}
}
static scclResult_t getLocalCpu(struct scclTopoSystem* system, int gpu, int* retCpu) {
// Find the closest CPU to a GPU
int minHops = 0;
int localCpu = -1;
struct scclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
for(int c = 0; c < system->nodes[CPU].count; c++) {
int hops = paths[c].count;
if(minHops == 0 || hops < minHops) {
localCpu = c;
minHops = hops;
}
}
if(localCpu == -1) {
WARN("Error : could not find CPU close to GPU %d", gpu);
return scclInternalError;
}
*retCpu = localCpu;
return scclSuccess;
}
static scclResult_t addInterStep(struct scclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
struct scclTopoNode* cpuNode = system->nodes[tx].nodes + ix;
struct scclTopoNode* srcNode = system->nodes[t1].nodes + i1;
int l = 0;
// Node 1 -> CPU
for(int i = 0; i < srcNode->paths[tx][ix].count; i++)
srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i];
// CPU -> Node 2
for(int i = 0; i < cpuNode->paths[t2][i2].count; i++)
srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
// Update path characteristics
srcNode->paths[t2][i2].count = l;
srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
if(tx == GPU)
srcNode->paths[t2][i2].type = PATH_PXN;
srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw);
return scclSuccess;
}
// Remove/free paths for a given type
static void scclTopoRemovePathType(struct scclTopoSystem* system, int nodeType) {
for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
// Remove links _to_ the given type
for(int n = 0; n < system->nodes[t].count; n++) {
struct scclTopoNode* node = system->nodes[t].nodes + n;
free(node->paths[nodeType]);
node->paths[nodeType] = NULL;
}
// Remove links _from_ the given type
for(int n = 0; n < system->nodes[nodeType].count; n++) {
struct scclTopoNode* node = system->nodes[nodeType].nodes + n;
free(node->paths[t]);
node->paths[t] = NULL;
}
}
}
static const int levelsOldToNew[] = {PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS};
scclResult_t scclGetLevel(int* level, const char* disableEnv, const char* levelEnv) {
if(*level == -1) {
int l = -1;
if(disableEnv) {
char* str = getenv(disableEnv);
if(str) {
int disable = strtol(str, NULL, 0);
if(disable == 1)
l = 0;
}
}
if(l == -1) {
char* str = getenv(levelEnv);
if(str) {
for(int i = 0; i <= PATH_SYS; i++) {
if(strcmp(str, topoPathTypeStr[i]) == 0) {
l = i;
break;
}
}
// Old style numbering
// levelsOldToNew to is an array with each index corresponding to the
// "old level" int, and each value mapping to the correct value defined in topo.h
// maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew)
if(l == -1 && str[0] >= '0' && str[0] <= '9') {
int oldLevel = strtol(str, NULL, 0);
const int maxOldLevel = sizeof(levelsOldToNew) / sizeof(int) - 1;
if(oldLevel > maxOldLevel)
oldLevel = maxOldLevel;
l = levelsOldToNew[oldLevel];
}
}
}
if(l >= 0)
INFO(SCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
*level = l >= 0 ? l : -2;
}
return scclSuccess;
}
SCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
int scclTopoUserGdrLevel = -1;
scclResult_t scclTopoCheckGdr(struct scclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) {
*useGdr = 0;
// Get GPU and NET
int n, g;
SCCLCHECK(scclTopoIdToIndex(system, NET, netDev, &n));
struct scclTopoNode* net = system->nodes[NET].nodes + n;
SCCLCHECK(scclTopoIdToIndex(system, GPU, busId, &g));
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
// Check that both the NIC and GPUs support it
if(net->net.gdrSupport == 0)
return scclSuccess;
if(gpu->gpu.gdrSupport == 0)
return scclSuccess;
if(read) { // For reads (sends) only enable under certain conditions
int gdrReadParam = scclParamNetGdrRead();
if(gdrReadParam == 0)
return scclSuccess;
if(gdrReadParam < 0) {
int nvlink = 0;
// Since we don't know whether there are other communicators,
// it's better to keep things local if we have a single GPU.
if(system->nodes[GPU].count == 1)
nvlink = 1;
for(int i = 0; i < system->nodes[GPU].count; i++) {
if(i == g)
continue;
if(gpu->paths[GPU][i].type == PATH_NVL) {
nvlink = 1;
break;
}
}
if(!nvlink)
return scclSuccess;
}
}
// Check if we are close enough that it makes sense to enable GDR
int netGdrLevel = system->netGdrLevel == -2 ? PATH_PXB : system->netGdrLevel;
SCCLCHECK(scclGetLevel(&scclTopoUserGdrLevel, NULL, "SCCL_NET_GDR_LEVEL"));
if(scclTopoUserGdrLevel != -2)
netGdrLevel = scclTopoUserGdrLevel;
else {
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME) {
int i, d1 = -1, d2 = -1;
for(i = 0; i < system->nodes[CPU].count; i++)
if(system->nodes[GPU].nodes[g].paths[CPU][i].count == 2)
break;
if(i < system->nodes[CPU].count)
d1 = system->nodes[CPU].nodes[i].id;
for(i = 0; i < system->nodes[CPU].count; i++)
if(system->nodes[NET].nodes[n].paths[CPU][i].count == 2)
break;
if(i < system->nodes[CPU].count)
d2 = system->nodes[CPU].nodes[i].id;
if(d1 != -1 && d2 != -1 && d1 == d2 && (system->nodes[GPU].nodes[g].id & 0xf0000) == (system->nodes[NET].nodes[n].net.busId & 0xf0000)) {
netGdrLevel = PATH_PHB;
}
}
}
int distance = gpu->paths[NET][n].type;
if(distance == PATH_PXN) {
// In case of PXN, use the intermediate GPU distance instead
int proxyRank, g;
SCCLCHECK(scclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
SCCLCHECK(scclTopoRankToIndex(system, proxyRank, &g));
struct scclTopoNode* proxyGpu = system->nodes[GPU].nodes + g;
distance = proxyGpu->paths[NET][n].type;
}
if(distance > netGdrLevel) {
INFO(SCCL_NET, "GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
return scclSuccess;
}
*useGdr = 1;
INFO(SCCL_NET, "GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read);
return scclSuccess;
}
// Set to 0 to disable the flush on Hopper when using GDR
SCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1);
// Determine whether we need to flush the GDR recv buffers
scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush) {
int g;
SCCLCHECK(scclTopoIdToIndex(system, GPU, busId, &g));
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
// Flush is required on Ampere and earlier
*flush = gpu->gpu.cudaCompCap < 90 ? 1 : scclParamNetForceFlush();
return scclSuccess;
}
SCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 1);
// Check whether going through the network would be faster than going through P2P/SHM.
scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net) {
if(scclParamNetDisableIntra() == 1) {
*net = 0;
return scclSuccess;
}
*net = 1;
// First check the current GPU-to-GPU speed.
int g1, g2;
if(scclTopoIdToIndex(system, GPU, id1, &g1) != scclSuccess || scclTopoIdToIndex(system, GPU, id2, &g2) != scclSuccess) {
return scclSuccess;
}
struct scclTopoNode* gpu1 = system->nodes[GPU].nodes + g1;
struct scclTopoNode* gpu2 = system->nodes[GPU].nodes + g2;
float speed = gpu1->paths[GPU][g2].bw;
// Now check the speed each GPU can access the network through PXB or better
float netSpeed1 = 0, netSpeed2 = 0;
for(int n = 0; n < system->nodes[NET].count; n++) {
struct scclTopoLinkList* path = gpu1->paths[NET] + n;
if(path->type <= PATH_PXB && path->bw > netSpeed1)
netSpeed1 = path->bw;
path = gpu2->paths[NET] + n;
if(path->type <= PATH_PXB && path->bw > netSpeed2)
netSpeed2 = path->bw;
}
if(netSpeed1 > speed && netSpeed2 > speed)
return scclSuccess;
*net = 0;
return scclSuccess;
}
scclResult_t scclTopoGetIntermediateRank(struct scclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
// Get GPU and NET
int n, g;
SCCLCHECK(scclTopoIdToIndex(system, NET, netDev, &n));
SCCLCHECK(scclTopoRankToIndex(system, rank, &g));
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
struct scclTopoLinkList* path = gpu->paths[NET] + n;
if(path->type == PATH_PXN) {
struct scclTopoNode* node;
int type = NVS;
for(int i = 0; i < path->count && type == NVS; i++) {
node = path->list[i]->remNode;
type = node->type;
}
if(type != GPU) {
WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
return scclInternalError;
}
*intermediateRank = node->gpu.rank;
} else {
*intermediateRank = rank;
}
return scclSuccess;
}
SCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);
// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
// remote proxies without risking deadlocks
int scclPxnDisable(struct scclComm* comm) {
static int pxnDisable = -1;
if(pxnDisable == -1) {
if(comm && scclNetVersion(comm) == 4) {
INFO(SCCL_INIT, "PXN Disabled as plugin is v4");
pxnDisable = 1;
} else {
pxnDisable = scclParamPxnDisable();
}
}
return pxnDisable;
}
scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks) {
struct scclTopoSystem* system = comm->topo;
*nranks = 0;
*intermediateRanks = NULL;
if(system->nodes[NET].count == 0)
return scclSuccess;
int nr = 0;
int* ranks = NULL;
for(int rank = 0; rank < comm->nRanks; rank++) {
int netDev, proxyRank;
SCCLCHECK(scclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank));
if(proxyRank == comm->rank)
continue;
int useGdr;
SCCLCHECK(scclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr));
if(useGdr == 0)
continue;
int found = 0;
for(int r = 0; r < nr; r++) {
if(ranks[r] == proxyRank)
found = 1;
}
if(!found) {
SCCLCHECK(scclRealloc(&ranks, nr, nr + 1));
ranks[nr++] = proxyRank;
}
}
*nranks = nr;
*intermediateRanks = ranks;
return scclSuccess;
}
static bool rcclPathOverride(struct scclTopoSystem* system, uint64_t distance) {
int i, j;
for(i = 0; i < system->nodes[GPU].count; i++) {
for(j = 0; j < system->nodes[NET].count; j++) {
if((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) ||
(system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance))
break;
}
if(j >= system->nodes[NET].count)
break;
}
if(i >= system->nodes[GPU].count) {
for(i = 0; i < system->nodes[GPU].count; i++) {
for(j = 0; j < system->nodes[NET].count; j++) {
if((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) ||
(system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance))
system->nodes[GPU].nodes[i].paths[NET][j].type = PATH_PXB;
}
}
return true;
} else {
return false;
}
}
RCCL_PARAM(EnableIntranet, "ENABLE_INTRANET", -2);
scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm) {
int* domains;
int64_t* ids;
SCCLCHECK(scclCalloc(&domains, system->nodes[GPU].count));
SCCLCHECK(scclCalloc(&ids, system->nodes[GPU].count));
int myDomain = 0;
for(int g = 0; g < system->nodes[GPU].count; g++) {
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
domains[g] = g;
ids[g] = gpu->id;
for(int p = 0; p < g; p++) {
if(gpu->paths[GPU][p].type < PATH_NET) {
domains[g] = std::min(domains[g], domains[p]);
}
}
if(gpu->gpu.rank == comm->rank)
myDomain = domains[g];
}
int ngpus = system->nodes[GPU].count;
for(int i = 0; i < ngpus; i++) {
if(domains[i] == myDomain)
continue;
struct scclTopoNode* gpu = NULL;
int g;
for(g = 0; g < system->nodes[GPU].count /* This one varies over the loops */; g++) {
gpu = system->nodes[GPU].nodes + g;
if(gpu->id == ids[i])
break;
else
gpu = NULL;
}
if(gpu == NULL) {
WARN("Could not find id %lx", ids[i]);
free(domains);
free(ids);
return scclInternalError;
}
SCCLCHECK(scclTopoRemoveNode(system, GPU, g));
}
// trim low speed port on same NIC
for(int i = 0; i < system->nodes[NET].count; i++) {
for(int j = 0; j < system->nodes[NET].count; j++) {
if(i == j)
continue;
if(system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[j].net.asic) {
if(system->nodes[NET].nodes[i].net.bw > system->nodes[NET].nodes[j].net.bw)
system->nodes[NET].nodes[j].net.bw = 0;
}
}
}
do {
int n;
for(n = 0; n < system->nodes[NET].count; n++) {
if(system->nodes[NET].nodes[n].net.bw == 0)
break;
}
if(n < system->nodes[NET].count) {
SCCLCHECK(scclTopoRemoveNode(system, NET, n));
} else
break;
} while(system->nodes[NET].count);
int remove = 1;
int gdr = 1;
bool allXgmi = true;
// detect if all GPUs are connected by XGMI
for(int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
for(int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
if(i == j)
continue;
int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
bool isXGMI;
SCCLCHECK(scclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
allXgmi &= isXGMI;
}
}
if(allXgmi)
system->type |= RCCL_TOPO_XGMI_ALL;
for(int g = 0; g < system->nodes[GPU].count; g++) {
int net;
SCCLCHECK(scclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
if(!gdr)
break;
}
if(gdr && !allXgmi) {
remove = 0;
system->type |= RCCL_TOPO_GDR_ALL;
INFO(SCCL_LOG_TOPO, "GDR is available on all GPUs");
}
// Special handling of gfx94x
if(rcclParamEnableIntranet() == 1 || (rcclParamEnableIntranet() == -2 && IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
system->nodes[GPU].count == 8 && system->nodes[NET].count == 8)) {
remove = 0;
system->type |= RCCL_TOPO_FORCE_INTRA;
}
comm->localRanks = system->nodes[GPU].count;
if(system->nodes[GPU].count == comm->nRanks && remove) {
for(int n = system->nodes[NET].count - 1; n >= 0; n--)
SCCLCHECK(scclTopoRemoveNode(system, NET, n));
}
free(domains);
free(ids);
return scclSuccess;
}
void scclTopoFree(struct scclTopoSystem* system) {
for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++)
scclTopoRemovePathType(system, t);
free(system);
}
SCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 1);
SCCL_PARAM(NChannelsPerPeer, "NCHANNELS_PER_PEER", 4);
static scclResult_t scclTopoGetNchannels(struct scclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
int peer;
struct scclTopoLinkList* path = NULL;
if(scclTopoRankToIndex(system, peerRank, &peer) == scclSuccess) {
// Same rank
if(g == peer) {
*nChannels = -1;
return scclSuccess;
}
// Local rank
path = system->nodes[GPU].nodes[peer].paths[GPU] + g;
if(path->type == PATH_NVL) {
float nvlBw = scclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn);
*nChannels = (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 4 : 2) * std::max(1, (int)(path->bw / nvlBw));
} else {
*nChannels = 2;
}
} else {
// Remote rank, use network
*nChannels = scclParamNChannelsPerNetPeer();
}
return scclSuccess;
}
SCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 4);
SCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);
static int nextPow2(int v) {
int pow2 = 1;
while(pow2 < v)
pow2 <<= 1;
return pow2;
}
scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm) {
/* here we already honor comm->max/minCTAs for p2pnChannels. */
int MinP2pNchannels = (int)scclParamMinP2pNChannels();
int MaxP2pNchannels = (int)scclParamMaxP2pNChannels();
int NchannelsPerPeer = (int)scclParamNChannelsPerPeer();
if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MIN_P2P_NCHANNELS") == NULL)
MinP2pNchannels = 32;
if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MAX_P2P_NCHANNELS") == NULL)
MaxP2pNchannels = 32;
if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_NCHANNELS_PER_PEER") == NULL)
NchannelsPerPeer = 32;
int scclMinP2pNchannels = MinP2pNchannels;
if(comm->sharedRes->owner != comm) {
comm->p2pnChannels = std::min(comm->nChannels, MaxP2pNchannels);
comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, scclMinP2pNchannels), comm->sharedRes->tpP2pNChannels);
} else {
comm->p2pnChannels = std::min(comm->nChannels, MaxP2pNchannels);
comm->p2pnChannels = std::max(comm->p2pnChannels, scclMinP2pNchannels);
}
int minChannels = comm->p2pnChannels;
// We need to loop through all local GPUs to have a global picture
for(int g = 0; g < comm->topo->nodes[GPU].count; g++) {
for(int r = 0; r < comm->nRanks; r++) {
int nChannels;
SCCLCHECK(scclTopoGetNchannels(comm->topo, g, r, &nChannels));
if(nChannels >= 0)
minChannels = std::min(minChannels, nChannels);
}
}
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(comm->topo, &arch, &vendor, &model));
// Round to next pow2 nChannelsPerPeer and nChannels
if(getNumaMaxGpus() == 1 && !scclTopoPathAllNVLink(comm->topo)) {
comm->p2pnChannelsPerPeer = nextPow2(comm->p2pnChannels);
} else {
comm->p2pnChannelsPerPeer = (NchannelsPerPeer == -2 ? nextPow2(minChannels) : NchannelsPerPeer);
}
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
// Init channels that weren't used so far
for(int c = comm->nChannels; c < std::max(comm->nChannels, comm->p2pnChannels); c++)
SCCLCHECK(initChannel(comm, c));
// We want to spread channels used when there aren't many and progressively
// fill the whole space of nChannels. To do so we mirror the bits in the
// nChannels space.
for(int c = 0; c < comm->p2pnChannels; c++) {
int mirror = 0;
for(int b = 1, mb = (comm->p2pnChannels >> 1); b < comm->p2pnChannels; b <<= 1, mb >>= 1)
if(c & b)
mirror |= mb;
comm->p2pChannels[c] = mirror;
}
return scclSuccess;
}
scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks) {
int ngpus = system->nodes[GPU].count;
SCCLCHECK(scclCalloc(ranks, ngpus));
int nvbGpus = 0;
for(int g = 0; g < ngpus; g++) {
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
if(gpu->gpu.rank != rank)
continue;
for(int p = 0; p < ngpus; p++) {
if(gpu->paths[GPU][p].type == PATH_NVB) {
(*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
}
}
}
*nranks = nvbGpus;
return scclSuccess;
}
int scclTopoPathAllNVLink(struct scclTopoSystem* system) {
int minPath = PATH_DIS;
for(int i = 0; i < system->nodes[GPU].count; i++) {
struct scclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
for(int j = 0; j < system->nodes[GPU].count; j++) {
if(i == j)
continue;
minPath = std::min(minPath, paths[j].type);
}
}
return minPath >= PATH_PIX ? 0 : 1;
}
} // namespace graph
scclResult_t scclTopoPrintPaths(struct scclTopoSystem* system) {
for(int i = 0; i < system->nodes[GPU].count; i++) {
graph::printNodePaths(system, system->nodes[GPU].nodes + i);
}
for(int i = 0; i < system->nodes[NET].count; i++) {
graph::printNodePaths(system, system->nodes[NET].nodes + i);
}
return scclSuccess;
}
int scclTopoUserP2pLevel = -1;
scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank) {
*p2p = 0;
if(read)
*read = 0;
if(intermediateRank)
*intermediateRank = -1;
// Get GPUs from topology
int g1, g2;
SCCLCHECK(scclTopoIdToIndex(system, GPU, id1, &g1));
struct scclTopoNode* gpu1 = system->nodes[GPU].nodes + g1;
if(scclTopoIdToIndex(system, GPU, id2, &g2) == scclInternalError) {
// GPU not found, we can't use p2p.
return scclSuccess;
}
int intermediateIndex = -1;
// Set intermediate GPU rank, if routing through an intermediate GPU.
struct scclTopoLinkList* path = gpu1->paths[GPU] + g2;
if(path->count == 2) {
struct scclTopoNode* intermediateNode = path->list[0]->remNode;
if(intermediateNode->type == GPU) {
intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
if(intermediateRank)
*intermediateRank = intermediateNode->gpu.rank;
}
}
// In general, use P2P whenever we can.
int p2pLevel = PATH_SYS;
// User override
if(scclTopoUserP2pLevel == -1)
SCCLCHECK(scclGetLevel(&scclTopoUserP2pLevel, "SCCL_P2P_DISABLE", "SCCL_P2P_LEVEL"));
if(scclTopoUserP2pLevel != -2) {
p2pLevel = scclTopoUserP2pLevel;
goto compare;
}
// Don't use P2P through ARM CPUs
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
if(arch == SCCL_TOPO_CPU_ARCH_ARM)
p2pLevel = PATH_PXB;
if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
p2pLevel = PATH_PXB;
}
if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
p2pLevel = PATH_PXB;
}
compare:
// Compute the PCI distance and compare with the p2pLevel.
if(path->type <= p2pLevel)
*p2p = 1;
if(path->type == PATH_NVL) {
struct scclTopoNode* gpu2 = system->nodes[GPU].nodes + g2;
// Enable P2P Read for Ampere/NVLink only
if(read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80))
*read = 1;
}
return scclSuccess;
}
scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm) {
// Precompute paths between GPUs/NICs.
// Remove everything in case we're re-computing
for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++)
graph::scclTopoRemovePathType(system, t);
// Set direct paths to CPUs. We need them in many cases.
for(int c = 0; c < system->nodes[CPU].count; c++) {
SCCLCHECK(graph::scclTopoSetPaths(system->nodes[CPU].nodes + c, system));
}
// Set direct paths to GPUs.
for(int g = 0; g < system->nodes[GPU].count; g++) {
SCCLCHECK(graph::scclTopoSetPaths(system->nodes[GPU].nodes + g, system));
}
// Set direct paths to NICs.
for(int n = 0; n < system->nodes[NET].count; n++) {
SCCLCHECK(graph::scclTopoSetPaths(system->nodes[NET].nodes + n, system));
}
// Set direct paths to NVSwitches.
for(int n = 0; n < system->nodes[NVS].count; n++) {
SCCLCHECK(graph::scclTopoSetPaths(system->nodes[NVS].nodes + n, system));
}
// Update path for GPUs when we don't want to / can't use GPU Direct P2P
for(int g = 0; g < system->nodes[GPU].count; g++) {
for(int p = 0; p < system->nodes[GPU].count; p++) {
int p2p;
SCCLCHECK(scclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
if(p2p == 0) {
// Divert all traffic through the CPU
int cpu;
SCCLCHECK(getLocalCpu(system, g, &cpu));
SCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
}
}
if(comm == NULL)
continue;
// Remove GPUs we can't (or don't want to) communicate with through P2P or SHM
struct scclPeerInfo* dstInfo = comm->peerInfo + system->nodes[GPU].nodes[g].gpu.rank;
for(int p = 0; p < system->nodes[GPU].count; p++) {
if(p == g)
continue;
struct scclPeerInfo* srcInfo = comm->peerInfo + system->nodes[GPU].nodes[p].gpu.rank;
int p2p;
SCCLCHECK(scclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
if(p2p == 0) {
int shm;
SCCLCHECK(scclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
if(shm == 0) {
// Mark this peer as inaccessible. We'll trim it later.
system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
}
}
}
}
// Special handling of gfx94x
#if !defined(TOPO_EXPL)
char strValue[1024];
SCCLCHECK(scclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue));
if(strncmp("Hyper-V UEFI Release", strValue, 20) == 0) {
#endif
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_INTEL && IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
((system->nodes[GPU].count == 8 && system->nodes[NET].count == 8 && system->nodes[GPU].count == system->nRanks) ||
(system->nodes[GPU].count != system->nRanks))) {
if(!rcclPathOverride(system, 0x100000))
rcclPathOverride(system, 0x1000);
}
#if !defined(TOPO_EXPL)
}
#endif
// Update paths for NICs (no GPU Direct, PXN, ...)
for(int n = 0; n < system->nodes[NET].count; n++) {
struct scclTopoNode* netNode = system->nodes[NET].nodes + n;
for(int g = 0; g < system->nodes[GPU].count; g++) {
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
if(scclPxnDisable(comm) != 1) {
int localGpuIndex;
SCCLCHECK(scclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex));
if(localGpuIndex != g && localGpuIndex != -1) {
// PXN = PCI + NVLink.
struct scclTopoNode* peerNode = system->nodes[GPU].nodes + localGpuIndex;
// Only use PXN for NIC n if remote GPU p ...
if(peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI
peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU
// We can use that GPU as relay to communicate with that NIC.
// Only enabling it in the GPU->NIC direction for now to favor
// receiving locally and sending remotely (consistent with net.cc)
SCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
}
}
// Update path when we dont want to / can't use GPU Direct RDMA.
int gdr;
SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
if(gdr == 0) {
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
int localCpu;
SCCLCHECK(getLocalCpu(system, g, &localCpu));
SCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
SCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
}
}
}
return scclSuccess;
}
} // namespace topology
} // namespace hardware
} // namespace sccl
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
#define MAXWIDTH 20
#define PREFIXLEN 15
#define STRLENGTH (PREFIXLEN + 5 * MAXWIDTH)
void dumpLine(int* values, int nranks, const char* prefix) {
int prefixlen = strlen(prefix);
char line[STRLENGTH + 1];
line[STRLENGTH] = '\0';
memset(line, ' ', STRLENGTH);
strncpy(line, prefix, PREFIXLEN);
for(int i = 0; i < nranks && i < MAXWIDTH; i++)
sprintf(line + prefixlen + 4 * i, " %3d", values[i]);
INFO(SCCL_INIT, "%s", line);
}
scclResult_t scclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
for(int r = 0; r < nrings; r++) {
char prefix[40];
/*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
dumpLine(prev+r*nranks, nranks, prefix);
sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
dumpLine(next+r*nranks, nranks, prefix);*/
int current = rank;
for(int i = 0; i < nranks; i++) {
rings[r * nranks + i] = current;
current = next[r * nranks + current];
}
sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
if(rank == 0)
dumpLine(rings + r * nranks, nranks, prefix);
if(current != rank) {
WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
return scclInternalError;
}
// Check that all ranks are there
for(int i = 0; i < nranks; i++) {
int found = 0;
for(int j = 0; j < nranks; j++) {
if(rings[r * nranks + j] == i) {
found = 1;
break;
}
}
if(found == 0) {
WARN("Error : ring %d does not contain rank %d", r, i);
return scclInternalError;
}
}
}
return scclSuccess;
}
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
/*************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
scclResult_t scclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "xml.h"
#include <math.h>
#include <sys/time.h>
#include <algorithm>
#include <string.h>
#include "rome_models.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
struct scclRomeModel {
int nGpus;
int nCpus;
int nNics;
int nLinks;
int64_t gpuIds[SCCL_TOPO_MAX_NODES];
int64_t nicIds[SCCL_TOPO_MAX_NODES];
int64_t gpuNuma[SCCL_TOPO_MAX_NODES];
int64_t nicNuma[SCCL_TOPO_MAX_NODES];
uint8_t connMatrix[SCCL_TOPO_MAX_NODES * SCCL_TOPO_MAX_NODES];
uint8_t gdrLevel[SCCL_TOPO_MAX_NODES * SCCL_TOPO_MAX_NODES];
const char* pattern;
const char* ringBase;
const char* options;
const char* treeBase;
};
static struct scclRomeModel rome_model_22 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 1,
.nLinks = 2,
.gpuIds =
{
0x3000,
0x43000,
0x26000,
0xc3000,
0x83000,
0x23000,
0xc6000,
0xa3000,
},
.nicIds =
{
0xe1000,
},
.gpuNuma =
{
1,
0,
1,
2,
3,
1,
2,
3,
},
.nicNuma =
{
2,
},
.connMatrix =
{
0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_SYS,
},
.pattern = "10302120",
.ringBase = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6",
.options = "",
};
static struct scclRomeModel rome_model_25 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 2,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds =
{
0x61000,
0xa1000,
},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
2,
3,
},
.nicNuma =
{
0,
3,
},
.connMatrix =
{
0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
},
.gdrLevel =
{
PATH_PHB,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
},
.pattern = "11303011",
.ringBase = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0",
.options = "",
};
static struct scclRomeModel rome_model_27 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 2,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds =
{
0x61000,
0xa1000,
},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
2,
3,
},
.nicNuma =
{
0,
3,
},
.connMatrix =
{
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
},
.gdrLevel =
{
PATH_PHB,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
},
.pattern = "11303011",
.ringBase = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2",
.options = "",
};
static struct scclRomeModel rome_model_29 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 1,
.nLinks = 3,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds =
{
0xe1000,
},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
3,
3,
},
.nicNuma =
{
2,
},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_PHB,
PATH_SYS,
PATH_SYS,
},
.pattern = "10302120",
.ringBase = "6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0",
.options = "",
};
static struct scclRomeModel rome_model_31 = {
.nGpus = 8,
.nCpus = 8,
.nNics = 2,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds =
{
0x61000,
0xa1000,
},
.gpuNuma =
{
1,
2,
2,
3,
4,
5,
5,
7,
},
.nicNuma =
{
0,
6,
},
.connMatrix =
{
0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
},
.pattern = "0110201010200110",
.ringBase = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3",
.options = "",
};
static struct scclRomeModel rome_model_33 = {
.nGpus = 8,
.nCpus = 8,
.nNics = 2,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds =
{
0x61000,
0xa1000,
},
.gpuNuma =
{
1,
2,
2,
3,
4,
5,
5,
7,
},
.nicNuma =
{
0,
6,
},
.connMatrix =
{
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
},
.pattern = "0110201010200110",
.ringBase = "1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0",
.options = "",
};
static struct scclRomeModel rome_model_30 = {
.nGpus = 8,
.nCpus = 8,
.nNics = 0,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
1,
2,
2,
3,
4,
5,
5,
7,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "0010201010200010",
.ringBase = "3 0 1 2 6 7 5 4|2 1 0 3 7 6 4 5",
.options = "",
};
static struct scclRomeModel rome_model_32 = {
.nGpus = 8,
.nCpus = 8,
.nNics = 0,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
1,
2,
2,
3,
4,
5,
5,
7,
},
.nicNuma = {},
.connMatrix =
{
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
},
.gdrLevel = {},
.pattern = "0010201010200010",
.ringBase = "0 6 2 3 4 5 7 1|3 2 6 0 1 7 5 4",
.options = "",
};
static struct scclRomeModel rome_model_24 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
2,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "10303010",
.ringBase = "0 1 2 3 5 7 6 4|1 0 3 2 7 5 4 6",
.options = "",
};
static struct scclRomeModel rome_model_26 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
2,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
},
.gdrLevel = {},
.pattern = "10303010",
.ringBase = "4 5 7 1 0 3 2 6|3 0 6 2 1 7 5 4",
.options = "",
};
static struct scclRomeModel rome_model_23 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
},
.gdrLevel = {},
.pattern = "10302020",
.ringBase = "1 7 6 4 5 2 0 3|2 5 3 0 4 6 7 1",
.options = "",
};
static struct scclRomeModel rome_model_38 = {
.nGpus = 8,
.nCpus = 7,
.nNics = 0,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
1,
2,
2,
3,
5,
5,
6,
7,
},
.nicNuma = {},
.connMatrix =
{
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
},
.gdrLevel = {},
.pattern = "10201000201010",
.ringBase = "6 7 1 4 3 5 2 0|0 2 5 3 4 1 7 6",
.options = "",
};
static struct scclRomeModel rome_model_28 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "10302020",
.ringBase = "0 3 2 1 4 5 6 7|7 6 5 4 1 2 3 0|0 2 5 7 4 6 3 1|1 3 6 4 7 5 2 0",
.options = "",
};
static struct scclRomeModel rome_model_40 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 1,
.nLinks = 3,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds =
{
0xe1000,
},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
3,
3,
},
.nicNuma =
{
2,
},
.connMatrix =
{
0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_PHB,
PATH_SYS,
PATH_SYS,
},
.pattern = "10302120",
.ringBase = "6 7 1 4 0 5 3 2|7 6 4 1 0 2 3 5",
.options = "",
};
static struct scclRomeModel rome_model_42 = {
.nGpus = 8,
.nCpus = 7,
.nNics = 1,
.nLinks = 3,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds =
{
0xe1000,
},
.gpuNuma =
{
1,
2,
2,
3,
5,
5,
6,
7,
},
.nicNuma =
{
4,
},
.connMatrix =
{
0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
},
.pattern = "10201001201010",
.ringBase = "7 4 6 1 3 0 2 5|6 4 7 1 3 2 5 0",
.options = "",
};
static struct scclRomeModel rome_model_44 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 1,
.nLinks = 3,
.gpuIds =
{
0x63000,
0x43000,
0x27000,
0x3000,
0xe3000,
0xc3000,
0xa3000,
0x83000,
},
.nicIds =
{
0xc4000,
},
.gpuNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.nicNuma =
{
2,
},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_PHB,
PATH_SYS,
PATH_SYS,
},
.pattern = "20202120",
.ringBase = "5 4 7 6 2 1 3 0|5 6 7 4 1 0 2 3",
.options = "",
};
static struct scclRomeModel rome_model_45 = {
.nGpus = 8,
.nCpus = 7,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
1,
2,
2,
3,
5,
5,
6,
7,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "10201000201010",
.ringBase = "0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1",
.options = "",
};
static struct scclRomeModel rome_model_46 = {
.nGpus = 8,
.nCpus = 7,
.nNics = 1,
.nLinks = 3,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds =
{
0xe1000,
},
.gpuNuma =
{
1,
2,
2,
3,
5,
5,
6,
7,
},
.nicNuma =
{
4,
},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
},
.pattern = "10201001201010",
.ringBase = "6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2",
.options = "",
};
static struct scclRomeModel rome_model_48 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x4a000,
0x50000,
0xa000,
0xf000,
0xcb000,
0xd1000,
0x8a000,
0x90000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "20202020",
.ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0",
.options = "",
};
static struct scclRomeModel rome_model_49 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0x4a000,
0x50000,
0xa000,
0xf000,
0xcb000,
0xd1000,
0x8a000,
0x90000,
},
.nicIds =
{
0x45000,
0x13000,
0xc6000,
0x85000,
},
.gpuNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.nicNuma =
{
0,
1,
2,
3,
},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB,
PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
},
.pattern = "21212121",
.ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1",
.options = "",
};
static struct scclRomeModel rome_model_52 = {
.nGpus = 8,
.nCpus = 1,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0xc1000,
0xc5000,
0xc9000,
0xcd000,
0xd1000,
0xd5000,
0xd9000,
0xdd000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
0,
0,
0,
0,
0,
0,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "80",
.ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
.options = "",
};
static struct scclRomeModel rome_model_53 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0x4a000,
0x50000,
0xa000,
0xf000,
0xcb000,
0xd1000,
0x8a000,
0x90000,
},
.nicIds =
{
0x45000,
0x13000,
0xc6000,
0x85000,
},
.gpuNuma =
{
1,
1,
3,
3,
5,
5,
7,
7,
},
.nicNuma =
{
1,
3,
5,
7,
},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB,
PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
},
.pattern = "21212121",
.ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1",
.options = "",
};
static struct scclRomeModel rome_model_43 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x63000,
0x43000,
0x27000,
0x3000,
0xe3000,
0xc3000,
0xa3000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "20202020",
.ringBase = "0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1|0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1|0 1 2 3 4 5 6 "
"7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1",
.options = "treeDefined=1",
.treeBase =
"(2(5(6(7(4))))(3(0(1))))|(2(5(7(6(4))))(0(1(3))))|(2(5(7(4(6))))(1(3(0))))|(6(1(0(2(3))))(7(4(5))))|(6(1(2(0(3))))(4(5(7))))|(6(1(0(3(2))))(5(7(4))))|"
"(1(6(7(5(4))))(2(3(0))))|(1(6(4(7(5))))(3(2(0))))|(1(6(5(4(7))))(3(0(2))))|(5(2(3(1(0))))(4(6(7))))|(5(2(0(3(1))))(6(4(7))))|(5(2(1(0(3))))(4(7(6))))",
};
static struct scclRomeModel rome_model_55 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x100000,
0x200000,
0x300000,
0x400000,
0x500000,
0x600000,
0x700000,
0x800000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "20202020",
.ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|2 3 0 1 6 7 4 5|5 4 7 6 1 0 3 2",
.options = "",
};
static struct scclRomeModel rome_model_56 = {
.nGpus = 16,
.nCpus = 4,
.nNics = 0,
.nLinks = 4,
.gpuIds =
{
0x4e000,
0x51000,
0x56000,
0x59000,
0xe000,
0x11000,
0x16000,
0x19000,
0xcf000,
0xd2000,
0xd7000,
0xda000,
0x8f000,
0x92000,
0x97000,
0x9a000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
2,
2,
2,
2,
3,
3,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 4, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 4, 0,
},
.gdrLevel = {},
.pattern = "40404040",
.ringBase = "0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4|0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4|0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1|4 5 13 12 8 9 11 10 14 15 7 "
"6 2 3 1 0|4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0|1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0",
.options = "pivotA2AEnabled=1,pivotA2ANumBiRings=3,tuning=1,mscclEnabled=1,treeDefined=1",
.treeBase = "(0(1(3(2(6(7(15(14(10))))))))(4(5(13(12(8(9(11))))))))|(2(3(7(6(13(12(8(9(10))))))))(1(0(4(5(14(15(11))))))))|(14(15(11(10(8(9(13(12(4))))))))"
"(6(7(3(2(0(1(5))))))))|(10(11(9(8(12(13(5(4(0))))))))(14(15(7(6(2(3(1))))))))|(10(11(15(14(5(4(0(1(2))))))))(9(8(12(13(6(7(3))))))))|(4(5(1(0("
"2(3(7(6(14))))))))(12(13(9(8(10(11(15))))))))|(6(7(15(14(10(11(9(8(12))))))))(2(3(1(0(4(5(13))))))))|(13(12(8(9(10(11(15(14(5))))))))(6(7(3(2("
"1(0(4))))))))|(8(9(13(12(4(5(1(0(2))))))))(10(11(15(14(6(7(3))))))))|(12(13(5(4(0(1(3(2(6))))))))(8(9(11(10(14(15(7))))))))|(5(4(0(1(2(3(7(6("
"13))))))))(14(15(11(10(9(8(12))))))))|(2(3(7(6(14(15(11(10(8))))))))(0(1(5(4(12(13(9))))))))",
};
static struct scclRomeModel rome_model_58 = {
.nGpus = 8,
.nCpus = 3,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0xc1000,
0xc6000,
0xc9000,
0xce000,
0xd1000,
0xd6000,
0xd9000,
0xde000,
},
.nicIds = {},
.gpuNuma =
{
3,
3,
1,
1,
0,
0,
0,
0,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "402020",
.ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
.options = "",
};
static struct scclRomeModel rome_model_59 = {
.nGpus = 16,
.nCpus = 4,
.nNics = 8,
.nLinks = 4,
.gpuIds =
{
0x4e000,
0x51000,
0x56000,
0x59000,
0xe000,
0x11000,
0x16000,
0x19000,
0xcf000,
0xd2000,
0xd7000,
0xda000,
0x8f000,
0x92000,
0x97000,
0x9a000,
},
.nicIds =
{
0x4b000,
0x5a000,
0xb000,
0x1a000,
0xcc000,
0xdb000,
0x8c000,
0x9b000,
},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
2,
2,
2,
2,
3,
3,
3,
3,
},
.nicNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.connMatrix =
{
0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 4, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 4, 0,
},
.gdrLevel =
{
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
},
.pattern = "42424242",
.ringBase = "N4 9 8 12 13 5 4 0 1 3 2 6 7 15 14 10 11 N5|N1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 1 N0|N3 7 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 N7|N7 15 14 "
"10 11 9 8 12 13 5 4 0 1 3 2 6 7 N3|N5 11 10 14 15 7 6 2 3 1 0 4 5 13 12 8 9 N4|N0 1 0 4 5 13 12 8 9 11 10 14 15 7 6 2 3 N1|N3 6 7 3 2 1 0 4 5 "
"14 15 11 10 9 8 12 13 N6|N7 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 5 N2|N2 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 14 N7|N6 13 12 8 9 10 11 15 14 5 4 "
"0 1 2 3 7 6 N3|N4 8 9 13 12 4 5 1 0 2 3 7 6 14 15 11 10 N5|N5 10 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 N4|N6 12 13 9 8 10 11 15 14 6 7 3 2 0 1 5 "
"4 N2|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N1 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 N0|N0 0 1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 N1|N5 "
"10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N3 6 7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N7 14 15 7 6 "
"2 3 1 0 4 5 13 12 8 9 11 10 N5|N0 0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 N2|N4 8 9 10 11 15 14 5 4 0 1 2 3 7 6 13 12 N6|N3 7 6 13 12 8 9 10 11 "
"15 14 5 4 0 1 2 3 N1|N1 3 2 1 0 4 5 14 15 11 10 9 8 12 13 6 7 N3|N6 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 8 N4|N2 4 5 14 15 11 10 9 8 12 13 6 7 "
"3 2 1 0 N0|N0 1 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 N2|N6 13 12 4 5 1 0 2 3 7 6 14 15 11 10 8 9 N4|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 "
"N7|N2 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0 1 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6",
.options = "tuning=4,ll128Enabled=1,baseBw=161.4",
};
static struct scclRomeModel rome_model_62 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0xc1000,
0xc6000,
0xc9000,
0xce000,
0xd1000,
0xd6000,
0xd9000,
0xde000,
},
.nicIds = {},
.gpuNuma =
{
3,
3,
1,
1,
0,
0,
2,
2,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "20202020",
.ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
.options = "",
};
static struct scclRomeModel rome_model_63 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0xc1000,
0xc6000,
0xc9000,
0xce000,
0xd1000,
0xd6000,
0xd9000,
0xde000,
},
.nicIds =
{
0xc5000,
0xcd000,
0xd5000,
0xdd000,
},
.gpuNuma =
{
3,
3,
1,
1,
0,
0,
2,
2,
},
.nicNuma =
{
3,
1,
0,
2,
},
.connMatrix =
{
0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
},
.gdrLevel =
{
PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
},
.pattern = "21212121",
.ringBase = "N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 "
"1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3",
.options = "tuning=3",
};
static struct scclRomeModel rome_model_65 = {
.nGpus = 16,
.nCpus = 4,
.nNics = 8,
.nLinks = 4,
.gpuIds =
{
0x4e000,
0x51000,
0x56000,
0x59000,
0xe000,
0x11000,
0x16000,
0x19000,
0xcf000,
0xd2000,
0xd7000,
0xda000,
0x8f000,
0x92000,
0x97000,
0x9a000,
},
.nicIds =
{
0x4b000,
0x5a000,
0xb000,
0x1a000,
0xcc000,
0xdb000,
0x8c000,
0x9b000,
},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
2,
2,
2,
2,
3,
3,
3,
3,
},
.nicNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.connMatrix =
{
0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 4, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 4, 0,
},
.gdrLevel =
{
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
},
.pattern = "42424242",
.ringBase = "N4 9 8 12 13 5 4 0 1 3 2 6 7 15 14 10 11 N5|N1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 1 N0|N3 7 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 N7|N7 15 14 "
"10 11 9 8 12 13 5 4 0 1 3 2 6 7 N3|N5 11 10 14 15 7 6 2 3 1 0 4 5 13 12 8 9 N4|N0 1 0 4 5 13 12 8 9 11 10 14 15 7 6 2 3 N1|N3 6 7 3 2 1 0 4 5 "
"14 15 11 10 9 8 12 13 N6|N7 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 5 N2|N2 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 14 N7|N6 13 12 8 9 10 11 15 14 5 4 "
"0 1 2 3 7 6 N3|N4 8 9 13 12 4 5 1 0 2 3 7 6 14 15 11 10 N5|N5 10 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 N4|N6 12 13 9 8 10 11 15 14 6 7 3 2 0 1 5 "
"4 N2|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N1 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 N0|N0 0 1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 N1|N5 "
"10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N3 6 7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N7 14 15 7 6 "
"2 3 1 0 4 5 13 12 8 9 11 10 N5|N0 0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 N2|N4 8 9 10 11 15 14 5 4 0 1 2 3 7 6 13 12 N6|N3 7 6 13 12 8 9 10 11 "
"15 14 5 4 0 1 2 3 N1|N1 3 2 1 0 4 5 14 15 11 10 9 8 12 13 6 7 N3|N6 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 8 N4|N2 4 5 14 15 11 10 9 8 12 13 6 7 "
"3 2 1 0 N0|N0 1 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 N2|N6 13 12 4 5 1 0 2 3 7 6 14 15 11 10 8 9 N4|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 "
"N7|N2 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0 1 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6",
.options = "tuning=4,ll128Enabled=1,baseBw=161.4",
};
static struct scclRomeModel rome_model_66 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x29000,
0x2c000,
0x2f000,
0x32000,
0xad000,
0xb0000,
0xb3000,
0xb6000,
},
.nicIds = {},
.gpuNuma =
{
1,
1,
1,
1,
3,
3,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 4, 0, 0, 2, 0, 1, 0, 4, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 4, 1, 0, 2, 0, 0, 1, 4, 0, 0, 1, 0, 0,
2, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 1, 4, 0,
},
.gdrLevel = {},
.pattern = "4040",
.ringBase = "0 6 7 5 4 2 3 1|1 3 2 4 5 7 6 0|0 1 7 6 2 3 5 4|4 5 3 2 6 7 1 0",
.options = "disableNumaMatching=1,tuning=2",
};
static struct scclRomeModel rome_model_67 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0x29000,
0x2c000,
0x2f000,
0x32000,
0xad000,
0xb0000,
0xb3000,
0xb6000,
},
.nicIds =
{
0x1d000,
0x1e000,
0xa1000,
0xa2000,
},
.gpuNuma =
{
1,
1,
1,
1,
3,
3,
3,
3,
},
.nicNuma =
{
1,
1,
3,
3,
},
.connMatrix =
{
0, 4, 0, 0, 2, 0, 1, 0, 4, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 4, 1, 0, 2, 0, 0, 1, 4, 0, 0, 1, 0, 0,
2, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 1, 4, 0,
},
.gdrLevel =
{
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
},
.pattern = "4242",
.ringBase = "N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N1 2 3 5 4 0 1 7 6 N3|N2 4 5 3 2 6 7 1 0 N0|N1 3 2 4 5 7 6 0 1 N0|N0 1 0 6 7 5 4 2 3 N1|N0 0 1 7 "
"6 2 3 5 4 N2|N3 6 7 1 0 4 5 3 2 N1",
.options = "disableNumaMatching=1,tuning=2",
};
static struct scclRomeModel rome_model_68 = {
.nGpus = 16,
.nCpus = 1,
.nNics = 16,
.nLinks = 3,
.gpuIds =
{
0xcf000,
0xd4000,
0xd5000,
0xd6000,
0xd0000,
0xd1000,
0xd2000,
0xd3000,
0xf0000,
0xf1000,
0xf2000,
0xf3000,
0xf4000,
0xf5000,
0xf6000,
0xf7000,
},
.nicIds =
{
0xcd000,
0xc8000,
0xc9000,
0xcb000,
0xcc000,
0xce000,
0xc7000,
0xca000,
0xe8000,
0xe9000,
0xea000,
0xeb000,
0xec000,
0xed000,
0xee000,
0xef000,
},
.gpuNuma =
{
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
},
.nicNuma =
{
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB,
PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PIX,
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PIX, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX,
},
.pattern = "@@",
.ringBase = "N0 0 1 2 3 N3 N4 4 5 6 7 N7 N8 8 9 10 11 N11 N12 12 13 14 15 N15|N15 15 14 13 12 N12 N11 11 10 9 8 N8 N7 7 6 5 4 N4 N3 3 2 1 0 N0|N1 1 3 0 2 "
"N2 N5 5 7 4 6 N6 N9 9 11 8 10 N10 N13 13 15 12 14 N14|N14 14 12 15 13 N13 N10 10 8 11 9 N9 N6 6 4 7 5 N5 N2 2 0 3 1 N1|N0 0 1 2 3 N3 N4 4 5 6 "
"7 N7 N8 8 9 10 11 N11 N12 12 13 14 15 N15|N15 15 14 13 12 N12 N11 11 10 9 8 N8 N7 7 6 5 4 N4 N3 3 2 1 0 N0|N1 1 3 0 2 N2 N5 5 7 4 6 N6 N9 9 "
"11 8 10 N10 N13 13 15 12 14 N14|N14 14 12 15 13 N13 N10 10 8 11 9 N9 N6 6 4 7 5 N5 N2 2 0 3 1 N1",
.options = "",
};
static struct scclRomeModel rome_model_71 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x32000,
0x35000,
0x11000,
0x14000,
0xae000,
0xb3000,
0x8e000,
0x93000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
},
.nicNuma = {},
.connMatrix =
{
0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
},
.gdrLevel = {},
.pattern = "4040",
.ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 2 3 7 6|6 7 3 2 4 5 1 0",
.options = "disableNumaMatching=1,tuning=2",
};
static struct scclRomeModel rome_model_72 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0x32000,
0x35000,
0x11000,
0x14000,
0xae000,
0xb3000,
0x8e000,
0x93000,
},
.nicIds =
{
0x1d000,
0x1e000,
0xa0000,
0xa1000,
},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
},
.nicNuma =
{
0,
0,
1,
1,
},
.connMatrix =
{
0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
},
.gdrLevel =
{
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
},
.pattern = "4242",
.ringBase = "N0 0 1 3 2 4 5 7 6 N3|N1 2 3 1 0 6 7 5 4 N2|N3 7 6 0 1 5 4 2 3 N1|N0 1 0 6 7 3 2 4 5 N2|N2 4 5 7 6 0 1 3 2 N1|N3 6 7 5 4 2 3 1 0 N0|N2 5 4 2 "
"3 7 6 0 1 N0|N1 3 2 4 5 1 0 6 7 N3",
.options = "disableNumaMatching=1,tuning=2",
};
static struct scclRomeModel rome_model_73 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0xc1000,
0xc6000,
0xc9000,
0xce000,
0xd1000,
0xd6000,
0xd9000,
0xde000,
},
.nicIds = {},
.gpuNuma =
{
3,
3,
1,
1,
0,
0,
2,
2,
},
.nicNuma = {},
.connMatrix =
{
0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
},
.gdrLevel = {},
.pattern = "20202020",
.ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
.options = "",
};
static struct scclRomeModel rome_model_74 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0xc1000,
0xc6000,
0xc9000,
0xce000,
0xd1000,
0xd6000,
0xd9000,
0xde000,
},
.nicIds =
{
0xc5000,
0xcd000,
0xd5000,
0xdd000,
},
.gpuNuma =
{
3,
3,
1,
1,
0,
0,
2,
2,
},
.nicNuma =
{
3,
1,
0,
2,
},
.connMatrix =
{
0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
},
.gdrLevel =
{
PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
},
.pattern = "21212121",
.ringBase = "N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 "
"1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3",
.options = "tuning=3",
};
static struct scclRomeModel rome_model_76 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 8,
.nLinks = 3,
.gpuIds =
{
0x32000,
0x35000,
0x11000,
0x14000,
0xae000,
0xb3000,
0x8e000,
0x93000,
},
.nicIds =
{
0x26000,
0x2d000,
0x5000,
0xc000,
0xab000,
0xb4000,
0x8b000,
0x94000,
},
.gpuNuma =
{
1,
1,
1,
1,
3,
3,
3,
3,
},
.nicNuma =
{
1,
1,
1,
1,
3,
3,
3,
3,
},
.connMatrix =
{
0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
},
.gdrLevel =
{
PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB,
},
.pattern = "4444",
.ringBase = "N0 0 1 3 2 4 5 7 6 N6|N2 2 3 1 0 6 7 5 4 N4|N5 5 4 2 3 7 6 0 1 N1|N1 1 0 6 7 3 2 4 5 N5|N4 4 5 7 6 0 1 3 2 N2|N2 2 3 1 0 6 7 5 4 N4|N0 0 1 5 "
"4 2 3 7 6 N6|N3 3 2 4 5 1 0 6 7 N7|N4 4 5 7 6 0 1 3 2 N2|N6 6 7 5 4 2 3 1 0 N0|N7 7 6 0 1 5 4 2 3 N3|N6 6 7 3 2 4 5 1 0 N0|N3 3 2 0 1 5 4 6 7 "
"N7|N1 1 0 2 3 7 6 4 5 N5|N5 5 4 6 7 3 2 0 1 N1|N7 7 6 4 5 1 0 2 3 N3",
.options = "disableNumaMatching=1,tuning=3",
};
static struct scclRomeModel rome_model_79 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 0,
.nLinks = 7,
.gpuIds =
{
0x1d000,
0x2e000,
0x3f000,
0x61000,
0x9f000,
0xaf000,
0xbf000,
0xdf000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "4040",
.ringBase = "0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 "
"0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0",
.options = "noCpuCheck=1,mscclEnabled=1",
};
static struct scclRomeModel rome_model_80 = {
.nGpus = 4,
.nCpus = 4,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0x82000,
0xc2000,
0x2000,
0x42000,
},
.nicIds =
{
0x81000,
0xc1000,
0x1000,
0x41000,
},
.gpuNuma =
{
2,
3,
0,
1,
},
.nicNuma =
{
2,
3,
0,
1,
},
.connMatrix =
{
0,
2,
2,
2,
2,
0,
2,
2,
2,
2,
0,
2,
2,
2,
2,
0,
},
.gdrLevel =
{
PATH_PHB,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
},
.pattern = "11111111",
.ringBase = "N2 2 3 0 1 N1|N0 0 1 3 2 N2|N0 0 2 1 3 N3|N3 3 1 0 2 N2|N3 3 1 2 0 N0|N1 1 0 3 2 N2|N1 1 2 3 0 N0|N2 2 0 1 3 N3|N3 3 0 2 1 N1|N2 2 3 1 0 "
"N0|N1 1 2 0 3 N3|N0 0 3 2 1 N1",
.options = "",
};
static struct scclRomeModel rome_model_81 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 8,
.nLinks = 7,
.gpuIds =
{
0xc000,
0x22000,
0x38000,
0x5c000,
0x9f000,
0xaf000,
0xbf000,
0xdf000,
},
.nicIds =
{
0x7000,
0x1d000,
0x33000,
0x57000,
0x9a000,
0xaa000,
0xba000,
0xda000,
},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
},
.nicNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
},
.connMatrix =
{
0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB,
},
.pattern = "4444",
.ringBase = "N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 7 1 6 4 N4|N3 3 6 1 5 2 7 4 0 N0|N4 4 7 0 6 5 1 3 2 N2|N5 5 4 6 3 0 7 2 1 N1|N6 6 2 0 "
"4 1 7 5 3 N3|N7 7 3 1 4 2 6 0 5 N5|N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 7 1 6 4 N4|N3 3 6 1 5 2 7 4 0 N0|N4 4 7 0 6 5 1 3 2 "
"N2|N5 5 4 6 3 0 7 2 1 N1|N6 6 2 0 4 1 7 5 3 N3|N7 7 3 1 4 2 6 0 5 N5",
.options = "noCpuCheck=1,mscclEnabled=1",
};
static struct scclRomeModel romeTopoModels[] = {
rome_model_22, rome_model_25, rome_model_27, rome_model_29, rome_model_31, rome_model_33, rome_model_30, rome_model_32, rome_model_24,
rome_model_26, rome_model_23, rome_model_38, rome_model_28, rome_model_40, rome_model_42, rome_model_44, rome_model_45, rome_model_46,
rome_model_48, rome_model_49, rome_model_52, rome_model_53, rome_model_43, rome_model_55, rome_model_56, rome_model_58, rome_model_59,
rome_model_62, rome_model_63, rome_model_65, rome_model_66, rome_model_67, rome_model_68, rome_model_71, rome_model_72, rome_model_73,
rome_model_74, rome_model_76, rome_model_79, rome_model_80, rome_model_81,
};
/* Parse user defined rings. Format is like :
* "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1"
* Network interfaces can be optionally specified by N prefix.
* Rings with a non-matching number of gpus are ignored so we can provide
* rings for multiple cases.
*/
scclResult_t parseGraph(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map, int* net_map) {
int gpus[SCCL_TOPO_MAX_NODES];
int nChannels = 0;
int gpu = 0;
int offset = 0;
int status = 0; // 0 : between numbers, 1 : inside number, 2: start NET, 3: inside NET
int nets[SCCL_TOPO_MAX_NODES * 2];
int net_offset = 0, net_count = 0;
int ngpus = system->nodes[GPU].count;
int nnets = system->nodes[NET].count;
do {
if(str[offset] == 'N') {
if(status == 0) {
status = 2;
}
} else {
int digit = str[offset] - '0';
if(digit >= 0 && digit <= 9) {
switch(status) {
case 0:
gpus[gpu] = digit;
status = 1;
break;
case 1: gpus[gpu] = gpus[gpu] * 10 + digit; break;
case 2:
nets[net_offset] = digit + 'N';
status = 3;
break;
case 3: nets[net_offset] = (nets[net_offset] - 'N') * 10 + digit + 'N'; break;
}
} else {
if(status == 1) {
gpu++;
net_offset = 2 * gpu - 1;
if(gpu > SCCL_TOPO_MAX_NODES)
goto end;
} else if(status == 2 || status == 3) {
net_offset++;
net_count++;
if(net_offset > ngpus * 2)
goto end;
}
status = 0;
if(str[offset] == '|' || str[offset] == '\0') {
// Ignore if ngpus doesn't match
if(gpu != ngpus)
goto newchannel;
// Ignore if net_count is not 0 or odd number
if(net_count && net_count % 2)
goto newchannel;
for(int r = 0; r < ngpus; r++) {
int g = gpus[r];
// Ignore if gpus are out of bounds
if(g < 0 || g >= ngpus)
goto newchannel;
// Ignore if gpus are duplicate
for(int i = 0; i < r; i++)
if(gpus[i] == g)
goto newchannel;
// remap if needed
if(gpu_map)
g = gpu_map[g];
// Translate gpu numbers into ranks
int j = 0;
for(j = 0; j < ngpus; j++)
if(g == system->nodes[GPU].nodes[j].gpu.dev)
break;
if(j < ngpus)
graph->intra[nChannels * ngpus + r] = system->nodes[GPU].nodes[j].gpu.rank;
else
return scclInternalError;
}
if(net_count) {
for(int i = 0; net_map && i < ngpus * 2; i++) {
if(nets[i] - 'N' < 0 || nets[i] - 'N' >= nnets)
continue;
nets[i] = net_map[nets[i] - 'N'] + 'N';
}
memcpy(&graph->intraNets[ngpus * nChannels * 2], nets, ngpus * 2 * sizeof(int));
graph->nIntraChannels++;
if(nets[0] - 'N' >= nnets || nets[ngpus * 2 - 1] - 'N' >= nnets)
goto newchannel;
graph->inter[nChannels * 2] = nets[0] - 'N';
graph->inter[nChannels * 2 + 1] = nets[ngpus * 2 - 1] - 'N';
} else if(nnets) {
graph->inter[nChannels * 2] = system->nodes[NET].nodes[nChannels % nnets].id;
graph->inter[nChannels * 2 + 1] = system->nodes[NET].nodes[(nChannels + 1) % nnets].id;
}
nChannels++;
newchannel:
gpu = 0;
net_offset = 0;
net_count = 0;
}
}
}
} while(str[offset++] != 0);
end:
graph->nChannels = nChannels;
graph->bwIntra = graph->bwInter = system->totalBw / nChannels;
if(graph->id == 1) {
for(int i = 0; i < graph->nChannels; i++) {
int net;
scclTopoGetLocalNet(system, graph->intra[i * ngpus + 1], i, &net);
graph->inter[i * 2 + 1] = net;
}
}
#if 0
for (int i=0; i<graph->nChannels; i++) {
printf("%d: ", i);
printf ("NET/%d ", graph->inter[i*2]);
for (int j=0; j<ngpus; j++) printf("GPU/%d ", graph->intra[i*ngpus+j]);
printf ("NET/%d ", graph->inter[i*2+1]);
printf("\n");
}
#endif
return scclSuccess;
}
/* Parse user defined treeBase for complicated trees. Format is like :
* "(4(2(3)(1))(6(5)))"
*
* Rings with a non-matching number of gpus are ignored so we can provide
* rings for multiple cases.
*/
scclResult_t parseGraphLight(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map) {
int gpus[SCCL_TOPO_MAX_NODES]; // transcribe/change according to gpu_map
int nChannels = 0;
int gpu = 0;
int offset = 0;
int start_offset = offset;
if(str[0] == 0) {
graph->treeBase[0][0] = 0;
return scclSuccess;
}
int status = 0; // 0 : between numbers, 1 : inside number
int ngpus = system->nodes[GPU].count;
int x = 0, y = 0;
do {
int digit = str[offset] - '0';
if(digit >= 0 && digit <= 9) {
switch(status) {
case 0:
gpus[gpu] = digit;
status = 1;
break;
case 1: gpus[gpu] = gpus[gpu] * 10 + digit; break;
}
} else {
if(status == 1) {
gpu++;
}
status = 0;
if(str[offset] == '|' || str[offset] == 0) {
int r = 0, y = 0;
while(start_offset < offset) {
// for (int r=0; r<gpu; r++) {
if(str[start_offset] == '(' || str[start_offset] == ')') {
graph->treeBase[x][y] = str[start_offset];
y++;
start_offset++;
} else {
int g = gpus[r];
// remap if needed
if(gpu_map)
g = gpu_map[g];
r++;
int j = 0;
// Translate gpu numbers into ranks
for(j = 0; j < ngpus; j++)
if(g == system->nodes[GPU].nodes[j].gpu.dev)
break;
if(j < ngpus) {
while(str[start_offset] != '(' && str[start_offset] != ')')
start_offset++;
char number_str[10];
sprintf(number_str, "%d", g);
int k = 0;
while(number_str[k] != 0) {
graph->treeBase[x][y] = number_str[k];
y++;
k++;
}
} else
return scclInternalError;
}
}
graph->treeBase[x][y] = 0;
x++;
gpu = 0;
start_offset = offset + 1;
}
}
} while(str[offset++] != 0);
graph->treeBase[x][0] = 0;
return scclSuccess;
}
#define MAX_OPT_TOKENS 10
extern const char* topoPathTypeStr[];
static void parseOptions(struct scclTopoSystem* system, const char* options) {
if(strcmp(options, "")) {
char* str_temp = (char*)malloc(strlen(options) + 1);
strcpy(str_temp, options);
char* tokens[MAX_OPT_TOKENS];
int numTokens = 0;
char* state;
tokens[numTokens] = strtok_r(str_temp, "=, ", &state);
numTokens++;
while(tokens[numTokens - 1] != NULL && numTokens < MAX_OPT_TOKENS)
tokens[numTokens++] = strtok_r(NULL, "=, ", &state);
for(int i = 0; i < numTokens / 2; i++) {
if(strcmp(tokens[i * 2], "netGdrLevel") == 0) {
int j;
for(j = 0; j <= PATH_SYS; j++) {
if(strcmp(tokens[i * 2 + 1], topoPathTypeStr[j]) == 0)
break;
}
if(j <= PATH_SYS)
system->netGdrLevel = j;
else {
system->netGdrLevel = -2;
WARN("invalid netGdrLevel: %s", tokens[i * 2 + 1]);
}
} else if(strcmp(tokens[i * 2], "pivotA2AEnabled") == 0) {
system->pivotA2AEnabled = (bool)atol(tokens[i * 2 + 1]);
} else if(strcmp(tokens[i * 2], "pivotA2ANumBiRings") == 0) {
system->pivotA2ANumBiRings = atol(tokens[i * 2 + 1]);
} else if(strcmp(tokens[i * 2], "tuning") == 0) {
system->tuning = atol(tokens[i * 2 + 1]);
} else if(strcmp(tokens[i * 2], "ll128Enabled") == 0) {
system->ll128Enabled = (bool)atol(tokens[i * 2 + 1]);
} else if(strcmp(tokens[i * 2], "baseBw") == 0) {
system->baseBw = std::stof(tokens[i * 2 + 1]);
} else if(strcmp(tokens[i * 2], "mscclEnabled") == 0) {
system->mscclEnabled = (bool)atol(tokens[i * 2 + 1]);
} else if(strcmp(tokens[i * 2], "treeDefined") == 0) {
system->treeDefined = (bool)atol(tokens[i * 2 + 1]);
}
}
free(str_temp);
}
}
static bool checkOption(const char* options, const char* name) {
if(strcmp(options, "")) {
char* str_temp = (char*)malloc(strlen(options) + 1);
strcpy(str_temp, options);
char* tokens[MAX_OPT_TOKENS];
int numTokens = 0;
char* state;
tokens[numTokens] = strtok_r(str_temp, "=, ", &state);
numTokens++;
while(tokens[numTokens - 1] != NULL && numTokens < MAX_OPT_TOKENS)
tokens[numTokens++] = strtok_r(NULL, "=, ", &state);
for(int i = 0; i < numTokens / 2; i++) {
if(strcmp(tokens[i * 2], name) == 0) {
return (bool)atol(tokens[i * 2 + 1]);
}
}
free(str_temp);
}
return false;
}
scclResult_t parseChordalRing(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
static const char* ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3";
int id[8], dist[8];
int i;
int ngpus = system->nodes[GPU].count;
if(ngpus != 8)
return scclSuccess;
// validate chordal ring and calculate distance
for(i = 0; i < ngpus; i++) {
struct scclTopoNode* node = system->nodes[GPU].nodes + i;
if(node->paths[GPU] == NULL)
continue;
int sum = ngpus * (ngpus - 1) / 2 - node->gpu.dev;
int count = 0;
for(int n = 0; n < ngpus; n++) {
struct scclTopoLink* link;
for(link = node->links; link->remNode; link++) {
if(link->remNode->gpu.dev == n)
break;
}
if(!link->remNode)
continue;
if(link->type != LINK_NVL)
continue;
sum -= system->nodes[GPU].nodes[n].gpu.dev;
count++;
}
if(count != ngpus - 2 || sum < 0 || sum > ngpus - 1) {
return scclSuccess;
}
dist[i] = sum;
}
// remap GPU ids
for(i = 0; i < ngpus; i++)
id[i] = i;
for(i = 0; i < ngpus; i++) {
if(dist[i] == ngpus - 1 - i)
continue;
int j, m, n, temp;
for(j = i + 1; j < ngpus; j++)
if(dist[j] == ngpus - 1 - i)
break;
m = dist[i];
n = dist[j];
dist[i] = n;
dist[j] = m;
temp = id[m];
id[m] = id[n];
id[n] = temp;
temp = dist[m];
dist[m] = dist[n];
dist[n] = temp;
}
// create chordal ring based on reference and remapped ids
system->type |= RCCL_TOPO_CR8G;
SCCLCHECK(parseGraph(ringBase, system, graph, id, NULL));
if(system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
int *intra, *used;
graph->nChannels = system->nodes[NET].count;
SCCLCHECK(scclCalloc(&intra, ngpus));
SCCLCHECK(scclCalloc(&used, system->nodes[NET].count));
for(int n = 0; n < system->nodes[NET].count; n++) {
graph->inter[n * 2] = graph->inter[n * 2 + 1] = n;
struct scclTopoNode* net = system->nodes[NET].nodes + n;
struct scclTopoLinkList* paths = net->paths[GPU];
// find the first unsed GPU that is closest to NIC
int f, m;
for(f = 0; f < ngpus; f++) {
int j = 0;
for(j = 0; j < n; j++)
if(used[j] == system->nodes[GPU].nodes[f].gpu.rank)
break;
if(j >= n)
break;
}
for(int i = 0; i < ngpus; i++) {
int j = 0;
for(j = 0; j < n; j++)
if(used[j] == system->nodes[GPU].nodes[i].gpu.rank)
break;
if(j < n)
continue;
if(paths[i].count < paths[f].count)
f = i;
}
for(m = 0; m < ngpus; m++)
if(graph->intra[n * ngpus + m] == system->nodes[GPU].nodes[f].gpu.rank)
break;
used[n] = graph->intra[n * ngpus + m];
for(int i = 0; i < ngpus; i++)
intra[i] = graph->intra[n * ngpus + ((i + m) % ngpus)];
for(int i = 0; i < ngpus; i++)
graph->intra[n * ngpus + i] = intra[i];
}
free(used);
free(intra);
}
return scclSuccess;
}
static scclResult_t parseRomeSystem(struct scclTopoSystem* system, struct scclRomeModel* romeTopo, char* pattern) {
pattern[0] = 0; // pattern will be NULL for invalid topology
romeTopo->nGpus = system->nodes[GPU].count;
romeTopo->nCpus = system->nodes[CPU].count;
romeTopo->nNics = system->nodes[NET].count;
romeTopo->nLinks = 0;
struct scclGpuIdHIP {
int g;
int dev;
};
auto cmpIds = [](const void* g1, const void* g2) {
struct scclGpuIdHIP* s1 = (struct scclGpuIdHIP*)g1;
struct scclGpuIdHIP* s2 = (struct scclGpuIdHIP*)g2;
return s1->dev - s2->dev;
};
struct scclCpuNuma {
int c;
uint64_t numa;
};
auto cmpNuma = [](const void* g1, const void* g2) {
struct scclCpuNuma* s1 = (struct scclCpuNuma*)g1;
struct scclCpuNuma* s2 = (struct scclCpuNuma*)g2;
return (int)(s1->numa - s2->numa);
};
struct scclNetId {
int n;
uint64_t id;
};
auto cmpNets = [](const void* g1, const void* g2) {
struct scclNetId* s1 = (struct scclNetId*)g1;
struct scclNetId* s2 = (struct scclNetId*)g2;
return (int)(s1->id - s2->id);
};
// sort GPU devices by HIP device ID
struct scclGpuIdHIP gpu_scores[SCCL_TOPO_MAX_NODES];
for(int i = 0; i < romeTopo->nGpus; i++) {
gpu_scores[i].g = i;
gpu_scores[i].dev = system->nodes[GPU].nodes[i].gpu.dev;
}
qsort(gpu_scores, romeTopo->nGpus, sizeof(struct scclGpuIdHIP), cmpIds);
// sort CPU devices by NUMA id
struct scclCpuNuma cpu_scores[SCCL_TOPO_MAX_NODES];
for(int i = 0; i < romeTopo->nCpus; i++) {
cpu_scores[i].c = i;
cpu_scores[i].numa = system->nodes[CPU].nodes[i].id;
}
qsort(cpu_scores, romeTopo->nCpus, sizeof(struct scclCpuNuma), cmpNuma);
// sort NET devices by id
struct scclNetId net_scores[SCCL_TOPO_MAX_NODES];
for(int i = 0; i < romeTopo->nNics; i++) {
net_scores[i].n = i;
net_scores[i].id = system->nodes[NET].nodes[i].id;
}
qsort(net_scores, romeTopo->nNics, sizeof(struct scclNetId), cmpNets);
for(int i = 0; i < romeTopo->nGpus; i++) {
int gpu, n, m, distance;
gpu = gpu_scores[i].g;
romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
m = 0;
distance = system->nodes[GPU].nodes[gpu].paths[CPU][m].count;
for(n = 1; n < romeTopo->nCpus; n++) {
if(system->nodes[GPU].nodes[gpu].paths[CPU][n].count < distance) {
distance = system->nodes[GPU].nodes[gpu].paths[CPU][n].count;
m = n;
}
}
if(m < romeTopo->nCpus)
romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[m].id;
struct scclTopoNode* node = system->nodes[GPU].nodes + gpu;
if(node->paths[GPU] == NULL)
continue;
int count = 0;
for(n = 0; n < romeTopo->nGpus; n++) {
romeTopo->connMatrix[i * romeTopo->nGpus + n] = 0;
struct scclTopoLink* link;
for(link = node->links; link->remNode; link++) {
if(link->remNode->gpu.dev == n)
break;
}
if(!link->remNode)
continue;
if(link->type != LINK_NVL)
continue;
romeTopo->connMatrix[i * romeTopo->nGpus + n] = link->bw / scclTopoXGMISpeed(node->gpu.gcn);
count++;
}
if(romeTopo->nLinks < count)
romeTopo->nLinks = count;
}
for(int i = 0; i < romeTopo->nNics; i++) {
int n, m, distance;
m = 0;
int net = net_scores[i].n;
romeTopo->nicIds[i] = system->nodes[NET].nodes[net].net.busId;
distance = system->nodes[NET].nodes[net].paths[CPU][m].count;
for(n = 0; n < romeTopo->nCpus; n++)
if(system->nodes[NET].nodes[net].paths[CPU][n].count < distance) {
distance = system->nodes[NET].nodes[net].paths[CPU][n].count;
m = n;
}
if(m < romeTopo->nCpus)
romeTopo->nicNuma[i] = system->nodes[CPU].nodes[m].id;
else
return scclSuccess;
}
// number of GPUs and NICs on each numa node is used as first screening pattern
for(int i = 0; i < romeTopo->nCpus; i++) {
uint64_t id = system->nodes[CPU].nodes[cpu_scores[i].c].id;
int g = 0, n = 0;
for(int j = 0; j < romeTopo->nGpus; j++)
if(romeTopo->gpuNuma[j] == id)
g++;
for(int j = 0; j < romeTopo->nNics; j++)
if(romeTopo->nicNuma[j] == id)
n++;
pattern[i * 2] = '0' + g;
pattern[i * 2 + 1] = '0' + n;
}
pattern[romeTopo->nCpus * 2] = 0;
// compute gdr level matrix
for(int i = 0; i < romeTopo->nNics; i++) {
int n = net_scores[i].n;
for(int j = 0; j < romeTopo->nGpus; j++) {
int g = gpu_scores[j].g;
romeTopo->gdrLevel[i * romeTopo->nGpus + j] = system->nodes[GPU].nodes[g].paths[NET][n].type;
}
}
const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
if(romeModelFile) {
INFO(SCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
FILE* file = fopen(romeModelFile, "w");
if(file == NULL) {
WARN("Unable to open %s, not dumping Rome model.", romeModelFile);
return scclSuccess;
}
fprintf(file, "static struct scclRomeModel rome_model_ = {\n");
fprintf(file, " .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,\n", romeTopo->nGpus, romeTopo->nCpus, romeTopo->nNics, romeTopo->nLinks);
fprintf(file, " .gpuIds = { ");
for(int i = 0; i < romeTopo->nGpus; i++)
fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]);
fprintf(file, "},\n");
fprintf(file, " .nicIds = { ");
for(int i = 0; i < romeTopo->nNics; i++)
fprintf(file, "0x%lx, ", romeTopo->nicIds[i]);
fprintf(file, "},\n");
fprintf(file, " .gpuNuma = { ");
for(int i = 0; i < romeTopo->nGpus; i++)
fprintf(file, "%ld, ", romeTopo->gpuNuma[i]);
fprintf(file, "},\n");
fprintf(file, " .nicNuma = { ");
for(int i = 0; i < romeTopo->nNics; i++)
fprintf(file, "%ld, ", romeTopo->nicNuma[i]);
fprintf(file, "},\n");
fprintf(file, " .connMatrix = { ");
for(int i = 0; i < romeTopo->nGpus; i++)
for(int n = 0; n < romeTopo->nGpus; n++)
fprintf(file, "%d, ", romeTopo->connMatrix[i * romeTopo->nGpus + n]);
fprintf(file, "},\n");
fprintf(file, " .gdrLevel = { ");
for(int i = 0; i < romeTopo->nNics; i++)
for(int n = 0; n < romeTopo->nGpus; n++)
fprintf(file, "PATH_%s, ", topoPathTypeStr[romeTopo->gdrLevel[i * romeTopo->nGpus + n]]);
fprintf(file, "},\n");
fprintf(file, " .pattern = \"%s\",\n", pattern);
fprintf(file, " .ringBase = \"\",\n");
fprintf(file, " .options = \"\",\n");
fprintf(file, "};\n");
fclose(file);
}
return scclSuccess;
}
static bool permuteGpuIds(int* g, int n, int last, struct scclRomeModel* ref, struct scclRomeModel* topo, int* time, bool nbio, bool ignore_numa) {
(*time)++;
if(n == last) {
int i, j;
// match GPU numa
if(!ignore_numa) {
for(i = 0; i < ref->nGpus; i++)
if(ref->gpuNuma[i] != topo->gpuNuma[g[i]])
break;
if(i < ref->nGpus)
return false;
}
// match XGMI connection
for(i = 0; i < ref->nGpus; i++) {
for(j = 0; j < ref->nGpus; j++) {
if(ref->connMatrix[i * ref->nGpus + j] != topo->connMatrix[g[i] * ref->nGpus + g[j]])
break;
if((ref->gpuIds[i] - ref->gpuIds[j]) * (topo->gpuIds[g[i]] - topo->gpuIds[g[j]]) < 0)
break;
}
if(j < ref->nGpus)
break;
}
if(i < ref->nGpus)
return false;
// match NBIO
if(nbio) {
for(i = 0; i < ref->nGpus; i++) {
for(j = 0; j < ref->nGpus; j++) {
if(i == j)
continue;
bool nbio_ref = (ref->gpuIds[i] & 0xf0000) == (ref->gpuIds[j] & 0xf0000);
bool nbio_topo = (topo->gpuIds[g[i]] & 0xf0000) == (topo->gpuIds[g[j]] & 0xf0000);
if(nbio_ref != nbio_topo)
break;
if(nbio_ref && ((ref->gpuIds[i] - ref->gpuIds[j]) * (topo->gpuIds[g[i]] - topo->gpuIds[g[j]]) < 0))
break;
}
if(j < ref->nGpus)
break;
}
if(i < ref->nGpus)
return false;
}
return true;
} else {
for(int i = n; i <= last; i++) {
std::swap(g[n], g[i]);
if(permuteGpuIds(g, n + 1, last, ref, topo, time, nbio, ignore_numa))
return true;
std::swap(g[n], g[i]);
}
}
return false;
}
static bool permuteNetIds(int* n, int* g, int s, int last, struct scclRomeModel* ref, struct scclRomeModel* topo, int* time, bool ignore_numa) {
(*time)++;
if(s == last) {
int i, j;
// match NET numa
if(!ignore_numa) {
for(i = 0; i < ref->nNics; i++) {
if(ref->nicNuma[i] != topo->nicNuma[n[i]])
break;
}
if(i < ref->nNics)
return false;
}
// match gdr level
for(i = 0; i < ref->nNics; i++) {
for(j = 0; j < ref->nGpus; j++) {
if(ref->gdrLevel[i * ref->nGpus + j] != topo->gdrLevel[n[i] * ref->nGpus + g[j]])
break;
}
if(j < ref->nGpus)
break;
}
if(i < ref->nNics)
return false;
return true;
} else {
for(int i = s; i <= last; i++) {
std::swap(n[s], n[i]);
if(permuteNetIds(n, g, s + 1, last, ref, topo, time, ignore_numa))
return true;
std::swap(n[s], n[i]);
}
}
return false;
}
scclResult_t parseRome4P2H(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
static char ringRemap[64];
int i;
int ngpus = system->nodes[GPU].count;
int ncpus = system->nodes[CPU].count;
int nnets = system->nodes[NET].count;
if(ngpus > 8)
return scclSuccess;
// only valid on Rome
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
// number of GPUs and NICs on each numa node is used as first screening pattern
struct scclRomeModel romeTopo;
char pattern[256];
SCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
// recognize system as Rome 4P2H even if no matching model
if(ngpus > 4 && romeTopo.nLinks)
system->type |= RCCL_TOPO_4P2H_ROME;
int g[SCCL_TOPO_MAX_NODES], n[SCCL_TOPO_MAX_NODES];
int time = 0;
struct timeval tvs, tve;
gettimeofday(&tvs, NULL);
// check if GPUs are directly connected to CPU
bool match_nbio = true;
for(i = 0; i < romeTopo.nGpus; i++) {
int cpu, gpu;
SCCLCHECK(scclTopoIdToIndex(system, CPU, romeTopo.gpuNuma[i], &cpu));
SCCLCHECK(scclTopoIdToIndex(system, GPU, romeTopo.gpuIds[i], &gpu));
if(system->nodes[GPU].nodes[gpu].paths[CPU][cpu].count > 2)
break;
}
if(i < romeTopo.nGpus)
match_nbio = false;
for(i = 0; i < sizeof(romeTopoModels) / sizeof(romeTopoModels[0]); i++) {
bool ignore_cpu = checkOption(romeTopoModels[i].options, "noCpuCheck");
if(!ignore_cpu && (arch != SCCL_TOPO_CPU_ARCH_X86 || vendor != SCCL_TOPO_CPU_VENDOR_AMD || model != SCCL_TOPO_CPU_TYPE_ROME))
continue;
bool ignore_numa = checkOption(romeTopoModels[i].options, "disableNumaMatching");
if(!ignore_numa && romeTopo.nCpus != romeTopoModels[i].nCpus)
continue;
if(romeTopo.nGpus != romeTopoModels[i].nGpus || romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks)
continue;
if(!ignore_numa && strcmp(romeTopoModels[i].pattern, pattern))
continue;
// permute GPU IDs
for(int j = 0; j < ngpus; j++)
g[j] = (j + 2) % ngpus;
if(!permuteGpuIds(g, 0, ngpus - 1, romeTopoModels + i, &romeTopo, &time, ignore_cpu ? false : match_nbio, ignore_numa))
continue;
if(nnets > 1) {
// permute NET IDs
for(int j = 0; j < nnets; j++)
n[j] = (j + 2) % nnets;
if(permuteNetIds(n, g, 0, nnets - 1, romeTopoModels + i, &romeTopo, &time, ignore_numa))
break;
} else
break;
}
gettimeofday(&tve, NULL);
float t = (tve.tv_sec - tvs.tv_sec) * 1E3 + (tve.tv_usec - tvs.tv_usec) / 1E3;
if(i >= sizeof(romeTopoModels) / sizeof(romeTopoModels[0])) {
// printf("No solution in %.2fms (%d iter)\n", t, time);
return scclSuccess;
}
char line[1024];
// sprintf(line, "Found matching Rome model index %d in %.2fms (%d iter) with GPU mapping: ", i, t, time);
sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i);
int offset = strlen(line);
for(int k = 0; k < ngpus; k++) {
sprintf(line + offset, "%d ", g[k]);
offset = strlen(line);
}
if(nnets > 1) {
sprintf(line + offset, "NET mapping: ");
offset = strlen(line);
for(int k = 0; k < nnets; k++) {
sprintf(line + offset, "%d ", n[k]);
offset = strlen(line);
}
}
INFO(SCCL_GRAPH, "%s", line);
parseOptions(system, romeTopoModels[i].options);
// create 4P2H based on reference and remapped ids
SCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL));
if(romeTopoModels[i].treeBase != nullptr)
SCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, g));
return scclSuccess;
}
scclResult_t parse1H16P(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
#define NUMA_CPUS 4
#define NUMA_GPUS 4
#define NUMA_PERMUTE_COUNT 24
#define TOTAL_PERMUTE_COUNT (NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT)
static char ringRemap[256];
int i;
int ngpus = system->nodes[GPU].count;
int ncpus = system->nodes[CPU].count;
int nnets = system->nodes[NET].count;
// only valid on Rome
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
if(arch != SCCL_TOPO_CPU_ARCH_X86 || vendor != SCCL_TOPO_CPU_VENDOR_AMD || model != SCCL_TOPO_CPU_TYPE_ROME)
return scclSuccess;
// number of GPUs and NICs on each numa node is used as first screening pattern
struct scclRomeModel romeTopo;
char pattern[256];
SCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
// only match for system with 16 GPUs
if(ngpus != 16 || ncpus != NUMA_CPUS)
return scclSuccess;
int gcnt = 0;
int *g16, n[SCCL_TOPO_MAX_NODES];
int* all_gpu_permutations = (int*)malloc(TOTAL_PERMUTE_COUNT * NUMA_CPUS * NUMA_GPUS * sizeof(int));
struct timeval tvs, tve;
gettimeofday(&tvs, NULL);
for(i = 0; i < sizeof(romeTopoModels) / sizeof(romeTopoModels[0]); i++) {
if(romeTopo.nCpus != romeTopoModels[i].nCpus || romeTopo.nGpus != romeTopoModels[i].nGpus || romeTopo.nNics != romeTopoModels[i].nNics ||
romeTopo.nLinks != romeTopoModels[i].nLinks)
continue;
if(strcmp(romeTopoModels[i].pattern, pattern))
continue;
int j, r[ngpus], g[ngpus];
int numa_gpu_permutations[NUMA_CPUS][NUMA_PERMUTE_COUNT][NUMA_GPUS];
// permute GPUs for each CPU NUMA nodes
for(j = 0; j < ncpus; j++) {
int ngpusPerNuma = 0, cnt = 0, npermute = 0;
for(int k = 0; k < ngpus; k++) {
if(romeTopoModels[i].gpuNuma[k] != j)
continue;
r[ngpusPerNuma++] = k;
}
if(ngpusPerNuma == 0)
continue;
if(ngpusPerNuma != NUMA_GPUS)
break;
gcnt++;
// init GPU mapping
for(int k = 0; k < ngpus; k++) {
if(romeTopo.gpuNuma[k] != j)
continue;
g[(2 + cnt++) % ngpusPerNuma] = k;
}
std::sort(g, g + ngpusPerNuma);
do {
for(int n = 0; n < ngpusPerNuma; n++)
numa_gpu_permutations[j][npermute][n] = g[n];
npermute++;
} while(std::next_permutation(g, g + ngpusPerNuma));
if(npermute != NUMA_PERMUTE_COUNT)
break;
}
if(j < ncpus)
continue;
// permute GPUs for all CPU NUMA nodes
for(int a = 0; a < NUMA_PERMUTE_COUNT; a++) {
for(int b = 0; b < NUMA_PERMUTE_COUNT; b++) {
for(int c = 0; c < NUMA_PERMUTE_COUNT; c++) {
for(int d = 0; d < NUMA_PERMUTE_COUNT; d++) {
uint64_t offset = ((a * NUMA_PERMUTE_COUNT + b) * NUMA_PERMUTE_COUNT + c) * NUMA_PERMUTE_COUNT + d;
// offset = (offset+TOTAL_PERMUTE_COUNT/2)%TOTAL_PERMUTE_COUNT;
offset *= (NUMA_CPUS * NUMA_GPUS);
memcpy(all_gpu_permutations + offset, &numa_gpu_permutations[0][a][0], NUMA_GPUS * sizeof(int));
memcpy(all_gpu_permutations + offset + NUMA_GPUS, &numa_gpu_permutations[1][b][0], NUMA_GPUS * sizeof(int));
memcpy(all_gpu_permutations + offset + NUMA_GPUS * 2, &numa_gpu_permutations[2][c][0], NUMA_GPUS * sizeof(int));
memcpy(all_gpu_permutations + offset + NUMA_GPUS * 3, &numa_gpu_permutations[3][d][0], NUMA_GPUS * sizeof(int));
}
}
}
}
// match all GPUs' XGMI connection
int p;
for(p = 0; p < TOTAL_PERMUTE_COUNT; p++) {
g16 = all_gpu_permutations + p * NUMA_CPUS * NUMA_GPUS;
int k;
for(k = 0; k < romeTopoModels[i].nGpus; k++) {
int m;
for(m = 0; m < romeTopoModels[i].nGpus; m++) {
if(romeTopoModels[i].connMatrix[k * romeTopoModels[i].nGpus + m] != romeTopo.connMatrix[g16[k] * romeTopoModels[i].nGpus + g16[m]])
break;
}
if(m < romeTopoModels[i].nGpus)
break;
}
if(k < romeTopoModels[i].nGpus)
continue;
// printf("found match %d: ", p); for (int n = 0; n < NUMA_CPUS*NUMA_GPUS; n++) printf("%d ", g16[n]); printf("\n");
if(nnets > 1) {
// permute NET IDs
int time = 0;
for(int m = 0; m < nnets; m++)
n[m] = (m + 2) % nnets;
if(permuteNetIds(n, g16, 0, nnets - 1, romeTopoModels + i, &romeTopo, &time, false))
break;
} else
break;
}
if(p < TOTAL_PERMUTE_COUNT)
break;
}
gettimeofday(&tve, NULL);
float t = (tve.tv_sec - tvs.tv_sec) * 1E3 + (tve.tv_usec - tvs.tv_usec) / 1E3;
if(i >= sizeof(romeTopoModels) / sizeof(romeTopoModels[0])) {
// printf("No solution in %.2fms\n", t);
return scclSuccess;
}
char line[1024];
// sprintf(line, "Found matching Rome model index %d in %.2fms with GPU mapping: ", i, t);
sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i);
int offset = strlen(line);
for(int k = 0; k < ngpus; k++) {
sprintf(line + offset, "%d ", g16[k]);
offset = strlen(line);
}
if(nnets > 1) {
sprintf(line + offset, "NET mapping: ");
offset = strlen(line);
for(int k = 0; k < nnets; k++) {
sprintf(line + offset, "%d ", n[k]);
offset = strlen(line);
}
}
INFO(SCCL_GRAPH, "%s", line);
system->type |= RCCL_TOPO_16P1H;
parseOptions(system, romeTopoModels[i].options);
// create 16P1H based on reference and remapped ids
SCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g16, nnets > 1 ? n : NULL));
if(romeTopoModels[i].treeBase != nullptr)
SCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, g16));
// clean up
free(all_gpu_permutations);
return scclSuccess;
}
scclResult_t parse4H4P(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
#define NUM_HIVES 4
#define HIVE_GPUS 4
static char ringRemap[256];
int ngpus = system->nodes[GPU].count;
int nnets = system->nodes[NET].count;
// only valid on Rome
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
if(arch != SCCL_TOPO_CPU_ARCH_X86 || vendor != SCCL_TOPO_CPU_VENDOR_AMD || model != SCCL_TOPO_CPU_TYPE_ROME)
return scclSuccess;
// number of GPUs and NICs on each numa node is used as first screening pattern
struct scclRomeModel romeTopo;
char pattern[256];
SCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
// only match for system with 16 GPUs
if(ngpus != NUM_HIVES * HIVE_GPUS || nnets != NUM_HIVES * HIVE_GPUS)
return scclSuccess;
int g_hives[ngpus], n_hives[nnets];
int ng_hives[NUM_HIVES];
// try to sort GPUs into hives
for(int i = 0; i < NUM_HIVES; i++)
ng_hives[i] = 0;
for(int i = 0; i < nnets; i++)
n_hives[i] = -1;
for(int i = 0; i < ngpus; i++)
g_hives[i] = -1;
for(int i = 0; i < ngpus; i++) {
int j, h;
for(j = 0; j < NUM_HIVES; j++) {
if(ng_hives[j]) {
if(romeTopo.connMatrix[i * ngpus + g_hives[j * HIVE_GPUS]]) {
g_hives[j * HIVE_GPUS + ng_hives[j]] = i;
ng_hives[j]++;
break;
}
}
}
if(j >= NUM_HIVES) {
for(h = 0; h < NUM_HIVES; h++) {
if(ng_hives[h] == 0) {
g_hives[h * HIVE_GPUS] = i;
ng_hives[h]++;
break;
}
}
if(h >= NUM_HIVES)
return scclSuccess;
}
}
for(int i = 0; i < NUM_HIVES; i++)
if(ng_hives[i] != 4)
return scclSuccess;
// remap NET ids
for(int i = 0; i < nnets; i++) {
int j;
for(j = 0; j < ngpus; j++) {
if(romeTopo.gdrLevel[i * nnets + g_hives[j]] == 3) {
n_hives[j] = i;
break;
}
}
if(j >= ngpus)
return scclSuccess;
}
// validation
for(int i = 0; i < nnets; i++)
if(n_hives[i] == -1)
return scclSuccess;
for(int i = 0; i < ngpus; i++)
if(g_hives[i] == -1)
return scclSuccess;
char line[1024];
sprintf(line, "Found matching Rome model 4P4H with GPU mapping: ");
int offset = strlen(line);
for(int k = 0; k < ngpus; k++) {
sprintf(line + offset, "%d ", g_hives[k]);
offset = strlen(line);
}
if(nnets > 1) {
sprintf(line + offset, "NET mapping: ");
offset = strlen(line);
for(int k = 0; k < nnets; k++) {
sprintf(line + offset, "%d ", n_hives[k]);
offset = strlen(line);
}
}
INFO(SCCL_GRAPH, "%s", line);
if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME)
system->type |= RCCL_TOPO_4P2H_ROME;
parseOptions(system, rome_model_68.options);
// create 4P4H based on reference and remapped ids
SCCLCHECK(parseGraph(rome_model_68.ringBase, system, graph, g_hives, n_hives));
return scclSuccess;
}
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
#ifndef SCCL_ROME_MODELS_H_
#define SCCL_ROME_MODELS_H_
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
scclResult_t parseGraph(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map, int* net_map);
scclResult_t parseGraphLight(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map);
scclResult_t parseRome4P2H(struct scclTopoSystem* system, struct scclTopoGraph* graph);
scclResult_t parseChordalRing(struct scclTopoSystem* system, struct scclTopoGraph* graph);
scclResult_t parse1H16P(struct scclTopoSystem* system, struct scclTopoGraph* graph);
scclResult_t parse4H4P(struct scclTopoSystem* system, struct scclTopoGraph* graph);
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif
\ No newline at end of file
/**
* MIT License
*
* Copyright 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/*!\file
* \brief sccl_bfloat16.h provides struct for sccl_bfloat16 typedef
*/
#ifndef _SCCL_BFLOAT16_H_
#define _SCCL_BFLOAT16_H_
#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
// include a minimal definition of sccl_bfloat16
#include <stdint.h>
/*! \brief Struct to represent a 16 bit brain floating point number. */
namespace sccl {
typedef struct {
uint16_t data;
} sccl_bfloat16;
} // namespace sccl
#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <hip/hip_runtime.h>
#include <ostream>
#include <type_traits>
namespace sccl {
struct sccl_bfloat16 {
uint16_t data;
enum truncate_t {
truncate
};
__host__ __device__ sccl_bfloat16() = default;
// round upper 16 bits of IEEE float to convert to bfloat16
explicit __host__ __device__ sccl_bfloat16(float f) : data(float_to_bfloat16(f)) {}
explicit __host__ __device__ sccl_bfloat16(float f, truncate_t) : data(truncate_float_to_bfloat16(f)) {}
// zero extend lower 16 bits of bfloat16 to convert to IEEE float
__host__ __device__ operator float() const {
union {
uint32_t int32;
float fp32;
} u = {uint32_t(data) << 16};
return u.fp32;
}
private:
static __host__ __device__ uint16_t float_to_bfloat16(float f) {
union {
float fp32;
uint32_t int32;
} u = {f};
if(~u.int32 & 0x7f800000) {
// When the exponent bits are not all 1s, then the value is zero, normal,
// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
// This causes the bfloat16's mantissa to be incremented by 1 if the 16
// least significant bits of the float mantissa are greater than 0x8000,
// or if they are equal to 0x8000 and the least significant bit of the
// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
// has the value 0x7f, then incrementing it causes it to become 0x00 and
// the exponent is incremented by one, which is the next higher FP value
// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
// incrementing it causes it to become an exponent of 0xFF and a mantissa
// of 0x00, which is Inf, the next higher value to the unrounded value.
u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
} else if(u.int32 & 0xffff) {
// When all of the exponent bits are 1, the value is Inf or NaN.
// Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
// mantissa bit. Quiet NaN is indicated by the most significant mantissa
// bit being 1. Signaling NaN is indicated by the most significant
// mantissa bit being 0 but some other bit(s) being 1. If any of the
// lower 16 bits of the mantissa are 1, we set the least significant bit
// of the bfloat16 mantissa, in order to preserve signaling NaN in case
// the bloat16's mantissa bits are all 0.
u.int32 |= 0x10000; // Preserve signaling NaN
}
return uint16_t(u.int32 >> 16);
}
// Truncate instead of rounding, preserving SNaN
static __host__ __device__ uint16_t truncate_float_to_bfloat16(float f) {
union {
float fp32;
uint32_t int32;
} u = {f};
return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
}
};
typedef struct {
uint16_t data;
} sccl_bfloat16_public;
static_assert(std::is_standard_layout<sccl_bfloat16>{},
"sccl_bfloat16 is not a standard layout type, and thus is "
"incompatible with C.");
static_assert(std::is_trivial<sccl_bfloat16>{},
"sccl_bfloat16 is not a trivial type, and thus is "
"incompatible with C.");
static_assert(sizeof(sccl_bfloat16) == sizeof(sccl_bfloat16_public) && offsetof(sccl_bfloat16, data) == offsetof(sccl_bfloat16_public, data),
"internal sccl_bfloat16 does not match public sccl_bfloat16");
inline std::ostream& operator<<(std::ostream& os, const sccl_bfloat16& bf16) { return os << float(bf16); }
inline __host__ __device__ sccl_bfloat16 operator+(sccl_bfloat16 a) { return a; }
inline __host__ __device__ sccl_bfloat16 operator-(sccl_bfloat16 a) {
a.data ^= 0x8000;
return a;
}
inline __host__ __device__ sccl_bfloat16 operator+(sccl_bfloat16 a, sccl_bfloat16 b) { return sccl_bfloat16(float(a) + float(b)); }
inline __host__ __device__ sccl_bfloat16 operator-(sccl_bfloat16 a, sccl_bfloat16 b) { return sccl_bfloat16(float(a) - float(b)); }
inline __host__ __device__ sccl_bfloat16 operator*(sccl_bfloat16 a, sccl_bfloat16 b) { return sccl_bfloat16(float(a) * float(b)); }
inline __host__ __device__ sccl_bfloat16 operator/(sccl_bfloat16 a, sccl_bfloat16 b) { return sccl_bfloat16(float(a) / float(b)); }
inline __host__ __device__ bool operator<(sccl_bfloat16 a, sccl_bfloat16 b) { return float(a) < float(b); }
inline __host__ __device__ bool operator==(sccl_bfloat16 a, sccl_bfloat16 b) { return float(a) == float(b); }
inline __host__ __device__ bool operator>(sccl_bfloat16 a, sccl_bfloat16 b) { return b < a; }
inline __host__ __device__ bool operator<=(sccl_bfloat16 a, sccl_bfloat16 b) { return !(a > b); }
inline __host__ __device__ bool operator!=(sccl_bfloat16 a, sccl_bfloat16 b) { return !(a == b); }
inline __host__ __device__ bool operator>=(sccl_bfloat16 a, sccl_bfloat16 b) { return !(a < b); }
inline __host__ __device__ sccl_bfloat16& operator+=(sccl_bfloat16& a, sccl_bfloat16 b) { return a = a + b; }
inline __host__ __device__ sccl_bfloat16& operator-=(sccl_bfloat16& a, sccl_bfloat16 b) { return a = a - b; }
inline __host__ __device__ sccl_bfloat16& operator*=(sccl_bfloat16& a, sccl_bfloat16 b) { return a = a * b; }
inline __host__ __device__ sccl_bfloat16& operator/=(sccl_bfloat16& a, sccl_bfloat16 b) { return a = a / b; }
inline __host__ __device__ sccl_bfloat16& operator++(sccl_bfloat16& a) { return a += sccl_bfloat16(1.0f); }
inline __host__ __device__ sccl_bfloat16& operator--(sccl_bfloat16& a) { return a -= sccl_bfloat16(1.0f); }
inline __host__ __device__ sccl_bfloat16 operator++(sccl_bfloat16& a, int) {
sccl_bfloat16 orig = a;
++a;
return orig;
}
inline __host__ __device__ sccl_bfloat16 operator--(sccl_bfloat16& a, int) {
sccl_bfloat16 orig = a;
--a;
return orig;
}
namespace std {
constexpr __host__ __device__ bool isinf(sccl_bfloat16 a) { return !(~a.data & 0x7f80) && !(a.data & 0x7f); }
constexpr __host__ __device__ bool isnan(sccl_bfloat16 a) { return !(~a.data & 0x7f80) && +(a.data & 0x7f); }
constexpr __host__ __device__ bool iszero(sccl_bfloat16 a) { return !(a.data & 0x7fff); }
inline sccl_bfloat16 sin(sccl_bfloat16 a) { return sccl_bfloat16(sinf(float(a))); }
inline sccl_bfloat16 cos(sccl_bfloat16 a) { return sccl_bfloat16(cosf(float(a))); }
} // namespace std
} // namespace sccl
#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
#endif // _SCCL_BFLOAT16_H_
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "xml.h"
#include <math.h>
#include <sys/time.h>
#include "rome_models.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
SCCL_PARAM(CrossNic, "CROSS_NIC", 2);
// Initialize system->maxBw. This is the per-channel (i.e. per-SM)
// max bw.
static float getMaxBw(struct scclTopoSystem* system, struct scclTopoNode* gpu, int type) {
float maxBw = 0.0;
for(int i = 0; i < system->nodes[type].count; i++) {
struct scclTopoLinkList* path = gpu->paths[type] + i;
float bw = path->bw;
if(path->count == 0)
continue;
maxBw = std::max(maxBw, bw);
}
return maxBw;
}
static float getTotalBw(struct scclTopoSystem* system, struct scclTopoNode* gpu) {
float nvlinkBw = 0.0, pciBw = 0.0;
for(int l = 0; l < gpu->nlinks; l++) {
struct scclTopoLink* link = gpu->links + l;
if(link->type == LINK_NVL)
nvlinkBw += link->bw;
if(link->type == LINK_PCI)
pciBw = link->bw;
}
return std::max(pciBw, nvlinkBw);
}
scclResult_t scclTopoSearchInit(struct scclTopoSystem* system) {
system->maxBw = 0.0;
system->totalBw = 0.0;
int inter = system->nodes[NET].count;
if(inter == 0 && system->nodes[GPU].count == 1) {
system->maxBw = LOC_BW;
return scclSuccess;
}
for(int g = 0; g < system->nodes[GPU].count; g++) {
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
system->maxBw = std::max(system->maxBw, getMaxBw(system, gpu, inter ? NET : GPU));
system->totalBw = std::max(system->totalBw, getTotalBw(system, gpu));
}
return scclSuccess;
}
static scclResult_t findRevLink(struct scclTopoNode* node1, struct scclTopoNode* node2, struct scclTopoLink** revLink) {
for(int l = 0; l < node2->nlinks; l++) {
struct scclTopoLink* link = node2->links + l;
if(link->remNode == node1) {
*revLink = link;
return scclSuccess;
}
}
WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id);
return scclInternalError;
}
// This is unfortunately needed since manipulating floats often results in rounding errors.
#define SUB_ROUND(a, b) (a = roundf((a - b) * 1000) / 1000)
static scclResult_t followPath(struct scclTopoLinkList* path, struct scclTopoNode* start, int maxSteps, float bw, int* steps) {
float pciBw = bw;
for(int step = 0; step < path->count; step++) {
struct scclTopoNode* node = path->list[step]->remNode;
if(node->type == CPU) {
// Account for P2P inefficiency through Intel CPU RC
if(path->type == PATH_PHB && start->type == GPU && node->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && node->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
pciBw = INTEL_P2P_OVERHEAD(bw);
}
}
}
struct scclTopoNode* node = start;
for(int step = 0; step < maxSteps; step++) {
struct scclTopoLink* link = path->list[step];
struct scclTopoLink* revLink = NULL;
float fwBw = link->type == LINK_PCI ? pciBw : bw;
float revBw = 0;
if(link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) {
if(revLink == NULL)
SCCLCHECK(findRevLink(node, link->remNode, &revLink));
revBw += fwBw / 8;
}
if(link->remNode->type == CPU && link->type == LINK_NVL) {
if(revLink == NULL)
SCCLCHECK(findRevLink(node, link->remNode, &revLink));
revBw += fwBw;
}
if(link->bw < fwBw || (revBw && revLink->bw < revBw)) {
*steps = step;
return scclSuccess;
}
SUB_ROUND(link->bw, fwBw);
if(revBw)
SUB_ROUND(revLink->bw, revBw);
node = link->remNode;
}
*steps = maxSteps;
return scclSuccess;
}
// Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1).
static scclResult_t scclTopoFollowPath(
struct scclTopoSystem* system, struct scclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct scclTopoNode** node) {
// First handle easy cases
*node = system->nodes[type2].nodes + index2;
if(type1 == -1)
return scclSuccess;
struct scclTopoNode* node1 = system->nodes[type1].nodes + index1;
struct scclTopoLinkList* path = node1->paths[type2] + index2;
struct scclTopoNode* node2 = system->nodes[type2].nodes + index2;
struct scclTopoLinkList* revPath = node2->paths[type1] + index1;
if(path == NULL) {
WARN("No path computed to go from %s/%d to %s/%d", topoNodeTypeStr[type1], index1, topoNodeTypeStr[type2], index2);
return scclInternalError;
}
if(path->count == 0)
return scclSuccess;
// Now check link type
*node = NULL;
int intra = (type1 == GPU || type1 == NVS) && (type2 == GPU || type2 == NVS);
float bw = intra ? graph->bwIntra : graph->bwInter;
int type = intra ? graph->typeIntra : graph->typeInter;
if(mult == 1 && (path->type > type))
return scclSuccess;
if(mult == 1 &&
(graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE || graph->pattern == SCCL_TOPO_PATTERN_TREE || graph->pattern == SCCL_TOPO_PATTERN_SPLIT_TREE) &&
(revPath->type > type))
return scclSuccess;
bw *= mult;
// Check there is enough bandwidth on paths.
int step = 0;
SCCLCHECK(followPath(path, node1, path->count, bw, &step));
if(step < path->count)
goto rewind;
// Enough bandwidth : return destination node.
graph->nHops += mult * path->count;
*node = system->nodes[type2].nodes + index2;
return scclSuccess;
rewind:
// Not enough bandwidth : rewind and exit.
SCCLCHECK(followPath(path, node1, step, -bw, &step));
return scclSuccess;
}
static int gpuPciBw(struct scclTopoNode* gpu) {
for(int l = 0; l < gpu->nlinks; l++) {
struct scclTopoLink* gpuLink = gpu->links + l;
if(gpuLink->type != LINK_PCI)
continue;
struct scclTopoNode* pci = gpuLink->remNode;
for(int l = 0; l < pci->nlinks; l++) {
struct scclTopoLink* pciLink = pci->links + l;
if(pciLink->remNode != gpu)
continue;
return std::min(gpuLink->bw, pciLink->bw);
}
}
return -1;
}
/* Choose the order in which we try next GPUs. This is critical for the search
to quickly converge to the best solution even if it eventually times out. */
struct scclGpuScore {
int g; // Retain the index
int startIndex; // Least important
int intraNhops;
int intraBw;
int interNhops;
int interPciBw;
int interBw; // Most important
};
static int cmpScore(const void* g1, const void* g2) {
struct scclGpuScore* s1 = (struct scclGpuScore*)g1;
struct scclGpuScore* s2 = (struct scclGpuScore*)g2;
int d;
if((d = (s2->interBw - s1->interBw)))
return d;
if((d = (s2->interPciBw - s1->interPciBw)))
return d;
if((d = (s1->interNhops - s2->interNhops)))
return d;
if((d = (s2->startIndex - s1->startIndex)))
return d;
if((d = (s2->intraBw - s1->intraBw)))
return d;
if((d = (s1->intraNhops - s2->intraNhops)))
return d;
return s1->startIndex - s2->startIndex;
}
static int cmpIntraScores(struct scclGpuScore* scores, int count) {
int intraBw = scores[0].intraBw;
int intraNhops = scores[0].intraNhops;
for(int i = 1; i < count; i++) {
if(scores[i].intraBw != intraBw || scores[i].intraNhops != intraNhops)
return 1;
}
return 0;
}
static scclResult_t getGpuIndex(struct scclTopoSystem* system, int rank, int* index) {
for(int g = 0; g < system->nodes[GPU].count; g++) {
if(system->nodes[GPU].nodes[g].gpu.rank == rank) {
*index = g;
return scclSuccess;
}
}
WARN("Could not find gpu rank %d", rank);
return scclInternalError;
}
static scclResult_t getNetIndex(struct scclTopoSystem* system, int64_t id, int* index) {
for(int n = 0; n < system->nodes[NET].count; n++) {
if(system->nodes[NET].nodes[n].id == id) {
*index = n;
return scclSuccess;
}
}
WARN("Could not find net id %lx", id);
return scclInternalError;
}
static scclResult_t getNetPaths(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoLinkList** netPaths) {
int netId = graph->inter[graph->nChannels * 2];
int n;
SCCLCHECK(getNetIndex(system, netId, &n));
*netPaths = system->nodes[NET].nodes[n].paths[GPU];
return scclSuccess;
}
scclResult_t
scclTopoSearchNextGpuSort(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoNode* gpu, int* next, int* countPtr, int sortNet) {
const uint64_t flag = 1ULL << (graph->nChannels);
int ngpus = system->nodes[GPU].count;
struct scclTopoLinkList* paths = gpu->paths[GPU];
struct scclTopoLinkList* netPaths = NULL;
if(sortNet)
SCCLCHECK(getNetPaths(system, graph, &netPaths));
struct scclGpuScore scores[SCCL_TOPO_MAX_NODES];
memset(scores, 0, ngpus * sizeof(struct scclGpuScore));
int start = gpu - system->nodes[GPU].nodes;
int count = 0;
for(int i = 1; i < ngpus; i++) {
int g = (start + i) % ngpus;
if(paths[g].count == 0)
continue; // There is no path to that GPU
if(system->nodes[GPU].nodes[g].used & flag)
continue;
scores[count].g = g;
scores[count].startIndex = i;
scores[count].intraNhops = paths[g].count;
scores[count].intraBw = paths[g].bw;
if(netPaths) {
scores[count].interNhops = netPaths[g].count;
scores[count].interPciBw = gpuPciBw(system->nodes[GPU].nodes + g);
scores[count].interBw = netPaths[g].bw;
}
count++;
}
// Sort GPUs
qsort(scores, count, sizeof(struct scclGpuScore), cmpScore);
// Check if all have the same intra-node score in which case we go reverse for sortNet = -1
if(sortNet == -1 && cmpIntraScores(scores, count) == 0) {
for(int i = 0; i < count; i++)
next[i] = scores[count - 1 - i].g;
} else {
for(int i = 0; i < count; i++)
next[i] = scores[i].g;
}
*countPtr = count;
return scclSuccess;
}
scclResult_t scclTopoSearchRec(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int* time);
// Try to keep all searchs within one second
#define SCCL_SEARCH_GLOBAL_TIMEOUT (5ULL << 16)
#define SCCL_SEARCH_TIMEOUT (1 << 14)
#define SCCL_SEARCH_TIMEOUT_TREE (1 << 14)
#define SCCL_SEARCH_TIMEOUT_SAMECHANNELS (1 << 8)
#define FORCED_ORDER_PCI 1
#define FORCED_ORDER_REPLAY 2
scclResult_t scclTopoReplayGetGpu(struct scclTopoSystem* system, struct scclTopoGraph* graph, int step, int* g) {
*g = -1;
if(graph->nChannels == 0)
return scclInternalError;
int ngpus = system->nodes[GPU].count;
int nextRank = graph->intra[(graph->nChannels - 1) * ngpus + step + 1];
for(int i = 0; i < ngpus; i++)
if(system->nodes[GPU].nodes[i].gpu.rank == nextRank) {
*g = i;
return scclSuccess;
}
if(*g == -1)
return scclInternalError;
return scclSuccess;
}
scclResult_t scclTopoSearchRecGpu(struct scclTopoSystem* system,
struct scclTopoGraph* graph,
struct scclTopoGraph* saveGraph,
struct scclTopoNode* gpu,
int step,
int backToNet,
int backToFirstRank,
int forcedOrder,
int* time);
scclResult_t scclTopoSearchTryGpu(struct scclTopoSystem* system,
struct scclTopoGraph* graph,
struct scclTopoGraph* saveGraph,
int step,
int backToNet,
int backToFirstRank,
int forcedOrder,
int* time,
int type,
int index,
int g) {
const uint64_t flag = 1ULL << (graph->nChannels);
struct scclTopoNode* gpu;
SCCLCHECK(scclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu));
if(gpu) {
gpu->used ^= flag;
SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, time));
gpu->used ^= flag;
SCCLCHECK(scclTopoFollowPath(system, graph, type, index, GPU, g, -1, &gpu));
}
return scclSuccess;
}
static int scclTopoCountXGMI(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int count = 0;
for(int c = 0; c < graph->nChannels; c++) {
for(int i = 0; i < ngpus; i++) {
int g = graph->intra[ngpus * c + i];
int n = graph->intra[ngpus * c + ((i + 1) % ngpus)];
struct scclTopoNode* node;
int j;
for(j = 0; j < ngpus; j++)
if(system->nodes[GPU].nodes[j].gpu.rank == g)
break;
if(j < ngpus) {
node = system->nodes[GPU].nodes + j;
for(int k = 0; k < system->nodes[GPU].count; k++) {
if(node->paths[GPU][k].count == 1) {
struct scclTopoLink* link = node->paths[GPU][k].list[0];
struct scclTopoNode* remNode = link->remNode;
if(remNode->gpu.rank == n) {
if(link->type == LINK_NVL)
count++;
}
}
}
}
}
}
return count;
}
scclResult_t scclTopoSearchTryNvls(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int g, int ngpus, int* time) {
struct scclTopoNode* nvs;
struct scclTopoNode* gpu;
int d0 = 0; // See if there is enough bandwidth for NVS->GPU traffic
do {
SCCLCHECK(scclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? 2 : 1, &gpu));
d0++;
} while(gpu && d0 < system->nodes[GPU].count);
if(gpu == NULL) {
d0--;
} else {
int d1 = 0; // See if there is enough bandwidth for GPU->NVS traffic
do {
SCCLCHECK(scclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? 2 : 1, &nvs));
d1++;
} while(nvs && d1 < system->nodes[GPU].count);
if(nvs == NULL) {
d1--;
} else { // Both directions worked. Move on to the next path.
SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time));
}
while(d1) {
d1--;
SCCLCHECK(scclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? -2 : -1, &nvs));
}
}
while(d0) {
d0--;
SCCLCHECK(scclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? -2 : -1, &gpu));
}
return scclSuccess;
}
scclResult_t scclTopoCompareGraphs(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* refGraph, int* copy) {
// 1. Try to get the same nChannels between Rings and Trees
if(graph->nChannels < graph->minChannels)
return scclSuccess;
if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better.
if(graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count)
*copy = 1;
return scclSuccess;
}
// 2. Try to get better bandwidth
// Give a 15% perf bonus to paths not crossing nics
float target = 1.0 - (refGraph->crossNic - graph->crossNic) * .15;
if(graph->nChannels * graph->bwIntra > refGraph->nChannels * refGraph->bwIntra * target) {
*copy = 1;
return scclSuccess;
}
if(graph->nChannels * graph->bwIntra < refGraph->nChannels * refGraph->bwIntra * target)
return scclSuccess;
// 3. Less hops
if(graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops)
*copy = 1;
// 4. Prefer graph with more XGMI connections
if(graph->nChannels == refGraph->nChannels && scclTopoCountXGMI(system, refGraph) < scclTopoCountXGMI(system, graph))
*copy = 1;
return scclSuccess;
}
// Build a list of the best NETs to try.
//
// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
// index when trying to get back to the NIC.
//
// The list is built the following way:
// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list
// based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which
// might have been choosen by GPU 0 (case with multiple independent communicators per node)
// 3. Then add the NETs to the final list if they were not already added by another closer GPU.
scclResult_t scclTopoSelectNets(struct scclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
int netCount = 0;
int localNetCount;
int* localNets;
SCCLCHECK(scclCalloc(&localNets, system->nodes[NET].count));
for(int t = 0; t <= typeInter; t++) {
for(int g = 0; g < system->nodes[GPU].count; g++) {
if(gpu != -1 && gpu != g)
continue;
localNetCount = 0;
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
struct scclTopoLinkList* paths = gpu->paths[NET];
for(int n = 0; n < system->nodes[NET].count; n++) {
if(paths[n].type == t)
localNets[localNetCount++] = n;
}
if(localNetCount == 0)
continue;
// Shuffle by gpu NVML device number so that GPUs on the same PCI switch
// with multiple NICs don't use the same one as first choice.
for(int r = 0; r < system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) {
int net0 = localNets[0];
for(int i = 0; i < localNetCount - 1; i++)
localNets[i] = localNets[i + 1];
localNets[localNetCount - 1] = net0;
}
// Append NICs to list
for(int i = 0; i < localNetCount; i++) {
int n = localNets[i];
int found = 0;
while(nets[found] != n && found < netCount)
found++;
if(found == netCount)
nets[netCount++] = n;
}
}
}
*netCountRet = netCount;
free(localNets);
return scclSuccess;
}
scclResult_t scclTopoSearchRecGpu(struct scclTopoSystem* system,
struct scclTopoGraph* graph,
struct scclTopoGraph* saveGraph,
struct scclTopoNode* gpu,
int step,
int backToNet,
int backToFirstRank,
int forcedOrder,
int* time) {
if((*time) <= 0)
return scclSuccess;
(*time)--;
int ngpus = system->nodes[GPU].count;
if(step == ngpus) {
// Determine whether we found a better solution or not
int copy = 0;
graph->nChannels++;
SCCLCHECK(scclTopoCompareGraphs(system, graph, saveGraph, &copy));
if(copy) {
memcpy(saveGraph, graph, sizeof(struct scclTopoGraph));
if(graph->nChannels == graph->maxChannels)
*time = -1;
}
if(graph->nChannels < graph->maxChannels) {
SCCLCHECK(scclTopoSearchRec(system, graph, saveGraph, time));
}
graph->nChannels--;
return scclSuccess;
}
graph->intra[graph->nChannels * ngpus + step] = gpu->gpu.rank;
int g = gpu - system->nodes[GPU].nodes;
if(step == backToNet) {
// first get back to NIC
if(system->nodes[NET].count) {
int startNetIndex;
SCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels * 2], &startNetIndex));
struct scclTopoNode* startNet = system->nodes[NET].nodes + startNetIndex;
int netcount;
int* nets;
SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count));
SCCLCHECK(scclTopoSelectNets(system, graph->typeInter, g, nets, &netcount));
for(int i = 0; i < netcount; i++) {
int n = nets[i];
struct scclTopoNode* net = system->nodes[NET].nodes + n;
if(graph->pattern == SCCL_TOPO_PATTERN_TREE && net->id != startNet->id)
continue; // Trees are symmetric
if(graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port))
continue;
// Balanced Tree : count half of the bandwidth on first two GPUs
int nextBackToNet = -1;
float bwInterSave = graph->bwInter;
if(graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE) {
// Count half of the bandwidth on each of the first two GPUs
if(step == 0)
nextBackToNet = 1;
else if(net->id != graph->inter[graph->nChannels * 2 + 1])
continue;
graph->bwInter /= 2;
}
SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
graph->bwInter = bwInterSave;
if(net) {
graph->inter[graph->nChannels * 2 + 1] = net->id;
SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
if(graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE)
graph->bwInter /= 2;
SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
graph->bwInter = bwInterSave;
}
}
free(nets);
}
} else if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) {
SCCLCHECK(scclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
} else if(step < system->nodes[GPU].count - 1) {
// Go to next GPU
int next[SCCL_TOPO_MAX_NODES];
int count;
if(forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
next[0] = step + 1;
count = 1;
} else if(forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order
SCCLCHECK(scclTopoReplayGetGpu(system, graph, step, next));
count = 1;
} else { // Normal search
SCCLCHECK(scclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step + 1 ? 1 : -1));
}
for(int i = 0; i < count; i++) {
SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, step + 1, backToNet, backToFirstRank, forcedOrder, time, GPU, g, next[i]));
}
} else if(step == backToFirstRank) {
// Find first GPU and loop back to it
int p;
SCCLCHECK(getGpuIndex(system, graph->intra[graph->nChannels * ngpus], &p));
struct scclTopoNode* firstGpu;
SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, GPU, p, 1, &firstGpu));
if(firstGpu) {
SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step + 1, backToNet, -1, forcedOrder, time));
SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, GPU, p, -1, &firstGpu));
}
} else {
// Next path
SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
}
return scclSuccess;
}
scclResult_t scclTopoSearchRecNet(
struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
const int bw = graph->bwInter;
int* nets;
SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count));
int netcount;
SCCLCHECK(scclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount));
for(int i = 0; i < netcount; i++) {
int n = nets[i];
struct scclTopoNode* net = system->nodes[NET].nodes + n;
struct scclTopoNode* gpu;
if(graph->collNet && net->net.collSupport == 0)
continue;
if(net->net.bw < bw)
continue;
graph->inter[graph->nChannels * 2] = net->id;
graph->latencyInter = net->net.latency;
for(int i = 0; i < system->nodes[NET].count; i++) {
if((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) {
system->nodes[NET].nodes[i].net.bw -= bw;
}
}
// NVLS needs to balance on all NICs
if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) {
SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, nets[graph->nChannels]));
} else {
if(graph->nChannels > 0) {
// Try to replay the last channel
int g;
SCCLCHECK(scclTopoReplayGetGpu(system, graph, -1, &g));
SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
}
if(graph->nChannels == 0 || graph->sameChannels == 0) {
if(graph->nChannels == 0) {
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
struct scclTopoLinkList* paths = net->paths[GPU];
int f = 0, f_gdr = 0;
// find the first GPU that is closest to NIC
for(int i = 0; i < system->nodes[GPU].count; i++) {
if(paths[i].count <= paths[f].count) {
// prefer GPU direct RDMA
int gdr;
SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr));
if(paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) {
f = i;
f_gdr = gdr;
}
}
}
int t = 1 << 10;
SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
if(t == -1)
*time = -1;
}
// Then try the most local GPUs
float maxBw = 0;
int minHops = 0xfffffff;
struct scclTopoLinkList* paths = net->paths[GPU];
for(int g = 0; g < system->nodes[GPU].count; g++) {
if(paths[g].bw > maxBw) {
maxBw = paths[g].bw;
minHops = paths[g].count;
} else if(paths[g].bw == maxBw && paths[g].count < minHops) {
minHops = paths[g].count;
}
}
if(maxBw >= bw) {
// In the first loop, avoid using GPUs in both directions between channels (one channel
// sending from that GPU and one channel receiving to that GPU), since that usually leads
// to lower BW.
for(int tryGpuBidir = 0; tryGpuBidir < 2; tryGpuBidir++) {
for(int g = 0; g < system->nodes[GPU].count; g++) {
if(paths[g].bw == maxBw && paths[g].count == minHops) {
gpu = system->nodes[GPU].nodes + g;
int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1;
if(tryGpuBidir == gpuUsed) {
SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
}
}
}
}
}
}
}
for(int i = 0; i < system->nodes[NET].count; i++) {
if((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) {
system->nodes[NET].nodes[i].net.bw += bw;
}
}
}
free(nets);
return scclSuccess;
}
/* Search Patterns
*
* Intra-node
* Ring : GPU a -> GPU b -> .. -> GPU x -> GPU a
* (=Split Tree Loop)
* Tree : GPU a -> GPU b -> .. -> GPU x
* (=Split Tree)
*
* Inter-node
* Ring : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic)
* Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
* `--> NET n (or m if crossNic)
* Split Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
* `--> NET n (or m if crossNic)
* Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a
* `--> NET n (or m if crossNic)
*/
scclResult_t scclTopoSearchParams(struct scclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
if(system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
if(pattern == SCCL_TOPO_PATTERN_RING)
*backToNet = system->nodes[GPU].count - 1;
else if(pattern == SCCL_TOPO_PATTERN_SPLIT_TREE)
*backToNet = 1;
else
*backToNet = 0;
*backToFirstRank = -1;
} else {
*backToNet = -1;
if(pattern == SCCL_TOPO_PATTERN_RING)
*backToFirstRank = system->nodes[GPU].count - 1;
else
*backToFirstRank = -1;
}
return scclSuccess;
}
scclResult_t scclTopoSearchRec(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int* time) {
int backToNet, backToFirstRank;
SCCLCHECK(scclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
if(system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
// Start from NET
scclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
} else {
// Intra-node only.
if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) {
SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, graph->nChannels));
return scclSuccess;
} else if(graph->nChannels == 0) {
// Try PCI order first
SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
} else {
// Also try to replay previous channel
int g;
SCCLCHECK(scclTopoReplayGetGpu(system, graph, -1, &g));
SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
}
if(graph->sameChannels == 0 || graph->nChannels == 0) {
// Finally, try all other possibilities unless we are forced to use the same channels
for(int g = 0; g < system->nodes[GPU].count; g++) {
SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
}
}
}
return scclSuccess;
}
/************************************/
/* User defined graph from XML file */
/************************************/
struct kvDict kvDictLinkType[] = {{"LOC", PATH_LOC},
{"NVL", PATH_NVL},
{"NVB", PATH_NVB},
{"PIX", PATH_PIX},
{"PXB", PATH_PXB},
{"PXN", PATH_PXN},
{"PHB", PATH_PHB},
{"SYS", PATH_SYS},
{NULL, 0}};
scclResult_t scclTopoGetChannelFromXml(struct scclXmlNode* xmlChannel, int c, struct scclTopoSystem* system, struct scclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int* inter = graph->inter + 2 * c;
int* intra = graph->intra + ngpus * c;
int n = 0, g = 0;
for(int s = 0; s < xmlChannel->nSubs; s++) {
struct scclXmlNode* sub = xmlChannel->subs[s];
int dev;
SCCLCHECK(xmlGetAttrInt(sub, "dev", &dev));
if(strcmp(sub->name, "net") == 0) {
inter[n++] = dev;
} else if(strcmp(sub->name, "gpu") == 0) {
int rank = -1;
for(int g = 0; g < ngpus; g++) {
if(system->nodes[GPU].nodes[g].gpu.dev == dev)
rank = system->nodes[GPU].nodes[g].gpu.rank;
}
if(rank == -1) {
WARN("XML Import Channel : dev %d not found.", dev);
return scclSystemError;
}
intra[g++] = rank;
}
}
return scclSuccess;
}
scclResult_t scclTopoGetGraphFromXmlSub(struct scclXmlNode* xmlGraph, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* nChannels) {
int id;
SCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id));
if(graph->id != id)
return scclSuccess;
int crossNic;
SCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic));
if(scclParamCrossNic() == 0 && crossNic == 1)
return scclSuccess;
graph->crossNic = crossNic;
SCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern));
SCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
SCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra));
SCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter));
if(xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != scclSuccess)
graph->latencyInter = 0.0;
const char* str;
SCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
SCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
SCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str));
SCCLCHECK(kvConvertToInt(str, &graph->typeInter, kvDictLinkType));
SCCLCHECK(xmlGetAttrInt(xmlGraph, "samechannels", &graph->sameChannels));
for(int s = 0; s < xmlGraph->nSubs; s++) {
SCCLCHECK(scclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph));
}
*nChannels = xmlGraph->nSubs;
return scclSuccess;
}
scclResult_t scclTopoGetGraphFromXml(struct scclXmlNode* xmlGraphs, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* nChannels) {
for(int s = 0; s < xmlGraphs->nSubs; s++) {
SCCLCHECK(scclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph, nChannels));
}
return scclSuccess;
}
/* And the reverse : graph->xml */
scclResult_t scclTopoGetXmlFromChannel(struct scclTopoGraph* graph, int c, struct scclTopoSystem* system, struct scclXml* xml, struct scclXmlNode* parent) {
struct scclXmlNode* xmlChannel;
int ngpus = system->nodes[GPU].count;
int* inter = graph->inter + 2 * c;
int* intra = graph->intra + ngpus * c;
SCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel));
struct scclXmlNode* node;
if(system->nodes[NET].count) {
SCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
SCCLCHECK(xmlSetAttrInt(node, "dev", inter[0]));
}
for(int g = 0; g < ngpus; g++) {
SCCLCHECK(xmlAddNode(xml, xmlChannel, "gpu", &node));
int dev = -1;
for(int i = 0; i < ngpus; i++) {
if(system->nodes[GPU].nodes[i].gpu.rank == intra[g])
dev = system->nodes[GPU].nodes[i].gpu.dev;
}
if(dev == -1) {
WARN("XML Export Channel : rank %d not found.", intra[g]);
return scclInternalError;
}
SCCLCHECK(xmlSetAttrInt(node, "dev", dev));
}
if(system->nodes[NET].count) {
SCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
SCCLCHECK(xmlSetAttrInt(node, "dev", inter[1]));
}
return scclSuccess;
}
scclResult_t scclTopoGetXmlFromGraph(struct scclTopoGraph* graph, struct scclTopoSystem* system, struct scclXml* xml, struct scclXmlNode* parent) {
struct scclXmlNode* xmlGraph;
SCCLCHECK(xmlAddNode(xml, parent, "graph", &xmlGraph));
SCCLCHECK(xmlSetAttrInt(xmlGraph, "id", graph->id));
SCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern));
SCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic));
SCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels));
SCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->bwIntra));
SCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->bwInter));
SCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter));
const char* str;
SCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType));
SCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str));
SCCLCHECK(kvConvertToStr(graph->typeInter, &str, kvDictLinkType));
SCCLCHECK(xmlSetAttr(xmlGraph, "typeinter", str));
SCCLCHECK(xmlSetAttrInt(xmlGraph, "samechannels", graph->sameChannels));
for(int c = 0; c < graph->nChannels; c++) {
SCCLCHECK(scclTopoGetXmlFromChannel(graph, c, system, xml, xmlGraph));
}
return scclSuccess;
}
scclResult_t scclTopoGetXmlFromGraphs(int ngraphs, struct scclTopoGraph** graphs, struct scclTopoSystem* system, struct scclXml* xml) {
xml->maxIndex = 0;
struct scclXmlNode* xmlGraphs;
SCCLCHECK(xmlAddNode(xml, NULL, "graphs", &xmlGraphs));
SCCLCHECK(xmlSetAttrInt(xmlGraphs, "version", SCCL_GRAPH_XML_VERSION));
for(int g = 0; g < ngraphs; g++) {
SCCLCHECK(scclTopoGetXmlFromGraph(graphs[g], system, xml, xmlGraphs));
}
return scclSuccess;
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
float speedArrayIntra[] = {48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12};
float speedArrayInter[] = {48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12};
#define NSPEEDSINTRA (sizeof(speedArrayIntra) / sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter) / sizeof(float))
#else
float speedArrayIntra[] = {40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0};
float speedArrayInter[] = {48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12};
#define NSPEEDSINTRA (sizeof(speedArrayIntra) / sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter) / sizeof(float))
float sm90SpeedArrayIntra[] = {60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0};
float sm90SpeedArrayInter[] = {48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12};
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra) / sizeof(float))
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter) / sizeof(float))
#endif
RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
RCCL_PARAM(NChannels, "NCHANNELS", 0);
scclResult_t scclTopoCompute(scclTopoSystem* system, struct scclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
graph->crossNic = scclParamCrossNic();
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic &&
(graph->pattern == SCCL_TOPO_PATTERN_RING || graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE ||
graph->pattern == SCCL_TOPO_PATTERN_SPLIT_TREE)
? 1
: 0;
graph->bwIntra = graph->bwInter = 0;
graph->latencyInter = 0;
if(graph->crossNic == 2)
graph->crossNic = 0;
graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
graph->typeInter = PATH_PIX;
graph->nChannels = 0;
graph->nIntraChannels = 0;
memset(graph->intraNets, 0, MAXCHANNELS * SCCL_TOPO_MAX_NODES * 2 * sizeof(int));
int trySameChannels = graph->pattern == SCCL_TOPO_PATTERN_NVLS ? 0 : 1;
graph->sameChannels = trySameChannels;
char* str = getenv("SCCL_GRAPH_FILE");
if(str) {
INFO(SCCL_ENV, "SCCL_GRAPH_FILE set by environment to %s", str);
struct scclXml* xml;
SCCLCHECK(scclCalloc(&xml, 1));
SCCLCHECK(scclTopoGetXmlGraphFromFile(str, xml));
int nChannels;
SCCLCHECK(scclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels));
INFO(SCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels);
free(xml);
if(graph->nChannels > 0)
return scclSuccess;
}
str = getenv("SCCL_RINGS");
char* strTrees = getenv("RCCL_TREES");
if(str || strTrees) {
// user supplied topo
if(strTrees) {
SCCLCHECK(parseGraphLight(strTrees, system, graph, NULL));
system->treeDefined = true;
} else {
SCCLCHECK(parseGraph(str, system, graph, NULL, NULL));
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
if(graph->nChannels && arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME) {
system->type |= RCCL_TOPO_4P2H_ROME;
}
}
} else if(!rcclParamModelMatchingDisable() && !graph->collNet) {
// try to match 8P6L
SCCLCHECK(parseChordalRing(system, graph));
if(graph->nChannels)
return scclSuccess;
// try to match Rome 4P2H
SCCLCHECK(parseRome4P2H(system, graph));
if(graph->nChannels)
return scclSuccess;
// try to match 1H16P
SCCLCHECK(parse1H16P(system, graph));
if(graph->nChannels)
return scclSuccess;
// try to match 4H4P
SCCLCHECK(parse4H4P(system, graph));
}
if(graph->nChannels)
return scclSuccess;
if((graph->pattern == SCCL_TOPO_PATTERN_RING) && (system->type & RCCL_TOPO_4P2H_ROME) && (ngpus == system->nRanks)) {
// limit single node max channels when searching ring graph on Rome
graph->maxChannels = 2;
}
if(ngpus == 1)
if(graph->pattern != SCCL_TOPO_PATTERN_RING)
graph->pattern = SCCL_TOPO_PATTERN_TREE;
int ccMin;
SCCLCHECK(scclTopoGetCompCap(system, &ccMin, NULL));
if(graph->pattern == SCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90))
return scclSuccess;
if(ngpus == 1)
if(graph->pattern != SCCL_TOPO_PATTERN_RING)
graph->pattern = SCCL_TOPO_PATTERN_TREE;
if(system->nodes[NET].count == 0 && graph->pattern == SCCL_TOPO_PATTERN_NVLS) {
// Force intra-node NVLS algorithm to pull evenly from all GPUs.
graph->minChannels = graph->maxChannels = system->nodes[GPU].count;
}
struct scclTopoGraph tmpGraph;
memcpy(&tmpGraph, graph, sizeof(struct scclTopoGraph));
// First try crossnic, then decrease bw and finally increase bwIntra.
int nspeeds = 0;
float* speedArray = NULL;
if(system->nodes[NET].count == 0) {
nspeeds = NSPEEDSINTRA;
speedArray = speedArrayIntra;
} else {
nspeeds = NSPEEDSINTER;
speedArray = speedArrayInter;
}
int pass = 1;
int speedIndex = 0;
float maxBw = system->maxBw;
float totalBw = system->totalBw;
if(ngpus == 1 || graph->pattern != SCCL_TOPO_PATTERN_RING)
totalBw *= ngpus * 1.0 / (ngpus - 1);
while((speedArray[speedIndex] > maxBw || speedArray[speedIndex] * graph->minChannels > totalBw) && speedIndex < nspeeds - 1)
speedIndex++;
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
int64_t globalTimeout = SCCL_SEARCH_GLOBAL_TIMEOUT;
search:
int time = tmpGraph.sameChannels ? SCCL_SEARCH_TIMEOUT_SAMECHANNELS
: tmpGraph.pattern == SCCL_TOPO_PATTERN_TREE ? SCCL_SEARCH_TIMEOUT_TREE
: SCCL_SEARCH_TIMEOUT;
tmpGraph.nChannels = 0;
globalTimeout -= time;
SCCLCHECK(scclTopoSearchRec(system, &tmpGraph, graph, &time));
#if 0
printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
for (int c=0; c<graph->nChannels; c++) {
printf("%2d : ", c);
for (int g=0; g<ngpus; g++) {
printf("%d ", graph->intra[c*ngpus+g]);
}
printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]);
printf("\n");
}
#endif
// Optimal solution, stop here
if(time == -1)
goto done;
if(graph->nChannels * graph->bwInter >= system->totalBw)
goto done;
if(pass == 1) {
// First pass, we don't have a solution yet ; try other options
// Try having different channels
if(tmpGraph.sameChannels == 1) {
tmpGraph.sameChannels = 0;
goto search;
}
tmpGraph.sameChannels = trySameChannels;
if(time != -1)
globalTimeout += time;
else
globalTimeout = SCCL_SEARCH_GLOBAL_TIMEOUT;
if(globalTimeout < 0 && graph->nChannels)
goto done;
tmpGraph.pattern = graph->pattern;
int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
if(tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
tmpGraph.typeIntra += 1;
goto search;
}
tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
if(system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS &&
(graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
tmpGraph.typeInter += 1;
goto search;
}
tmpGraph.typeInter = PATH_PIX;
if(crossNic && tmpGraph.crossNic == 0) {
// Try again with crossNic if permitted
tmpGraph.crossNic = crossNic;
goto search;
}
tmpGraph.crossNic = 0;
// Decrease bw until we find a solution
if((speedIndex < nspeeds - 1) && (graph->nChannels == 0 || (speedArray[speedIndex + 1] / graph->bwInter > .49))) {
tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex];
goto search;
}
speedIndex = 0;
while(speedArray[speedIndex] > maxBw && speedIndex < nspeeds - 1)
speedIndex++;
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
}
done:
// We have a solution. Start from that solution and move to pass 2.
if(pass == 1) {
time = -1;
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
speedIndex = 0;
while(speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds - 1)
speedIndex++;
tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
tmpGraph.minChannels = graph->nChannels;
pass = 2;
}
// 3. See if we can increase bwIntra for trees (2 nodes or collnet)
if(pass == 2) {
if(time != 0 && graph->pattern != SCCL_TOPO_PATTERN_RING && tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter * 2 &&
speedIndex > 0) {
tmpGraph.bwIntra = speedArray[--speedIndex];
goto search;
}
time = -1;
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
}
if(graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != SCCL_TOPO_PATTERN_NVLS) {
WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
for(int i = 0; i < ngpus; i++)
graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
graph->inter[0] = graph->inter[1] = 0;
graph->bwIntra = graph->bwInter = 0.1;
graph->typeIntra = graph->typeInter = PATH_SYS;
graph->nChannels = 1;
}
if(graph->nChannels == 0)
return scclSuccess;
if(graph->pattern == SCCL_TOPO_PATTERN_NVLS)
return scclSuccess;
if(graph->bwIntra < 25.0)
return scclSuccess;
if(ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4)
return scclSuccess;
int dupChannels = std::min(graph->nChannels * 2, graph->maxChannels);
memcpy(graph->intra + graph->nChannels * ngpus, graph->intra, (dupChannels - graph->nChannels) * ngpus * sizeof(int));
memcpy(graph->inter + graph->nChannels * 2, graph->inter, (dupChannels - graph->nChannels) * 2 * sizeof(int));
graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
graph->nChannels = dupChannels;
int nc = rcclParamNChannels();
if(graph->nChannels > 0 && nc > 0 && nc <= MAXCHANNELS / 2 && nc > graph->nChannels) {
int nChannels = nc - graph->nChannels;
int nnets = system->nodes[NET].count;
if(nnets <= 2) {
for(int i = 0; i < nChannels; ++i) {
memcpy(graph->intra + graph->nChannels * ngpus, graph->intra, ngpus * sizeof(int));
memcpy(graph->inter + graph->nChannels * 2, graph->inter, 2 * sizeof(int));
memcpy(graph->intraNets + graph->nChannels * ngpus * 2, graph->intraNets, 2 * ngpus * sizeof(int));
graph->nChannels++;
}
} else {
typedef struct {
int id;
int used;
} Net;
Net nets[nnets];
auto sortFunc = [](const void* a, const void* b) -> int { return ((Net*)a)->used - ((Net*)b)->used; };
memset(nets, 0, nnets * sizeof(Net));
for(int i = 0; i < nnets; ++i) {
nets[i].id = system->nodes[NET].nodes[i].id;
}
for(int i = 0; i < graph->nChannels; ++i) {
for(int j = 0; j < nnets; ++j) {
if(nets[j].id == *(graph->inter + i * 2) || nets[j].id == *(graph->inter + i * 2 + 1)) {
nets[j].used++;
}
}
}
for(int i = 0; i < nChannels; ++i) {
memcpy(graph->intra + graph->nChannels * ngpus, graph->intra, ngpus * sizeof(int));
qsort(nets, nnets, sizeof(Net), sortFunc);
*(graph->inter + graph->nChannels * 2) = nets[0].id;
nets[0].used++;
qsort(nets, nnets, sizeof(Net), sortFunc);
if(graph->crossNic == 0 || graph->crossNic == 2) {
*(graph->inter + graph->nChannels * 2 + 1) = nets[0].id;
nets[0].used++;
qsort(nets, nnets, sizeof(Net), sortFunc);
} else {
nets[0].used++;
qsort(nets, nnets, sizeof(Net), sortFunc);
*(graph->inter + graph->nChannels * 2 + 1) = nets[0].id;
}
nets[0].used++;
memcpy(graph->intraNets + graph->nChannels * ngpus * 2, graph->intraNets, 2 * ngpus * sizeof(int));
graph->nChannels++;
}
}
graph->bwIntra /= DIVUP(nc, graph->nChannels);
graph->bwInter /= DIVUP(nc, graph->nChannels);
}
return scclSuccess;
}
scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
INFO(SCCL_GRAPH,
"Pattern %d, crossNic %d, nChannels %d, bw %f/%f, type %s/%s, sameChannels %d",
graph->pattern,
graph->crossNic,
graph->nChannels,
graph->bwIntra,
graph->bwInter,
topoPathTypeStr[graph->typeIntra],
topoPathTypeStr[graph->typeInter],
graph->sameChannels);
int ngpus = system->nodes[GPU].count;
char line[1024];
for(int c = 0; c < graph->nChannels; c++) {
sprintf(line, "%2d :", c);
int offset = strlen(line);
if(system->nodes[NET].count > 0 && system->nodes[GPU].count != system->nRanks && !graph->nIntraChannels) {
sprintf(line + offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2 * c]);
offset = strlen(line);
}
for(int i = 0; i < ngpus; i++) {
int n = graph->intraNets[(ngpus * c + i) * 2] - 'N';
if(n >= 0 && n < system->nodes[NET].count) {
sprintf(line + offset, " NET/%d", n);
offset = strlen(line);
}
sprintf(line + offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus * c + i]);
offset = strlen(line);
n = graph->intraNets[(ngpus * c + i) * 2 + 1] - 'N';
if(n >= 0 && n < system->nodes[NET].count) {
sprintf(line + offset, " NET/%d", n);
offset = strlen(line);
}
}
if(system->nodes[NET].count > 0 && system->nodes[GPU].count != system->nRanks && !graph->nIntraChannels) {
sprintf(line + offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2 * c + 1]);
offset = strlen(line);
}
INFO(SCCL_GRAPH, "%s", line);
}
return scclSuccess;
}
scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs) {
char* str = getenv("SCCL_GRAPH_DUMP_FILE");
if(str) {
INFO(SCCL_ENV, "SCCL_GRAPH_DUMP_FILE set by environment to %s", str);
struct scclXml* xml;
SCCLCHECK(scclCalloc(&xml, 1));
SCCLCHECK(scclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
SCCLCHECK(scclTopoDumpXmlToFile(str, xml));
free(xml);
}
return scclSuccess;
}
#include "comm.h"
// NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
scclResult_t getNvlsNetDev(struct scclComm* comm, struct scclTopoGraph* graph, int* dev) {
int localRanks = comm->topo->nodes[GPU].count;
for(int c = 0; c < graph->nChannels; c++) {
if(graph->intra[c * localRanks] == comm->rank) {
*dev = graph->inter[c * 2];
return scclSuccess;
}
}
WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
return scclInternalError;
}
// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
SCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2);
scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) {
if(graph) {
// Honor the net device in the graph
int channel = channelId % graph->nChannels;
int ngpus = comm->topo->nodes[GPU].count;
int index = graph->intra[channel * ngpus] == rank ? 0 : 1;
if(graph->pattern != SCCL_TOPO_PATTERN_NVLS) {
*dev = graph->inter[channel * 2 + index];
} else {
SCCLCHECK(getNvlsNetDev(comm, graph, dev));
}
SCCLCHECK(scclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
} else if(peerRank == -1) {
return scclInternalError;
} else {
// Start with our local NIC and local Rank
SCCLCHECK(scclTopoGetLocalNet(comm->topo, rank, channelId, dev));
*proxyRank = rank;
int pxnLevel = scclPxnDisable(comm) == 1 ? 0 : scclParamP2pPxnLevel();
// See whether we can use the remote rank preferred device.
if(scclParamCrossNic() == 0 || (pxnLevel != 0)) {
// Find local NIC number close to local cudaDev
int cudaDev = comm->peerInfo[peerRank].cudaDev;
int localRank;
if(scclTopoDevToRank(comm->topo, cudaDev, &localRank) != scclSuccess)
return scclSuccess;
int netDev;
SCCLCHECK(scclTopoGetLocalNet(comm->topo, localRank, channelId, &netDev));
int n;
// Check that device exists on our node
if(scclParamCrossNic() == 0) {
if(scclTopoIdToIndex(comm->topo, NET, netDev, &n) != scclSuccess) {
WARN("Rank %d requires NIC %d but that NIC is not available for rank %d", peerRank, netDev, rank);
return scclInvalidUsage;
}
*dev = netDev;
}
if(pxnLevel == 1) {
int g, n;
SCCLCHECK(scclTopoRankToIndex(comm->topo, rank, &g));
SCCLCHECK(scclTopoIdToIndex(comm->topo, NET, netDev, &n));
struct scclTopoNode* gpu = comm->topo->nodes[GPU].nodes + g;
if(gpu->paths[NET][n].type <= PATH_PXN) {
*dev = netDev;
SCCLCHECK(scclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
}
} else if(pxnLevel == 2) {
// Check which local GPU corresponds to that NIC and see if we can use PXN.
int n, g1, g2;
SCCLCHECK(scclTopoIdToIndex(comm->topo, NET, netDev, &n));
SCCLCHECK(scclTopoRankToIndex(comm->topo, rank, &g1));
SCCLCHECK(scclTopoGetLocalGpu(comm->topo, netDev, &g2));
if(g2 != -1) {
struct scclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes + g2;
if(peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
*proxyRank = peerGpu->gpu.rank;
*dev = netDev;
return scclSuccess;
}
}
}
}
}
return scclSuccess;
}
scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev) {
*dev = -1;
if(graph && graph->nIntraChannels) {
int n1 = -1;
int ngpus = system->nodes[GPU].count;
int nnets = system->nodes[NET].count;
int chan = channelId % graph->nIntraChannels;
for(int i = 0; i < ngpus; i++) {
if(graph->intra[ngpus * chan + i] == rank) {
n1 = graph->intraNets[(ngpus * chan + i) * 2 + type] - 'N';
break;
}
}
if(n1 >= 0 && n1 < nnets) {
*dev = n1;
}
}
return scclSuccess;
}
scclResult_t scclTopoGetLinkType(struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter, int nInter, int* inter) {
int interGpus[MAX_XGMI_INTER_GPUS + 1];
int ngpus = system->nodes[GPU].count;
*isXGMI = false;
// check for direct XGMI connection
for(int i = 0; i < ngpus; i++) {
if(system->nodes[GPU].nodes[i].gpu.dev == cudaDev1) {
struct scclTopoNode* node = system->nodes[GPU].nodes + i;
for(int k = 0; k < system->nodes[GPU].count; k++) {
if(node->paths[GPU][k].count == 1) {
struct scclTopoLink* link = node->paths[GPU][k].list[0];
struct scclTopoNode* remNode = link->remNode;
if(remNode->gpu.dev == cudaDev2) {
*isXGMI = (link->type == LINK_NVL);
if(*isXGMI)
return scclSuccess;
}
}
}
}
}
// try intermediate GPUs
if(maxInter) {
// check if there are intermediate GPUs that are connected to both
bool res1, res2, res3;
int j;
for(j = 0; j < nInter; j++) {
scclTopoGetLinkType(system, inter[j], inter[j + 1], &res1, 0);
if(!res1)
break;
}
if(j < nInter)
return scclSuccess;
if(nInter > 0 && inter != nullptr) {
scclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0);
if(res2) {
*isXGMI = true;
return scclSuccess;
}
memcpy(interGpus + 1, inter + 1, sizeof(int) * nInter);
}
interGpus[0] = cudaDev1;
// add one more intermediate GPU recursively util reaching max depth
nInter++;
if(nInter + 2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter)
return scclSuccess;
for(int i = 0; i < ngpus; i++) {
int dev = system->nodes[GPU].nodes[i].gpu.dev;
// skip duplicated GPU
if(dev == cudaDev2)
continue;
for(j = 0; j < nInter; j++)
if(dev == interGpus[j])
break;
if(j < nInter)
continue;
// check connectivity with intermediate GPUs
interGpus[nInter] = dev;
scclTopoGetLinkType(system, cudaDev1, cudaDev2, &res3, maxInter, nInter, interGpus);
if(res3) {
*isXGMI = true;
return scclSuccess;
}
}
}
return scclSuccess;
}
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
#include "sccl.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
#define RANK_TO_INDEX(r) (rank > root ? rank - 1 : rank)
/* Btree which alternates leaves and nodes.
* Assumes root is 0, which conveniently builds a tree on powers of two,
* (because we have pow2-1 ranks) which lets us manipulate bits.
* Find first non-zero bit, then :
* Find the parent :
* xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
* xx11[0] -> xx10[0] (3,7,11 below)
* Find the children :
* xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
* xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
*
* Illustration :
* 0---------------8
* ______/ \______
* 4 12
* / \ / \
* 2 6 10 \
* / \ / \ / \ \
* 1 3 5 7 9 11 13
*/
scclResult_t scclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
int up, down0, down1;
int bit;
for(bit = 1; bit < nranks; bit <<= 1) {
if(bit & rank)
break;
}
if(rank == 0) {
*u = -1;
*d0 = -1;
// Child rank is > 0 so it has to be our child 1, not 0.
*d1 = nranks > 1 ? bit >> 1 : -1;
return scclSuccess;
}
up = (rank ^ bit) | (bit << 1);
// if smaller than the parent, we are his first child, otherwise we're his second
if(up >= nranks)
up = (rank ^ bit);
*parentChildType = (rank < up) ? 0 : 1;
*u = up;
int lowbit = bit >> 1;
// down0 is always within bounds
down0 = lowbit == 0 ? -1 : rank - lowbit;
down1 = lowbit == 0 ? -1 : rank + lowbit;
// Make sure down1 is within bounds
while(down1 >= nranks) {
down1 = lowbit == 0 ? -1 : rank + lowbit;
lowbit >>= 1;
}
*d0 = down0;
*d1 = down1;
return scclSuccess;
}
/* Build a double binary tree. Take the previous tree for the first tree.
* For the second tree, we use a mirror tree (if nranks is even)
*
* 0---------------8 3----------------11
* ______/ \ / \______
* 4 \ / 7
* / \ \ / / \
* 2 6 10 1 5 9
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 0 2 4 6 8 10
*
* or shift it by one rank (if nranks is odd).
*
* 0---------------8 1---------------9
* ______/ \______ ______/ \______
* 4 12 5 0
* / \ / / \ /
* 2 6 10 3 7 11
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 2 4 6 8 10 12
*/
scclResult_t scclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
// First tree ... use a btree
scclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
// Second tree ... mirror or shift
if(nranks % 2 == 1) {
// shift
int shiftrank = (rank - 1 + nranks) % nranks;
int u, d0, d1;
scclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
*s1 = u == -1 ? -1 : (u + 1) % nranks;
*d1_0 = d0 == -1 ? -1 : (d0 + 1) % nranks;
*d1_1 = d1 == -1 ? -1 : (d1 + 1) % nranks;
} else {
// mirror
int u, d0, d1;
scclGetBtree(nranks, nranks - 1 - rank, &u, &d0, &d1, parentChildType1);
*s1 = u == -1 ? -1 : nranks - 1 - u;
*d1_0 = d0 == -1 ? -1 : nranks - 1 - d0;
*d1_1 = d1 == -1 ? -1 : nranks - 1 - d1;
}
return scclSuccess;
}
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
#include "core.h"
#include "devcomm.h"
#include "comm.h"
#include "topo.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
SCCL_PARAM(Nthreads, "NTHREADS", -2);
SCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);
static int getNthreads(const char* name, int env, int min, int max, int def, int WarpSize) {
int nt = env;
if(nt > 0) {
if(nt % WarpSize != 0) {
WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WarpSize);
nt = max;
} else if(nt > max) {
WARN("Invalid %s %d (maximum %d).", name, nt, max);
nt = max;
} else if(nt < min) {
WARN("Invalid %s %d (minimum %d).", name, nt, min);
nt = min;
}
} else {
nt = def;
}
return nt;
}
scclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) {
int def, set;
if(str[0] == '^') {
def = 1;
set = 0;
str++;
} else {
def = 0;
set = 1;
}
for(int i = 0; i < nelems; i++)
list[i] = def;
char* tokStr = strdup(str);
char* tmpStr;
char* token = strtok_r(tokStr, ",", &tmpStr);
while(token) {
for(int i = 0; i < nelems; i++)
if(strcasecmp(token, elems[i]) == 0)
list[i] = set;
token = strtok_r(NULL, ",", &tmpStr);
}
free(tokStr);
return scclSuccess;
}
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static const float baseLat[SCCL_NUM_ALGORITHMS][SCCL_NUM_PROTOCOLS] = {{12.0, 12.0, 17.0},
{12.0, 12.0, 17.0}, // Tree, Ring
{12.0, 12.0, 17.0},
{12.0, 12.0, 17.0}, // Collnet Direct, Chain
{0, 0, 0},
{0, 0, 0}}; // NVLS, NVLS Tree
// NVLink, PCI, Network
#define SCCL_HW_NVLINK 0
#define SCCL_HW_PCI 1
#define SCCL_HW_NET 2
struct tuningModel {
float hwLat[3][SCCL_NUM_ALGORITHMS][SCCL_NUM_PROTOCOLS];
float bwRatio[2][SCCL_NUM_ALGORITHMS][SCCL_NUM_PROTOCOLS];
float treeCorrectionFactor[SCCL_NUM_PROTOCOLS][27];
float ringCorrectionFactor[SCCL_NUM_PROTOCOLS][27];
};
static struct tuningModel tuning_model_0{
.hwLat =
{
/* NVLINK */
{/* Tree (LL/LL128/Simple)*/ {0.8, 1.4, 2.5},
/* Ring (LL/LL128/Simple)*/ {0.8, 2.2, 3.6},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 0.8},
/* CollNetChain (Simple)*/ {0.0, 0.0, 1.4},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* PCI */
{/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
/* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
/* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* NET */
{/* Tree (LL/LL128/Simple)*/ {11.8, 18.2, 20.8},
/* Ring (LL/LL128/Simple)*/ {9.5, 19.8, 15.1},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 11.8},
/* CollNetChain (Simple)*/ {0.0, 0.0, 18.2},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
},
.bwRatio =
{
/* 2 nodes */
{/* Tree (LL/LL128/Simple)*/ {0.28, 0.22, 0.91},
/* Ring (LL/LL128/Simple)*/ {0.31, 0.34, 1.00},
/* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
/* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* more than 2 nodes */
{/* Tree (LL/LL128/Simple)*/ {0.04, 0.22, 0.95},
/* Ring (LL/LL128/Simple)*/ {0.04, 0.34, 1.00},
/* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
/* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
},
.treeCorrectionFactor =
{
{
0.1, 0.2, 0.1, 0.1, 0.9, 0.3, 0.4, 0.1, 0.2, 0.4, 0.2, 0.1, 0.3, 0.3, 0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
},
{
0.1, 0.3, 1.0, 0.1, 0.5, 1.0, 0.9, 1.0, 1.0, 1.0, 0.3, 0.1, 0.4, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
},
// { 0.2, 1.0, 0.1, 0.1, 0.7, 0.2, 0.4, 0.1, 0.1, 0.3, 0.4, 0.3, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, },
{
0.2, 1.0, 0.1, 0.1, 0.7, 0.2, 0.4, 0.1, 0.1, 0.3, 0.4, 0.3, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 0.9, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4,
},
},
.ringCorrectionFactor =
{
{
0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.2, 0.3, 0.5, 0.3, 0.1, 0.5, 0.5, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
},
{
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3,
},
{
1.0, 0.8, 0.2, 1.0, 1.0, 0.3, 1.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.5, 1.0, 0.8, 0.8, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
},
},
};
static struct tuningModel tuning_model_1{
.hwLat =
{
/* NVLINK */
{/* Tree (LL/LL128/Simple)*/ {1.5, 1.5, 4.5},
/* Ring (LL/LL128/Simple)*/ {1.5, 1.5, 4.5},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 4.5},
/* CollNetChain (Simple)*/ {0.0, 0.0, 4.5},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* PCI */
{/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
/* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
/* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* NET */
{/* Tree (LL/LL128/Simple)*/ {33.0, 33.0, 15.8},
/* Ring (LL/LL128/Simple)*/ {5.1, 5.1, 68.8},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 15.8},
/* CollNetChain (Simple)*/ {0.0, 0.0, 15.8},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
},
.bwRatio =
{
/* 2 nodes */
{/* Tree (LL/LL128/Simple)*/ {0.30, 1.00, 0.99},
/* Ring (LL/LL128/Simple)*/ {0.31, 1.00, 1.00},
/* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
/* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* more than 2 nodes */
{/* Tree (LL/LL128/Simple)*/ {0.15, 1.00, 0.42},
/* Ring (LL/LL128/Simple)*/ {0.20, 1.00, 1.00},
/* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
/* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
},
.treeCorrectionFactor =
{
{
0.5, 0.4, 0.7, 0.6, 1.0, 1.0, 0.5, 0.4, 0.1, 0.5, 0.4, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.6, 0.5, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1,
},
{
0.5, 0.4, 0.7, 0.6, 1.0, 1.0, 0.5, 0.4, 0.1, 0.5, 0.4, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.6, 0.5, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1,
},
// { 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.4, 0.5, 0.1, 0.6, 1.0, 1.0, 1.0, 0.6, 0.5, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.5, 0.3, 0.3, },
{
0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.4, 0.5, 0.1, 0.6, 1.0, 1.0, 1.0, 0.6, 0.5, 0.7, 1.0, 1.0, 1.0, 0.4, 0.4, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1,
},
},
.ringCorrectionFactor =
{
{
1.0, 0.5, 1.0, 1.0, 0.6, 0.7, 1.0, 1.0, 0.2, 1.0, 0.9, 0.7, 1.0, 1.0, 1.0, 0.9, 0.9, 0.8, 0.8, 0.7, 0.6, 0.5, 0.5, 0.3, 0.2, 0.1, 0.1,
},
{
1.0, 0.5, 1.0, 1.0, 0.6, 0.7, 1.0, 1.0, 0.2, 1.0, 0.9, 0.7, 1.0, 1.0, 1.0, 0.9, 0.9, 0.8, 0.8, 0.7, 0.6, 0.5, 0.5, 0.3, 0.2, 0.1, 0.1,
},
{
0.3, 1.0, 0.3, 0.1, 0.1, 0.1, 0.3, 0.7, 1.0, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.9, 1.0, 1.0, 1.0, 1.0,
},
},
};
static struct tuningModel tuning_model_2{
.hwLat =
{
/* NVLINK */
{/* Tree (LL/LL128/Simple)*/ {1.5, 1.5, 4.5},
/* Ring (LL/LL128/Simple)*/ {1.5, 1.5, 4.5},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 4.5},
/* CollNetChain (Simple)*/ {0.0, 0.0, 4.5},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* PCI */
{/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
/* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
/* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* NET */
{/* Tree (LL/LL128/Simple)*/ {27.9, 27.9, 15.8},
/* Ring (LL/LL128/Simple)*/ {12.1, 12.1, 68.8},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 15.8},
/* CollNetChain (Simple)*/ {0.0, 0.0, 15.8},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
},
.bwRatio =
{
/* 2 nodes */
{/* Tree (LL/LL128/Simple)*/ {0.30, 1.00, 0.99},
/* Ring (LL/LL128/Simple)*/ {0.31, 1.00, 1.00},
/* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
/* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* more than 2 nodes */
{/* Tree (LL/LL128/Simple)*/ {0.07, 1.00, 0.42},
/* Ring (LL/LL128/Simple)*/ {0.08, 1.00, 1.00},
/* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
/* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
},
.treeCorrectionFactor =
{
{
0.1, 0.4, 0.3, 0.3, 0.2, 0.4, 0.5, 0.1, 0.1, 0.6, 0.7, 0.7, 0.8, 1.0, 0.9, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
},
{
0.1, 0.4, 0.3, 0.3, 0.2, 0.4, 0.5, 0.1, 0.1, 0.6, 0.7, 0.7, 0.8, 1.0, 0.9, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
},
// { 1.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.1, 0.6, 0.9, 0.8, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.9, 0.9, 1.0, 1.0, 1.0, },
{
1.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.1, 0.6, 0.9, 0.8, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 0.4, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4,
},
},
.ringCorrectionFactor =
{
{
0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 1.0, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
},
{
0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 1.0, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
},
{
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.5, 0.6, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
},
},
};
static struct tuningModel tuning_model_3{
.hwLat =
{
/* NVLINK */
{/* Tree (LL/LL128/Simple)*/ {0.8, 0.0, 2.5},
/* Ring (LL/LL128/Simple)*/ {0.8, 0.0, 3.6},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 0.8},
/* CollNetChain (Simple)*/ {0.0, 0.0, 0.0},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* PCI */
{/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
/* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
/* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* NET */
{/* Tree (LL/LL128/Simple)*/ {12.5, 0.0, 22.4},
/* Ring (LL/LL128/Simple)*/ {9.5, 0.0, 19.8},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 12.5},
/* CollNetChain (Simple)*/ {0.0, 0.0, 0.0},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
},
.bwRatio =
{
/* 2 nodes */
{/* Tree (LL/LL128/Simple)*/ {0.20, 0.00, 1.75},
/* Ring (LL/LL128/Simple)*/ {0.20, 0.00, 1.00},
/* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
/* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* more than 2 nodes */
{/* Tree (LL/LL128/Simple)*/ {0.20, 0.00, 0.96},
/* Ring (LL/LL128/Simple)*/ {0.20, 0.00, 1.00},
/* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
/* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
},
.treeCorrectionFactor =
{
{
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.2, 1.0, 0.9, 1.0, 0.6, 0.4, 0.6, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
},
{
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
},
// { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.2, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7, 0.8, 0.9, 0.7, 0.7, },
{
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.2, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 0.4, 0.3, 0.3, 0.3, 0.4, 0.3, 0.3,
},
},
.ringCorrectionFactor =
{
{
0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.1, 0.2, 0.1, 0.4, 0.4, 0.2, 0.2, 0.3, 0.7, 0.5, 0.4, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
},
{
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
},
{
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 1.0, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.4, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
},
},
};
static struct tuningModel tuning_model_4{
.hwLat =
{
/* NVLINK */
{/* Tree (LL/LL128/Simple)*/ {0.8, 1.4, 2.5},
/* Ring (LL/LL128/Simple)*/ {0.8, 2.2, 3.6},
/* CollNetDirect (Simple)*/ {0.8, 1.4, 2.5},
/* CollNetChain (Simple)*/ {0.8, 1.4, 2.5},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* PCI */
{/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
/* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
/* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* NET */
{/* Tree (LL/LL128/Simple)*/ {32.2, 34.4, 47.6},
/* Ring (LL/LL128/Simple)*/ {35.4, 87.8, 209.2},
/* CollNetDirect (Simple)*/ {0.0, 0.0, 47.6},
/* CollNetChain (Simple)*/ {0.0, 0.0, 47.6},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
},
.bwRatio =
{
/* 2 nodes */
{/* Tree (LL/LL128/Simple)*/ {0.16, 1.09, 1.61},
/* Ring (LL/LL128/Simple)*/ {0.15, 0.41, 1.00},
/* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
/* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
/* more than 2 nodes */
{/* Tree (LL/LL128/Simple)*/ {0.16, 1.09, 1.08},
/* Ring (LL/LL128/Simple)*/ {0.15, 0.41, 1.00},
/* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
/* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
/* NVLS */ {0, 0, 0},
/* NVLS Tree */ {0, 0, 0}},
},
.treeCorrectionFactor =
{
{
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.1, 0.1, 0.2, 0.4, 0.6, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
},
{
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.2, 1.0, 0.5, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
},
// { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.4, 0.3, 0.3, 0.1, 0.1, 1.0, 1.0, 0.7, 0.5, 0.6, 0.5, 0.6, 0.6, 0.5, 0.6, 0.6, 0.6, 0.7, },
// { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.4, 0.3, 0.3, 0.1, 0.1, 1.0, 1.0, 0.7, 0.5, 0.6, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, },
},
.ringCorrectionFactor =
{
{
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
},
{
0.4, 0.5, 0.5, 0.4, 0.4, 0.4, 0.4, 0.2, 0.2, 0.1, 0.3, 1.0, 1.0, 0.7, 0.8, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.8, 0.5, 0.4, 0.3, 0.3,
},
{
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.8, 0.5, 0.1, 0.7, 0.2, 0.4, 0.4, 0.6, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
},
},
};
static struct tuningModel rcclTuningModel[] = {
tuning_model_0,
tuning_model_1,
tuning_model_2,
tuning_model_3,
tuning_model_4,
};
/* Array indexes used below */
#define VOLTA_COMPCAP_IDX 0
#define AMPERE_COMPCAP_IDX 1
#define HOPPER_COMPCAP_IDX 2
// LL128 max BW per channel
static const double llMaxBws[3][3] = {
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
/* Hopper-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}};
static const double perChMaxRingLL128Bws[3][3] = {
/* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
};
static const double perChMaxTreeLL128Bws[3][3] = {
/* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
};
static const double perChMaxTreeBws[3][3] = {
/* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
/* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
};
// Network post overhead in ns (1000 = 1 us)
SCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
static float getNetOverhead(struct scclComm* comm) {
if(scclParamNetOverhead() != -2)
return scclParamNetOverhead() * .001;
int cpuArch, cpuVendor, cpuModel;
SCCLCHECK(scclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
if(cpuArch == SCCL_TOPO_CPU_ARCH_X86 && cpuVendor == SCCL_TOPO_CPU_VENDOR_INTEL)
return 1.0;
if(cpuArch == SCCL_TOPO_CPU_ARCH_X86 && cpuVendor == SCCL_TOPO_CPU_VENDOR_AMD)
return 2.0;
else
return 1.0;
}
scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs) {
int simpleDefaultThreads = (graphs[SCCL_ALGO_RING]->bwIntra * graphs[SCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : SCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[SCCL_ALGO_RING][SCCL_PROTO_SIMPLE] =
getNthreads("SCCL_NTHREADS", scclParamNthreads(), 4 * comm->WarpSize, SCCL_MAX_NTHREADS, simpleDefaultThreads, comm->WarpSize);
comm->maxThreads[SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE] = comm->maxThreads[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_SIMPLE] =
getNthreads("SCCL_NTHREADS", scclParamNthreads(), 4 * comm->WarpSize, SCCL_MAX_NTHREADS, SCCL_MAX_NTHREADS, comm->WarpSize);
comm->maxThreads[SCCL_ALGO_RING][SCCL_PROTO_LL] = comm->maxThreads[SCCL_ALGO_TREE][SCCL_PROTO_LL] =
comm->maxThreads[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_LL] =
getNthreads("SCCL_NTHREADS", scclParamNthreads(), 4 * comm->WarpSize, SCCL_MAX_NTHREADS, SCCL_MAX_NTHREADS, comm->WarpSize);
comm->maxThreads[SCCL_ALGO_RING][SCCL_PROTO_LL128] = comm->maxThreads[SCCL_ALGO_TREE][SCCL_PROTO_LL128] =
getNthreads("SCCL_LL128_NTHREADS", scclParamLl128Nthreads(), 4 * comm->WarpSize, SCCL_LL128_MAX_NTHREADS, SCCL_LL128_MAX_NTHREADS, comm->WarpSize);
int nNodes = comm->nNodes;
int nRanks = comm->nRanks;
if(nRanks <= 1)
return scclSuccess;
int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
int cpuArch, cpuVendor, cpuModel;
SCCLCHECK(scclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
int index2 = nNodes <= 2 ? nNodes - 1 : 2;
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
int index1 = nNodes == 1 ? compCapIndex : cpuVendor == SCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
double llMaxBw = llMaxBws[index1][index2];
double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
// De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
// if (cpuArch == SCCL_TOPO_CPU_ARCH_POWER) hwLat[SCCL_HW_PCI][SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE] = hwLat[SCCL_HW_PCI][SCCL_ALGO_RING][SCCL_PROTO_SIMPLE];
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
int intraHw[SCCL_NUM_ALGORITHMS], hw[SCCL_NUM_ALGORITHMS];
for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++)
intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? SCCL_HW_NVLINK : SCCL_HW_PCI;
for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++)
hw[a] = nNodes == 1 ? intraHw[a] : SCCL_HW_NET;
for(int coll = 0; coll < SCCL_NUM_FUNCTIONS; coll++) {
int nsteps = coll == scclFuncAllReduce ? 2 * (nRanks - 1) : coll == scclFuncReduceScatter || coll == scclFuncAllGather ? nRanks - 1 : nRanks;
int nInterSteps = coll == scclFuncAllReduce ? (nNodes > 1 ? 2 * nNodes : 0)
: coll == scclFuncReduceScatter || coll == scclFuncAllGather ? nNodes - 1
: nNodes;
for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++) {
if(coll == scclFuncBroadcast && a != SCCL_ALGO_RING)
continue;
if(coll == scclFuncReduce && a != SCCL_ALGO_RING)
continue;
if(coll == scclFuncReduceScatter && a != SCCL_ALGO_RING)
continue;
if(coll == scclFuncAllGather && a != SCCL_ALGO_RING)
continue;
for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
if((a == SCCL_ALGO_NVLS || a == SCCL_ALGO_NVLS_TREE) && p != SCCL_PROTO_SIMPLE)
continue;
int collnet = (a == SCCL_ALGO_COLLNET_DIRECT || a == SCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw;
// INFO(SCCL_INIT, "algo %s proto %s busBw %f baseBw %f bw %f nChannels %d bwIntra %f bwInter %f", scclAlgoStr[a], scclProtoStr[p], busBw,
// comm->topo->baseBw, bw, graphs[a]->nChannels, graphs[a]->bwIntra, graphs[a]->bwInter);
// Various model refinements
if(nNodes <= 2)
busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p];
else
busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p];
if(a == SCCL_ALGO_COLLNET_DIRECT && p == SCCL_PROTO_SIMPLE && minCompCap >= 90)
busBw *= .85;
// Convert bus BW to algorithm BW
float ratio;
if(a == SCCL_ALGO_RING)
ratio = (1.0 * nRanks) / nsteps;
else if(a == SCCL_ALGO_NVLS)
ratio = 5.0 / 6.0;
else if(a == SCCL_ALGO_NVLS_TREE)
ratio = .70 * nNodes / (2 * (nNodes - 1));
else
ratio = .5;
comm->bandwidths[coll][a][p] = busBw * ratio;
comm->latencies[coll][a][p] = baseLat[a][p];
float intraLat = rcclTuningModel[comm->topo->tuning].hwLat[intraHw[a]][a][p];
float interLat = graphs[a]->latencyInter ? graphs[a]->latencyInter : rcclTuningModel[comm->topo->tuning].hwLat[SCCL_HW_NET][a][p];
// if (nNodes > 1 && p == SCCL_PROTO_LL) intraLat *= 1.8;
if(p == SCCL_PROTO_SIMPLE)
interLat += graphs[a]->latencyInter;
if(a == SCCL_ALGO_RING) {
float lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][a][p];
if((coll == scclFuncReduce || coll == scclFuncBroadcast)) {
if(graphs[a]->sameChannels) {
comm->latencies[coll][a][p] += lat;
} else {
if(p == SCCL_PROTO_SIMPLE)
lat = rcclTuningModel[comm->topo->tuning]
.hwLat[hw[a]][SCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
comm->latencies[coll][a][p] += nsteps * lat;
}
} else {
// Inter-node rings still have to launch nsteps * net overhead.
float netOverhead = 0.0;
if(nNodes > 1) {
netOverhead = getNetOverhead(comm);
if(p == SCCL_PROTO_SIMPLE)
netOverhead *= 3;
}
intraLat = std::max(intraLat, netOverhead);
comm->latencies[coll][a][p] += (nsteps - nInterSteps) * intraLat + nInterSteps * interLat;
}
} else if(a == SCCL_ALGO_TREE) {
comm->latencies[coll][a][p] += 2 * ((nRanks / nNodes - 1) * intraLat + log2i(nNodes) * interLat);
} else if(a == SCCL_ALGO_COLLNET_DIRECT) {
comm->latencies[coll][a][p] +=
2 * (std::min(1, (nRanks / nNodes - 1)) * intraLat + (nRanks / nNodes - 1) * 0.5) + interLat; // Add 0.5 arity serialization latency
} else if(a == SCCL_ALGO_COLLNET_CHAIN) {
comm->latencies[coll][a][p] += 2 * (nRanks / nNodes - 1) * intraLat + interLat;
} else if(a == SCCL_ALGO_NVLS) {
if(nNodes > 1)
comm->latencies[coll][a][p] += rcclTuningModel[comm->topo->tuning].hwLat[SCCL_HW_NET][a][p];
} else if(a == SCCL_ALGO_NVLS_TREE) {
comm->latencies[coll][a][p] += 2 * (nNodes - 1) * rcclTuningModel[comm->topo->tuning].hwLat[SCCL_HW_NET][a][p];
}
}
}
}
// Protocols/Algorithms enable/disable, and user overrides.
// All are enabled except ll128 which is enabled by default only in certain cases.
int protoEnable[SCCL_NUM_PROTOCOLS] = {1, 2, 1};
int algoEnable[SCCL_NUM_ALGORITHMS] = {1, 1, 1, 1, 1, 1};
const char* protoStr = getenv("SCCL_PROTO");
if(protoStr) {
INFO(SCCL_ENV, "SCCL_PROTO set by environment to %s", protoStr);
SCCLCHECK(parseList(protoStr, scclProtoStr, SCCL_NUM_PROTOCOLS, protoEnable));
}
const char* algoStr = getenv("SCCL_ALGO");
if(algoStr) {
INFO(SCCL_ENV, "SCCL_ALGO set by environment to %s", algoStr);
SCCLCHECK(parseList(algoStr, scclAlgoStr, SCCL_NUM_ALGORITHMS, algoEnable));
}
if(comm->nNodes == 1)
algoEnable[SCCL_ALGO_NVLS_TREE] = 0;
// Disable CollNet if it is not supported
if(comm->collNetSupport == 0) {
algoEnable[SCCL_ALGO_COLLNET_DIRECT] = 0;
algoEnable[SCCL_ALGO_COLLNET_CHAIN] = 0;
if(comm->nNodes > 1)
algoEnable[SCCL_ALGO_NVLS] = 0;
// If user has hard set SCCL_ALGO=COLLNET, ignore it
if(algoEnable[SCCL_ALGO_RING] == 0 && algoEnable[SCCL_ALGO_TREE] == 0 && algoEnable[SCCL_ALGO_NVLS] == 0 && algoEnable[SCCL_ALGO_NVLS_TREE] == 0) {
algoEnable[SCCL_ALGO_RING] = algoEnable[SCCL_ALGO_TREE] = 1;
if(comm->rank == 0)
WARN("CollNet is not supported or fails to initialize, ignoring SCCL_ALGO=COLLNET");
}
} else {
// Disable CollNet+Direct if not on an NVSwitch system
int nvsCount = 0;
SCCLCHECK(scclTopoGetNvsCount(comm->topo, &nvsCount));
if(nvsCount == 0)
algoEnable[SCCL_ALGO_COLLNET_DIRECT] = 0;
}
for(int c = 0; c < SCCL_NUM_FUNCTIONS; c++)
for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++)
for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
// Disable LL protocol on gfx11xx
int pEnable = protoEnable[p];
if(pEnable == 2 && p == SCCL_PROTO_LL128) {
#if defined(ENABLE_LL128)
// Enable LL128 by default only on gfx90a with available tuning table
pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
(IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && comm->topo->ll128Enabled)
? 1
: 0;
#else
pEnable = 0;
#endif
}
if(pEnable == 0)
comm->bandwidths[c][a][p] = 0;
// Never disable ring for non-allreduce operations. That allows to run real apps with SCCL_ALGO=TREE.
if(a == SCCL_ALGO_RING && c != scclFuncAllReduce)
continue;
if(algoEnable[a] == 0)
comm->bandwidths[c][a][p] = 0;
}
if(comm->rank == 0) {
char line[1024];
for(int block = 0; block < 2; block++) {
sprintf(line, " Algorithm |");
for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) {
int a = block * SCCL_NUM_ALGORITHMS / 2 + ba;
sprintf(line + strlen(line), " %14s %14s %14s |", "", scclAlgoStr[a], "");
}
INFO(SCCL_TUNING, "%s", line);
sprintf(line, " Protocol |");
for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) {
for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
sprintf(line + strlen(line), " %14s |", scclProtoStr[p]);
}
}
INFO(SCCL_TUNING, "%s", line);
sprintf(line, " Max NThreads |");
for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) {
int a = block * SCCL_NUM_ALGORITHMS / 2 + ba;
for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
sprintf(line + strlen(line), " %14d |", comm->maxThreads[a][p]);
}
}
INFO(SCCL_TUNING, "%s", line);
for(int c = 0; c < SCCL_NUM_FUNCTIONS; c++) {
sprintf(line, "%13s |", scclFuncStr[c]);
for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) {
int a = block * SCCL_NUM_ALGORITHMS / 2 + ba;
for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
sprintf(line + strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
}
}
INFO(SCCL_TUNING, "%s", line);
}
}
}
// Set per-thread amount of work before we increase nThreads and nChannels
for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++) {
comm->threadThresholds[a][SCCL_PROTO_LL] = SCCL_LL_THREAD_THRESHOLD;
comm->threadThresholds[a][SCCL_PROTO_LL128] = SCCL_LL128_THREAD_THRESHOLD;
comm->threadThresholds[a][SCCL_PROTO_SIMPLE] = SCCL_SIMPLE_THREAD_THRESHOLD;
}
comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_LL] *= nRanks;
comm->threadThresholds[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_SIMPLE] = 256;
comm->threadThresholds[SCCL_ALGO_COLLNET_CHAIN][SCCL_PROTO_SIMPLE] = 256;
// Override defaults with user env
char* str = getenv("SCCL_THREAD_THRESHOLDS");
if(str) {
INFO(SCCL_ENV, "SCCL_THREAD_THRESHOLDS set by environment to %s", str);
ssize_t t[2][SCCL_NUM_PROTOCOLS] = {{-2, -2, -2}, {-2, -2, -2}};
sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0] + 1, t[0] + 2, t[1], t[1] + 1, t[1] + 2);
for(int a = 0; a < 2; a++) {
for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
if(t[a][p] >= 0)
comm->threadThresholds[a][p] = t[a][p];
}
}
}
INFO(SCCL_INIT,
"threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld | %ld",
comm->threadThresholds[SCCL_ALGO_TREE][SCCL_PROTO_LL],
comm->threadThresholds[SCCL_ALGO_TREE][SCCL_PROTO_LL128],
comm->threadThresholds[SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE],
comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_LL],
comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_LL128],
comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_SIMPLE],
comm->threadThresholds[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_SIMPLE],
comm->threadThresholds[SCCL_ALGO_COLLNET_CHAIN][SCCL_PROTO_SIMPLE]);
return scclSuccess;
}
scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) {
float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
float lat = info->comm->latencies[info->coll][algorithm][protocol];
if(bw == 0) {
*time = -1.0;
return scclSuccess;
}
int logSize = log2i(info->nBytes >> 6);
if(algorithm == SCCL_ALGO_TREE) {
if(logSize < 27)
bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][logSize];
else
bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][26];
} else if(algorithm == SCCL_ALGO_RING && info->comm->nNodes > 1) {
if(logSize < 27)
bw *= rcclTuningModel[info->comm->topo->tuning].ringCorrectionFactor[protocol][logSize];
else
bw *= rcclTuningModel[info->comm->topo->tuning].ringCorrectionFactor[protocol][26];
}
// Tree pipelining saves latency in aggregation cases
int latCount = algorithm == SCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, SCCL_MAX_WORK_ELEMENTS);
*time = lat * latCount + (info->nBytes) / (1000 * bw);
return scclSuccess;
}
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
#include <stdint.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include "base.h"
#include "hardware_utils.h"
namespace sccl {
namespace hardware {} // namespace hardware
} // namespace sccl
...@@ -2,7 +2,13 @@ ...@@ -2,7 +2,13 @@
#include <stdint.h> #include <stdint.h>
#include "base.h" #include "base.h"
#include "comm.h"
namespace sccl { namespace sccl {
namespace hardware {} // namespace hardware namespace hardware {
namespace ops {
////
} // namespace ops
} // namespace hardware
} // namespace sccl } // namespace sccl
#pragma once
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <poll.h>
#include <sys/types.h>
#include <unistd.h>
#include "ibvwrap.h"
#include "net_utils.h"
namespace sccl {
namespace hardware {
namespace net {
namespace device {
//////////////////////////////////
extern scclNet_t scclNetIb;
} // namespace device
} // namespace net
} // namespace hardware
} // namespace sccl
#pragma once
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <poll.h>
#include <sys/types.h>
#include <unistd.h>
#include "base.h"
#include "net_utils.h"
namespace sccl {
namespace hardware {
namespace net {
namespace host {
//////////////////////////////////
extern scclNet_t scclNetSocket;
} // namespace host
} // namespace net
} // namespace hardware
} // namespace sccl
#include <pthread.h>
#include <stdlib.h>
#include <poll.h>
#include <limits.h>
#include <fcntl.h>
#include <thread> // 为了使用 std::this_thread::sleep_for
#include "ipc_socket.h"
namespace sccl {
namespace hardware {
namespace net {
namespace ipc_socket {
//////////////////////////////////////// scclIpcSocket调用的函数 ////////////////////////////////////////
scclIpcSocket::scclIpcSocket(int localRank, int localRanks, uint64_t hash, volatile uint32_t* abortFlag)
: localRank(localRank), localRanks(localRanks), ipc_hash(hash) {
scclResult_t res;
handle = new struct scclIpcSocketHandle();
if(localRanks > 0) {
pthread_pool = new ThreadPool(localRanks * 2); // 其中一半用于发送一半,用于接收
}
SCCLCHECKGOTO(scclIpcSocketInit(abortFlag), res, failure);
return;
failure:
WARN("scclIpcSocket init failed");
return;
}
scclIpcSocket::~scclIpcSocket() {
// 释放pthpool
if(pthread_pool) {
delete(pthread_pool);
}
// 释放handle
if(handle->socketName[0] != '\0') {
unlink(handle->socketName);
}
if(handle->fd >= 0) {
close(handle->fd);
}
delete(handle);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
scclResult_t scclIpcSocket::scclIpcSocketInit(volatile uint32_t* abortFlag) {
// 中间变量
int fd = -1;
char temp_addr[SCCL_IPC_SOCKNAME_LEN];
// 初始化handle的成员变量
handle->fd = -1;
handle->socketName[0] = '\0';
// 创建Unix域套接字
// af是本机IP地址类型,一般有PF_INET或者AF_INET(IPv4互联网协议族),还有PF_INET6(IPv6互联网协议族)等,但是一般用IPv4。
// type有两种SOCK_STREAM 和SOCK_DGRAM分别对应tcp和udp协议,区别是用不用建立连接。
if((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
WARN("UDS: Socket creation error : %d", errno);
return scclSystemError;
}
// 将cliaddr结构体清零,确保没有残留数据
bzero(&my_cliaddr, sizeof(my_cliaddr));
my_cliaddr.sun_family = AF_UNIX;
// 为套接字创建唯一名称
int len = snprintf(temp_addr, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, localRank, ipc_hash);
if(len > (sizeof(my_cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot bind provided name to socket. Name too large");
return scclInternalError;
}
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Creating socket %s", temp_addr);
// 设置套接字路径
strncpy(my_cliaddr.sun_path, temp_addr, len);
my_cliaddr.sun_path[0] = '\0'; // Linux抽象套接字技巧
// 绑定套接字
if(bind(fd, (struct sockaddr*)&my_cliaddr, sizeof(my_cliaddr)) < 0) {
WARN("UDS: Binding to socket %s failed : %d", temp_addr, errno);
close(fd);
return scclSystemError;
}
// 设置handle的成员变量
handle->fd = fd;
strcpy(handle->socketName, temp_addr);
// 设置中止标志
handle->abortFlag = abortFlag;
// 将套接字标记为非阻塞
if(handle->abortFlag) {
int flags;
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
}
return scclSuccess;
}
/**
* 设置中止标志并更新socket的非阻塞模式
*
* @param flag 指向中止标志的指针。如果非空,将socket设为非阻塞模式;
* 如果为空,则恢复为阻塞模式。
* @note 该函数仅在handle有效时执行操作
*/
scclResult_t scclIpcSocket::setAbortFlag(volatile uint32_t* flag) {
if(handle) {
handle->abortFlag = flag;
if(flag) {
int flags;
EQCHECK(flags = fcntl(handle->fd, F_GETFL), -1);
SYSCHECK(fcntl(handle->fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
} else {
int flags;
EQCHECK(flags = fcntl(handle->fd, F_GETFL), -1);
SYSCHECK(fcntl(handle->fd, F_SETFL, flags & ~O_NONBLOCK), "fcntl");
}
}
return scclSuccess;
}
// 获取 abortFlag 的函数
volatile uint32_t* scclIpcSocket::getAbortFlag() const { return handle ? handle->abortFlag : nullptr; }
/**
* 设置IPC套接字的超时时间
*
* @param timeout_ms 超时时间(毫秒)
* @return 成功返回scclSuccess
*/
scclResult_t scclIpcSocket::setTimeout(int timeout_ms) {
timeoutMs = timeout_ms;
return scclSuccess;
}
ThreadPool* scclIpcSocket::getPthreadPool() { return pthread_pool; }
//////////////////////////////////////////////////////////////////////////////////////////////////////
/**
* @brief 通过Unix域套接字发送文件描述符
*
* @param sendFd 要发送的文件描述符
* @param dst_rank 目标rank号
* @return scclResult_t 返回操作结果:
* - scclSuccess: 发送成功
* - scclInternalError: 内部错误(如地址过长或中止标志被设置)
* - scclSystemError: 系统调用错误
*
* @note 使用Linux抽象套接字技巧(将sun_path[0]置为'\0')
* 通过SCM_RIGHTS机制发送文件描述符
* 函数会循环尝试发送直到成功或遇到错误
*/
scclResult_t scclIpcSocket::scclIpcSocketSendFd(const int sendFd, int dst_rank) {
// 创建一个临时地址字符串
char temp_addr[SCCL_IPC_SOCKNAME_LEN];
// 格式化地址字符串
int len = snprintf(temp_addr, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, dst_rank, ipc_hash);
// 检查地址字符串长度是否超过限制
if(len > (sizeof(my_cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot connect to provided name for socket. Name too large");
return scclInternalError;
}
// 记录发送文件描述符的信息
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Sending fd %d to UDS socket %s/fd:%d", sendFd, temp_addr, handle->fd);
// 初始化消息头结构体和iovec结构体
struct msghdr msg;
struct iovec iov[1];
// 联合体用于保证控制数组的对齐要求
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr* cmptr;
struct sockaddr_un cliaddr;
// 构造客户端地址以发送共享句柄
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
strncpy(cliaddr.sun_path, temp_addr, len);
cliaddr.sun_path[0] = '\0'; // Linux抽象套接字技巧
// 设置消息头的控制信息部分
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
// 将要发送的文件描述符复制到控制信息中
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
// 设置消息头的地址信息部分
msg.msg_name = (void*)&cliaddr;
msg.msg_namelen = sizeof(struct sockaddr_un);
// 设置iovec结构体,用于指定要发送的数据
iov[0].iov_base = (void*)"";
iov[0].iov_len = 1;
// 将iovec结构体关联到消息头
msg.msg_iov = iov;
msg.msg_iovlen = 1;
// 初始化消息标志
msg.msg_flags = 0;
ssize_t sendResult;
// 循环发送消息,直到成功发送数据
while((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
// 如果发送失败且错误不是EAGAIN, EWOULDBLOCK或EINTR,则记录警告并返回错误
if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Sending data over socket %s failed : %d", temp_addr, errno);
return scclSystemError;
}
// 如果设置了中止标志,则返回内部错误
if(handle->abortFlag && *handle->abortFlag)
return scclInternalError;
}
// 返回成功
return scclSuccess;
}
/**
* @brief 通过IPC socket接收文件描述符
*
* 该函数使用recvmsg系统调用从socket接收文件描述符。函数会循环尝试接收,
* 直到成功或发生错误。接收到的文件描述符会通过参数recvFd返回。
*
* @param recvFd 用于存储接收到的文件描述符的指针
* @return scclResult_t 返回操作结果:
* - scclSuccess: 成功接收文件描述符
* - scclSystemError: 系统调用失败
* - scclInternalError: 操作被中止
*
* @note 函数会处理EAGAIN、EWOULDBLOCK和EINTR错误,其他错误会导致返回失败。
* 接收到的控制消息必须符合SOL_SOCKET级别和SCM_RIGHTS类型。
*/
scclResult_t scclIpcSocket::scclIpcSocketRecvFd(int* recvFd) {
// 初始化消息头结构体和iovec结构体
struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
struct iovec iov[1];
// 联合体用于保证控制数组的对齐要求
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr* cmptr;
char dummy_buffer[1];
int ret;
// 设置消息头的控制信息部分
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
// 设置iovec结构体,用于指定要接收的数据
iov[0].iov_base = (void*)dummy_buffer;
iov[0].iov_len = sizeof(dummy_buffer);
// 将iovec结构体关联到消息头
msg.msg_iov = iov;
msg.msg_iovlen = 1;
// 循环接收消息,直到成功接收到数据
while((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
// 如果接收失败且错误不是EAGAIN, EWOULDBLOCK或EINTR,则记录警告并返回错误
if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Receiving data over socket failed : %d", errno);
return scclSystemError;
}
// 如果设置了中止标志,则返回内部错误
if(handle->abortFlag && *handle->abortFlag)
return scclInternalError;
}
// 检查接收到的控制信息
if(((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
// 如果控制信息的级别或类型不正确,则记录警告并返回错误
if((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
WARN("UDS: Receiving data over socket failed");
return scclSystemError;
}
// 将接收到的文件描述符复制到recvFd
memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
} else {
// 如果没有接收到控制信息,则记录警告并返回错误
WARN("UDS: Receiving data over socket %s failed", handle->socketName);
return scclSystemError;
}
// 记录成功接收到文件描述符的信息
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
// 返回成功
return scclSuccess;
}
/**
* @brief 通过IPC套接字发送数据到指定目标rank
*
* @param data 要发送的数据指针
* @param dataLen 要发送的数据长度
* @param dst_rank 目标rank号
* @return scclResult_t 返回操作结果状态码:
* - scclSuccess: 发送成功
* - scclInternalError: 内部错误(如套接字名称过长或中止标志被设置)
* - scclSystemError: 系统调用错误(如poll超时或sendmsg失败)
*
* @note 使用Linux抽象套接字技术,通过poll机制确保套接字可写后再发送数据
* 支持EAGAIN/EWOULDBLOCK/EINTR错误重试机制
*/
scclResult_t scclIpcSocket::scclIpcSocketSendData(const void* data, size_t dataLen, int dst_rank) {
// 构造目标地址字符串
char temp_addr[SCCL_IPC_SOCKNAME_LEN];
int len = snprintf(temp_addr, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, dst_rank, ipc_hash);
if(len > (sizeof(my_cliaddr.sun_path) - 1)) {
WARN("UDS: Unable to connect to the provided socket name. Name too long");
return scclInternalError;
}
// 设置消息结构体
struct msghdr msg;
struct iovec iov[1];
struct sockaddr_un cliaddr;
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
strncpy(cliaddr.sun_path, temp_addr, len);
cliaddr.sun_path[0] = '\0'; // Linux抽象套接字技巧
iov[0].iov_base = (void*)data;
iov[0].iov_len = dataLen;
msg.msg_name = (void*)&cliaddr;
msg.msg_namelen = sizeof(cliaddr);
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
// 使用 poll 等待 socket 可写
struct pollfd pfd;
pfd.fd = handle->fd;
pfd.events = POLLOUT;
int pollResult = poll(&pfd, 1, timeoutMs);
if(pollResult <= 0) {
if(pollResult == 0) {
WARN("UDS: Timeout occurred while waiting to send data to socket %s", temp_addr);
} else {
WARN("UDS: Error occurred while polling socket %s for writability : %d", temp_addr, errno);
}
return scclSystemError;
}
ssize_t sendResult;
while((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Error occurred while sending data through socket %s : %d", temp_addr, errno);
return scclSystemError;
}
if(handle->abortFlag && *handle->abortFlag)
return scclInternalError;
// 如果 sendmsg 因为 EAGAIN 或 EWOULDBLOCK 失败,重新 poll
pollResult = poll(&pfd, 1, timeoutMs);
if(pollResult <= 0) {
if(pollResult == 0) {
WARN("UDS: Timeout occurred while waiting to send data to socket %s", temp_addr);
} else {
WARN("UDS: Error occurred while polling socket %s for writability : %d", temp_addr, errno);
}
return scclSystemError;
}
}
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Successfully sent %zu bytes of data through UDS socket %s", dataLen, temp_addr);
return scclSuccess;
}
/**
* @brief 通过IPC socket接收数据
*
* 该函数使用poll机制等待socket可读,然后通过recvmsg接收数据。
* 支持超时设置和中断处理,当发生错误或超时时返回相应错误码。
*
* @param buffer 接收数据的缓冲区指针
* @param bufferLen 缓冲区长度
* @param receivedLen 实际接收到的数据长度(输出参数)
* @return scclResult_t 操作结果状态码:
* - scclSuccess: 成功接收数据
* - scclSystemError: 系统调用错误
* - scclInternalError: 被中断标志终止
*/
scclResult_t scclIpcSocket::scclIpcSocketRecvData(void* buffer, size_t bufferLen, size_t* receivedLen) {
// 设置消息结构体
struct msghdr msg = {0};
struct iovec iov[1];
iov[0].iov_base = buffer;
iov[0].iov_len = bufferLen;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
// 使用 poll 等待 socket 可读
struct pollfd pfd;
pfd.fd = handle->fd;
pfd.events = POLLIN;
int pollResult = poll(&pfd, 1, timeoutMs);
if(pollResult <= 0) {
if(pollResult == 0) {
WARN("UDS: Timeout occurred while waiting to receive data from socket %s", handle->socketName);
} else {
WARN("UDS: Error occurred while polling socket %s for readability : %d", handle->socketName, errno);
}
return scclSystemError;
}
int ret;
while((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Error occurred while receiving data through socket %s : %d", handle->socketName, errno);
return scclSystemError;
}
if(handle->abortFlag && *handle->abortFlag)
return scclInternalError;
// 如果 recvmsg 因为 EAGAIN 或 EWOULDBLOCK 失败,重新 poll
pollResult = poll(&pfd, 1, timeoutMs);
if(pollResult <= 0) {
if(pollResult == 0) {
WARN("UDS: Timeout occurred while waiting to receive data from socket %s", handle->socketName);
} else {
WARN("UDS: Error occurred while polling socket %s for readability : %d", handle->socketName, errno);
}
return scclSystemError;
}
}
if(ret > 0) {
*receivedLen = ret;
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Successfully received %zu bytes of data from socket %s", ret, handle->socketName);
return scclSuccess;
} else {
WARN("UDS: Error occurred while receiving data through socket %s", handle->socketName);
return scclSystemError;
}
}
/**
* @brief 通过Unix域套接字非阻塞发送数据到指定rank节点
*
* @param data 要发送的数据指针
* @param dataLen 要发送的数据长度(字节)
* @param dst_rank 目标rank号
* @return scclResult_t 返回操作结果:
* - scclSuccess: 发送成功
* - scclInternalError: 内部错误(地址过长或中止标志被设置)
* - scclSystemError: 系统调用错误
*
* @note 使用Linux抽象套接字命名空间技术
* 函数会持续重试直到发送成功或发生错误
* 使用poll系统调用等待套接字变为可写状态
*/
scclResult_t scclIpcSocket::scclIpcSocketSendDataNonBlocking(const void* data, size_t dataLen, int dst_rank) {
// 创建一个临时地址字符串,用于存储目标套接字的地址
char temp_addr[SCCL_IPC_SOCKNAME_LEN];
// 格式化目标地址字符串
int len = snprintf(temp_addr, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, dst_rank, ipc_hash);
// 如果地址字符串太长,则返回错误
if(len > (sizeof(my_cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot connect to provided name for socket. Name too large");
return scclInternalError;
}
// 记录日志,表示正在发送数据
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Sending %zu bytes of data to UDS socket %s", dataLen, temp_addr);
// 设置消息头结构体
struct msghdr msg;
struct iovec iov[1];
struct sockaddr_un cliaddr;
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
strncpy(cliaddr.sun_path, temp_addr, len);
cliaddr.sun_path[0] = '\0'; // Linux抽象套接字技巧
iov[0].iov_base = (void*)data;
iov[0].iov_len = dataLen;
msg.msg_name = (void*)&cliaddr;
msg.msg_namelen = sizeof(cliaddr);
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
ssize_t sendResult;
// 尝试发送数据,如果失败则等待套接字变得可写后重试
while((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
// 如果错误不是 EAGAIN, EWOULDBLOCK 或 EINTR,则记录警告并返回错误
if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Sending data over socket %s failed : %d", temp_addr, errno);
return scclSystemError;
}
// 如果设置了中止标志,则返回内部错误
if(handle->abortFlag && *handle->abortFlag)
return scclInternalError;
// 使用 poll 系统调用等待套接字变得可写
struct pollfd pfd;
pfd.fd = handle->fd;
pfd.events = POLLOUT;
int pollResult = poll(&pfd, 1, -1); // 无限等待
if(pollResult <= 0) {
WARN("UDS: Polling for socket %s to become writable failed : %d", temp_addr, errno);
return scclSystemError;
}
}
return scclSuccess;
}
/**
* @brief 非阻塞接收IPC socket数据
*
* 通过UDS套接字非阻塞接收数据,当数据不可读时会等待直到可读或发生错误。
*
* @param buffer 接收数据的缓冲区指针
* @param bufferLen 缓冲区长度
* @param receivedLen 实际接收到的数据长度(输出参数)
* @return scclResult_t 操作结果:
* - scclSuccess: 成功接收数据
* - scclSystemError: 系统调用错误
* - scclInternalError: 被中止标志中断
*
* @note 内部使用recvmsg和poll系统调用实现
*/
scclResult_t scclIpcSocket::scclIpcSocketRecvDataNonBlocking(void* buffer, size_t bufferLen, size_t* receivedLen) {
// 初始化消息头结构体和iovec结构体
struct msghdr msg = {0};
struct iovec iov[1];
iov[0].iov_base = buffer;
iov[0].iov_len = bufferLen;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
int ret;
// 尝试接收消息,如果失败则等待套接字变得可读后重试
while((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
// 如果接收失败且错误不是EAGAIN, EWOULDBLOCK或EINTR,则记录警告并返回错误
if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Receiving data over socket failed : %d", errno);
return scclSystemError;
}
// 如果设置了中止标志,则返回内部错误
if(handle->abortFlag && *handle->abortFlag)
return scclInternalError;
// 使用 poll 系统调用等待套接字变得可读
struct pollfd pfd;
pfd.fd = handle->fd;
pfd.events = POLLIN;
int pollResult = poll(&pfd, 1, -1); // 无限等待
if(pollResult <= 0) {
WARN("UDS: Polling for socket %s to become readable failed : %d", handle->socketName, errno);
return scclSystemError;
}
}
// 如果成功接收到数据,则记录接收到的数据长度并返回成功
if(ret > 0) {
*receivedLen = ret;
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Received %zu bytes of data from socket %s", *receivedLen, handle->socketName);
return scclSuccess;
} else {
WARN("UDS: Receiving data over socket %s failed", handle->socketName);
return scclSystemError;
}
}
/**
* @brief 使用IPC套接字实现Allgather操作
*
* 该函数通过线程池并行发送和接收数据,实现多节点间的Allgather集合通信。
*
* @param sendData 发送数据缓冲区指针
* @param recvData 接收数据缓冲区指针
* @param dataLen 每个节点的数据长度(字节)
* @param wait 是否等待所有通信完成
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*
* @note 1. 会跳过本地rank的数据传输
* 2. 数据包格式: [发送rank(int)][数据]
* 3. 接收缓冲区需要预先分配足够空间(大小=localRanks*dataLen)
*/
scclResult_t scclIpcSocket::scclIpcSocketAllgather(const void* sendData, void* recvData, size_t dataLen, bool wait) {
if(pthread_pool == nullptr || localRanks <= 0) {
WARN("scclIpcSocket init error!");
return scclInternalError;
}
std::vector<std::future<void>> futures;
// 采用线程池发送和接收数据
for(int i = 0; i < localRanks; ++i) {
if(i != localRank) {
auto sendTask = [this, sendData, dataLen, i]() {
// 计算 DataPackage 的总大小
size_t packageSize = sizeof(int) + dataLen;
char* buffer = new char[packageSize];
// 将 rank 信息和数据一起拷贝到 buffer 中
int* rankPtr = reinterpret_cast<int*>(buffer);
*rankPtr = localRank;
char* dataPtr = buffer + sizeof(int);
memcpy(dataPtr, sendData, dataLen);
// 一次性发送 rank 信息和数据
scclIpcSocketSendData(buffer, packageSize, i);
delete[] buffer;
};
futures.push_back(pthread_pool->enqueue(sendTask));
auto recvTask = [this, recvData, dataLen, i]() {
// 准备接收缓冲区
size_t packageSize = sizeof(int) + dataLen;
char* buffer = new char[packageSize];
size_t receivedLen;
// 一次性接收 rank 信息和数据
scclIpcSocketRecvData(buffer, packageSize, &receivedLen);
// 从 buffer 中提取 rank 信息和数据
int* rankPtr = reinterpret_cast<int*>(buffer);
int senderRank = *rankPtr;
char* dataPtr = buffer + sizeof(int);
memcpy(static_cast<char*>(recvData) + senderRank * dataLen, dataPtr, dataLen);
delete[] buffer;
};
futures.push_back(pthread_pool->enqueue(recvTask));
} else {
// 自己的数据直接放置到正确位置
memcpy(static_cast<char*>(recvData) + localRank * dataLen, sendData, dataLen);
}
}
if(wait) {
// 等待所有任务完成
for(auto& fut : futures) {
fut.get();
}
}
return scclSuccess;
}
/**
* @brief 使用IPC套接字进行Allgather同步操作
*
* 该函数实现了基于IPC套接字的Allgather同步操作,将各进程的数据收集到所有进程的接收缓冲区中。
*
* @param sendData 发送数据缓冲区指针
* @param recvData 接收数据缓冲区指针
* @param dataLen 每个进程发送/接收的数据长度
* @param wait 是否等待所有通信任务完成
* @return scclResult_t 返回操作结果,成功返回scclSuccess,失败返回错误码
*
* @note 1. 函数会先将本地数据复制到接收缓冲区对应位置
* 2. 使用线程池并行处理与其他进程的通信任务
* 3. 当wait为true时会阻塞等待所有通信完成
*/
scclResult_t scclIpcSocket::scclIpcSocketAllgatherSync(const void* sendData, void* recvData, size_t dataLen, bool wait) {
if(pthread_pool == nullptr || localRanks <= 0) {
WARN("scclIpcSocket init error!");
return scclInternalError;
}
// 将当前进程的数据复制到接收缓冲区的对应位置
memcpy(static_cast<char*>(recvData) + localRank * dataLen, sendData, dataLen);
std::vector<std::future<void>> futures;
// 采用线程池发送和接收数据
for(int i = 0; i < localRanks; ++i) {
if(i != localRank) {
auto sendTask = [this, sendData, dataLen, i]() { scclIpcSocketSendData(sendData, dataLen, i); };
futures.push_back(pthread_pool->enqueue(sendTask));
auto recvTask = [this, recvData, dataLen, i]() {
size_t receivedLen;
scclIpcSocketRecvData(reinterpret_cast<char*>(recvData) + i * dataLen, dataLen, &receivedLen);
};
futures.push_back(pthread_pool->enqueue(recvTask));
}
}
if(wait) {
// 等待所有任务完成
for(auto& fut : futures) {
fut.get();
}
}
return scclSuccess;
}
/**
* @brief 通过IPC Socket进行广播操作
*
* 该函数实现了基于IPC Socket的广播通信机制。根进程(root)将数据发送给所有其他进程,
* 非根进程从根进程接收数据。可以选择是否等待所有通信操作完成。
*
* @param sendData 发送数据缓冲区指针(根进程使用)
* @param recvData 接收数据缓冲区指针(非根进程使用)
* @param dataLen 数据长度(字节)
* @param root 根进程的rank值
* @param wait 是否等待所有通信操作完成
*
* @return scclResult_t 返回操作结果状态码
* - scclSuccess: 操作成功
* - scclInternalError: IPC Socket未初始化或本地rank数无效
* - scclInvalidArgument: 根进程rank值无效
*/
scclResult_t scclIpcSocket::scclIpcSocketBroadcast(const void* sendData, void* recvData, size_t dataLen, int root, bool wait) {
if(pthread_pool == nullptr || localRanks <= 0) {
WARN("scclIpcSocket init error!");
return scclInternalError;
}
if(root < 0 || root >= localRanks) {
WARN("scclIpcSocketBroadcast: Invalid root rank %d", root);
return scclInvalidArgument;
}
std::vector<std::future<scclResult_t>> futures; // 使用 future 来收集每个任务的返回结果
if(localRank == root) {
// 根进程:发送数据给所有其他进程
for(int i = 0; i < localRanks; ++i) {
if(i != root) {
auto sendTask = [this, sendData, dataLen, i]() -> scclResult_t { return scclIpcSocketSendData(sendData, dataLen, i); };
futures.push_back(pthread_pool->enqueue(sendTask));
}
}
} else {
// 非根进程:从根进程接收数据
auto recvTask = [this, recvData, dataLen, root]() -> scclResult_t {
size_t receivedLen;
return scclIpcSocketRecvData(recvData, dataLen, &receivedLen);
};
futures.push_back(pthread_pool->enqueue(recvTask));
}
if(wait) {
// 等待所有任务完成并检查结果
for(auto& fut : futures) {
scclResult_t result = fut.get();
if(result != scclSuccess) {
WARN("scclIpcSocketBroadcast: Task failed with error %d", result);
return scclInternalError;
}
}
}
return scclSuccess;
}
} // namespace ipc_socket
} // namespace net
} // namespace hardware
} // namespace sccl
#pragma once
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <poll.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/un.h>
#include "base.h"
#include "net_utils.h"
#include "socket.h"
#include "thread_pool.h"
namespace sccl {
namespace hardware {
namespace net {
namespace ipc_socket {
#define SCCL_IPC_SOCKNAME_LEN 64
#define SCCL_IPC_SOCKNAME_STR "/tmp/sccl-socket-%d-%lx"
// 定义IPC套接字结构体
struct scclIpcSocketHandle {
int fd; // 文件描述符
char socketName[SCCL_IPC_SOCKNAME_LEN]; // 套接字名称
volatile uint32_t* abortFlag; // 用于中止操作的标志
};
// 封装发送数据,包括rank信息和实际数据的引用
struct DataPackage {
int rank;
char data[]; // 灵活数组成员,用于存储实际数据
};
//////////////////////////////////////////////////////////////////////////////////////////////////////
class scclIpcSocket {
public:
// 构造函数和析构函数
scclIpcSocket(int localRank, int localRanks, uint64_t hash, volatile uint32_t* abortFlag = nullptr);
virtual ~scclIpcSocket();
// 初始化IPC套接字
scclResult_t scclIpcSocketInit(volatile uint32_t* abortFlag);
// 设置 abortFlag 的函数
scclResult_t setAbortFlag(volatile uint32_t* flag);
// 获取 abortFlag 的函数
volatile uint32_t* getAbortFlag() const;
// 设置IPC套接字的超时时间
scclResult_t setTimeout(int timeout_ms);
// 获取线程池指针
ThreadPool* getPthreadPool();
//////////////////////////////////////////////////////////////////////////////////////////////////////
/*
并行计算时,不同的进程可能需要访问相同的文件或网络资源。通过发送文件描述符,可以避免多个进程重复打开相同的文件或建立相同的网络连接,从而节省资源和时间。
*/
// 发送文件描述符
scclResult_t scclIpcSocketSendFd(const int sendFd, int dst_rank);
// 接收文件描述符
scclResult_t scclIpcSocketRecvFd(int* fd);
// 通过Unix域套接字发送数据到指定目标,阻塞方式
scclResult_t scclIpcSocketSendData(const void* data, size_t dataLen, int dst_rank);
// 通过Unix域套接字接收数据,阻塞方式
scclResult_t scclIpcSocketRecvData(void* buffer, size_t bufferLen, size_t* receivedLen);
// 通过Unix域套接字发送数据到指定目标,非阻塞方式
scclResult_t scclIpcSocketSendDataNonBlocking(const void* data, size_t dataLen, int dst_rank);
// 通过Unix域套接字接收数据,非阻塞方式
scclResult_t scclIpcSocketRecvDataNonBlocking(void* buffer, size_t bufferLen, size_t* receivedLen);
// local rank内的allgather操作。保证接收顺序
scclResult_t scclIpcSocketAllgather(const void* sendData, void* recvData, size_t dataLen, bool wait = true);
// local rank内的allgather操作。为了性能,不保证接收顺序,所以发送的信息中需要添加进程ID
scclResult_t scclIpcSocketAllgatherSync(const void* sendData, void* recvData, size_t dataLen, bool wait = true);
// local rank内的broadcast操作
scclResult_t scclIpcSocketBroadcast(const void* sendData, void* recvData, size_t dataLen, int root, bool wait = true);
private:
// 定义并初始化一个 scclIpcSocket 结构体,用于处理 IPC 套接字连接
struct scclIpcSocketHandle* handle = nullptr;
// 定义一个 sockaddr_un 结构体,用于存储客户端地址信息
struct sockaddr_un my_cliaddr;
// 用于生成唯一套接字名称的hash值
const uint64_t ipc_hash;
// 非阻塞套接字设置
const volatile uint32_t* my_abortFlag;
// 进程id信息
int localRank = -1;
int localRanks = 0;
// 线程池指针
ThreadPool* pthread_pool = nullptr;
// 设置超时时间为 10000 毫秒
int timeoutMs = 10000;
};
} // namespace ipc_socket
} // namespace net
} // namespace hardware
} // namespace sccl
#include <stdint.h>
#include "net.h"
namespace sccl {
namespace hardware {
namespace net {
/**
* 打印套接字地址信息
*
* @param sock_addr 套接字地址结构体指针
* @param prefix 输出信息的前缀字符串
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*
* @note 该函数会格式化输出套接字地址信息,包含在分隔线中以便阅读
*/
scclResult_t printSocketAddr(union net_socket::scclSocketAddress* sock_addr, const char* prefix) {
char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2];
net::net_socket::scclSocketToString(sock_addr, line);
printf("\n==========================================\n%s addr: %s"
"\n==========================================\n",
prefix,
line);
return scclSuccess;
}
/**
* 打印套接字信息
*
* @param sock 指向scclSocket结构体的指针,包含套接字相关信息
* @param prefix 输出信息的前缀字符串
* @return 返回scclResult_t类型,成功时返回scclSuccess
*
* 该函数用于格式化输出套接字的详细信息,包括文件描述符、重试次数、
* 地址信息、状态标志等调试信息。输出格式包含分隔线以便于阅读。
*/
scclResult_t printSocketInfo(struct net_socket::scclSocket* sock, const char* prefix) {
char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2];
net::net_socket::scclSocketToString(&sock->addr, line);
printf("\n==========================================\n%s: fd: %d, acceptFd: %d, timedOutRetries: %d, refusedRetries: %d, \naddr: %s, abortFlag=%u, "
"asyncFlag=%d, state=%d, salen=%d, magic=%lu, type=%d"
"\n==========================================\n",
prefix,
sock->fd,
sock->acceptFd,
sock->timedOutRetries,
sock->refusedRetries,
line,
sock->abortFlag != NULL ? *sock->abortFlag : 0,
sock->asyncFlag,
int(sock->state),
sock->salen,
sock->magic,
int(sock->type));
return scclSuccess;
}
////////////////////////////////////////////////////////////////////////////////////////
// 定义网络状态的枚举类型
typedef enum scclNetState {
scclNetStateInit = 0, // 初始化状态
scclNetStateEnabled = 1, // 启用状态
scclNetStateDisabled = 2 // 禁用状态
} scclNetState_t;
// 定义一个数组,存储每种网络类型的状态,初始值均为初始化状态
scclNetState_t scclNetStates[scclNetTypeNum] = {scclNetStateInit, scclNetStateInit, scclNetStateInit};
/**
* 获取指定网络接口的状态
*
* @param i 网络接口索引
* @param state 输出参数,用于存储获取到的网络状态
* @return scclResult_t 返回操作结果,成功返回scclSuccess
*
* @note 该函数是线程安全的,内部使用互斥锁保护共享状态
* @note 如果网络未初始化,会自动执行初始化并更新状态
*/
scclResult_t netGetState(int i, scclNetState_t* state) {
pthread_mutex_lock(&netLock);
if(scclNetStates[i] == scclNetStateInit) {
int ndev;
if(scclNets[i]->init() != scclSuccess)
scclNetStates[i] = scclNetStateDisabled;
else if(scclNets[i]->devices(&ndev) != scclSuccess || ndev <= 0)
scclNetStates[i] = scclNetStateDisabled;
else
scclNetStates[i] = scclNetStateEnabled;
}
*state = scclNetStates[i];
pthread_mutex_unlock(&netLock);
return scclSuccess;
}
/**
* @brief 初始化指定名称的网络
*
* 遍历所有可用的网络类型,查找与指定名称匹配且状态为启用的网络。
* 如果找到匹配的网络,则将其赋值给scclNet参数。
*
* @param netName 要查找的网络名称,可为NULL表示匹配任意名称
* @param scclNet 输出参数,用于返回找到的网络实例
*
* @return scclResult_t 返回操作结果:
* - scclSuccess 成功找到匹配网络
* - scclInvalidUsage 未找到匹配网络
*/
scclResult_t scclNetInit(const char* netName, scclNet_t*& scclNet) {
// Initialize main communication network
bool ok = false;
for(int i = 0; i < scclNetTypeNum; i++) {
if(scclNets[i] == nullptr)
continue;
enum scclNetState state;
SCCLCHECK(netGetState(i, &state));
if(state != scclNetStateEnabled)
continue;
if(netName && strcasecmp(netName, scclNets[i]->name) != 0)
continue;
scclNet = scclNets[i];
ok = true;
// if(scclCollNets[i]) {
// SCCLCHECK(collNetGetState(i, &state));
// if(state == scclNetStateEnabled) {
// comm->scclCollNet = scclCollNets[i];
// }
// }
break;
}
if(!ok) {
WARN("Error: network %s not found.", netName ? netName : "");
return scclInvalidUsage;
}
return scclSuccess;
}
} // namespace net
} // namespace hardware
} // namespace sccl
#pragma once #pragma once
#include <stdint.h> #include <stdint.h>
#include <memory>
#include "base.h" #include "base.h"
#include "net_utils.h" #include "net_utils.h"
#include "device/net_ib.h" #include "net_socket/socket.h"
#include "host/net_socket.h" #include "net_ib/net_ib.h"
#include "net_socket/net_socket.h"
namespace sccl { namespace sccl {
namespace hardware { namespace hardware {
namespace net { namespace net {
////////////////////////////////// // 定义一个静态的pthread互斥锁,用于线程同步
typedef enum net_type : uint8_t { static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
NET_IB = 0,
NET_SOCKET = 1
} net_type_t;
//////////////////////////////////
inline scclResult_t initNetSpecial(scclNet_t* net) {
int ndev;
// 初始化网络,如果初始化失败则返回内部错误
if(net->init() != scclSuccess)
return scclInternalError;
// 获取设备数量,如果获取失败则返回内部错误
if(net->devices(&ndev) != scclSuccess)
return scclInternalError;
// 如果设备数量小于或等于0,则返回系统错误
if(ndev <= 0)
return scclSystemError;
return scclSuccess;
}
/** //////////////////////////////////// 功能函数 ////////////////////////////////////
* 初始化网络设备 // 打印Socket信息
* scclResult_t printSocketAddr(union net_socket::scclSocketAddress* sock_addr, const char* prefix);
* @param net 指向scclNet_t结构体的指针,表示要初始化的网络设备 scclResult_t printSocketInfo(struct net_socket::scclSocket* sock, const char* prefix);
* @return scclResult_t 返回操作结果:
* - scclSuccess: 初始化成功
* - scclInternalError: 网络初始化或获取设备数量失败
* - scclSystemError: 系统中无可用设备
*/
inline scclNet_t* initNet(net_type_t t) {
scclNet_t* scclNet = NULL;
if(t == NET_IB) { //////////////////////////////////// 网络接口 ////////////////////////////////////
if(initNetSpecial(&(device::scclNetIb)) == scclSuccess) { // 定义网络类型数量的常量
scclNet = &(device::scclNetIb); constexpr int scclNetTypeNum = 3;
}
} else if(t == NET_SOCKET) {
if(initNetSpecial(&(host::scclNetSocket)) == scclSuccess) {
scclNet = &(host::scclNetSocket);
}
} else {
WARN("Unsupported network type.");
}
return scclNet; // 定义一个内联数组,存储不同类型的sccl网络指针
} inline scclNetBase* scclNets[] = {nullptr, new net_ib::scclNetIb(), new net_socket::scclNetSocket()};
//////////////////////////////////// // 定义初始化sccl网络的函数
inline scclNet_t* scclNets[3] = {nullptr, &device::scclNetIb, &host::scclNetSocket}; scclResult_t scclNetInit(const char* netName, scclNet_t*& scclNet);
} // namespace net } // namespace net
} // namespace hardware } // namespace hardware
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment