通过线程池实现ipcsocket，满足节点内通信

a4ac3320 · lishen · d9d23f34 · d9d23f34 · d9d23f34 · d9d23f34
Commit a4ac3320 authored Jul 07, 2025 by lishen
20 changed files
--- a/src/hardware/graph/connect.cc
+++ b/src/hardware/graph/connect.cc
-#include "comm.h"
-#include "graph.h"
-#include "trees.h"
-#include "rings.h"
-#include "topo.h"
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace detect {
-/******************************************************************/
-/********************* Internode connection ***********************/
-/******************************************************************/
-scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks) {
-    int rank       = comm->rank;
-    int localRanks = comm->topo->nodes[GPU].count;
-    int nChannels  = comm->nChannels;
-    for(int c = 0; c < nChannels; c++) {
-        struct scclChannel* channel = comm->channels + c;
-        channel->ring.prev = channel->ring.next = -1;
-        channel->tree.up                        = -1;
-        channel->collnetChain.up                = -1;
-        for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++)
-            channel->tree.down[i] = -1;
-        for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++)
-            channel->collnetChain.down[i] = -1;
-        channel->collnetDirect.out      = -1;
-        channel->collnetDirect.headRank = -1;
-        channel->collnetDirect.nHeads   = 0;
-        channel->collnetDirect.shift    = 0;
-        for(int i = 0; i < SCCL_MAX_DIRECT_ARITY; i++)
-            channel->collnetDirect.up[i] = -1;
-        for(int i = 0; i < SCCL_MAX_DIRECT_ARITY; i++)
-            channel->collnetDirect.down[i] = -1;
-        int* ringIntra    = graphs[SCCL_ALGO_RING]->intra + c * localRanks;
-        int* treeIntra    = graphs[SCCL_ALGO_TREE]->intra + c * localRanks;
-        int* collNetIntra = graphs[SCCL_ALGO_COLLNET_CHAIN]->intra + c * localRanks;
-        int* nvlsIntra    = graphs[SCCL_ALGO_NVLS]->intra + c * localRanks;
-        for(int i = 0; i < localRanks; i++) {
-            if(ringIntra[i] == rank) {
-                topoRanks->ringRecv[c] = ringIntra[0];
-                topoRanks->ringSend[c] = ringIntra[localRanks - 1];
-                channel->ring.prev     = (i == 0) ? -1 : ringIntra[i - 1];
-                channel->ring.next     = (i == localRanks - 1) ? -1 : ringIntra[i + 1];
-            }
-            if(treeIntra[i] == rank) {
-                int parentIndex = 0;
-                int child0Index = graphs[SCCL_ALGO_TREE]->pattern == SCCL_TOPO_PATTERN_TREE ? 0 : 1;
-                int child1Index = graphs[SCCL_ALGO_TREE]->pattern == SCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
-                topoRanks->treeToParent[c] = treeIntra[parentIndex];
-                topoRanks->treeToChild0[c] = treeIntra[child0Index];
-                topoRanks->treeToChild1[c] = treeIntra[child1Index];
-                channel->tree.up           = i == 0 ? -1 : treeIntra[i - 1];
-                channel->tree.down[0]      = i == localRanks - 1 ? -1 : treeIntra[i + 1];
-            }
-            if(collNetIntra[i] == rank) {
-                channel->collnetChain.up      = i == 0 ? comm->nRanks : collNetIntra[i - 1];
-                channel->collnetChain.down[0] = i == localRanks - 1 ? -1 : collNetIntra[i + 1];
-            }
-        }
-        topoRanks->ringPrev[c]  = channel->ring.prev;
-        topoRanks->ringNext[c]  = channel->ring.next;
-        topoRanks->nvlsHeads[c] = nvlsIntra[0];
-    }
-    // Duplicate channels rings/trees
-    struct scclChannel* channel0 = comm->channels;
-    struct scclChannel* channel1 = (nChannels > MAXCHANNELS / 2) ? 0 : channel0 + nChannels;
-    if(channel1)
-        memcpy(channel1, channel0, nChannels * sizeof(struct scclChannel));
-    return scclSuccess;
-}
-bool isRankHere(const char* s, int start, int end, int rank) {
-    if(end <= start || start < 0 || end < 0)
-        return false;
-    int num = 0;
-    while(start < end) {
-        char currChar = s[start];
-        if(isdigit(currChar)) {
-            num = num * 10 + (currChar - '0');
-            if(isdigit(s[start + 1])) {
-                start++;
-                continue;
-            }
-        } else if(currChar == '(' || currChar == ')') {
-            start++;
-            num = 0;
-            continue;
-        }
-        if(num == rank)
-            return true;
-        start++;
-    }
-    return false;
-}
-scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph) {
-    int x = 0, y = 0;
-    for(int i = 0; treeGraph->treeBase[i][0] != 0; i++) {
-        x = i + 1;
-    }
-    if(treeGraph->treeBase[0][0] == 0)
-        return scclSuccess;
-    int nChannels  = comm->nChannels;
-    int localRanks = comm->topo->nodes[GPU].count;
-    // new tree
-    for(int c = 0; c < nChannels; c++) { // in here
-        int buff = c % x;
-        char tempString[SCCL_TOPO_MAX_NODES * 4];
-        int ko = 0;
-        while(treeGraph->treeBase[buff][ko] != 0) {
-            tempString[ko] = treeGraph->treeBase[buff][ko];
-            ko++;
-        }
-        tempString[ko]              = 0;
-        int start                   = 0;
-        int curRank                 = comm->rank;
-        struct scclChannel* channel = comm->channels + c;
-        int end                     = 0;
-        while(tempString[end] != 0)
-            end++;
-        int parent = -1;
-        // constructing a number from the continuous digits
-        while(start < end) {
-            int num = 0, num_found = 0;
-            start++;
-            while(start < end && tempString[start] != '(' && tempString[start] != ')') {
-                int num_here = (int)(tempString[start] - '0');
-                num          = num * 10 + num_here;
-                start        = start + 1;
-                if(tempString[start] == '(' || tempString[start] == ')' || start == end)
-                    num_found = 1;
-            }
-            if(num_found != 0 && num == curRank) {
-                channel->tree.up = parent;
-                int depth        = 0;
-                for(int childId = 0; childId < SCCL_MAX_TREE_ARITY; childId++) {
-                    int or_start                = start;
-                    int child                   = -1;
-                    channel->tree.down[childId] = -1;
-                    if(or_start >= end - 1)
-                        continue;
-                    num = 0;
-                    or_start++;
-                    while(tempString[or_start] != 0 && tempString[or_start] != '(' && tempString[or_start] != ')') {
-                        int num_here = (int)(tempString[or_start] - '0');
-                        num          = num * 10 + num_here;
-                        or_start++;
-                    }
-                    child = num;
-                    // find next child start
-                    while(start < end) {
-                        if(tempString[start] == '(')
-                            depth++;
-                        else if(tempString[start] == ')')
-                            depth--;
-                        if(depth == 0)
-                            break; // next child
-                        start++;
-                    }
-                    start++;
-                    channel->tree.down[childId] = child;
-                    // get kids, update numbers, get out of this string
-                }
-                break;
-            } else { // go to the next one
-                parent      = num;
-                int start_c = start;
-                int end_c   = start_c;
-                while(end_c < end) {
-                    int depth = 0;
-                    while(end_c < end) {
-                        if(tempString[end_c] == '(')
-                            depth++;
-                        else if(tempString[end_c] == ')')
-                            depth--;
-                        if(depth == 0)
-                            break; // next child
-                        end_c++;
-                    }
-                    if(isRankHere(tempString, start_c, end_c, curRank)) {
-                        start = start_c;
-                        end   = end_c;
-                        break;
-                    } else {
-                        end_c++;
-                        start_c = end_c;
-                    }
-                }
-            }
-        }
-    }
-    return scclSuccess;
-}
-static scclResult_t connectRings(struct scclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
-    int nChannels = comm->nChannels;
-    int nNodes    = comm->nNodes;
-    for(int c = 0; c < nChannels; c++) {
-        int* recv                    = ringRecv + c * comm->nNodes;
-        int* send                    = ringSend + c * comm->nNodes;
-        int* prev                    = ringPrev + c * comm->nRanks;
-        int* next                    = ringNext + c * comm->nRanks;
-        struct scclChannel* channel0 = comm->channels + c;
-        struct scclChannel* channel1 = (nChannels > MAXCHANNELS / 2) ? 0 : channel0 + nChannels;
-        for(int n = 0; n < nNodes; n++) {
-            int recvRank     = recv[n];
-            int prevSendRank = send[(n - 1 + nNodes) % nNodes];
-            prev[recvRank]   = prevSendRank;
-            if(comm->rank == recvRank) {
-                channel0->ring.prev = prevSendRank;
-                if(channel1)
-                    channel1->ring.prev = prevSendRank;
-            }
-            int sendRank     = send[n];
-            int nextRecvRank = recv[(n + 1) % nNodes];
-            next[sendRank]   = nextRecvRank;
-            if(comm->rank == sendRank) {
-                channel0->ring.next = nextRecvRank;
-                if(channel1)
-                    channel1->ring.next = nextRecvRank;
-            }
-        }
-    }
-    return scclSuccess;
-}
-static scclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
-    for(int n = 0; n < nNodes; n++)
-        indexes[n] = ranks[n];
-    return scclSuccess;
-}
-static scclResult_t setTreeUp(struct scclTree* tree, int* indexes, int u) {
-    if(u == -1)
-        return scclSuccess;
-    tree->up = indexes[u];
-    return scclSuccess;
-}
-static scclResult_t setTreeDown(struct scclTree* tree, int* indexes, int d) {
-    if(d == -1)
-        return scclSuccess;
-    int x = 0;
-    while(x < SCCL_MAX_TREE_ARITY && tree->down[x] >= 0)
-        x++;
-    if(x == SCCL_MAX_TREE_ARITY) {
-        WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
-        return scclInternalError;
-    }
-    tree->down[x] = indexes[d];
-    return scclSuccess;
-}
-static scclResult_t connectTrees(struct scclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
-    const int nChannels = (comm->nChannels > MAXCHANNELS / 2) ? comm->nChannels / 2 : comm->nChannels, nNodes = comm->nNodes, node = comm->node;
-    // Compute tree depth. Not an exact value but a good approximation in most
-    // cases
-    int depth = comm->nRanks / nNodes - 1 + log2i(nNodes);
-    int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
-    int *ttp, *ttc0, *ttc1;
-    SCCLCHECK(scclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
-    if(comm->nChannels <= MAXCHANNELS / 2) {
-        for(int c = 0; c < nChannels; c++) {
-            struct scclChannel* channel0 = comm->channels + c;
-            struct scclChannel* channel1 = channel0 + nChannels;
-            ttp                          = treeToParent + c * comm->nNodes;
-            ttc0                         = treeToChild0 + c * comm->nNodes;
-            ttc1                         = treeToChild1 + c * comm->nNodes;
-            if(comm->rank == ttp[node]) {
-                SCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
-                SCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
-            }
-            if(comm->rank == ttc0[node]) {
-                SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
-                SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
-            }
-            if(comm->rank == ttc1[node]) {
-                SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
-                SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
-            }
-            if(comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
-                INFO(SCCL_LOG_TOPO,
-                     "Tree %d : %d -> %d -> %d/%d/%d",
-                     c,
-                     channel0->tree.up,
-                     comm->rank,
-                     channel0->tree.down[0],
-                     channel0->tree.down[1],
-                     channel0->tree.down[2]);
-                INFO(SCCL_LOG_TOPO,
-                     "Tree %d : %d -> %d -> %d/%d/%d",
-                     c + nChannels,
-                     channel1->tree.up,
-                     comm->rank,
-                     channel1->tree.down[0],
-                     channel1->tree.down[1],
-                     channel1->tree.down[2]);
-            }
-            channel0->tree.depth = channel1->tree.depth = depth;
-        }
-    } else {
-        for(int c = 0; c < nChannels; c++) {
-            struct scclChannel* channel0 = comm->channels + c;
-            ttp                          = treeToParent + c * comm->nNodes;
-            ttc0                         = treeToChild0 + c * comm->nNodes;
-            ttc1                         = treeToChild1 + c * comm->nNodes;
-            if(comm->rank == ttp[node]) {
-                SCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
-            }
-            if(comm->rank == ttc0[node]) {
-                SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
-            }
-            if(comm->rank == ttc1[node]) {
-                SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
-            }
-            if(comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
-                INFO(SCCL_LOG_TOPO,
-                     "Tree %d : %d -> %d -> %d/%d/%d",
-                     c,
-                     channel0->tree.up,
-                     comm->rank,
-                     channel0->tree.down[0],
-                     channel0->tree.down[1],
-                     channel0->tree.down[2]);
-            }
-            channel0->tree.depth = depth;
-        }
-        for(int c = nChannels; c < nChannels * 2; c++) {
-            struct scclChannel* channel1 = comm->channels + c;
-            ttp                          = treeToParent + c * comm->nNodes;
-            ttc0                         = treeToChild0 + c * comm->nNodes;
-            ttc1                         = treeToChild1 + c * comm->nNodes;
-            if(comm->rank == ttp[node]) {
-                SCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
-            }
-            if(comm->rank == ttc0[node]) {
-                SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
-            }
-            if(comm->rank == ttc1[node]) {
-                SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
-            }
-            if(comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
-                INFO(SCCL_LOG_TOPO,
-                     "Tree %d : %d -> %d -> %d/%d/%d",
-                     c + nChannels,
-                     channel1->tree.up,
-                     comm->rank,
-                     channel1->tree.down[0],
-                     channel1->tree.down[1],
-                     channel1->tree.down[2]);
-            }
-            channel1->tree.depth = depth;
-        }
-    }
-    return scclSuccess;
-}
-static scclResult_t connectCollNet(struct scclComm* comm, struct scclTopoGraph* collNetGraph) {
-    int rank       = comm->rank;
-    int localRanks = comm->localRanks;
-    int nHeads     = 0;
-    int* heads;
-    SCCLCHECK(scclCalloc(&heads, localRanks));
-    // Find all head ranks
-    // Head index is always 0
-    for(int c = 0; c < collNetGraph->nChannels; c++) {
-        int* collNetIntra = collNetGraph->intra + c * localRanks;
-        int head          = collNetIntra[0];
-        for(int h = 0; h < nHeads; h++)
-            if(heads[h] == head)
-                head = -1;
-        if(head != -1)
-            heads[nHeads++] = collNetIntra[0];
-    }
-    // For all channels
-    for(int c = 0; c < comm->nChannels; c++) {
-        struct scclChannel* channel = comm->channels + c;
-        char line[1024];
-        sprintf(line, "CollNet channel %d rank %d ", c, rank);
-        int nDown = 0;
-        for(int i = 0; i < nHeads; i++) {
-            if(rank == heads[i]) {                              // is head
-                channel->collnetDirect.headRank = i;            // Mark the index for deciding offset in the CUDA kernel
-                channel->collnetDirect.out      = comm->nRanks; // Set root of collnetDirect to id nranks
-                int* collNetIntra               = collNetGraph->intra + i * localRanks;
-                sprintf(line + strlen(line), "down ");
-                for(int r = 0; r < localRanks; r++) {
-                    if(collNetIntra[r] == rank)
-                        continue;
-                    channel->collnetDirect.down[nDown++] = collNetIntra[r]; // connect to all peers
-                    sprintf(line + strlen(line), " %d ", collNetIntra[r]);
-                }
-                sprintf(line + strlen(line), "nDown %d ", nDown);
-                break;
-            }
-        }
-        // Connect to all heads
-        int nUp = 0;
-        sprintf(line + strlen(line), "up ");
-        for(int h = 0; h < nHeads; h++) {
-            if(rank == heads[h])
-                continue;
-            channel->collnetDirect.up[nUp++] = heads[h];
-            sprintf(line + strlen(line), " %d ", heads[h]);
-        }
-        channel->collnetDirect.nHeads = nHeads;
-        channel->collnetDirect.shift  = (rank % localRanks) % nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
-        channel->collnetDirect.depth  = (nUp == 0 && nDown == 0) ? 1 : 2;
-        sprintf(line + strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
-        sprintf(line + strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
-        INFO(SCCL_LOG_TOPO, "%s", line);
-        channel->collnetChain.depth = comm->nRanks / comm->nNodes;
-    }
-    for(int c = 0; c < comm->nvlsChannels; c++) {
-        struct scclChannel* channel = comm->channels + c;
-        if(channel->nvls.headRank != -1)
-            channel->nvls.out = comm->nRanks;
-    }
-    free(heads);
-    return scclSuccess;
-}
-static scclResult_t connectNvls(struct scclComm* comm, int* nvlsHeads, struct scclTopoGraph* nvlsGraph) {
-    int nHeads   = nvlsGraph->nChannels;
-    int headRank = -1;
-    for(int h = 0; h < nHeads; h++) {
-        if(nvlsGraph->intra[h * comm->localRanks] == comm->rank)
-            headRank = h;
-    }
-    if(nHeads == 0) {
-        comm->nvlsChannels = 0;
-        return scclSuccess;
-    }
-    for(int c = 0; c < comm->nvlsChannels; c++) {
-        struct scclChannel* channel = comm->channels + c;
-        channel->nvls.nHeads        = nHeads;
-        for(int h = 0; h < nHeads; h++)
-            channel->nvls.up[h] = comm->nRanks + 1 + h;
-        for(int h = nHeads; h < SCCL_MAX_NVLS_ARITY; h++)
-            channel->nvls.up[h] = -1;
-        channel->nvls.down     = comm->nRanks + 1 + headRank;
-        channel->nvls.out      = -1; // NVLS+SHARP not yet implemented.
-        channel->nvls.headRank = headRank;
-        channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
-        channel->nvls.node                                                                                       = comm->node;
-        channel->nvls.nNodes                                                                                     = comm->nNodes;
-    }
-    if(comm->nNodes == 1)
-        return scclSuccess;
-    // Connect Trees
-    int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
-    int pc0, pc1; // ignored
-    SCCLCHECK(scclGetDtree(comm->nNodes, comm->node, &tree0Parent, &tree0Child0, &tree0Child1, &pc0, &tree1Parent, &tree1Child0, &tree1Child1, &pc1));
-    int* heads       = NULL;
-    int treeUp[2]    = {-1, -1};
-    int treeDown0[2] = {-1, -1};
-    int treeDown1[2] = {-1, -1};
-    if(comm->node == 0) {
-        for(int h = 0; h < nHeads; h++) {
-            char line[1024];
-            sprintf(line, "NVLS Head %2d:", h);
-            heads = nvlsHeads + h * comm->nNodes;
-            for(int n = 0; n < comm->nNodes && n < 20; n++) {
-                sprintf(line + strlen(line), " %2d", heads[n]);
-            }
-            INFO(SCCL_INIT, "%s", line);
-        }
-    }
-    // Find the heads where I'm the head rank and retain tree up/down
-    for(int h = 0; h < nHeads; h++) {
-        heads = nvlsHeads + h * comm->nNodes;
-        if(heads[comm->node] == comm->rank) {
-            treeUp[0]    = tree0Parent == -1 ? -1 : heads[tree0Parent];
-            treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0];
-            treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1];
-            treeUp[1]    = tree1Parent == -1 ? -1 : heads[tree1Parent];
-            treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0];
-            treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1];
-            break;
-        }
-    }
-    // Set prev/next in all channels (NVLS compute channels work
-    // orthogonally to NVLS search channels).
-    for(int c = 0; c < comm->nvlsChannels; c++) {
-        struct scclChannel* channel = comm->channels + c;
-        channel->nvls.treeUp        = treeUp[c % 2];
-        channel->nvls.treeDown[0]   = channel->nvls.down;
-        int ix                      = 1;
-        if(treeDown0[c % 2] != -1)
-            channel->nvls.treeDown[ix++] = treeDown0[c % 2];
-        if(treeDown1[c % 2] != -1)
-            channel->nvls.treeDown[ix] = treeDown1[c % 2];
-    }
-    struct scclNvls* nvls0 = &comm->channels[0].nvls;
-    struct scclNvls* nvls1 = &comm->channels[1].nvls;
-    INFO(SCCL_LOG_TOPO,
-         "NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d",
-         nvls0->treeDown[0],
-         nvls0->treeDown[1],
-         comm->rank,
-         nvls0->treeUp,
-         nvls1->treeDown[0],
-         nvls1->treeDown[1],
-         comm->rank,
-         nvls1->treeUp);
-    return scclSuccess;
-}
-// Legacy naming
-SCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
-SCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
-// New naming
-SCCL_PARAM(MinNchannels, "MIN_NCHANNELS", 4);
-SCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
-int scclMinNchannels() {
-    int minNchannels = 2;
-    if(scclParamMinNrings() != -2)
-        minNchannels = scclParamMinNrings();
-    if(scclParamMinNchannels() != -2)
-        minNchannels = scclParamMinNchannels();
-    if(minNchannels > MAXCHANNELS) {
-        WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
-        minNchannels = MAXCHANNELS;
-    }
-    if(minNchannels < 0)
-        minNchannels = 0;
-    return minNchannels;
-}
-int scclMaxNchannels() {
-    int maxNchannels = MAXCHANNELS;
-    if(scclParamMaxNrings() != -2)
-        maxNchannels = scclParamMaxNrings();
-    if(scclParamMaxNchannels() != -2)
-        maxNchannels = scclParamMaxNchannels();
-    if(maxNchannels > MAXCHANNELS)
-        maxNchannels = MAXCHANNELS;
-    if(maxNchannels < 1) {
-        WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
-        maxNchannels = 1;
-    }
-    return maxNchannels;
-}
-static int copyChannels(struct scclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
-    int nranks = comm->nRanks;
-    int c;
-    for(c = start; c < end; c++) {
-        memcpy(ringPrev + c * nranks, ringPrev + (c - start) * nranks, nranks * sizeof(int));
-        memcpy(ringNext + c * nranks, ringNext + (c - start) * nranks, nranks * sizeof(int));
-        memcpy(comm->channels + c, comm->channels + c - start, sizeof(struct scclChannel));
-    }
-    return c;
-}
-static int copyMixedChannels(struct scclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
-    int nranks = comm->nRanks;
-    int c;
-    for(c = start; c < end; c++) {
-        memcpy(ringPrev + c * nranks, ringPrev + (c - start) * nranks, nranks * sizeof(int));
-        memcpy(ringNext + c * nranks, ringNext + (c - start) * nranks, nranks * sizeof(int));
-        memcpy(comm->channels + c, comm->channels + c - start, sizeof(struct scclChannel));
-        comm->channels[c].transportType = comm->mixedTransportType;
-    }
-    return c;
-}
-RCCL_PARAM(MaxMixedHylinkNChannels, "MAX_MIXED_HYLINK_NCHANNELS", 0);
-RCCL_PARAM(MixedTransportType, "MIXED_TRANSPORT_TYPE", TRANSPORT_SHM);
-scclResult_t scclTopoPostset(
-    struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc) {
-    // Gather data from all ranks
-    int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
-    int nranks       = comm->nRanks;
-    int nNodes       = comm->nNodes;
-    int nChannels    = comm->nChannels;
-    int MinNChannels = scclMinNchannels();
-    int MaxNChannels = scclMaxNchannels();
-    SCCLCHECK(scclCalloc(&ringRecv, nNodes * MAXCHANNELS));
-    SCCLCHECK(scclCalloc(&ringSend, nNodes * MAXCHANNELS));
-    SCCLCHECK(scclCalloc(&ringPrev, nranks * MAXCHANNELS));
-    SCCLCHECK(scclCalloc(&ringNext, nranks * MAXCHANNELS));
-    SCCLCHECK(scclCalloc(&treeToParent, nNodes * MAXCHANNELS));
-    SCCLCHECK(scclCalloc(&treeToChild0, nNodes * MAXCHANNELS));
-    SCCLCHECK(scclCalloc(&treeToChild1, nNodes * MAXCHANNELS));
-    SCCLCHECK(scclCalloc(&nvlsHeads, nNodes * MAXCHANNELS));
-    for(int c = 0; c < nChannels; c++) {
-        for(int n = 0; n < nNodes; n++) {
-            int r                        = firstRanks[n];
-            ringRecv[c * nNodes + n]     = allTopoRanks[r]->ringRecv[c];
-            ringSend[c * nNodes + n]     = allTopoRanks[r]->ringSend[c];
-            treeToParent[c * nNodes + n] = allTopoRanks[r]->treeToParent[c];
-            treeToChild0[c * nNodes + n] = allTopoRanks[r]->treeToChild0[c];
-            treeToChild1[c * nNodes + n] = allTopoRanks[r]->treeToChild1[c];
-            nvlsHeads[c * nNodes + n]    = allTopoRanks[r]->nvlsHeads[c];
-        }
-        for(int r = 0; r < nranks; r++) {
-            ringPrev[c * nranks + r] = allTopoRanks[r]->ringPrev[c];
-            ringNext[c * nranks + r] = allTopoRanks[r]->ringNext[c];
-        }
-    }
-    // Connect rings and trees. This should also duplicate the channels.
-    SCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
-    SCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
-    SCCLCHECK(connectNvls(comm, nvlsHeads, graphs[SCCL_ALGO_NVLS]));
-    // Duplicate ringPrev/ringNext for scclBuildRing
-    if(nChannels <= MAXCHANNELS / 2)
-        memcpy(ringPrev + nChannels * nranks, ringPrev, nChannels * nranks * sizeof(int));
-    if(nChannels <= MAXCHANNELS / 2)
-        memcpy(ringNext + nChannels * nranks, ringNext, nChannels * nranks * sizeof(int));
-    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MIN_NCHANNELS") == NULL)
-        MinNChannels = 32;
-    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MAX_NCHANNELS") == NULL)
-        MaxNChannels = 32;
-#ifdef HCU_SDMA_FEATURE
-    int ncSdma = nc;
-    ncSdma     = std::min((int)scclMaxNchannels() / comm->nChannels, nc);
-    ncSdma *= comm->nChannels;
-#endif
-    // Get number of channels after duplication
-    nc = std::min((int)MaxNChannels / comm->nChannels, nc);
-    nc *= comm->nChannels;
-    // Duplication should be complete now
-    nChannels = comm->nChannels = std::min(MAXCHANNELS, (nChannels <= MAXCHANNELS / 2) ? nChannels * 2 : nChannels);
-    // Setup CollNet
-    if(comm->collNetSupport == 1) {
-        struct scclTopoGraph* collNetGraph = graphs[SCCL_ALGO_COLLNET_DIRECT];
-        // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
-        if(collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
-            int collNetNchannels = std::min(MAXCHANNELS, nChannels + nChannels / 2);
-            nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
-        }
-        SCCLCHECK(connectCollNet(comm, collNetGraph));
-    }
-    // Use 4 compute channels per search channel to reach peak BW on <8 PPN
-    if(comm->minCompCap == 90 && comm->nNodes > 1 && graphs[SCCL_ALGO_RING]->bwIntra > 45.0 && 2 * nChannels <= MAXCHANNELS) {
-        nChannels = comm->nChannels = copyChannels(comm, nChannels, 2 * nChannels, ringPrev, ringNext);
-    }
-    // Add Hylink + PCIE double channel path
-    if(graphs[SCCL_ALGO_RING]->typeIntra == PATH_NVL) {
-        comm->nMixedHylinkChannels = std::min(MAXCHANNELS - comm->nChannels, (int)rcclParamMaxMixedHylinkNChannels());
-        if(comm->nMixedHylinkChannels > 0) {
-            INFO(SCCL_LOG_TOPO,
-                 "<%s:%d> -----> comm->nMixedHylinkShmChannels: %d, comm->nChannels: %d\n",
-                 __func__,
-                 __LINE__,
-                 comm->nMixedHylinkChannels,
-                 comm->nChannels);
-            comm->mixedTransportType = std::max((int)rcclParamMixedTransportType(), TRANSPORT_SHM);
-            nChannels = comm->nChannels = copyMixedChannels(comm, nChannels, nChannels + comm->nMixedHylinkChannels, ringPrev, ringNext);
-        }
-    }
-    // Honor SCCL_MIN_NRINGS/SCCL_MAX_NRINGS.
-    // We permit combining max, then min, to only use the first channels, then duplicate them.
-    if(checkSdmaCopyEnable(comm)) {
-        uint32_t sdmaChannelNum;
-        uint32_t maxChannels;
-        sdmaChannelNum = getSdmaChannelNum(comm);
-        if(comm->sharedRes->owner != comm) {
-            /* child comm #channels cannot exceed top parent #channels. */
-            nChannels = comm->nChannels = std::min(std::min(std::min(scclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
-            maxChannels =
-                sdmaChannelNum ? sdmaChannelNum : std::min(std::max(scclMinNchannels(), std::max(ncSdma, comm->config.minCTAs)), comm->sharedRes->tpNChannels);
-            nChannels = comm->nChannels = copyChannels(comm, nChannels, maxChannels, ringPrev, ringNext);
-        } else {
-            nChannels = comm->nChannels = std::min(std::min(scclMaxNchannels(), nChannels), comm->config.maxCTAs);
-            maxChannels                 = sdmaChannelNum ? sdmaChannelNum : std::max(scclMinNchannels(), std::max(ncSdma, comm->config.minCTAs));
-            nChannels = comm->nChannels = copyChannels(comm, nChannels, maxChannels, ringPrev, ringNext);
-        }
-        INFO(SCCL_INIT, "-hcugon- scclTopoPostset rank %d sdmaChannelNum %d nChannels %d", comm->rank, sdmaChannelNum, comm->nChannels);
-    } else {
-        if(comm->sharedRes->owner != comm) {
-            /* child comm #channels cannot exceed top parent #channels. */
-            nChannels = comm->nChannels = std::min(std::min(std::min(MaxNChannels, nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
-            nChannels = comm->nChannels = copyChannels(
-                comm, nChannels, std::min(std::max(MinNChannels, std::max(nc, comm->config.minCTAs)), comm->sharedRes->tpNChannels), ringPrev, ringNext);
-        } else {
-            nChannels = comm->nChannels = std::min(std::min(MaxNChannels, nChannels), comm->config.maxCTAs);
-            nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(MinNChannels, std::max(nc, comm->config.minCTAs)), ringPrev, ringNext);
-        }
-    }
-    // Create rings array and check all is fine
-    SCCLCHECK(scclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
-    free(ringRecv);
-    free(ringSend);
-    free(ringPrev);
-    free(ringNext);
-    free(treeToParent);
-    free(treeToChild0);
-    free(treeToChild1);
-    free(nvlsHeads);
-    return scclSuccess;
-}
-} // namespace detect
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/graph/devcomm.h
+++ b/src/hardware/graph/devcomm.h
-/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef SCCL_DEVICE_H_
-#define SCCL_DEVICE_H_
-#include "check.h"
-#include "sccl_bfloat16.h"
-#include "align.h"
-#if defined(ENABLE_NPKIT)
-#include "npkit/npkit_struct.h"
-#endif
-#if defined(ENABLE_TIMELINE)
-#include "timeline/timeline.h"
-#endif
-#include <stdint.h>
-#ifdef HCU_SDMA_FEATURE
-#include "hsa/hsa_ext_amd.h"
-#include "hsa_extra.h"
-// #define HCU_PRINT_DEBUG
-#endif
-namespace sccl {
-#define PRINT_ERR(...)
-#define PRINT_INFO(...)
-#define PRINT_INFOM(...)
-#define PRINT_INFOT(tid, ...)
-#define PRINT_DEBUG(...)
-#if defined(ENABLE_NPKIT) && defined(HCU_SDMA_FEATURE)
-#define NPKIT_SET_GPU_EVENT(event, size, cost) \
-    NpKit::CollectGpuEvent(event, size, cost, NPKIT_GET_GPU_TIMESTAMP(), scclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
-#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm) NpKit::CollectGpuEvent(event, size, cost, tm, scclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
-#else
-#define NPKIT_SET_GPU_EVENT(event, size, cost)
-#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm)
-#endif
-#ifdef HCU_SDMA_FEATURE
-#define INIT_PRIMS_SDMA(prims, args)                                                                           \
-    {                                                                                                          \
-        prims.rank            = scclShmem.comm.rank;                                                           \
-        prims.useSdmaConfig   = args->useSdma;                                                                 \
-        prims.useSdmaCopy     = args->useSdma && prims.sdmaQueueCtx;                                           \
-        prims.preFnOps        = args->preFnOps;                                                                \
-        prims.sdmaMinCopySize = args->useSdma && prims.sdmaQueueCtx ? prims.sdmaQueueCtx->minCopySize : 0;     \
-        prims.sdmaCountEnable = args->useSdma && prims.sdmaQueueCtx ? prims.sdmaQueueCtx->copyCountEnable : 0; \
-        prims.sdmaCopyCount   = 0;                                                                             \
-        prims.allCopyCount    = 0;                                                                             \
-    }
-#endif
-#define SCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
-typedef enum {
-    scclFuncBroadcast,
-    scclFuncReduce,
-    scclFuncAllGather,
-    scclFuncReduceScatter,
-    scclFuncAllReduce,
-    scclFuncSendRecv,
-    scclFuncSend,
-    scclFuncRecv,
-    scclFuncAllToAllPivot,
-    scclNumFuncs
-} scclFunc_t;
-extern const char* scclFuncStr[SCCL_NUM_FUNCTIONS + 2];
-#define SCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
-#define SCCL_ALGO_TREE 0
-#define SCCL_ALGO_RING 1
-#define SCCL_ALGO_COLLNET_DIRECT 2
-#define SCCL_ALGO_COLLNET_CHAIN 3
-#define SCCL_ALGO_NVLS 4
-#define SCCL_ALGO_NVLS_TREE 5
-enum scclAlgo {
-    SCCL_ALGO_TREE           = 0, // 树形算法
-    SCCL_ALGO_RING           = 1, // 环形算法
-    SCCL_ALGO_COLLNET_DIRECT = 2, // 直接网络算法
-    SCCL_ALGO_COLLNET_CHAIN  = 3, // 链式网络算法
-    SCCL_ALGO_NVLS           = 4, // NVLink算法
-    SCCL_ALGO_NVLS_TREE      = 5, // NVLink树形算法
-};
-extern const char* scclAlgoStr[SCCL_NUM_ALGORITHMS];
-#define SCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
-#define SCCL_PROTO_LL 0
-#define SCCL_PROTO_LL128 1
-#define SCCL_PROTO_SIMPLE 2
-extern const char* scclProtoStr[SCCL_NUM_PROTOCOLS];
-#define SCCL_MAX_OPS 2048
-#define SCCL_STEPS 8
-union scclLLFifoLine {
-    /* Flags have to be *after* data, because otherwise, an incomplete receive
-       from the network may receive the flag but not the data.
-       Note this is assuming that either we receive contiguous chunks of data
-       (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
-    struct {
-        uint32_t data1;
-        uint32_t flag1;
-        uint32_t data2;
-        uint32_t flag2;
-    };
-    uint64_t v[2];
-    int4 i4;
-};
-#define WARP_SIZE warpSize
-#define MAXCHANNELS 32
-#define SCCL_MAX_NTHREADS 256
-#define SCCL_SIMPLE_MAX_NTHREADS SCCL_MAX_NTHREADS
-#define SCCL_LL_MAX_NTHREADS SCCL_MAX_NTHREADS
-#define SCCL_LL_LINES_PER_THREAD 8
-#ifdef TEST_LL_CLEANUP
-#define SCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
-#define SCCL_LL_FLAG_MAX 0x100
-#define SCCL_LL_FLAG(a) ((uint32_t)((a) % SCCL_LL_FLAG_MAX))
-#else
-#define SCCL_LL_CLEAN_MASK 0x7ffffff8
-#define SCCL_LL_FLAG(a) ((uint32_t)(a))
-#endif
-// Make sure the clean mask will last for at least SCCL_NSTEPS
-static_assert(SCCL_LL_CLEAN_MASK % SCCL_STEPS == 0, "Invalid SCCL_LL_CLEAN_MASK value");
-#define SCCL_LL128_LINESIZE 64
-#define SCCL_LL128_LINEELEMS (SCCL_LL128_LINESIZE / sizeof(uint64_t))
-#define SCCL_LL128_DATAELEMS (SCCL_LL128_LINEELEMS - 1)
-#define SCCL_LL128_MAX_NTHREADS 256
-#define SCCL_LL128_ELEMS_PER_THREAD 28
-#define SCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
-#define SCCL_LL128_SHMEM_SIZE (SCCL_LL128_SHMEM_ELEMS_PER_THREAD * SCCL_LL128_MAX_NTHREADS)
-#define SCCL_DIRECT_WRITE 0x01
-#define SCCL_DIRECT_READ 0x02
-#define SCCL_DIRECT_NIC 0x04
-#define SCCL_IPC_WRITE 0x08
-#define SCCL_IPC_READ 0x10
-#define SCCL_NVLS_MIN_POLL 0x20
-#ifdef HCU_SDMA_FEATURE
-#define SDMA_CTX_VALID_MAGIC 0xD65A
-#endif
-struct scclConnInfo {
-    // Regular comm mechanism
-    char* buffs[SCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
-    uint64_t* tail;                  // Local for recv, remote for send
-    uint64_t* head;                  // Local for send, remote for recv
-    int flags;                  // Direct communication / other flags
-    int shared;                 // Buffers are shared
-    void** ptrExchange;         // Pointer exchange for direct communication
-    uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
-    int* sizesFifo; // Sizes fifo from GPU to proxy
-    int* offsFifo;  // Buffer fifo from proxy to GPU
-    uint64_t step; // Keep where we are
-    uint64_t llLastCleaning;
-    // GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
-    // allows software to explicitly initiate a flush read to HDP memory. See more
-    // descriptions in primitives.h.
-    uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
-    uint32_t* curr_hdp_reg; // Current GPU's HDP register
-#ifdef HCU_SDMA_FEATURE
-    struct sdmaQueueContext* sdmaQueueCtx;
-    uint32_t sdmaCtxValidMagic;
-#endif
-};
-struct scclProxyConnector {
-    int tpRank;
-    int tpLocalRank;
-    int sameProcess;
-    struct scclProxyConnection* connection;
-};
-struct scclConnector {
-    int connected;
-    struct scclProxyConnector proxyConn;
-    struct scclTransportComm* transportComm;
-    void* transportResources;
-    struct scclConnInfo conn;
-};
-struct scclRing {
-    // Shortcuts for userRanks[1] and userRanks[n-1]
-    int prev;
-    int next;
-    // Maps an internal sccl index to user-specified rank order. This is necessary
-    // since we need to know how the user expects data to be ordered across
-    // devices. Ordered from current device.
-    int* userRanks;
-    int index; // This rank's index in the ring
-};
-// The root of each tree only has one node down (+1 intra-node).
-#define SCCL_MAX_TREE_ARITY_TOP 2
-// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
-#define SCCL_MAX_TREE_ARITY 3
-struct scclTree {
-    int depth;
-    int up;
-    int down[SCCL_MAX_TREE_ARITY];
-};
-#define SCCL_MAX_DIRECT_ARITY 7
-struct scclDirect {
-    int depth;
-    int out;
-    int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
-    int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
-    int shift;    // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
-    int up[SCCL_MAX_DIRECT_ARITY];
-    int down[SCCL_MAX_DIRECT_ARITY];
-};
-#define SCCL_CONN_IDX_P2P_NET 2
-#define SCCL_MAX_NVLS_ARITY 8
-#define SCCL_MAX_NVLS_TREE_ARITY 3
-struct scclNvls {
-    int out;
-    int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
-    int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
-    int up[SCCL_MAX_NVLS_ARITY];
-    int down;
-    int treeUp;
-    int treeDown[SCCL_MAX_NVLS_TREE_ARITY];
-    int node;
-    int nNodes;
-};
-#define SCCL_MAX_CONNS 3
-struct scclChannelPeer {
-    struct scclConnector send[SCCL_MAX_CONNS];
-    struct scclConnector recv[SCCL_MAX_CONNS];
-    int refCount;
-};
-struct scclDevComm;
-#pragma pack(push) /* push current alignment to stack */
-#pragma pack(8)    /* set alignment to 8 bytes boundary */
-/* scclWork is to be a power of two, currently 8x64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of scclWorkElem. */
-#define SCCL_WORK_SIZE 256
-enum scclWorkType : uint8_t {
-    scclWorkTypeUnused  = 0,
-    scclWorkTypeColl    = 1,
-    scclWorkTypeP2p     = 2,
-    scclWorkTypeRegColl = 3
-};
-enum scclWorkP2PType : uint8_t {
-    scclWorkP2pTypeUnused = 0,
-    scclWorkP2pTypeSend,
-    scclWorkP2pTypeRecv
-};
-struct scclWorkHeader {
-    union {
-        int32_t workNext;  // when isLast=0: Offset from kernel argument workHead
-        uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
-    };
-    uint16_t funcIndex;
-    uint8_t isLast : 1; // last work for this kernel
-    uint8_t inFifo : 1; // is this work in the fifo
-    enum scclWorkType type;
-};
-struct scclWorkElem {
-    union {
-        uint8_t flagBits;
-        struct {
-            uint8_t isUsed : 1, redOpArgIsPtr : 1, regUsed : 1, nWarps : 5;
-        };
-    };
-    uint8_t direct;
-    uint8_t bid;
-    uint8_t nChannels;
-    struct {
-        uint32_t root : 28;
-        uint32_t preFnOps : 1;
-        uint32_t useSdma : 1;
-        uint32_t connIndex : 2;
-    };
-    const void* sendbuff;
-    void* recvbuff;
-    size_t count;
-    union {
-        size_t lastChunkSize;
-        // Pivot A2A kernel computes chunk size itself.
-        // Instead, it needs the number of bidirectional rings.
-        size_t pivotA2ANumBiRings;
-    };
-    uint64_t redOpArg;
-    uint64_t opCount;
-};
-static_assert((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElem))) / sizeof(scclWorkElem) == 4,
-              "Sanity check: SCCL_MAX_WORK_ELEMENTS == 4");
-#define SCCL_MAX_WORK_ELEMENTS 1
-struct scclWorkElemP2p {
-    struct {
-        int32_t peer : 26;
-        uint32_t preFnOps : 1;
-        uint32_t useSdma : 1;
-        uint32_t connIndex : 2;
-        int32_t proto : 2;
-    };
-    union {
-        uint16_t flagBits;
-        struct {
-            enum scclWorkP2PType p2pType : 4;
-            uint16_t nWarps : 4;
-            uint16_t warpStart : 4;
-            uint16_t ngroups : 4;
-        };
-    };
-    uint16_t opCount;
-    // Important not to use any fields with greater than 4-byte alignment since
-    // we need sizeof(scclWorkElemP2p)==28, but that would be padded up to 32 if
-    // there were 8-byte fields.
-    // void* buff;
-    uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
-    // size_t count;
-    uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
-    int chunkSize;
-};
-static_assert(((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElemP2p))) / sizeof(scclWorkElemP2p)) == 8,
-              "Sanity check: SCCL_MAX_WORK_ELEMENTS_P2P == 8");
-#define SCCL_MAX_WORK_ELEMENTS_P2P 2
-struct scclWorkElemReg {
-    struct scclWorkElem elem;
-    void* dnInputs[SCCL_MAX_DIRECT_ARITY + 1];
-    void* dnOutputs[SCCL_MAX_DIRECT_ARITY + 1];
-    void* upOutputs[SCCL_MAX_DIRECT_ARITY + 1];
-};
-#define SCCL_MAX_WORK_ELEMENTS_REG ((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElemReg))) / sizeof(scclWorkElemReg))
-static_assert(SCCL_MAX_WORK_ELEMENTS_REG == 1, "Sanity check: SCCL_MAX_WORK_ELEMENTS_REG == 1");
-// Number of named barriers supported by CUDA
-#define SCCL_MAX_GROUPS (SCCL_MAX_NTHREADS / WARP_SIZE)
-struct scclWork {
-    struct scclWorkHeader header;
-    union {
-        char pad[SCCL_WORK_SIZE - sizeof(struct scclWorkHeader)];
-        struct scclWorkElem elems[SCCL_MAX_WORK_ELEMENTS];
-        struct scclWorkElemP2p p2pElems[SCCL_MAX_WORK_ELEMENTS_P2P];
-        struct scclWorkElemReg regElems[SCCL_MAX_WORK_ELEMENTS_REG];
-    };
-};
-static_assert(sizeof(struct scclWork) == SCCL_WORK_SIZE, "Sanity check: sizeof(struct scclWork) == SCCL_WORK_SIZE");
-static_assert(sizeof(struct scclWork) % 16 == 0, "Sanity check: sizeof(struct scclWork)%16 == 0");
-struct scclDevChannelPeer {
-    // Stripped version of scclChannelPeer where we only keep the scclConnInfo
-    // instead of the full scclConnector.
-    struct scclConnInfo send[SCCL_MAX_CONNS];
-    struct scclConnInfo recv[SCCL_MAX_CONNS];
-};
-#pragma pack(pop) /* restore original alignment from stack */
-#ifdef ENABLE_PROFILING
-#define PROFILE_NUM_ITEMS 31
-#define PROFILE_NUM_LAUNCHES 1024
-struct scclProf {
-    uint32_t count;
-    uint32_t seq; // only entry from first launch is used
-    struct {
-        uint64_t line : 16;
-        uint64_t timeStamp : 48;
-    } elem[PROFILE_NUM_ITEMS];
-};
-static_assert(sizeof(struct scclProf) == 256, "scclProf must have size of 256");
-#endif
-#ifdef ENABLE_COLLTRACE
-typedef enum {
-    scclCollTraceNotReady         = 0,
-    scclCollTraceKernelLaunchType = 1,
-    scclCollTraceKernelEndType    = 2,
-    scclCollTraceCollLaunchType   = 3,
-    scclCollTraceAbortType        = 4,
-    scclCollTraceDataType         = 5,
-    scclCollTraceCollElemType     = (1 << 4),
-    scclCollTraceP2pElemType      = (1 << 5),
-} scclCollTraceDataType_t;
-struct scclCollTrace {
-    uint8_t type;
-    uint8_t bid;
-    int16_t funcIndex;
-    uint32_t data_0;
-    uint64_t timeStamp;
-    union {
-        uint64_t opCount;
-        uint32_t p2pOpCount[2];
-    };
-    union {
-        uint64_t data_1;
-        struct {
-            uint8_t nWarps;
-            uint8_t bid;
-            uint8_t nChannels;
-        } coll;
-        struct {
-            int16_t peer;
-            uint8_t ngroups : 4;
-            uint8_t connIndex : 4;
-            uint8_t warpStart : 4;
-            uint8_t nWarps : 4;
-        } p2p[2];
-    };
-};
-static_assert(sizeof(struct scclCollTrace) == 8 * sizeof(int), "scclCollTrace must have a pow2 size");
-union scclCollTraceTail {
-    uint32_t tail;
-    char padding[4096];
-};
-#define COLLTRACE_NUM_ITEMS 8192
-#endif
-#ifdef HCU_SDMA_FEATURE
-struct sdmaQueueContext {
-    hsa_sdma_info_t* sdmaInfo;
-    uint64_t pkgIndex;
-    uint32_t queueId;
-    uint32_t sumSdmaCopyCount;
-    uint32_t sumAllCopyCount;
-    uint32_t queueLock;
-    uint32_t minCopySize;
-    uint32_t copyCountEnable;
-    uint32_t sdmaQueueDepth;
-    uint32_t sdmaPkgLen;
-    uint32_t sdmaQueueLen;
-};
-#endif
-struct alignas(16) scclDevChannel {
-    struct scclDevChannelPeer** peers;
-    struct scclRing ring;
-    struct scclTree tree;
-    struct scclTree collnetChain;
-    struct scclDirect collnetDirect;
-    struct scclTree binTree;
-    struct scclNvls nvls;
-    uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
-};
-struct scclDevComm {
-    int rank;
-    int nRanks;
-    int buffSizes[SCCL_NUM_PROTOCOLS];
-    // Operation list for aggregation
-    int workFifoDepth;
-    struct scclWork* workFifoHeap; // may be cudaHost or GDR memory
-    // Flag to ask SCCL kernels to abort
-    volatile uint32_t* abortFlag;
-    // Channels, device side
-    struct scclDevChannel* channels /*[MAXCHANNELS]*/;
-#if defined(ENABLE_NPKIT)
-    NpKitEventCollectContext* npKitEventCollectContexts;
-#endif
-#ifdef ENABLE_COLLTRACE
-    struct scclCollTrace* collTrace;
-    union scclCollTraceTail* collTraceTail;
-    pthread_t collTraceThread;
-#endif
-#ifdef ENABLE_PROFILING
-    struct scclProf* devProf;
-#endif
-#if defined(ENABLE_TIMELINE)
-    TimelineGpuEventContext* gpuEventContext;
-#endif
-#if defined(ENABLE_NPKIT) || defined(ENABLE_TIMELINE)
-    uint64_t* cpuTimestamp;
-#endif
-};
-struct alignas(16) scclDevCommAndChannels {
-    struct scclDevComm comm;
-    struct scclDevChannel channels[MAXCHANNELS];
-};
-#ifdef __CUDA_ARCH__
-#define SCCL_CUDA_ARCH __CUDA_ARCH__
-#else
-#define SCCL_CUDA_ARCH 0
-#endif
-template <typename T>
-__host__ __device__ constexpr T min_constexpr(T a) {
-    return a;
-}
-template <typename T, typename... Ts>
-__host__ __device__ constexpr T min_constexpr(T a, T b, Ts... c) {
-    return min_constexpr<T>((a < b ? a : b), c...);
-}
-template <typename T>
-__host__ __device__ constexpr T max_constexpr(T a) {
-    return a;
-}
-template <typename T, typename... Ts>
-__host__ __device__ constexpr T max_constexpr(T a, T b, Ts... c) {
-    return max_constexpr<T>((a > b ? a : b), c...);
-}
-// Calculate the unroll factor given:
-// * bytePerPack: number of bytes accessed per instruction
-// * insns: max permissible unroll value
-// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
-__host__ __device__ constexpr int scclCalcUnroll(int bytePerPack, int insns, int bytes) {
-    return min_constexpr(insns, (bytes + bytePerPack - 1) / bytePerPack);
-}
-// Note that all unroll value logic should depend on a given cudaArch argument
-// and not __CUDA_ARCH__ since these need to be host-side executable where the
-// arch value is strictly runtime only. By defaulting to SCCL_CUDA_ARCH, device
-// side code can elide passing the arch for brevity.
-__host__ __device__ constexpr int scclCollUnroll(int cudaArch = SCCL_CUDA_ARCH) {
-    // Our collective unroll should move to the same bytes&insns model as NVLS.
-    return cudaArch >= 800 ? 8 : 4;
-}
-__host__ __device__ constexpr int scclNvlsUnrollBytes(int cudaArch = SCCL_CUDA_ARCH) { return 4 * 16; }
-__host__ __device__ constexpr int scclNvlsUnrollInsns(int cudaArch = SCCL_CUDA_ARCH) { return 16; }
-__host__ __device__ constexpr int scclNvlsUnroll(int bytePerPack, int cudaArch = SCCL_CUDA_ARCH) {
-    return scclCalcUnroll(bytePerPack, scclNvlsUnrollInsns(cudaArch), scclNvlsUnrollBytes(cudaArch));
-}
-// The amount of dynamic shmem per warp
-__host__ __device__ constexpr int scclShmemScratchWarpSize(int cudaArch = SCCL_CUDA_ARCH) {
-    return (max_constexpr<int>(
-                /*LL    */ 0,
-                /*LL128 */ (SCCL_LL128_SHMEM_ELEMS_PER_THREAD * WARP_SIZE) * sizeof(uint64_t),
-                /*SIMPLE*/ (scclCollUnroll(cudaArch) * WARP_SIZE + 1) * 16,
-                // NVLS needs an extra 16B to read unaligned data.
-                /*NVLS  */ WARP_SIZE * (cudaArch >= 900 ? scclNvlsUnrollBytes(cudaArch) : 0) + 16) +
-            15) &
-           -16; // pad to 16 bytes
-}
-// The amount of dynamic shmem per block
-__host__ __device__ constexpr int scclShmemDynamicSize(int cudaArch = SCCL_CUDA_ARCH) {
-    return cudaArch < 700 ? 0 : scclShmemScratchWarpSize(cudaArch) * (SCCL_MAX_NTHREADS / WARP_SIZE);
-}
-} // namespace sccl
-#endif
--- a/src/hardware/graph/graph.h
+++ b/src/hardware/graph/graph.h
-#ifndef SCCL_GRAPH_H_
-#define SCCL_GRAPH_H_
-// #include "topo_utils.h"
-#include "devcomm.h"
-#include <limits.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <sched.h>
-namespace sccl {
-namespace hardware {
-namespace topology {
-#define MAX_XGMI_INTER_GPUS 4
-struct scclTopoGraph {
-    // Input / output
-    int id; // ring : 0, tree : 1, collnet : 2
-    int pattern;
-    int crossNic;
-    int collNet;
-    int minChannels;
-    int maxChannels;
-    // Output
-    int nChannels;
-    float bwIntra;
-    float bwInter;
-    float latencyInter;
-    int typeIntra;
-    int typeInter;
-    int sameChannels;
-    int nHops;
-    int intra[MAXCHANNELS * SCCL_TOPO_MAX_NODES];
-    int inter[MAXCHANNELS * 2];
-    int nIntraChannels;
-    int intraNets[MAXCHANNELS * SCCL_TOPO_MAX_NODES * 2];
-    char treeBase[SCCL_TOPO_MAX_NODES][SCCL_TOPO_MAX_NODES * 4];
-};
-struct scclTopoRanks {
-    int ringRecv[MAXCHANNELS];
-    int ringSend[MAXCHANNELS];
-    int ringPrev[MAXCHANNELS];
-    int ringNext[MAXCHANNELS];
-    int treeToParent[MAXCHANNELS];
-    int treeToChild0[MAXCHANNELS];
-    int treeToChild1[MAXCHANNELS];
-    int nvlsHeads[MAXCHANNELS];
-};
-// struct sccl::hardware::topology::topo::scclTopoSystem;
-// 对系统拓扑结构进行排序
-scclResult_t scclTopoSortSystem(struct scclTopoSystem* system);
-// 打印系统拓扑结构
-scclResult_t scclTopoPrint(struct scclTopoSystem* system);
-// 计算系统中的路径
-scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm);
-// // 释放系统拓扑结构
-// void scclTopoFree(struct scclTopoSystem* system);
-// // 裁剪系统拓扑结构
-// scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm);
-// // 计算点对点通道
-// scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm);
-// // 获取指定rank的Nvidia GPU信息
-// scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks);
-// // 检查系统中是否所有路径都通过NVLink
-// int scclTopoPathAllNVLink(struct scclTopoSystem* system);
-// // 获取网络设备信息
-// scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
-// // 检查两个设备之间是否存在点对点连接
-scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank);
-// // 检查是否使用GDR
-// scclResult_t scclTopoCheckGdr(struct scclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
-// // 获取内部网络设备信息
-// scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev);
-// // 获取两个CUDA设备之间的连接类型
-// scclResult_t scclTopoGetLinkType(
-//     struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter = MAX_XGMI_INTER_GPUS, int nInter = 0, int* inter = nullptr);
-// // 检查是否需要刷新
-// scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush);
-// // 检查两个设备是否在同一网络中
-// scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net);
-// // 禁用PXE网络
-// int scclPxnDisable(struct scclComm* comm);
-// // 获取PXE网络中的中间节点
-// scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks);
-// // 获取本地节点的rank
-// scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank);
-// // 获取CPU亲和性
-// scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity);
-// // 获取CPU类型信息
-// scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model);
-// // 获取GPU数量
-// scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count);
-// // 获取NVS数量
-// scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count);
-// // 获取本地网络设备信息
-// scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id);
-// // 获取本地GPU索引
-// scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex);
-// // 初始化搜索，调用scclTopoCompute之前需要执行
-// scclResult_t scclTopoSearchInit(struct scclTopoSystem* system);
-// // 计算拓扑图
-// scclResult_t scclTopoCompute(struct scclTopoSystem* system, struct scclTopoGraph* graph);
-// // 打印拓扑图
-// scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph);
-// // 导出拓扑图
-// scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs);
-// // 设置预定义拓扑图
-// scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks);
-// // 设置后处理拓扑图
-// scclResult_t scclTopoPostset(
-//     struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc);
-// // 设置基于树的后处理拓扑图
-// scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph);
-// // 调整模型以适应计算能力
-// scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs);
-// scclResult_t scclTopoCudaPath(int cudaDev, char** path);
-// #include "info.h"
-// scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
-#endif
--- a/src/hardware/graph/paths.cc
+++ b/src/hardware/graph/paths.cc
-#include "core.h"
-#include "graph.h"
-#include "topo.h"
-#include "comm.h"
-#include "net.h"
-#include "channel.h"
-#include "xml.h"
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace graph {
-// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
-struct scclTopoNodeList {
-    struct scclTopoNode* list[SCCL_TOPO_MAX_NODES];
-    int count;
-};
-static scclResult_t getPath(struct scclTopoSystem* system, struct scclTopoNode* node, int t, int64_t id, struct scclTopoLinkList** path) {
-    for(int i = 0; i < system->nodes[t].count; i++) {
-        if(system->nodes[t].nodes[i].id == id) {
-            *path = node->paths[t] + i;
-            return scclSuccess;
-        }
-    }
-    WARN("Could not find node of type %d id %lx", t, id);
-    return scclInternalError;
-}
-static scclResult_t scclTopoSetPaths(struct scclTopoNode* baseNode, struct scclTopoSystem* system) {
-    if(baseNode->paths[baseNode->type] == NULL) {
-        SCCLCHECK(scclCalloc(baseNode->paths + baseNode->type, system->nodes[baseNode->type].count));
-    }
-    // breadth-first search to set all paths to that node in the system
-    struct scclTopoNodeList nodeList;
-    struct scclTopoNodeList nextNodeList;
-    nodeList.count     = 1;
-    nodeList.list[0]   = baseNode;
-    nextNodeList.count = 0;
-    struct scclTopoLinkList* basePath;
-    SCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
-    basePath->count = 0;
-    basePath->bw    = LOC_BW;
-    basePath->type  = PATH_LOC;
-    while(nodeList.count) {
-        nextNodeList.count = 0;
-        for(int n = 0; n < nodeList.count; n++) {
-            struct scclTopoNode* node = nodeList.list[n];
-            struct scclTopoLinkList* path;
-            SCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
-            for(int l = 0; l < node->nlinks; l++) {
-                struct scclTopoLink* link    = node->links + l;
-                struct scclTopoNode* remNode = link->remNode;
-                if(remNode->paths[baseNode->type] == NULL) {
-                    SCCLCHECK(scclCalloc(remNode->paths + baseNode->type, system->nodes[baseNode->type].count));
-                }
-                struct scclTopoLinkList* remPath;
-                SCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
-                float bw = std::min(path->bw, link->bw);
-                // allow routing through a GPU only as 1 hop
-                if(node != baseNode && node->type == GPU && (link->type != LINK_NVL || remNode->type != GPU || path->count > 1))
-                    continue;
-                if((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) {
-                    // Find reverse link
-                    for(int l = 0; l < remNode->nlinks; l++) {
-                        if(remNode->links[l].remNode == node) {
-                            remPath->list[0] = remNode->links + l;
-                            break;
-                        }
-                    }
-                    if(remPath->list[0] == NULL) {
-                        WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
-                             remNode->type,
-                             remNode->id,
-                             remNode->nlinks,
-                             node->type,
-                             node->id);
-                        return scclInternalError;
-                    }
-                    // Copy the rest of the path
-                    for(int i = 0; i < path->count; i++)
-                        remPath->list[i + 1] = path->list[i];
-                    remPath->count = path->count + 1;
-                    remPath->bw    = bw;
-                    // Start with path type = link type. PATH and LINK types are supposed to match.
-                    // Don't consider LINK_NET as we only care about the NIC->GPU path.
-                    int type = link->type == LINK_NET ? LINK_LOC : link->type;
-                    // Differentiate between one and multiple PCI switches
-                    if(node->type == PCI && remNode->type == PCI)
-                        type = PATH_PXB;
-                    // Consider a path going through the CPU as PATH_PHB
-                    if(link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU))
-                        type = PATH_PHB;
-                    // Set 1 hop NVLink as NVB
-                    // if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
-                    remPath->type = std::max(path->type, type);
-                    // Add to the list for the next iteration if not already in the list
-                    // Disallow GPUs as intermediate steps for now
-                    if(remNode->type != GPU) {
-                        int i;
-                        for(i = 0; i < nextNodeList.count; i++)
-                            if(nextNodeList.list[i] == remNode)
-                                break;
-                        if(i == nextNodeList.count)
-                            nextNodeList.list[nextNodeList.count++] = remNode;
-                    }
-                }
-            }
-        }
-        memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
-    }
-    return scclSuccess;
-}
-/**
- * 打印节点路径信息
- *
- * @param system 拓扑系统指针
- * @param node 待打印路径的节点指针
- *
- * 该函数用于输出指定节点的路径信息，包括路径类型、目标节点ID、
- * 路径跳数、带宽和路径类型字符串。输出格式为一行字符串。
- */
-static void printNodePaths(struct scclTopoSystem* system, struct scclTopoNode* node) {
-    char line[1024];
-    sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
-    int offset = strlen(line);
-    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
-        if(node->paths[t] == NULL)
-            continue;
-        for(int n = 0; n < system->nodes[t].count; n++) {
-            sprintf(line + offset,
-                    "%s/%lX (%d/%f/%s) ",
-                    topoNodeTypeStr[t],
-                    system->nodes[t].nodes[n].id,
-                    node->paths[t][n].count,
-                    node->paths[t][n].bw,
-                    topoPathTypeStr[node->paths[t][n].type]);
-            offset = strlen(line);
-        }
-    }
-}
-static scclResult_t getLocalCpu(struct scclTopoSystem* system, int gpu, int* retCpu) {
-    // Find the closest CPU to a GPU
-    int minHops                    = 0;
-    int localCpu                   = -1;
-    struct scclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
-    for(int c = 0; c < system->nodes[CPU].count; c++) {
-        int hops = paths[c].count;
-        if(minHops == 0 || hops < minHops) {
-            localCpu = c;
-            minHops  = hops;
-        }
-    }
-    if(localCpu == -1) {
-        WARN("Error : could not find CPU close to GPU %d", gpu);
-        return scclInternalError;
-    }
-    *retCpu = localCpu;
-    return scclSuccess;
-}
-static scclResult_t addInterStep(struct scclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
-    struct scclTopoNode* cpuNode = system->nodes[tx].nodes + ix;
-    struct scclTopoNode* srcNode = system->nodes[t1].nodes + i1;
-    int l = 0;
-    // Node 1 -> CPU
-    for(int i = 0; i < srcNode->paths[tx][ix].count; i++)
-        srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i];
-    // CPU -> Node 2
-    for(int i = 0; i < cpuNode->paths[t2][i2].count; i++)
-        srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
-    // Update path characteristics
-    srcNode->paths[t2][i2].count = l;
-    srcNode->paths[t2][i2].type  = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
-    if(tx == GPU)
-        srcNode->paths[t2][i2].type = PATH_PXN;
-    srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw);
-    return scclSuccess;
-}
-// Remove/free paths for a given type
-static void scclTopoRemovePathType(struct scclTopoSystem* system, int nodeType) {
-    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
-        // Remove links _to_ the given type
-        for(int n = 0; n < system->nodes[t].count; n++) {
-            struct scclTopoNode* node = system->nodes[t].nodes + n;
-            free(node->paths[nodeType]);
-            node->paths[nodeType] = NULL;
-        }
-        // Remove links _from_ the given type
-        for(int n = 0; n < system->nodes[nodeType].count; n++) {
-            struct scclTopoNode* node = system->nodes[nodeType].nodes + n;
-            free(node->paths[t]);
-            node->paths[t] = NULL;
-        }
-    }
-}
-static const int levelsOldToNew[] = {PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS};
-scclResult_t scclGetLevel(int* level, const char* disableEnv, const char* levelEnv) {
-    if(*level == -1) {
-        int l = -1;
-        if(disableEnv) {
-            char* str = getenv(disableEnv);
-            if(str) {
-                int disable = strtol(str, NULL, 0);
-                if(disable == 1)
-                    l = 0;
-            }
-        }
-        if(l == -1) {
-            char* str = getenv(levelEnv);
-            if(str) {
-                for(int i = 0; i <= PATH_SYS; i++) {
-                    if(strcmp(str, topoPathTypeStr[i]) == 0) {
-                        l = i;
-                        break;
-                    }
-                }
-                // Old style numbering
-                // levelsOldToNew to is an array with each index corresponding to the
-                // "old level" int, and each value mapping to the correct value defined in topo.h
-                // maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew)
-                if(l == -1 && str[0] >= '0' && str[0] <= '9') {
-                    int oldLevel          = strtol(str, NULL, 0);
-                    const int maxOldLevel = sizeof(levelsOldToNew) / sizeof(int) - 1;
-                    if(oldLevel > maxOldLevel)
-                        oldLevel = maxOldLevel;
-                    l = levelsOldToNew[oldLevel];
-                }
-            }
-        }
-        if(l >= 0)
-            INFO(SCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
-        *level = l >= 0 ? l : -2;
-    }
-    return scclSuccess;
-}
-SCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
-int scclTopoUserGdrLevel = -1;
-scclResult_t scclTopoCheckGdr(struct scclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) {
-    *useGdr = 0;
-    // Get GPU and NET
-    int n, g;
-    SCCLCHECK(scclTopoIdToIndex(system, NET, netDev, &n));
-    struct scclTopoNode* net = system->nodes[NET].nodes + n;
-    SCCLCHECK(scclTopoIdToIndex(system, GPU, busId, &g));
-    struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
-    // Check that both the NIC and GPUs support it
-    if(net->net.gdrSupport == 0)
-        return scclSuccess;
-    if(gpu->gpu.gdrSupport == 0)
-        return scclSuccess;
-    if(read) { // For reads (sends) only enable under certain conditions
-        int gdrReadParam = scclParamNetGdrRead();
-        if(gdrReadParam == 0)
-            return scclSuccess;
-        if(gdrReadParam < 0) {
-            int nvlink = 0;
-            // Since we don't know whether there are other communicators,
-            // it's better to keep things local if we have a single GPU.
-            if(system->nodes[GPU].count == 1)
-                nvlink = 1;
-            for(int i = 0; i < system->nodes[GPU].count; i++) {
-                if(i == g)
-                    continue;
-                if(gpu->paths[GPU][i].type == PATH_NVL) {
-                    nvlink = 1;
-                    break;
-                }
-            }
-            if(!nvlink)
-                return scclSuccess;
-        }
-    }
-    // Check if we are close enough that it makes sense to enable GDR
-    int netGdrLevel = system->netGdrLevel == -2 ? PATH_PXB : system->netGdrLevel;
-    SCCLCHECK(scclGetLevel(&scclTopoUserGdrLevel, NULL, "SCCL_NET_GDR_LEVEL"));
-    if(scclTopoUserGdrLevel != -2)
-        netGdrLevel = scclTopoUserGdrLevel;
-    else {
-        int arch, vendor, model;
-        SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
-        if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME) {
-            int i, d1 = -1, d2 = -1;
-            for(i = 0; i < system->nodes[CPU].count; i++)
-                if(system->nodes[GPU].nodes[g].paths[CPU][i].count == 2)
-                    break;
-            if(i < system->nodes[CPU].count)
-                d1 = system->nodes[CPU].nodes[i].id;
-            for(i = 0; i < system->nodes[CPU].count; i++)
-                if(system->nodes[NET].nodes[n].paths[CPU][i].count == 2)
-                    break;
-            if(i < system->nodes[CPU].count)
-                d2 = system->nodes[CPU].nodes[i].id;
-            if(d1 != -1 && d2 != -1 && d1 == d2 && (system->nodes[GPU].nodes[g].id & 0xf0000) == (system->nodes[NET].nodes[n].net.busId & 0xf0000)) {
-                netGdrLevel = PATH_PHB;
-            }
-        }
-    }
-    int distance = gpu->paths[NET][n].type;
-    if(distance == PATH_PXN) {
-        // In case of PXN, use the intermediate GPU distance instead
-        int proxyRank, g;
-        SCCLCHECK(scclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
-        SCCLCHECK(scclTopoRankToIndex(system, proxyRank, &g));
-        struct scclTopoNode* proxyGpu = system->nodes[GPU].nodes + g;
-        distance                      = proxyGpu->paths[NET][n].type;
-    }
-    if(distance > netGdrLevel) {
-        INFO(SCCL_NET, "GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
-        return scclSuccess;
-    }
-    *useGdr = 1;
-    INFO(SCCL_NET, "GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read);
-    return scclSuccess;
-}
-// Set to 0 to disable the flush on Hopper when using GDR
-SCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1);
-// Determine whether we need to flush the GDR recv buffers
-scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush) {
-    int g;
-    SCCLCHECK(scclTopoIdToIndex(system, GPU, busId, &g));
-    struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
-    // Flush is required on Ampere and earlier
-    *flush = gpu->gpu.cudaCompCap < 90 ? 1 : scclParamNetForceFlush();
-    return scclSuccess;
-}
-SCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 1);
-// Check whether going through the network would be faster than going through P2P/SHM.
-scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net) {
-    if(scclParamNetDisableIntra() == 1) {
-        *net = 0;
-        return scclSuccess;
-    }
-    *net = 1;
-    // First check the current GPU-to-GPU speed.
-    int g1, g2;
-    if(scclTopoIdToIndex(system, GPU, id1, &g1) != scclSuccess || scclTopoIdToIndex(system, GPU, id2, &g2) != scclSuccess) {
-        return scclSuccess;
-    }
-    struct scclTopoNode* gpu1 = system->nodes[GPU].nodes + g1;
-    struct scclTopoNode* gpu2 = system->nodes[GPU].nodes + g2;
-    float speed               = gpu1->paths[GPU][g2].bw;
-    // Now check the speed each GPU can access the network through PXB or better
-    float netSpeed1 = 0, netSpeed2 = 0;
-    for(int n = 0; n < system->nodes[NET].count; n++) {
-        struct scclTopoLinkList* path = gpu1->paths[NET] + n;
-        if(path->type <= PATH_PXB && path->bw > netSpeed1)
-            netSpeed1 = path->bw;
-        path = gpu2->paths[NET] + n;
-        if(path->type <= PATH_PXB && path->bw > netSpeed2)
-            netSpeed2 = path->bw;
-    }
-    if(netSpeed1 > speed && netSpeed2 > speed)
-        return scclSuccess;
-    *net = 0;
-    return scclSuccess;
-}
-scclResult_t scclTopoGetIntermediateRank(struct scclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
-    // Get GPU and NET
-    int n, g;
-    SCCLCHECK(scclTopoIdToIndex(system, NET, netDev, &n));
-    SCCLCHECK(scclTopoRankToIndex(system, rank, &g));
-    struct scclTopoNode* gpu      = system->nodes[GPU].nodes + g;
-    struct scclTopoLinkList* path = gpu->paths[NET] + n;
-    if(path->type == PATH_PXN) {
-        struct scclTopoNode* node;
-        int type = NVS;
-        for(int i = 0; i < path->count && type == NVS; i++) {
-            node = path->list[i]->remNode;
-            type = node->type;
-        }
-        if(type != GPU) {
-            WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
-            return scclInternalError;
-        }
-        *intermediateRank = node->gpu.rank;
-    } else {
-        *intermediateRank = rank;
-    }
-    return scclSuccess;
-}
-SCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);
-// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
-// remote proxies without risking deadlocks
-int scclPxnDisable(struct scclComm* comm) {
-    static int pxnDisable = -1;
-    if(pxnDisable == -1) {
-        if(comm && scclNetVersion(comm) == 4) {
-            INFO(SCCL_INIT, "PXN Disabled as plugin is v4");
-            pxnDisable = 1;
-        } else {
-            pxnDisable = scclParamPxnDisable();
-        }
-    }
-    return pxnDisable;
-}
-scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks) {
-    struct scclTopoSystem* system = comm->topo;
-    *nranks                       = 0;
-    *intermediateRanks            = NULL;
-    if(system->nodes[NET].count == 0)
-        return scclSuccess;
-    int nr     = 0;
-    int* ranks = NULL;
-    for(int rank = 0; rank < comm->nRanks; rank++) {
-        int netDev, proxyRank;
-        SCCLCHECK(scclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank));
-        if(proxyRank == comm->rank)
-            continue;
-        int useGdr;
-        SCCLCHECK(scclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr));
-        if(useGdr == 0)
-            continue;
-        int found = 0;
-        for(int r = 0; r < nr; r++) {
-            if(ranks[r] == proxyRank)
-                found = 1;
-        }
-        if(!found) {
-            SCCLCHECK(scclRealloc(&ranks, nr, nr + 1));
-            ranks[nr++] = proxyRank;
-        }
-    }
-    *nranks            = nr;
-    *intermediateRanks = ranks;
-    return scclSuccess;
-}
-static bool rcclPathOverride(struct scclTopoSystem* system, uint64_t distance) {
-    int i, j;
-    for(i = 0; i < system->nodes[GPU].count; i++) {
-        for(j = 0; j < system->nodes[NET].count; j++) {
-            if((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) ||
-               (system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance))
-                break;
-        }
-        if(j >= system->nodes[NET].count)
-            break;
-    }
-    if(i >= system->nodes[GPU].count) {
-        for(i = 0; i < system->nodes[GPU].count; i++) {
-            for(j = 0; j < system->nodes[NET].count; j++) {
-                if((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) ||
-                   (system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance))
-                    system->nodes[GPU].nodes[i].paths[NET][j].type = PATH_PXB;
-            }
-        }
-        return true;
-    } else {
-        return false;
-    }
-}
-RCCL_PARAM(EnableIntranet, "ENABLE_INTRANET", -2);
-scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm) {
-    int* domains;
-    int64_t* ids;
-    SCCLCHECK(scclCalloc(&domains, system->nodes[GPU].count));
-    SCCLCHECK(scclCalloc(&ids, system->nodes[GPU].count));
-    int myDomain = 0;
-    for(int g = 0; g < system->nodes[GPU].count; g++) {
-        struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
-        domains[g]               = g;
-        ids[g]                   = gpu->id;
-        for(int p = 0; p < g; p++) {
-            if(gpu->paths[GPU][p].type < PATH_NET) {
-                domains[g] = std::min(domains[g], domains[p]);
-            }
-        }
-        if(gpu->gpu.rank == comm->rank)
-            myDomain = domains[g];
-    }
-    int ngpus = system->nodes[GPU].count;
-    for(int i = 0; i < ngpus; i++) {
-        if(domains[i] == myDomain)
-            continue;
-        struct scclTopoNode* gpu = NULL;
-        int g;
-        for(g = 0; g < system->nodes[GPU].count /* This one varies over the loops */; g++) {
-            gpu = system->nodes[GPU].nodes + g;
-            if(gpu->id == ids[i])
-                break;
-            else
-                gpu = NULL;
-        }
-        if(gpu == NULL) {
-            WARN("Could not find id %lx", ids[i]);
-            free(domains);
-            free(ids);
-            return scclInternalError;
-        }
-        SCCLCHECK(scclTopoRemoveNode(system, GPU, g));
-    }
-    // trim low speed port on same NIC
-    for(int i = 0; i < system->nodes[NET].count; i++) {
-        for(int j = 0; j < system->nodes[NET].count; j++) {
-            if(i == j)
-                continue;
-            if(system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[j].net.asic) {
-                if(system->nodes[NET].nodes[i].net.bw > system->nodes[NET].nodes[j].net.bw)
-                    system->nodes[NET].nodes[j].net.bw = 0;
-            }
-        }
-    }
-    do {
-        int n;
-        for(n = 0; n < system->nodes[NET].count; n++) {
-            if(system->nodes[NET].nodes[n].net.bw == 0)
-                break;
-        }
-        if(n < system->nodes[NET].count) {
-            SCCLCHECK(scclTopoRemoveNode(system, NET, n));
-        } else
-            break;
-    } while(system->nodes[NET].count);
-    int remove   = 1;
-    int gdr      = 1;
-    bool allXgmi = true;
-    // detect if all GPUs are connected by XGMI
-    for(int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
-        int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
-        for(int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
-            if(i == j)
-                continue;
-            int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
-            bool isXGMI;
-            SCCLCHECK(scclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
-            allXgmi &= isXGMI;
-        }
-    }
-    if(allXgmi)
-        system->type |= RCCL_TOPO_XGMI_ALL;
-    for(int g = 0; g < system->nodes[GPU].count; g++) {
-        int net;
-        SCCLCHECK(scclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
-        SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
-        if(!gdr)
-            break;
-    }
-    if(gdr && !allXgmi) {
-        remove = 0;
-        system->type |= RCCL_TOPO_GDR_ALL;
-        INFO(SCCL_LOG_TOPO, "GDR is available on all GPUs");
-    }
-    // Special handling of gfx94x
-    if(rcclParamEnableIntranet() == 1 || (rcclParamEnableIntranet() == -2 && IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
-                                          system->nodes[GPU].count == 8 && system->nodes[NET].count == 8)) {
-        remove = 0;
-        system->type |= RCCL_TOPO_FORCE_INTRA;
-    }
-    comm->localRanks = system->nodes[GPU].count;
-    if(system->nodes[GPU].count == comm->nRanks && remove) {
-        for(int n = system->nodes[NET].count - 1; n >= 0; n--)
-            SCCLCHECK(scclTopoRemoveNode(system, NET, n));
-    }
-    free(domains);
-    free(ids);
-    return scclSuccess;
-}
-void scclTopoFree(struct scclTopoSystem* system) {
-    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++)
-        scclTopoRemovePathType(system, t);
-    free(system);
-}
-SCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 1);
-SCCL_PARAM(NChannelsPerPeer, "NCHANNELS_PER_PEER", 4);
-static scclResult_t scclTopoGetNchannels(struct scclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
-    int peer;
-    struct scclTopoLinkList* path = NULL;
-    if(scclTopoRankToIndex(system, peerRank, &peer) == scclSuccess) {
-        // Same rank
-        if(g == peer) {
-            *nChannels = -1;
-            return scclSuccess;
-        }
-        // Local rank
-        path = system->nodes[GPU].nodes[peer].paths[GPU] + g;
-        if(path->type == PATH_NVL) {
-            float nvlBw = scclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn);
-            *nChannels  = (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 4 : 2) * std::max(1, (int)(path->bw / nvlBw));
-        } else {
-            *nChannels = 2;
-        }
-    } else {
-        // Remote rank, use network
-        *nChannels = scclParamNChannelsPerNetPeer();
-    }
-    return scclSuccess;
-}
-SCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 4);
-SCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);
-static int nextPow2(int v) {
-    int pow2 = 1;
-    while(pow2 < v)
-        pow2 <<= 1;
-    return pow2;
-}
-scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm) {
-    /* here we already honor comm->max/minCTAs for p2pnChannels. */
-    int MinP2pNchannels  = (int)scclParamMinP2pNChannels();
-    int MaxP2pNchannels  = (int)scclParamMaxP2pNChannels();
-    int NchannelsPerPeer = (int)scclParamNChannelsPerPeer();
-    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MIN_P2P_NCHANNELS") == NULL)
-        MinP2pNchannels = 32;
-    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MAX_P2P_NCHANNELS") == NULL)
-        MaxP2pNchannels = 32;
-    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_NCHANNELS_PER_PEER") == NULL)
-        NchannelsPerPeer = 32;
-    int scclMinP2pNchannels = MinP2pNchannels;
-    if(comm->sharedRes->owner != comm) {
-        comm->p2pnChannels = std::min(comm->nChannels, MaxP2pNchannels);
-        comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, scclMinP2pNchannels), comm->sharedRes->tpP2pNChannels);
-    } else {
-        comm->p2pnChannels = std::min(comm->nChannels, MaxP2pNchannels);
-        comm->p2pnChannels = std::max(comm->p2pnChannels, scclMinP2pNchannels);
-    }
-    int minChannels = comm->p2pnChannels;
-    // We need to loop through all local GPUs to have a global picture
-    for(int g = 0; g < comm->topo->nodes[GPU].count; g++) {
-        for(int r = 0; r < comm->nRanks; r++) {
-            int nChannels;
-            SCCLCHECK(scclTopoGetNchannels(comm->topo, g, r, &nChannels));
-            if(nChannels >= 0)
-                minChannels = std::min(minChannels, nChannels);
-        }
-    }
-    int arch, vendor, model;
-    SCCLCHECK(scclTopoCpuType(comm->topo, &arch, &vendor, &model));
-    // Round to next pow2 nChannelsPerPeer and nChannels
-    if(getNumaMaxGpus() == 1 && !scclTopoPathAllNVLink(comm->topo)) {
-        comm->p2pnChannelsPerPeer = nextPow2(comm->p2pnChannels);
-    } else {
-        comm->p2pnChannelsPerPeer = (NchannelsPerPeer == -2 ? nextPow2(minChannels) : NchannelsPerPeer);
-    }
-    comm->p2pnChannels = nextPow2(comm->p2pnChannels);
-    // Init channels that weren't used so far
-    for(int c = comm->nChannels; c < std::max(comm->nChannels, comm->p2pnChannels); c++)
-        SCCLCHECK(initChannel(comm, c));
-    // We want to spread channels used when there aren't many and progressively
-    // fill the whole space of nChannels. To do so we mirror the bits in the
-    // nChannels space.
-    for(int c = 0; c < comm->p2pnChannels; c++) {
-        int mirror = 0;
-        for(int b = 1, mb = (comm->p2pnChannels >> 1); b < comm->p2pnChannels; b <<= 1, mb >>= 1)
-            if(c & b)
-                mirror |= mb;
-        comm->p2pChannels[c] = mirror;
-    }
-    return scclSuccess;
-}
-scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks) {
-    int ngpus = system->nodes[GPU].count;
-    SCCLCHECK(scclCalloc(ranks, ngpus));
-    int nvbGpus = 0;
-    for(int g = 0; g < ngpus; g++) {
-        struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
-        if(gpu->gpu.rank != rank)
-            continue;
-        for(int p = 0; p < ngpus; p++) {
-            if(gpu->paths[GPU][p].type == PATH_NVB) {
-                (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
-            }
-        }
-    }
-    *nranks = nvbGpus;
-    return scclSuccess;
-}
-int scclTopoPathAllNVLink(struct scclTopoSystem* system) {
-    int minPath = PATH_DIS;
-    for(int i = 0; i < system->nodes[GPU].count; i++) {
-        struct scclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
-        for(int j = 0; j < system->nodes[GPU].count; j++) {
-            if(i == j)
-                continue;
-            minPath = std::min(minPath, paths[j].type);
-        }
-    }
-    return minPath >= PATH_PIX ? 0 : 1;
-}
-} // namespace graph
-scclResult_t scclTopoPrintPaths(struct scclTopoSystem* system) {
-    for(int i = 0; i < system->nodes[GPU].count; i++) {
-        graph::printNodePaths(system, system->nodes[GPU].nodes + i);
-    }
-    for(int i = 0; i < system->nodes[NET].count; i++) {
-        graph::printNodePaths(system, system->nodes[NET].nodes + i);
-    }
-    return scclSuccess;
-}
-int scclTopoUserP2pLevel = -1;
-scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank) {
-    *p2p = 0;
-    if(read)
-        *read = 0;
-    if(intermediateRank)
-        *intermediateRank = -1;
-    // Get GPUs from topology
-    int g1, g2;
-    SCCLCHECK(scclTopoIdToIndex(system, GPU, id1, &g1));
-    struct scclTopoNode* gpu1 = system->nodes[GPU].nodes + g1;
-    if(scclTopoIdToIndex(system, GPU, id2, &g2) == scclInternalError) {
-        // GPU not found, we can't use p2p.
-        return scclSuccess;
-    }
-    int intermediateIndex = -1;
-    // Set intermediate GPU rank, if routing through an intermediate GPU.
-    struct scclTopoLinkList* path = gpu1->paths[GPU] + g2;
-    if(path->count == 2) {
-        struct scclTopoNode* intermediateNode = path->list[0]->remNode;
-        if(intermediateNode->type == GPU) {
-            intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
-            if(intermediateRank)
-                *intermediateRank = intermediateNode->gpu.rank;
-        }
-    }
-    // In general, use P2P whenever we can.
-    int p2pLevel = PATH_SYS;
-    // User override
-    if(scclTopoUserP2pLevel == -1)
-        SCCLCHECK(scclGetLevel(&scclTopoUserP2pLevel, "SCCL_P2P_DISABLE", "SCCL_P2P_LEVEL"));
-    if(scclTopoUserP2pLevel != -2) {
-        p2pLevel = scclTopoUserP2pLevel;
-        goto compare;
-    }
-    // Don't use P2P through ARM CPUs
-    int arch, vendor, model;
-    SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
-    if(arch == SCCL_TOPO_CPU_ARCH_ARM)
-        p2pLevel = PATH_PXB;
-    if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
-        p2pLevel = PATH_PXB;
-    }
-    if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
-        p2pLevel = PATH_PXB;
-    }
-compare:
-    // Compute the PCI distance and compare with the p2pLevel.
-    if(path->type <= p2pLevel)
-        *p2p = 1;
-    if(path->type == PATH_NVL) {
-        struct scclTopoNode* gpu2 = system->nodes[GPU].nodes + g2;
-        // Enable P2P Read for Ampere/NVLink only
-        if(read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80))
-            *read = 1;
-    }
-    return scclSuccess;
-}
-scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm) {
-    // Precompute paths between GPUs/NICs.
-    // Remove everything in case we're re-computing
-    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++)
-        graph::scclTopoRemovePathType(system, t);
-    // Set direct paths to CPUs. We need them in many cases.
-    for(int c = 0; c < system->nodes[CPU].count; c++) {
-        SCCLCHECK(graph::scclTopoSetPaths(system->nodes[CPU].nodes + c, system));
-    }
-    // Set direct paths to GPUs.
-    for(int g = 0; g < system->nodes[GPU].count; g++) {
-        SCCLCHECK(graph::scclTopoSetPaths(system->nodes[GPU].nodes + g, system));
-    }
-    // Set direct paths to NICs.
-    for(int n = 0; n < system->nodes[NET].count; n++) {
-        SCCLCHECK(graph::scclTopoSetPaths(system->nodes[NET].nodes + n, system));
-    }
-    // Set direct paths to NVSwitches.
-    for(int n = 0; n < system->nodes[NVS].count; n++) {
-        SCCLCHECK(graph::scclTopoSetPaths(system->nodes[NVS].nodes + n, system));
-    }
-    // Update path for GPUs when we don't want to / can't use GPU Direct P2P
-    for(int g = 0; g < system->nodes[GPU].count; g++) {
-        for(int p = 0; p < system->nodes[GPU].count; p++) {
-            int p2p;
-            SCCLCHECK(scclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
-            if(p2p == 0) {
-                // Divert all traffic through the CPU
-                int cpu;
-                SCCLCHECK(getLocalCpu(system, g, &cpu));
-                SCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
-            }
-        }
-        if(comm == NULL)
-            continue;
-        // Remove GPUs we can't (or don't want to) communicate with through P2P or SHM
-        struct scclPeerInfo* dstInfo = comm->peerInfo + system->nodes[GPU].nodes[g].gpu.rank;
-        for(int p = 0; p < system->nodes[GPU].count; p++) {
-            if(p == g)
-                continue;
-            struct scclPeerInfo* srcInfo = comm->peerInfo + system->nodes[GPU].nodes[p].gpu.rank;
-            int p2p;
-            SCCLCHECK(scclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
-            if(p2p == 0) {
-                int shm;
-                SCCLCHECK(scclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
-                if(shm == 0) {
-                    // Mark this peer as inaccessible. We'll trim it later.
-                    system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
-                }
-            }
-        }
-    }
-    // Special handling of gfx94x
-#if !defined(TOPO_EXPL)
-    char strValue[1024];
-    SCCLCHECK(scclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue));
-    if(strncmp("Hyper-V UEFI Release", strValue, 20) == 0) {
-#endif
-        int arch, vendor, model;
-        SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
-        if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_INTEL && IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
-           ((system->nodes[GPU].count == 8 && system->nodes[NET].count == 8 && system->nodes[GPU].count == system->nRanks) ||
-            (system->nodes[GPU].count != system->nRanks))) {
-            if(!rcclPathOverride(system, 0x100000))
-                rcclPathOverride(system, 0x1000);
-        }
-#if !defined(TOPO_EXPL)
-    }
-#endif
-    // Update paths for NICs (no GPU Direct, PXN, ...)
-    for(int n = 0; n < system->nodes[NET].count; n++) {
-        struct scclTopoNode* netNode = system->nodes[NET].nodes + n;
-        for(int g = 0; g < system->nodes[GPU].count; g++) {
-            // Check whether we can access the NIC through another NVLink-connected GPU (PXN)
-            struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
-            if(scclPxnDisable(comm) != 1) {
-                int localGpuIndex;
-                SCCLCHECK(scclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex));
-                if(localGpuIndex != g && localGpuIndex != -1) {
-                    // PXN = PCI + NVLink.
-                    struct scclTopoNode* peerNode = system->nodes[GPU].nodes + localGpuIndex;
-                    // Only use PXN for NIC n if remote GPU p ...
-                    if(peerNode->paths[NET][n].type <= PATH_PXB &&            // Is connected to the NIC through PCI
-                       peerNode->paths[GPU][g].type <= PATH_NVL &&            // Is connected to us through NVLink
-                       (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
-                        gpu->paths[NET][n].type > PATH_PXB))                  // or avoids going through a CPU
-                        // We can use that GPU as relay to communicate with that NIC.
-                        // Only enabling it in the GPU->NIC direction for now to favor
-                        // receiving locally and sending remotely (consistent with net.cc)
-                        SCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
-                }
-            }
-            // Update path when we dont want to / can't use GPU Direct RDMA.
-            int gdr;
-            SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
-            if(gdr == 0) {
-                // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
-                int localCpu;
-                SCCLCHECK(getLocalCpu(system, g, &localCpu));
-                SCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
-                SCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
-            }
-        }
-    }
-    return scclSuccess;
-}
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/graph/rings.cc
+++ b/src/hardware/graph/rings.cc
-/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#include "core.h"
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace detect {
-#define MAXWIDTH 20
-#define PREFIXLEN 15
-#define STRLENGTH (PREFIXLEN + 5 * MAXWIDTH)
-void dumpLine(int* values, int nranks, const char* prefix) {
-    int prefixlen = strlen(prefix);
-    char line[STRLENGTH + 1];
-    line[STRLENGTH] = '\0';
-    memset(line, ' ', STRLENGTH);
-    strncpy(line, prefix, PREFIXLEN);
-    for(int i = 0; i < nranks && i < MAXWIDTH; i++)
-        sprintf(line + prefixlen + 4 * i, " %3d", values[i]);
-    INFO(SCCL_INIT, "%s", line);
-}
-scclResult_t scclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
-    for(int r = 0; r < nrings; r++) {
-        char prefix[40];
-        /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
-        dumpLine(prev+r*nranks, nranks, prefix);
-        sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
-        dumpLine(next+r*nranks, nranks, prefix);*/
-        int current = rank;
-        for(int i = 0; i < nranks; i++) {
-            rings[r * nranks + i] = current;
-            current               = next[r * nranks + current];
-        }
-        sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
-        if(rank == 0)
-            dumpLine(rings + r * nranks, nranks, prefix);
-        if(current != rank) {
-            WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
-            return scclInternalError;
-        }
-        // Check that all ranks are there
-        for(int i = 0; i < nranks; i++) {
-            int found = 0;
-            for(int j = 0; j < nranks; j++) {
-                if(rings[r * nranks + j] == i) {
-                    found = 1;
-                    break;
-                }
-            }
-            if(found == 0) {
-                WARN("Error : ring %d does not contain rank %d", r, i);
-                return scclInternalError;
-            }
-        }
-    }
-    return scclSuccess;
-}
-} // namespace detect
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/graph/rings.h
+++ b/src/hardware/graph/rings.h
-/*************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace detect {
-scclResult_t scclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
-} // namespace detect
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/graph/rome_models.cc
+++ b/src/hardware/graph/rome_models.cc
-#include "core.h"
-#include "graph.h"
-#include "topo.h"
-#include "xml.h"
-#include <math.h>
-#include <sys/time.h>
-#include <algorithm>
-#include <string.h>
-#include "rome_models.h"
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace detect {
-struct scclRomeModel {
-    int nGpus;
-    int nCpus;
-    int nNics;
-    int nLinks;
-    int64_t gpuIds[SCCL_TOPO_MAX_NODES];
-    int64_t nicIds[SCCL_TOPO_MAX_NODES];
-    int64_t gpuNuma[SCCL_TOPO_MAX_NODES];
-    int64_t nicNuma[SCCL_TOPO_MAX_NODES];
-    uint8_t connMatrix[SCCL_TOPO_MAX_NODES * SCCL_TOPO_MAX_NODES];
-    uint8_t gdrLevel[SCCL_TOPO_MAX_NODES * SCCL_TOPO_MAX_NODES];
-    const char* pattern;
-    const char* ringBase;
-    const char* options;
-    const char* treeBase;
-};
-static struct scclRomeModel rome_model_22 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 1,
-    .nLinks = 2,
-    .gpuIds =
-        {
-            0x3000,
-            0x43000,
-            0x26000,
-            0xc3000,
-            0x83000,
-            0x23000,
-            0xc6000,
-            0xa3000,
-        },
-    .nicIds =
-        {
-            0xe1000,
-        },
-    .gpuNuma =
-        {
-            1,
-            0,
-            1,
-            2,
-            3,
-            1,
-            2,
-            3,
-        },
-    .nicNuma =
-        {
-            2,
-        },
-    .connMatrix =
-        {
-            0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
-            0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_PHB,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_PHB,
-            PATH_SYS,
-        },
-    .pattern  = "10302120",
-    .ringBase = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_25 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 2,
-    .nLinks = 2,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xe3000,
-            0xc3000,
-            0xc6000,
-            0x83000,
-        },
-    .nicIds =
-        {
-            0x61000,
-            0xa1000,
-        },
-    .gpuNuma =
-        {
-            0,
-            1,
-            1,
-            1,
-            2,
-            2,
-            2,
-            3,
-        },
-    .nicNuma =
-        {
-            0,
-            3,
-        },
-    .connMatrix =
-        {
-            0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PHB,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_PHB,
-        },
-    .pattern  = "11303011",
-    .ringBase = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_27 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 2,
-    .nLinks = 2,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xe3000,
-            0xc3000,
-            0xc6000,
-            0x83000,
-        },
-    .nicIds =
-        {
-            0x61000,
-            0xa1000,
-        },
-    .gpuNuma =
-        {
-            0,
-            1,
-            1,
-            1,
-            2,
-            2,
-            2,
-            3,
-        },
-    .nicNuma =
-        {
-            0,
-            3,
-        },
-    .connMatrix =
-        {
-            0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
-            0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PHB,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_PHB,
-        },
-    .pattern  = "11303011",
-    .ringBase = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_29 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 1,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xc3000,
-            0xc6000,
-            0xa3000,
-            0x83000,
-        },
-    .nicIds =
-        {
-            0xe1000,
-        },
-    .gpuNuma =
-        {
-            0,
-            1,
-            1,
-            1,
-            2,
-            2,
-            3,
-            3,
-        },
-    .nicNuma =
-        {
-            2,
-        },
-    .connMatrix =
-        {
-            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_PHB,
-            PATH_PHB,
-            PATH_SYS,
-            PATH_SYS,
-        },
-    .pattern  = "10302120",
-    .ringBase = "6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_31 = {
-    .nGpus  = 8,
-    .nCpus  = 8,
-    .nNics  = 2,
-    .nLinks = 2,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xe3000,
-            0xc3000,
-            0xc6000,
-            0x83000,
-        },
-    .nicIds =
-        {
-            0x61000,
-            0xa1000,
-        },
-    .gpuNuma =
-        {
-            1,
-            2,
-            2,
-            3,
-            4,
-            5,
-            5,
-            7,
-        },
-    .nicNuma =
-        {
-            0,
-            6,
-        },
-    .connMatrix =
-        {
-            0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-        },
-    .pattern  = "0110201010200110",
-    .ringBase = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_33 = {
-    .nGpus  = 8,
-    .nCpus  = 8,
-    .nNics  = 2,
-    .nLinks = 2,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xe3000,
-            0xc3000,
-            0xc6000,
-            0x83000,
-        },
-    .nicIds =
-        {
-            0x61000,
-            0xa1000,
-        },
-    .gpuNuma =
-        {
-            1,
-            2,
-            2,
-            3,
-            4,
-            5,
-            5,
-            7,
-        },
-    .nicNuma =
-        {
-            0,
-            6,
-        },
-    .connMatrix =
-        {
-            0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
-            0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-        },
-    .pattern  = "0110201010200110",
-    .ringBase = "1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_30 = {
-    .nGpus  = 8,
-    .nCpus  = 8,
-    .nNics  = 0,
-    .nLinks = 2,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xe3000,
-            0xc3000,
-            0xc6000,
-            0x83000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            1,
-            2,
-            2,
-            3,
-            4,
-            5,
-            5,
-            7,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "0010201010200010",
-    .ringBase = "3 0 1 2 6 7 5 4|2 1 0 3 7 6 4 5",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_32 = {
-    .nGpus  = 8,
-    .nCpus  = 8,
-    .nNics  = 0,
-    .nLinks = 2,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xe3000,
-            0xc3000,
-            0xc6000,
-            0x83000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            1,
-            2,
-            2,
-            3,
-            4,
-            5,
-            5,
-            7,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
-            0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "0010201010200010",
-    .ringBase = "0 6 2 3 4 5 7 1|3 2 6 0 1 7 5 4",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_24 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 0,
-    .nLinks = 2,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xe3000,
-            0xc3000,
-            0xc6000,
-            0x83000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            0,
-            1,
-            1,
-            1,
-            2,
-            2,
-            2,
-            3,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "10303010",
-    .ringBase = "0 1 2 3 5 7 6 4|1 0 3 2 7 5 4 6",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_26 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 0,
-    .nLinks = 2,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xe3000,
-            0xc3000,
-            0xc6000,
-            0x83000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            0,
-            1,
-            1,
-            1,
-            2,
-            2,
-            2,
-            3,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
-            0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "10303010",
-    .ringBase = "4 5 7 1 0 3 2 6|3 0 6 2 1 7 5 4",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_23 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 0,
-    .nLinks = 2,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xc3000,
-            0xc6000,
-            0xa3000,
-            0x83000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            0,
-            1,
-            1,
-            1,
-            2,
-            2,
-            3,
-            3,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
-            0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "10302020",
-    .ringBase = "1 7 6 4 5 2 0 3|2 5 3 0 4 6 7 1",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_38 = {
-    .nGpus  = 8,
-    .nCpus  = 7,
-    .nNics  = 0,
-    .nLinks = 2,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xc3000,
-            0xc6000,
-            0xa3000,
-            0x83000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            1,
-            2,
-            2,
-            3,
-            5,
-            5,
-            6,
-            7,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
-            0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "10201000201010",
-    .ringBase = "6 7 1 4 3 5 2 0|0 2 5 3 4 1 7 6",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_28 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 0,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xc3000,
-            0xc6000,
-            0xa3000,
-            0x83000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            0,
-            1,
-            1,
-            1,
-            2,
-            2,
-            3,
-            3,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "10302020",
-    .ringBase = "0 3 2 1 4 5 6 7|7 6 5 4 1 2 3 0|0 2 5 7 4 6 3 1|1 3 6 4 7 5 2 0",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_40 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 1,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xc3000,
-            0xc6000,
-            0xa3000,
-            0x83000,
-        },
-    .nicIds =
-        {
-            0xe1000,
-        },
-    .gpuNuma =
-        {
-            0,
-            1,
-            1,
-            1,
-            2,
-            2,
-            3,
-            3,
-        },
-    .nicNuma =
-        {
-            2,
-        },
-    .connMatrix =
-        {
-            0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
-            0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_PHB,
-            PATH_PHB,
-            PATH_SYS,
-            PATH_SYS,
-        },
-    .pattern  = "10302120",
-    .ringBase = "6 7 1 4 0 5 3 2|7 6 4 1 0 2 3 5",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_42 = {
-    .nGpus  = 8,
-    .nCpus  = 7,
-    .nNics  = 1,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xc3000,
-            0xc6000,
-            0xa3000,
-            0x83000,
-        },
-    .nicIds =
-        {
-            0xe1000,
-        },
-    .gpuNuma =
-        {
-            1,
-            2,
-            2,
-            3,
-            5,
-            5,
-            6,
-            7,
-        },
-    .nicNuma =
-        {
-            4,
-        },
-    .connMatrix =
-        {
-            0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
-            0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-        },
-    .pattern  = "10201001201010",
-    .ringBase = "7 4 6 1 3 0 2 5|6 4 7 1 3 2 5 0",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_44 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 1,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x63000,
-            0x43000,
-            0x27000,
-            0x3000,
-            0xe3000,
-            0xc3000,
-            0xa3000,
-            0x83000,
-        },
-    .nicIds =
-        {
-            0xc4000,
-        },
-    .gpuNuma =
-        {
-            0,
-            0,
-            1,
-            1,
-            2,
-            2,
-            3,
-            3,
-        },
-    .nicNuma =
-        {
-            2,
-        },
-    .connMatrix =
-        {
-            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_PHB,
-            PATH_PHB,
-            PATH_SYS,
-            PATH_SYS,
-        },
-    .pattern  = "20202120",
-    .ringBase = "5 4 7 6 2 1 3 0|5 6 7 4 1 0 2 3",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_45 = {
-    .nGpus  = 8,
-    .nCpus  = 7,
-    .nNics  = 0,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xc3000,
-            0xc6000,
-            0xa3000,
-            0x83000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            1,
-            2,
-            2,
-            3,
-            5,
-            5,
-            6,
-            7,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "10201000201010",
-    .ringBase = "0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_46 = {
-    .nGpus  = 8,
-    .nCpus  = 7,
-    .nNics  = 1,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x43000,
-            0x23000,
-            0x26000,
-            0x3000,
-            0xc3000,
-            0xc6000,
-            0xa3000,
-            0x83000,
-        },
-    .nicIds =
-        {
-            0xe1000,
-        },
-    .gpuNuma =
-        {
-            1,
-            2,
-            2,
-            3,
-            5,
-            5,
-            6,
-            7,
-        },
-    .nicNuma =
-        {
-            4,
-        },
-    .connMatrix =
-        {
-            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-        },
-    .pattern  = "10201001201010",
-    .ringBase = "6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_48 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 0,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x4a000,
-            0x50000,
-            0xa000,
-            0xf000,
-            0xcb000,
-            0xd1000,
-            0x8a000,
-            0x90000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            0,
-            0,
-            1,
-            1,
-            2,
-            2,
-            3,
-            3,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "20202020",
-    .ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_49 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 4,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x4a000,
-            0x50000,
-            0xa000,
-            0xf000,
-            0xcb000,
-            0xd1000,
-            0x8a000,
-            0x90000,
-        },
-    .nicIds =
-        {
-            0x45000,
-            0x13000,
-            0xc6000,
-            0x85000,
-        },
-    .gpuNuma =
-        {
-            0,
-            0,
-            1,
-            1,
-            2,
-            2,
-            3,
-            3,
-        },
-    .nicNuma =
-        {
-            0,
-            1,
-            2,
-            3,
-        },
-    .connMatrix =
-        {
-            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB,
-            PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
-        },
-    .pattern  = "21212121",
-    .ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_52 = {
-    .nGpus  = 8,
-    .nCpus  = 1,
-    .nNics  = 0,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0xc1000,
-            0xc5000,
-            0xc9000,
-            0xcd000,
-            0xd1000,
-            0xd5000,
-            0xd9000,
-            0xdd000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
-            0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "80",
-    .ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_53 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 4,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x4a000,
-            0x50000,
-            0xa000,
-            0xf000,
-            0xcb000,
-            0xd1000,
-            0x8a000,
-            0x90000,
-        },
-    .nicIds =
-        {
-            0x45000,
-            0x13000,
-            0xc6000,
-            0x85000,
-        },
-    .gpuNuma =
-        {
-            1,
-            1,
-            3,
-            3,
-            5,
-            5,
-            7,
-            7,
-        },
-    .nicNuma =
-        {
-            1,
-            3,
-            5,
-            7,
-        },
-    .connMatrix =
-        {
-            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB,
-            PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
-        },
-    .pattern  = "21212121",
-    .ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_43 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 0,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x63000,
-            0x43000,
-            0x27000,
-            0x3000,
-            0xe3000,
-            0xc3000,
-            0xa3000,
-            0x83000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            0,
-            0,
-            1,
-            1,
-            2,
-            2,
-            3,
-            3,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "20202020",
-    .ringBase = "0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1|0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1|0 1 2 3 4 5 6 "
-                "7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1",
-    .options  = "treeDefined=1",
-    .treeBase =
-        "(2(5(6(7(4))))(3(0(1))))|(2(5(7(6(4))))(0(1(3))))|(2(5(7(4(6))))(1(3(0))))|(6(1(0(2(3))))(7(4(5))))|(6(1(2(0(3))))(4(5(7))))|(6(1(0(3(2))))(5(7(4))))|"
-        "(1(6(7(5(4))))(2(3(0))))|(1(6(4(7(5))))(3(2(0))))|(1(6(5(4(7))))(3(0(2))))|(5(2(3(1(0))))(4(6(7))))|(5(2(0(3(1))))(6(4(7))))|(5(2(1(0(3))))(4(7(6))))",
-};
-static struct scclRomeModel rome_model_55 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 0,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x100000,
-            0x200000,
-            0x300000,
-            0x400000,
-            0x500000,
-            0x600000,
-            0x700000,
-            0x800000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            0,
-            0,
-            1,
-            1,
-            2,
-            2,
-            3,
-            3,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "20202020",
-    .ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|2 3 0 1 6 7 4 5|5 4 7 6 1 0 3 2",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_56 = {
-    .nGpus  = 16,
-    .nCpus  = 4,
-    .nNics  = 0,
-    .nLinks = 4,
-    .gpuIds =
-        {
-            0x4e000,
-            0x51000,
-            0x56000,
-            0x59000,
-            0xe000,
-            0x11000,
-            0x16000,
-            0x19000,
-            0xcf000,
-            0xd2000,
-            0xd7000,
-            0xda000,
-            0x8f000,
-            0x92000,
-            0x97000,
-            0x9a000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-            2,
-            2,
-            2,
-            2,
-            3,
-            3,
-            3,
-            3,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 4, 0,
-            0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4,
-            0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 4, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "40404040",
-    .ringBase = "0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4|0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4|0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1|4 5 13 12 8 9 11 10 14 15 7 "
-                "6 2 3 1 0|4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0|1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0",
-    .options  = "pivotA2AEnabled=1,pivotA2ANumBiRings=3,tuning=1,mscclEnabled=1,treeDefined=1",
-    .treeBase = "(0(1(3(2(6(7(15(14(10))))))))(4(5(13(12(8(9(11))))))))|(2(3(7(6(13(12(8(9(10))))))))(1(0(4(5(14(15(11))))))))|(14(15(11(10(8(9(13(12(4))))))))"
-                "(6(7(3(2(0(1(5))))))))|(10(11(9(8(12(13(5(4(0))))))))(14(15(7(6(2(3(1))))))))|(10(11(15(14(5(4(0(1(2))))))))(9(8(12(13(6(7(3))))))))|(4(5(1(0("
-                "2(3(7(6(14))))))))(12(13(9(8(10(11(15))))))))|(6(7(15(14(10(11(9(8(12))))))))(2(3(1(0(4(5(13))))))))|(13(12(8(9(10(11(15(14(5))))))))(6(7(3(2("
-                "1(0(4))))))))|(8(9(13(12(4(5(1(0(2))))))))(10(11(15(14(6(7(3))))))))|(12(13(5(4(0(1(3(2(6))))))))(8(9(11(10(14(15(7))))))))|(5(4(0(1(2(3(7(6("
-                "13))))))))(14(15(11(10(9(8(12))))))))|(2(3(7(6(14(15(11(10(8))))))))(0(1(5(4(12(13(9))))))))",
-};
-static struct scclRomeModel rome_model_58 = {
-    .nGpus  = 8,
-    .nCpus  = 3,
-    .nNics  = 0,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0xc1000,
-            0xc6000,
-            0xc9000,
-            0xce000,
-            0xd1000,
-            0xd6000,
-            0xd9000,
-            0xde000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            3,
-            3,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
-            0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "402020",
-    .ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_59 = {
-    .nGpus  = 16,
-    .nCpus  = 4,
-    .nNics  = 8,
-    .nLinks = 4,
-    .gpuIds =
-        {
-            0x4e000,
-            0x51000,
-            0x56000,
-            0x59000,
-            0xe000,
-            0x11000,
-            0x16000,
-            0x19000,
-            0xcf000,
-            0xd2000,
-            0xd7000,
-            0xda000,
-            0x8f000,
-            0x92000,
-            0x97000,
-            0x9a000,
-        },
-    .nicIds =
-        {
-            0x4b000,
-            0x5a000,
-            0xb000,
-            0x1a000,
-            0xcc000,
-            0xdb000,
-            0x8c000,
-            0x9b000,
-        },
-    .gpuNuma =
-        {
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-            2,
-            2,
-            2,
-            2,
-            3,
-            3,
-            3,
-            3,
-        },
-    .nicNuma =
-        {
-            0,
-            0,
-            1,
-            1,
-            2,
-            2,
-            3,
-            3,
-        },
-    .connMatrix =
-        {
-            0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 4, 0,
-            0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4,
-            0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 4, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB,
-            PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB,
-            PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
-        },
-    .pattern  = "42424242",
-    .ringBase = "N4 9 8 12 13 5 4 0 1 3 2 6 7 15 14 10 11 N5|N1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 1 N0|N3 7 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 N7|N7 15 14 "
-                "10 11 9 8 12 13 5 4 0 1 3 2 6 7 N3|N5 11 10 14 15 7 6 2 3 1 0 4 5 13 12 8 9 N4|N0 1 0 4 5 13 12 8 9 11 10 14 15 7 6 2 3 N1|N3 6 7 3 2 1 0 4 5 "
-                "14 15 11 10 9 8 12 13 N6|N7 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 5 N2|N2 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 14 N7|N6 13 12 8 9 10 11 15 14 5 4 "
-                "0 1 2 3 7 6 N3|N4 8 9 13 12 4 5 1 0 2 3 7 6 14 15 11 10 N5|N5 10 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 N4|N6 12 13 9 8 10 11 15 14 6 7 3 2 0 1 5 "
-                "4 N2|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N1 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 N0|N0 0 1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 N1|N5 "
-                "10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N3 6 7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N7 14 15 7 6 "
-                "2 3 1 0 4 5 13 12 8 9 11 10 N5|N0 0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 N2|N4 8 9 10 11 15 14 5 4 0 1 2 3 7 6 13 12 N6|N3 7 6 13 12 8 9 10 11 "
-                "15 14 5 4 0 1 2 3 N1|N1 3 2 1 0 4 5 14 15 11 10 9 8 12 13 6 7 N3|N6 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 8 N4|N2 4 5 14 15 11 10 9 8 12 13 6 7 "
-                "3 2 1 0 N0|N0 1 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 N2|N6 13 12 4 5 1 0 2 3 7 6 14 15 11 10 8 9 N4|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 "
-                "N7|N2 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0 1 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6",
-    .options  = "tuning=4,ll128Enabled=1,baseBw=161.4",
-};
-static struct scclRomeModel rome_model_62 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 0,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0xc1000,
-            0xc6000,
-            0xc9000,
-            0xce000,
-            0xd1000,
-            0xd6000,
-            0xd9000,
-            0xde000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            3,
-            3,
-            1,
-            1,
-            0,
-            0,
-            2,
-            2,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
-            0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "20202020",
-    .ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_63 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 4,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0xc1000,
-            0xc6000,
-            0xc9000,
-            0xce000,
-            0xd1000,
-            0xd6000,
-            0xd9000,
-            0xde000,
-        },
-    .nicIds =
-        {
-            0xc5000,
-            0xcd000,
-            0xd5000,
-            0xdd000,
-        },
-    .gpuNuma =
-        {
-            3,
-            3,
-            1,
-            1,
-            0,
-            0,
-            2,
-            2,
-        },
-    .nicNuma =
-        {
-            3,
-            1,
-            0,
-            2,
-        },
-    .connMatrix =
-        {
-            0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
-            0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB,
-            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
-        },
-    .pattern  = "21212121",
-    .ringBase = "N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 "
-                "1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3",
-    .options  = "tuning=3",
-};
-static struct scclRomeModel rome_model_65 = {
-    .nGpus  = 16,
-    .nCpus  = 4,
-    .nNics  = 8,
-    .nLinks = 4,
-    .gpuIds =
-        {
-            0x4e000,
-            0x51000,
-            0x56000,
-            0x59000,
-            0xe000,
-            0x11000,
-            0x16000,
-            0x19000,
-            0xcf000,
-            0xd2000,
-            0xd7000,
-            0xda000,
-            0x8f000,
-            0x92000,
-            0x97000,
-            0x9a000,
-        },
-    .nicIds =
-        {
-            0x4b000,
-            0x5a000,
-            0xb000,
-            0x1a000,
-            0xcc000,
-            0xdb000,
-            0x8c000,
-            0x9b000,
-        },
-    .gpuNuma =
-        {
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-            2,
-            2,
-            2,
-            2,
-            3,
-            3,
-            3,
-            3,
-        },
-    .nicNuma =
-        {
-            0,
-            0,
-            1,
-            1,
-            2,
-            2,
-            3,
-            3,
-        },
-    .connMatrix =
-        {
-            0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 4, 0,
-            0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4,
-            0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-            0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 4, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
-        },
-    .pattern  = "42424242",
-    .ringBase = "N4 9 8 12 13 5 4 0 1 3 2 6 7 15 14 10 11 N5|N1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 1 N0|N3 7 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 N7|N7 15 14 "
-                "10 11 9 8 12 13 5 4 0 1 3 2 6 7 N3|N5 11 10 14 15 7 6 2 3 1 0 4 5 13 12 8 9 N4|N0 1 0 4 5 13 12 8 9 11 10 14 15 7 6 2 3 N1|N3 6 7 3 2 1 0 4 5 "
-                "14 15 11 10 9 8 12 13 N6|N7 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 5 N2|N2 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 14 N7|N6 13 12 8 9 10 11 15 14 5 4 "
-                "0 1 2 3 7 6 N3|N4 8 9 13 12 4 5 1 0 2 3 7 6 14 15 11 10 N5|N5 10 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 N4|N6 12 13 9 8 10 11 15 14 6 7 3 2 0 1 5 "
-                "4 N2|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N1 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 N0|N0 0 1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 N1|N5 "
-                "10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N3 6 7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N7 14 15 7 6 "
-                "2 3 1 0 4 5 13 12 8 9 11 10 N5|N0 0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 N2|N4 8 9 10 11 15 14 5 4 0 1 2 3 7 6 13 12 N6|N3 7 6 13 12 8 9 10 11 "
-                "15 14 5 4 0 1 2 3 N1|N1 3 2 1 0 4 5 14 15 11 10 9 8 12 13 6 7 N3|N6 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 8 N4|N2 4 5 14 15 11 10 9 8 12 13 6 7 "
-                "3 2 1 0 N0|N0 1 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 N2|N6 13 12 4 5 1 0 2 3 7 6 14 15 11 10 8 9 N4|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 "
-                "N7|N2 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0 1 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6",
-    .options  = "tuning=4,ll128Enabled=1,baseBw=161.4",
-};
-static struct scclRomeModel rome_model_66 = {
-    .nGpus  = 8,
-    .nCpus  = 2,
-    .nNics  = 0,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x29000,
-            0x2c000,
-            0x2f000,
-            0x32000,
-            0xad000,
-            0xb0000,
-            0xb3000,
-            0xb6000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            1,
-            1,
-            1,
-            1,
-            3,
-            3,
-            3,
-            3,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 4, 0, 0, 2, 0, 1, 0, 4, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 4, 1, 0, 2, 0, 0, 1, 4, 0, 0, 1, 0, 0,
-            2, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 1, 4, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "4040",
-    .ringBase = "0 6 7 5 4 2 3 1|1 3 2 4 5 7 6 0|0 1 7 6 2 3 5 4|4 5 3 2 6 7 1 0",
-    .options  = "disableNumaMatching=1,tuning=2",
-};
-static struct scclRomeModel rome_model_67 = {
-    .nGpus  = 8,
-    .nCpus  = 2,
-    .nNics  = 4,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x29000,
-            0x2c000,
-            0x2f000,
-            0x32000,
-            0xad000,
-            0xb0000,
-            0xb3000,
-            0xb6000,
-        },
-    .nicIds =
-        {
-            0x1d000,
-            0x1e000,
-            0xa1000,
-            0xa2000,
-        },
-    .gpuNuma =
-        {
-            1,
-            1,
-            1,
-            1,
-            3,
-            3,
-            3,
-            3,
-        },
-    .nicNuma =
-        {
-            1,
-            1,
-            3,
-            3,
-        },
-    .connMatrix =
-        {
-            0, 4, 0, 0, 2, 0, 1, 0, 4, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 4, 1, 0, 2, 0, 0, 1, 4, 0, 0, 1, 0, 0,
-            2, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 1, 4, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB,
-            PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
-            PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
-        },
-    .pattern  = "4242",
-    .ringBase = "N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N1 2 3 5 4 0 1 7 6 N3|N2 4 5 3 2 6 7 1 0 N0|N1 3 2 4 5 7 6 0 1 N0|N0 1 0 6 7 5 4 2 3 N1|N0 0 1 7 "
-                "6 2 3 5 4 N2|N3 6 7 1 0 4 5 3 2 N1",
-    .options  = "disableNumaMatching=1,tuning=2",
-};
-static struct scclRomeModel rome_model_68 = {
-    .nGpus  = 16,
-    .nCpus  = 1,
-    .nNics  = 16,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0xcf000,
-            0xd4000,
-            0xd5000,
-            0xd6000,
-            0xd0000,
-            0xd1000,
-            0xd2000,
-            0xd3000,
-            0xf0000,
-            0xf1000,
-            0xf2000,
-            0xf3000,
-            0xf4000,
-            0xf5000,
-            0xf6000,
-            0xf7000,
-        },
-    .nicIds =
-        {
-            0xcd000,
-            0xc8000,
-            0xc9000,
-            0xcb000,
-            0xcc000,
-            0xce000,
-            0xc7000,
-            0xca000,
-            0xe8000,
-            0xe9000,
-            0xea000,
-            0xeb000,
-            0xec000,
-            0xed000,
-            0xee000,
-            0xef000,
-        },
-    .gpuNuma =
-        {
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-        },
-    .nicNuma =
-        {
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-            1,
-        },
-    .connMatrix =
-        {
-            0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
-            1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_PHB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
-            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB,
-            PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
-            PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB,
-            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
-            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB,
-            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PIX,
-            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
-            PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB,
-            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
-            PATH_PIX, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
-            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX,
-        },
-    .pattern  = "@@",
-    .ringBase = "N0 0 1 2 3 N3 N4 4 5 6 7 N7 N8 8 9 10 11 N11 N12 12 13 14 15 N15|N15 15 14 13 12 N12 N11 11 10 9 8 N8 N7 7 6 5 4 N4 N3 3 2 1 0 N0|N1 1 3 0 2 "
-                "N2 N5 5 7 4 6 N6 N9 9 11 8 10 N10 N13 13 15 12 14 N14|N14 14 12 15 13 N13 N10 10 8 11 9 N9 N6 6 4 7 5 N5 N2 2 0 3 1 N1|N0 0 1 2 3 N3 N4 4 5 6 "
-                "7 N7 N8 8 9 10 11 N11 N12 12 13 14 15 N15|N15 15 14 13 12 N12 N11 11 10 9 8 N8 N7 7 6 5 4 N4 N3 3 2 1 0 N0|N1 1 3 0 2 N2 N5 5 7 4 6 N6 N9 9 "
-                "11 8 10 N10 N13 13 15 12 14 N14|N14 14 12 15 13 N13 N10 10 8 11 9 N9 N6 6 4 7 5 N5 N2 2 0 3 1 N1",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_71 = {
-    .nGpus  = 8,
-    .nCpus  = 2,
-    .nNics  = 0,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x32000,
-            0x35000,
-            0x11000,
-            0x14000,
-            0xae000,
-            0xb3000,
-            0x8e000,
-            0x93000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
-            0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "4040",
-    .ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 2 3 7 6|6 7 3 2 4 5 1 0",
-    .options  = "disableNumaMatching=1,tuning=2",
-};
-static struct scclRomeModel rome_model_72 = {
-    .nGpus  = 8,
-    .nCpus  = 2,
-    .nNics  = 4,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x32000,
-            0x35000,
-            0x11000,
-            0x14000,
-            0xae000,
-            0xb3000,
-            0x8e000,
-            0x93000,
-        },
-    .nicIds =
-        {
-            0x1d000,
-            0x1e000,
-            0xa0000,
-            0xa1000,
-        },
-    .gpuNuma =
-        {
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-        },
-    .nicNuma =
-        {
-            0,
-            0,
-            1,
-            1,
-        },
-    .connMatrix =
-        {
-            0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
-            0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
-        },
-    .pattern  = "4242",
-    .ringBase = "N0 0 1 3 2 4 5 7 6 N3|N1 2 3 1 0 6 7 5 4 N2|N3 7 6 0 1 5 4 2 3 N1|N0 1 0 6 7 3 2 4 5 N2|N2 4 5 7 6 0 1 3 2 N1|N3 6 7 5 4 2 3 1 0 N0|N2 5 4 2 "
-                "3 7 6 0 1 N0|N1 3 2 4 5 1 0 6 7 N3",
-    .options  = "disableNumaMatching=1,tuning=2",
-};
-static struct scclRomeModel rome_model_73 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 0,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0xc1000,
-            0xc6000,
-            0xc9000,
-            0xce000,
-            0xd1000,
-            0xd6000,
-            0xd9000,
-            0xde000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            3,
-            3,
-            1,
-            1,
-            0,
-            0,
-            2,
-            2,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
-            0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "20202020",
-    .ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_74 = {
-    .nGpus  = 8,
-    .nCpus  = 4,
-    .nNics  = 4,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0xc1000,
-            0xc6000,
-            0xc9000,
-            0xce000,
-            0xd1000,
-            0xd6000,
-            0xd9000,
-            0xde000,
-        },
-    .nicIds =
-        {
-            0xc5000,
-            0xcd000,
-            0xd5000,
-            0xdd000,
-        },
-    .gpuNuma =
-        {
-            3,
-            3,
-            1,
-            1,
-            0,
-            0,
-            2,
-            2,
-        },
-    .nicNuma =
-        {
-            3,
-            1,
-            0,
-            2,
-        },
-    .connMatrix =
-        {
-            0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
-            0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB,
-            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
-        },
-    .pattern  = "21212121",
-    .ringBase = "N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 "
-                "1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3",
-    .options  = "tuning=3",
-};
-static struct scclRomeModel rome_model_76 = {
-    .nGpus  = 8,
-    .nCpus  = 2,
-    .nNics  = 8,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x32000,
-            0x35000,
-            0x11000,
-            0x14000,
-            0xae000,
-            0xb3000,
-            0x8e000,
-            0x93000,
-        },
-    .nicIds =
-        {
-            0x26000,
-            0x2d000,
-            0x5000,
-            0xc000,
-            0xab000,
-            0xb4000,
-            0x8b000,
-            0x94000,
-        },
-    .gpuNuma =
-        {
-            1,
-            1,
-            1,
-            1,
-            3,
-            3,
-            3,
-            3,
-        },
-    .nicNuma =
-        {
-            1,
-            1,
-            1,
-            1,
-            3,
-            3,
-            3,
-            3,
-        },
-    .connMatrix =
-        {
-            0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
-            0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
-            PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PHB,
-            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB,
-        },
-    .pattern  = "4444",
-    .ringBase = "N0 0 1 3 2 4 5 7 6 N6|N2 2 3 1 0 6 7 5 4 N4|N5 5 4 2 3 7 6 0 1 N1|N1 1 0 6 7 3 2 4 5 N5|N4 4 5 7 6 0 1 3 2 N2|N2 2 3 1 0 6 7 5 4 N4|N0 0 1 5 "
-                "4 2 3 7 6 N6|N3 3 2 4 5 1 0 6 7 N7|N4 4 5 7 6 0 1 3 2 N2|N6 6 7 5 4 2 3 1 0 N0|N7 7 6 0 1 5 4 2 3 N3|N6 6 7 3 2 4 5 1 0 N0|N3 3 2 0 1 5 4 6 7 "
-                "N7|N1 1 0 2 3 7 6 4 5 N5|N5 5 4 6 7 3 2 0 1 N1|N7 7 6 4 5 1 0 2 3 N3",
-    .options  = "disableNumaMatching=1,tuning=3",
-};
-static struct scclRomeModel rome_model_79 = {
-    .nGpus  = 8,
-    .nCpus  = 2,
-    .nNics  = 0,
-    .nLinks = 7,
-    .gpuIds =
-        {
-            0x1d000,
-            0x2e000,
-            0x3f000,
-            0x61000,
-            0x9f000,
-            0xaf000,
-            0xbf000,
-            0xdf000,
-        },
-    .nicIds = {},
-    .gpuNuma =
-        {
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-        },
-    .nicNuma = {},
-    .connMatrix =
-        {
-            0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
-            1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-        },
-    .gdrLevel = {},
-    .pattern  = "4040",
-    .ringBase = "0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 "
-                "0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0",
-    .options  = "noCpuCheck=1,mscclEnabled=1",
-};
-static struct scclRomeModel rome_model_80 = {
-    .nGpus  = 4,
-    .nCpus  = 4,
-    .nNics  = 4,
-    .nLinks = 3,
-    .gpuIds =
-        {
-            0x82000,
-            0xc2000,
-            0x2000,
-            0x42000,
-        },
-    .nicIds =
-        {
-            0x81000,
-            0xc1000,
-            0x1000,
-            0x41000,
-        },
-    .gpuNuma =
-        {
-            2,
-            3,
-            0,
-            1,
-        },
-    .nicNuma =
-        {
-            2,
-            3,
-            0,
-            1,
-        },
-    .connMatrix =
-        {
-            0,
-            2,
-            2,
-            2,
-            2,
-            0,
-            2,
-            2,
-            2,
-            2,
-            0,
-            2,
-            2,
-            2,
-            2,
-            0,
-        },
-    .gdrLevel =
-        {
-            PATH_PHB,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_PHB,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_PHB,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_SYS,
-            PATH_PHB,
-        },
-    .pattern  = "11111111",
-    .ringBase = "N2 2 3 0 1 N1|N0 0 1 3 2 N2|N0 0 2 1 3 N3|N3 3 1 0 2 N2|N3 3 1 2 0 N0|N1 1 0 3 2 N2|N1 1 2 3 0 N0|N2 2 0 1 3 N3|N3 3 0 2 1 N1|N2 2 3 1 0 "
-                "N0|N1 1 2 0 3 N3|N0 0 3 2 1 N1",
-    .options  = "",
-};
-static struct scclRomeModel rome_model_81 = {
-    .nGpus  = 8,
-    .nCpus  = 2,
-    .nNics  = 8,
-    .nLinks = 7,
-    .gpuIds =
-        {
-            0xc000,
-            0x22000,
-            0x38000,
-            0x5c000,
-            0x9f000,
-            0xaf000,
-            0xbf000,
-            0xdf000,
-        },
-    .nicIds =
-        {
-            0x7000,
-            0x1d000,
-            0x33000,
-            0x57000,
-            0x9a000,
-            0xaa000,
-            0xba000,
-            0xda000,
-        },
-    .gpuNuma =
-        {
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-        },
-    .nicNuma =
-        {
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-        },
-    .connMatrix =
-        {
-            0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
-            1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-        },
-    .gdrLevel =
-        {
-            PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS,
-            PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PHB, PATH_PHB,
-            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
-            PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB,
-        },
-    .pattern  = "4444",
-    .ringBase = "N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 7 1 6 4 N4|N3 3 6 1 5 2 7 4 0 N0|N4 4 7 0 6 5 1 3 2 N2|N5 5 4 6 3 0 7 2 1 N1|N6 6 2 0 "
-                "4 1 7 5 3 N3|N7 7 3 1 4 2 6 0 5 N5|N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 7 1 6 4 N4|N3 3 6 1 5 2 7 4 0 N0|N4 4 7 0 6 5 1 3 2 "
-                "N2|N5 5 4 6 3 0 7 2 1 N1|N6 6 2 0 4 1 7 5 3 N3|N7 7 3 1 4 2 6 0 5 N5",
-    .options  = "noCpuCheck=1,mscclEnabled=1",
-};
-static struct scclRomeModel romeTopoModels[] = {
-    rome_model_22, rome_model_25, rome_model_27, rome_model_29, rome_model_31, rome_model_33, rome_model_30, rome_model_32, rome_model_24,
-    rome_model_26, rome_model_23, rome_model_38, rome_model_28, rome_model_40, rome_model_42, rome_model_44, rome_model_45, rome_model_46,
-    rome_model_48, rome_model_49, rome_model_52, rome_model_53, rome_model_43, rome_model_55, rome_model_56, rome_model_58, rome_model_59,
-    rome_model_62, rome_model_63, rome_model_65, rome_model_66, rome_model_67, rome_model_68, rome_model_71, rome_model_72, rome_model_73,
-    rome_model_74, rome_model_76, rome_model_79, rome_model_80, rome_model_81,
-};
-/* Parse user defined rings. Format is like :
- * "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1"
- * Network interfaces can be optionally specified by N prefix.
- * Rings with a non-matching number of gpus are ignored so we can provide
- * rings for multiple cases.
- */
-scclResult_t parseGraph(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map, int* net_map) {
-    int gpus[SCCL_TOPO_MAX_NODES];
-    int nChannels = 0;
-    int gpu       = 0;
-    int offset    = 0;
-    int status    = 0; // 0 : between numbers, 1 : inside number, 2: start NET, 3: inside NET
-    int nets[SCCL_TOPO_MAX_NODES * 2];
-    int net_offset = 0, net_count = 0;
-    int ngpus = system->nodes[GPU].count;
-    int nnets = system->nodes[NET].count;
-    do {
-        if(str[offset] == 'N') {
-            if(status == 0) {
-                status = 2;
-            }
-        } else {
-            int digit = str[offset] - '0';
-            if(digit >= 0 && digit <= 9) {
-                switch(status) {
-                    case 0:
-                        gpus[gpu] = digit;
-                        status    = 1;
-                        break;
-                    case 1: gpus[gpu] = gpus[gpu] * 10 + digit; break;
-                    case 2:
-                        nets[net_offset] = digit + 'N';
-                        status           = 3;
-                        break;
-                    case 3: nets[net_offset] = (nets[net_offset] - 'N') * 10 + digit + 'N'; break;
-                }
-            } else {
-                if(status == 1) {
-                    gpu++;
-                    net_offset = 2 * gpu - 1;
-                    if(gpu > SCCL_TOPO_MAX_NODES)
-                        goto end;
-                } else if(status == 2 || status == 3) {
-                    net_offset++;
-                    net_count++;
-                    if(net_offset > ngpus * 2)
-                        goto end;
-                }
-                status = 0;
-                if(str[offset] == '|' || str[offset] == '\0') {
-                    // Ignore if ngpus doesn't match
-                    if(gpu != ngpus)
-                        goto newchannel;
-                    // Ignore if net_count is not 0 or odd number
-                    if(net_count && net_count % 2)
-                        goto newchannel;
-                    for(int r = 0; r < ngpus; r++) {
-                        int g = gpus[r];
-                        // Ignore if gpus are out of bounds
-                        if(g < 0 || g >= ngpus)
-                            goto newchannel;
-                        // Ignore if gpus are duplicate
-                        for(int i = 0; i < r; i++)
-                            if(gpus[i] == g)
-                                goto newchannel;
-                        // remap if needed
-                        if(gpu_map)
-                            g = gpu_map[g];
-                        // Translate gpu numbers into ranks
-                        int j = 0;
-                        for(j = 0; j < ngpus; j++)
-                            if(g == system->nodes[GPU].nodes[j].gpu.dev)
-                                break;
-                        if(j < ngpus)
-                            graph->intra[nChannels * ngpus + r] = system->nodes[GPU].nodes[j].gpu.rank;
-                        else
-                            return scclInternalError;
-                    }
-                    if(net_count) {
-                        for(int i = 0; net_map && i < ngpus * 2; i++) {
-                            if(nets[i] - 'N' < 0 || nets[i] - 'N' >= nnets)
-                                continue;
-                            nets[i] = net_map[nets[i] - 'N'] + 'N';
-                        }
-                        memcpy(&graph->intraNets[ngpus * nChannels * 2], nets, ngpus * 2 * sizeof(int));
-                        graph->nIntraChannels++;
-                        if(nets[0] - 'N' >= nnets || nets[ngpus * 2 - 1] - 'N' >= nnets)
-                            goto newchannel;
-                        graph->inter[nChannels * 2]     = nets[0] - 'N';
-                        graph->inter[nChannels * 2 + 1] = nets[ngpus * 2 - 1] - 'N';
-                    } else if(nnets) {
-                        graph->inter[nChannels * 2]     = system->nodes[NET].nodes[nChannels % nnets].id;
-                        graph->inter[nChannels * 2 + 1] = system->nodes[NET].nodes[(nChannels + 1) % nnets].id;
-                    }
-                    nChannels++;
-                newchannel:
-                    gpu        = 0;
-                    net_offset = 0;
-                    net_count  = 0;
-                }
-            }
-        }
-    } while(str[offset++] != 0);
-end:
-    graph->nChannels = nChannels;
-    graph->bwIntra = graph->bwInter = system->totalBw / nChannels;
-    if(graph->id == 1) {
-        for(int i = 0; i < graph->nChannels; i++) {
-            int net;
-            scclTopoGetLocalNet(system, graph->intra[i * ngpus + 1], i, &net);
-            graph->inter[i * 2 + 1] = net;
-        }
-    }
-#if 0
-  for (int i=0; i<graph->nChannels; i++) {
-    printf("%d: ", i);
-    printf ("NET/%d ", graph->inter[i*2]);
-    for (int j=0; j<ngpus; j++) printf("GPU/%d ", graph->intra[i*ngpus+j]);
-    printf ("NET/%d ", graph->inter[i*2+1]);
-    printf("\n");
-  }
-#endif
-    return scclSuccess;
-}
-/* Parse user defined treeBase for complicated trees. Format is like :
- * "(4(2(3)(1))(6(5)))"
- *
- * Rings with a non-matching number of gpus are ignored so we can provide
- * rings for multiple cases.
- */
-scclResult_t parseGraphLight(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map) {
-    int gpus[SCCL_TOPO_MAX_NODES]; // transcribe/change according to gpu_map
-    int nChannels    = 0;
-    int gpu          = 0;
-    int offset       = 0;
-    int start_offset = offset;
-    if(str[0] == 0) {
-        graph->treeBase[0][0] = 0;
-        return scclSuccess;
-    }
-    int status = 0; // 0 : between numbers, 1 : inside number
-    int ngpus  = system->nodes[GPU].count;
-    int x = 0, y = 0;
-    do {
-        int digit = str[offset] - '0';
-        if(digit >= 0 && digit <= 9) {
-            switch(status) {
-                case 0:
-                    gpus[gpu] = digit;
-                    status    = 1;
-                    break;
-                case 1: gpus[gpu] = gpus[gpu] * 10 + digit; break;
-            }
-        } else {
-            if(status == 1) {
-                gpu++;
-            }
-            status = 0;
-            if(str[offset] == '|' || str[offset] == 0) {
-                int r = 0, y = 0;
-                while(start_offset < offset) {
-                    // for (int r=0; r<gpu; r++) {
-                    if(str[start_offset] == '(' || str[start_offset] == ')') {
-                        graph->treeBase[x][y] = str[start_offset];
-                        y++;
-                        start_offset++;
-                    } else {
-                        int g = gpus[r];
-                        // remap if needed
-                        if(gpu_map)
-                            g = gpu_map[g];
-                        r++;
-                        int j = 0;
-                        // Translate gpu numbers into ranks
-                        for(j = 0; j < ngpus; j++)
-                            if(g == system->nodes[GPU].nodes[j].gpu.dev)
-                                break;
-                        if(j < ngpus) {
-                            while(str[start_offset] != '(' && str[start_offset] != ')')
-                                start_offset++;
-                            char number_str[10];
-                            sprintf(number_str, "%d", g);
-                            int k = 0;
-                            while(number_str[k] != 0) {
-                                graph->treeBase[x][y] = number_str[k];
-                                y++;
-                                k++;
-                            }
-                        } else
-                            return scclInternalError;
-                    }
-                }
-                graph->treeBase[x][y] = 0;
-                x++;
-                gpu          = 0;
-                start_offset = offset + 1;
-            }
-        }
-    } while(str[offset++] != 0);
-    graph->treeBase[x][0] = 0;
-    return scclSuccess;
-}
-#define MAX_OPT_TOKENS 10
-extern const char* topoPathTypeStr[];
-static void parseOptions(struct scclTopoSystem* system, const char* options) {
-    if(strcmp(options, "")) {
-        char* str_temp = (char*)malloc(strlen(options) + 1);
-        strcpy(str_temp, options);
-        char* tokens[MAX_OPT_TOKENS];
-        int numTokens = 0;
-        char* state;
-        tokens[numTokens] = strtok_r(str_temp, "=, ", &state);
-        numTokens++;
-        while(tokens[numTokens - 1] != NULL && numTokens < MAX_OPT_TOKENS)
-            tokens[numTokens++] = strtok_r(NULL, "=, ", &state);
-        for(int i = 0; i < numTokens / 2; i++) {
-            if(strcmp(tokens[i * 2], "netGdrLevel") == 0) {
-                int j;
-                for(j = 0; j <= PATH_SYS; j++) {
-                    if(strcmp(tokens[i * 2 + 1], topoPathTypeStr[j]) == 0)
-                        break;
-                }
-                if(j <= PATH_SYS)
-                    system->netGdrLevel = j;
-                else {
-                    system->netGdrLevel = -2;
-                    WARN("invalid netGdrLevel: %s", tokens[i * 2 + 1]);
-                }
-            } else if(strcmp(tokens[i * 2], "pivotA2AEnabled") == 0) {
-                system->pivotA2AEnabled = (bool)atol(tokens[i * 2 + 1]);
-            } else if(strcmp(tokens[i * 2], "pivotA2ANumBiRings") == 0) {
-                system->pivotA2ANumBiRings = atol(tokens[i * 2 + 1]);
-            } else if(strcmp(tokens[i * 2], "tuning") == 0) {
-                system->tuning = atol(tokens[i * 2 + 1]);
-            } else if(strcmp(tokens[i * 2], "ll128Enabled") == 0) {
-                system->ll128Enabled = (bool)atol(tokens[i * 2 + 1]);
-            } else if(strcmp(tokens[i * 2], "baseBw") == 0) {
-                system->baseBw = std::stof(tokens[i * 2 + 1]);
-            } else if(strcmp(tokens[i * 2], "mscclEnabled") == 0) {
-                system->mscclEnabled = (bool)atol(tokens[i * 2 + 1]);
-            } else if(strcmp(tokens[i * 2], "treeDefined") == 0) {
-                system->treeDefined = (bool)atol(tokens[i * 2 + 1]);
-            }
-        }
-        free(str_temp);
-    }
-}
-static bool checkOption(const char* options, const char* name) {
-    if(strcmp(options, "")) {
-        char* str_temp = (char*)malloc(strlen(options) + 1);
-        strcpy(str_temp, options);
-        char* tokens[MAX_OPT_TOKENS];
-        int numTokens = 0;
-        char* state;
-        tokens[numTokens] = strtok_r(str_temp, "=, ", &state);
-        numTokens++;
-        while(tokens[numTokens - 1] != NULL && numTokens < MAX_OPT_TOKENS)
-            tokens[numTokens++] = strtok_r(NULL, "=, ", &state);
-        for(int i = 0; i < numTokens / 2; i++) {
-            if(strcmp(tokens[i * 2], name) == 0) {
-                return (bool)atol(tokens[i * 2 + 1]);
-            }
-        }
-        free(str_temp);
-    }
-    return false;
-}
-scclResult_t parseChordalRing(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
-    static const char* ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3";
-    int id[8], dist[8];
-    int i;
-    int ngpus = system->nodes[GPU].count;
-    if(ngpus != 8)
-        return scclSuccess;
-    // validate chordal ring and calculate distance
-    for(i = 0; i < ngpus; i++) {
-        struct scclTopoNode* node = system->nodes[GPU].nodes + i;
-        if(node->paths[GPU] == NULL)
-            continue;
-        int sum   = ngpus * (ngpus - 1) / 2 - node->gpu.dev;
-        int count = 0;
-        for(int n = 0; n < ngpus; n++) {
-            struct scclTopoLink* link;
-            for(link = node->links; link->remNode; link++) {
-                if(link->remNode->gpu.dev == n)
-                    break;
-            }
-            if(!link->remNode)
-                continue;
-            if(link->type != LINK_NVL)
-                continue;
-            sum -= system->nodes[GPU].nodes[n].gpu.dev;
-            count++;
-        }
-        if(count != ngpus - 2 || sum < 0 || sum > ngpus - 1) {
-            return scclSuccess;
-        }
-        dist[i] = sum;
-    }
-    // remap GPU ids
-    for(i = 0; i < ngpus; i++)
-        id[i] = i;
-    for(i = 0; i < ngpus; i++) {
-        if(dist[i] == ngpus - 1 - i)
-            continue;
-        int j, m, n, temp;
-        for(j = i + 1; j < ngpus; j++)
-            if(dist[j] == ngpus - 1 - i)
-                break;
-        m       = dist[i];
-        n       = dist[j];
-        dist[i] = n;
-        dist[j] = m;
-        temp    = id[m];
-        id[m]   = id[n];
-        id[n]   = temp;
-        temp    = dist[m];
-        dist[m] = dist[n];
-        dist[n] = temp;
-    }
-    // create chordal ring based on reference and remapped ids
-    system->type |= RCCL_TOPO_CR8G;
-    SCCLCHECK(parseGraph(ringBase, system, graph, id, NULL));
-    if(system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
-        int *intra, *used;
-        graph->nChannels = system->nodes[NET].count;
-        SCCLCHECK(scclCalloc(&intra, ngpus));
-        SCCLCHECK(scclCalloc(&used, system->nodes[NET].count));
-        for(int n = 0; n < system->nodes[NET].count; n++) {
-            graph->inter[n * 2] = graph->inter[n * 2 + 1] = n;
-            struct scclTopoNode* net                      = system->nodes[NET].nodes + n;
-            struct scclTopoLinkList* paths                = net->paths[GPU];
-            // find the first unsed GPU that is closest to NIC
-            int f, m;
-            for(f = 0; f < ngpus; f++) {
-                int j = 0;
-                for(j = 0; j < n; j++)
-                    if(used[j] == system->nodes[GPU].nodes[f].gpu.rank)
-                        break;
-                if(j >= n)
-                    break;
-            }
-            for(int i = 0; i < ngpus; i++) {
-                int j = 0;
-                for(j = 0; j < n; j++)
-                    if(used[j] == system->nodes[GPU].nodes[i].gpu.rank)
-                        break;
-                if(j < n)
-                    continue;
-                if(paths[i].count < paths[f].count)
-                    f = i;
-            }
-            for(m = 0; m < ngpus; m++)
-                if(graph->intra[n * ngpus + m] == system->nodes[GPU].nodes[f].gpu.rank)
-                    break;
-            used[n] = graph->intra[n * ngpus + m];
-            for(int i = 0; i < ngpus; i++)
-                intra[i] = graph->intra[n * ngpus + ((i + m) % ngpus)];
-            for(int i = 0; i < ngpus; i++)
-                graph->intra[n * ngpus + i] = intra[i];
-        }
-        free(used);
-        free(intra);
-    }
-    return scclSuccess;
-}
-static scclResult_t parseRomeSystem(struct scclTopoSystem* system, struct scclRomeModel* romeTopo, char* pattern) {
-    pattern[0]       = 0; // pattern will be NULL for invalid topology
-    romeTopo->nGpus  = system->nodes[GPU].count;
-    romeTopo->nCpus  = system->nodes[CPU].count;
-    romeTopo->nNics  = system->nodes[NET].count;
-    romeTopo->nLinks = 0;
-    struct scclGpuIdHIP {
-        int g;
-        int dev;
-    };
-    auto cmpIds = [](const void* g1, const void* g2) {
-        struct scclGpuIdHIP* s1 = (struct scclGpuIdHIP*)g1;
-        struct scclGpuIdHIP* s2 = (struct scclGpuIdHIP*)g2;
-        return s1->dev - s2->dev;
-    };
-    struct scclCpuNuma {
-        int c;
-        uint64_t numa;
-    };
-    auto cmpNuma = [](const void* g1, const void* g2) {
-        struct scclCpuNuma* s1 = (struct scclCpuNuma*)g1;
-        struct scclCpuNuma* s2 = (struct scclCpuNuma*)g2;
-        return (int)(s1->numa - s2->numa);
-    };
-    struct scclNetId {
-        int n;
-        uint64_t id;
-    };
-    auto cmpNets = [](const void* g1, const void* g2) {
-        struct scclNetId* s1 = (struct scclNetId*)g1;
-        struct scclNetId* s2 = (struct scclNetId*)g2;
-        return (int)(s1->id - s2->id);
-    };
-    // sort GPU devices by HIP device ID
-    struct scclGpuIdHIP gpu_scores[SCCL_TOPO_MAX_NODES];
-    for(int i = 0; i < romeTopo->nGpus; i++) {
-        gpu_scores[i].g   = i;
-        gpu_scores[i].dev = system->nodes[GPU].nodes[i].gpu.dev;
-    }
-    qsort(gpu_scores, romeTopo->nGpus, sizeof(struct scclGpuIdHIP), cmpIds);
-    // sort CPU devices by NUMA id
-    struct scclCpuNuma cpu_scores[SCCL_TOPO_MAX_NODES];
-    for(int i = 0; i < romeTopo->nCpus; i++) {
-        cpu_scores[i].c    = i;
-        cpu_scores[i].numa = system->nodes[CPU].nodes[i].id;
-    }
-    qsort(cpu_scores, romeTopo->nCpus, sizeof(struct scclCpuNuma), cmpNuma);
-    // sort NET devices by id
-    struct scclNetId net_scores[SCCL_TOPO_MAX_NODES];
-    for(int i = 0; i < romeTopo->nNics; i++) {
-        net_scores[i].n  = i;
-        net_scores[i].id = system->nodes[NET].nodes[i].id;
-    }
-    qsort(net_scores, romeTopo->nNics, sizeof(struct scclNetId), cmpNets);
-    for(int i = 0; i < romeTopo->nGpus; i++) {
-        int gpu, n, m, distance;
-        gpu                 = gpu_scores[i].g;
-        romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
-        m                   = 0;
-        distance            = system->nodes[GPU].nodes[gpu].paths[CPU][m].count;
-        for(n = 1; n < romeTopo->nCpus; n++) {
-            if(system->nodes[GPU].nodes[gpu].paths[CPU][n].count < distance) {
-                distance = system->nodes[GPU].nodes[gpu].paths[CPU][n].count;
-                m        = n;
-            }
-        }
-        if(m < romeTopo->nCpus)
-            romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[m].id;
-        struct scclTopoNode* node = system->nodes[GPU].nodes + gpu;
-        if(node->paths[GPU] == NULL)
-            continue;
-        int count = 0;
-        for(n = 0; n < romeTopo->nGpus; n++) {
-            romeTopo->connMatrix[i * romeTopo->nGpus + n] = 0;
-            struct scclTopoLink* link;
-            for(link = node->links; link->remNode; link++) {
-                if(link->remNode->gpu.dev == n)
-                    break;
-            }
-            if(!link->remNode)
-                continue;
-            if(link->type != LINK_NVL)
-                continue;
-            romeTopo->connMatrix[i * romeTopo->nGpus + n] = link->bw / scclTopoXGMISpeed(node->gpu.gcn);
-            count++;
-        }
-        if(romeTopo->nLinks < count)
-            romeTopo->nLinks = count;
-    }
-    for(int i = 0; i < romeTopo->nNics; i++) {
-        int n, m, distance;
-        m                   = 0;
-        int net             = net_scores[i].n;
-        romeTopo->nicIds[i] = system->nodes[NET].nodes[net].net.busId;
-        distance            = system->nodes[NET].nodes[net].paths[CPU][m].count;
-        for(n = 0; n < romeTopo->nCpus; n++)
-            if(system->nodes[NET].nodes[net].paths[CPU][n].count < distance) {
-                distance = system->nodes[NET].nodes[net].paths[CPU][n].count;
-                m        = n;
-            }
-        if(m < romeTopo->nCpus)
-            romeTopo->nicNuma[i] = system->nodes[CPU].nodes[m].id;
-        else
-            return scclSuccess;
-    }
-    // number of GPUs and NICs on each numa node is used as first screening pattern
-    for(int i = 0; i < romeTopo->nCpus; i++) {
-        uint64_t id = system->nodes[CPU].nodes[cpu_scores[i].c].id;
-        int g = 0, n = 0;
-        for(int j = 0; j < romeTopo->nGpus; j++)
-            if(romeTopo->gpuNuma[j] == id)
-                g++;
-        for(int j = 0; j < romeTopo->nNics; j++)
-            if(romeTopo->nicNuma[j] == id)
-                n++;
-        pattern[i * 2]     = '0' + g;
-        pattern[i * 2 + 1] = '0' + n;
-    }
-    pattern[romeTopo->nCpus * 2] = 0;
-    // compute gdr level matrix
-    for(int i = 0; i < romeTopo->nNics; i++) {
-        int n = net_scores[i].n;
-        for(int j = 0; j < romeTopo->nGpus; j++) {
-            int g                                       = gpu_scores[j].g;
-            romeTopo->gdrLevel[i * romeTopo->nGpus + j] = system->nodes[GPU].nodes[g].paths[NET][n].type;
-        }
-    }
-    const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
-    if(romeModelFile) {
-        INFO(SCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
-        FILE* file = fopen(romeModelFile, "w");
-        if(file == NULL) {
-            WARN("Unable to open %s, not dumping Rome model.", romeModelFile);
-            return scclSuccess;
-        }
-        fprintf(file, "static struct scclRomeModel rome_model_ = {\n");
-        fprintf(file, "  .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,\n", romeTopo->nGpus, romeTopo->nCpus, romeTopo->nNics, romeTopo->nLinks);
-        fprintf(file, "  .gpuIds = { ");
-        for(int i = 0; i < romeTopo->nGpus; i++)
-            fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]);
-        fprintf(file, "},\n");
-        fprintf(file, "  .nicIds = { ");
-        for(int i = 0; i < romeTopo->nNics; i++)
-            fprintf(file, "0x%lx, ", romeTopo->nicIds[i]);
-        fprintf(file, "},\n");
-        fprintf(file, "  .gpuNuma = { ");
-        for(int i = 0; i < romeTopo->nGpus; i++)
-            fprintf(file, "%ld, ", romeTopo->gpuNuma[i]);
-        fprintf(file, "},\n");
-        fprintf(file, "  .nicNuma = { ");
-        for(int i = 0; i < romeTopo->nNics; i++)
-            fprintf(file, "%ld, ", romeTopo->nicNuma[i]);
-        fprintf(file, "},\n");
-        fprintf(file, "  .connMatrix = { ");
-        for(int i = 0; i < romeTopo->nGpus; i++)
-            for(int n = 0; n < romeTopo->nGpus; n++)
-                fprintf(file, "%d, ", romeTopo->connMatrix[i * romeTopo->nGpus + n]);
-        fprintf(file, "},\n");
-        fprintf(file, "  .gdrLevel = { ");
-        for(int i = 0; i < romeTopo->nNics; i++)
-            for(int n = 0; n < romeTopo->nGpus; n++)
-                fprintf(file, "PATH_%s, ", topoPathTypeStr[romeTopo->gdrLevel[i * romeTopo->nGpus + n]]);
-        fprintf(file, "},\n");
-        fprintf(file, "  .pattern = \"%s\",\n", pattern);
-        fprintf(file, "  .ringBase = \"\",\n");
-        fprintf(file, "  .options = \"\",\n");
-        fprintf(file, "};\n");
-        fclose(file);
-    }
-    return scclSuccess;
-}
-static bool permuteGpuIds(int* g, int n, int last, struct scclRomeModel* ref, struct scclRomeModel* topo, int* time, bool nbio, bool ignore_numa) {
-    (*time)++;
-    if(n == last) {
-        int i, j;
-        // match GPU numa
-        if(!ignore_numa) {
-            for(i = 0; i < ref->nGpus; i++)
-                if(ref->gpuNuma[i] != topo->gpuNuma[g[i]])
-                    break;
-            if(i < ref->nGpus)
-                return false;
-        }
-        // match XGMI connection
-        for(i = 0; i < ref->nGpus; i++) {
-            for(j = 0; j < ref->nGpus; j++) {
-                if(ref->connMatrix[i * ref->nGpus + j] != topo->connMatrix[g[i] * ref->nGpus + g[j]])
-                    break;
-                if((ref->gpuIds[i] - ref->gpuIds[j]) * (topo->gpuIds[g[i]] - topo->gpuIds[g[j]]) < 0)
-                    break;
-            }
-            if(j < ref->nGpus)
-                break;
-        }
-        if(i < ref->nGpus)
-            return false;
-        // match NBIO
-        if(nbio) {
-            for(i = 0; i < ref->nGpus; i++) {
-                for(j = 0; j < ref->nGpus; j++) {
-                    if(i == j)
-                        continue;
-                    bool nbio_ref  = (ref->gpuIds[i] & 0xf0000) == (ref->gpuIds[j] & 0xf0000);
-                    bool nbio_topo = (topo->gpuIds[g[i]] & 0xf0000) == (topo->gpuIds[g[j]] & 0xf0000);
-                    if(nbio_ref != nbio_topo)
-                        break;
-                    if(nbio_ref && ((ref->gpuIds[i] - ref->gpuIds[j]) * (topo->gpuIds[g[i]] - topo->gpuIds[g[j]]) < 0))
-                        break;
-                }
-                if(j < ref->nGpus)
-                    break;
-            }
-            if(i < ref->nGpus)
-                return false;
-        }
-        return true;
-    } else {
-        for(int i = n; i <= last; i++) {
-            std::swap(g[n], g[i]);
-            if(permuteGpuIds(g, n + 1, last, ref, topo, time, nbio, ignore_numa))
-                return true;
-            std::swap(g[n], g[i]);
-        }
-    }
-    return false;
-}
-static bool permuteNetIds(int* n, int* g, int s, int last, struct scclRomeModel* ref, struct scclRomeModel* topo, int* time, bool ignore_numa) {
-    (*time)++;
-    if(s == last) {
-        int i, j;
-        // match NET numa
-        if(!ignore_numa) {
-            for(i = 0; i < ref->nNics; i++) {
-                if(ref->nicNuma[i] != topo->nicNuma[n[i]])
-                    break;
-            }
-            if(i < ref->nNics)
-                return false;
-        }
-        // match gdr level
-        for(i = 0; i < ref->nNics; i++) {
-            for(j = 0; j < ref->nGpus; j++) {
-                if(ref->gdrLevel[i * ref->nGpus + j] != topo->gdrLevel[n[i] * ref->nGpus + g[j]])
-                    break;
-            }
-            if(j < ref->nGpus)
-                break;
-        }
-        if(i < ref->nNics)
-            return false;
-        return true;
-    } else {
-        for(int i = s; i <= last; i++) {
-            std::swap(n[s], n[i]);
-            if(permuteNetIds(n, g, s + 1, last, ref, topo, time, ignore_numa))
-                return true;
-            std::swap(n[s], n[i]);
-        }
-    }
-    return false;
-}
-scclResult_t parseRome4P2H(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
-    static char ringRemap[64];
-    int i;
-    int ngpus = system->nodes[GPU].count;
-    int ncpus = system->nodes[CPU].count;
-    int nnets = system->nodes[NET].count;
-    if(ngpus > 8)
-        return scclSuccess;
-    // only valid on Rome
-    int arch, vendor, model;
-    SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
-    // number of GPUs and NICs on each numa node is used as first screening pattern
-    struct scclRomeModel romeTopo;
-    char pattern[256];
-    SCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
-    // recognize system as Rome 4P2H even if no matching model
-    if(ngpus > 4 && romeTopo.nLinks)
-        system->type |= RCCL_TOPO_4P2H_ROME;
-    int g[SCCL_TOPO_MAX_NODES], n[SCCL_TOPO_MAX_NODES];
-    int time = 0;
-    struct timeval tvs, tve;
-    gettimeofday(&tvs, NULL);
-    // check if GPUs are directly connected to CPU
-    bool match_nbio = true;
-    for(i = 0; i < romeTopo.nGpus; i++) {
-        int cpu, gpu;
-        SCCLCHECK(scclTopoIdToIndex(system, CPU, romeTopo.gpuNuma[i], &cpu));
-        SCCLCHECK(scclTopoIdToIndex(system, GPU, romeTopo.gpuIds[i], &gpu));
-        if(system->nodes[GPU].nodes[gpu].paths[CPU][cpu].count > 2)
-            break;
-    }
-    if(i < romeTopo.nGpus)
-        match_nbio = false;
-    for(i = 0; i < sizeof(romeTopoModels) / sizeof(romeTopoModels[0]); i++) {
-        bool ignore_cpu = checkOption(romeTopoModels[i].options, "noCpuCheck");
-        if(!ignore_cpu && (arch != SCCL_TOPO_CPU_ARCH_X86 || vendor != SCCL_TOPO_CPU_VENDOR_AMD || model != SCCL_TOPO_CPU_TYPE_ROME))
-            continue;
-        bool ignore_numa = checkOption(romeTopoModels[i].options, "disableNumaMatching");
-        if(!ignore_numa && romeTopo.nCpus != romeTopoModels[i].nCpus)
-            continue;
-        if(romeTopo.nGpus != romeTopoModels[i].nGpus || romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks)
-            continue;
-        if(!ignore_numa && strcmp(romeTopoModels[i].pattern, pattern))
-            continue;
-        // permute GPU IDs
-        for(int j = 0; j < ngpus; j++)
-            g[j] = (j + 2) % ngpus;
-        if(!permuteGpuIds(g, 0, ngpus - 1, romeTopoModels + i, &romeTopo, &time, ignore_cpu ? false : match_nbio, ignore_numa))
-            continue;
-        if(nnets > 1) {
-            // permute NET IDs
-            for(int j = 0; j < nnets; j++)
-                n[j] = (j + 2) % nnets;
-            if(permuteNetIds(n, g, 0, nnets - 1, romeTopoModels + i, &romeTopo, &time, ignore_numa))
-                break;
-        } else
-            break;
-    }
-    gettimeofday(&tve, NULL);
-    float t = (tve.tv_sec - tvs.tv_sec) * 1E3 + (tve.tv_usec - tvs.tv_usec) / 1E3;
-    if(i >= sizeof(romeTopoModels) / sizeof(romeTopoModels[0])) {
-        // printf("No solution in %.2fms (%d iter)\n", t, time);
-        return scclSuccess;
-    }
-    char line[1024];
-    // sprintf(line, "Found matching Rome model index %d in %.2fms (%d iter) with GPU mapping: ", i, t, time);
-    sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i);
-    int offset = strlen(line);
-    for(int k = 0; k < ngpus; k++) {
-        sprintf(line + offset, "%d ", g[k]);
-        offset = strlen(line);
-    }
-    if(nnets > 1) {
-        sprintf(line + offset, "NET mapping: ");
-        offset = strlen(line);
-        for(int k = 0; k < nnets; k++) {
-            sprintf(line + offset, "%d ", n[k]);
-            offset = strlen(line);
-        }
-    }
-    INFO(SCCL_GRAPH, "%s", line);
-    parseOptions(system, romeTopoModels[i].options);
-    // create 4P2H based on reference and remapped ids
-    SCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL));
-    if(romeTopoModels[i].treeBase != nullptr)
-        SCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, g));
-    return scclSuccess;
-}
-scclResult_t parse1H16P(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
-#define NUMA_CPUS 4
-#define NUMA_GPUS 4
-#define NUMA_PERMUTE_COUNT 24
-#define TOTAL_PERMUTE_COUNT (NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT)
-    static char ringRemap[256];
-    int i;
-    int ngpus = system->nodes[GPU].count;
-    int ncpus = system->nodes[CPU].count;
-    int nnets = system->nodes[NET].count;
-    // only valid on Rome
-    int arch, vendor, model;
-    SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
-    if(arch != SCCL_TOPO_CPU_ARCH_X86 || vendor != SCCL_TOPO_CPU_VENDOR_AMD || model != SCCL_TOPO_CPU_TYPE_ROME)
-        return scclSuccess;
-    // number of GPUs and NICs on each numa node is used as first screening pattern
-    struct scclRomeModel romeTopo;
-    char pattern[256];
-    SCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
-    // only match for system with 16 GPUs
-    if(ngpus != 16 || ncpus != NUMA_CPUS)
-        return scclSuccess;
-    int gcnt = 0;
-    int *g16, n[SCCL_TOPO_MAX_NODES];
-    int* all_gpu_permutations = (int*)malloc(TOTAL_PERMUTE_COUNT * NUMA_CPUS * NUMA_GPUS * sizeof(int));
-    struct timeval tvs, tve;
-    gettimeofday(&tvs, NULL);
-    for(i = 0; i < sizeof(romeTopoModels) / sizeof(romeTopoModels[0]); i++) {
-        if(romeTopo.nCpus != romeTopoModels[i].nCpus || romeTopo.nGpus != romeTopoModels[i].nGpus || romeTopo.nNics != romeTopoModels[i].nNics ||
-           romeTopo.nLinks != romeTopoModels[i].nLinks)
-            continue;
-        if(strcmp(romeTopoModels[i].pattern, pattern))
-            continue;
-        int j, r[ngpus], g[ngpus];
-        int numa_gpu_permutations[NUMA_CPUS][NUMA_PERMUTE_COUNT][NUMA_GPUS];
-        // permute GPUs for each CPU NUMA nodes
-        for(j = 0; j < ncpus; j++) {
-            int ngpusPerNuma = 0, cnt = 0, npermute = 0;
-            for(int k = 0; k < ngpus; k++) {
-                if(romeTopoModels[i].gpuNuma[k] != j)
-                    continue;
-                r[ngpusPerNuma++] = k;
-            }
-            if(ngpusPerNuma == 0)
-                continue;
-            if(ngpusPerNuma != NUMA_GPUS)
-                break;
-            gcnt++;
-            // init GPU mapping
-            for(int k = 0; k < ngpus; k++) {
-                if(romeTopo.gpuNuma[k] != j)
-                    continue;
-                g[(2 + cnt++) % ngpusPerNuma] = k;
-            }
-            std::sort(g, g + ngpusPerNuma);
-            do {
-                for(int n = 0; n < ngpusPerNuma; n++)
-                    numa_gpu_permutations[j][npermute][n] = g[n];
-                npermute++;
-            } while(std::next_permutation(g, g + ngpusPerNuma));
-            if(npermute != NUMA_PERMUTE_COUNT)
-                break;
-        }
-        if(j < ncpus)
-            continue;
-        // permute GPUs for all CPU NUMA nodes
-        for(int a = 0; a < NUMA_PERMUTE_COUNT; a++) {
-            for(int b = 0; b < NUMA_PERMUTE_COUNT; b++) {
-                for(int c = 0; c < NUMA_PERMUTE_COUNT; c++) {
-                    for(int d = 0; d < NUMA_PERMUTE_COUNT; d++) {
-                        uint64_t offset = ((a * NUMA_PERMUTE_COUNT + b) * NUMA_PERMUTE_COUNT + c) * NUMA_PERMUTE_COUNT + d;
-                        // offset = (offset+TOTAL_PERMUTE_COUNT/2)%TOTAL_PERMUTE_COUNT;
-                        offset *= (NUMA_CPUS * NUMA_GPUS);
-                        memcpy(all_gpu_permutations + offset, &numa_gpu_permutations[0][a][0], NUMA_GPUS * sizeof(int));
-                        memcpy(all_gpu_permutations + offset + NUMA_GPUS, &numa_gpu_permutations[1][b][0], NUMA_GPUS * sizeof(int));
-                        memcpy(all_gpu_permutations + offset + NUMA_GPUS * 2, &numa_gpu_permutations[2][c][0], NUMA_GPUS * sizeof(int));
-                        memcpy(all_gpu_permutations + offset + NUMA_GPUS * 3, &numa_gpu_permutations[3][d][0], NUMA_GPUS * sizeof(int));
-                    }
-                }
-            }
-        }
-        // match all GPUs' XGMI connection
-        int p;
-        for(p = 0; p < TOTAL_PERMUTE_COUNT; p++) {
-            g16 = all_gpu_permutations + p * NUMA_CPUS * NUMA_GPUS;
-            int k;
-            for(k = 0; k < romeTopoModels[i].nGpus; k++) {
-                int m;
-                for(m = 0; m < romeTopoModels[i].nGpus; m++) {
-                    if(romeTopoModels[i].connMatrix[k * romeTopoModels[i].nGpus + m] != romeTopo.connMatrix[g16[k] * romeTopoModels[i].nGpus + g16[m]])
-                        break;
-                }
-                if(m < romeTopoModels[i].nGpus)
-                    break;
-            }
-            if(k < romeTopoModels[i].nGpus)
-                continue;
-            // printf("found match %d: ", p); for (int n = 0; n < NUMA_CPUS*NUMA_GPUS; n++) printf("%d ", g16[n]); printf("\n");
-            if(nnets > 1) {
-                // permute NET IDs
-                int time = 0;
-                for(int m = 0; m < nnets; m++)
-                    n[m] = (m + 2) % nnets;
-                if(permuteNetIds(n, g16, 0, nnets - 1, romeTopoModels + i, &romeTopo, &time, false))
-                    break;
-            } else
-                break;
-        }
-        if(p < TOTAL_PERMUTE_COUNT)
-            break;
-    }
-    gettimeofday(&tve, NULL);
-    float t = (tve.tv_sec - tvs.tv_sec) * 1E3 + (tve.tv_usec - tvs.tv_usec) / 1E3;
-    if(i >= sizeof(romeTopoModels) / sizeof(romeTopoModels[0])) {
-        // printf("No solution in %.2fms\n", t);
-        return scclSuccess;
-    }
-    char line[1024];
-    // sprintf(line, "Found matching Rome model index %d in %.2fms with GPU mapping: ", i, t);
-    sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i);
-    int offset = strlen(line);
-    for(int k = 0; k < ngpus; k++) {
-        sprintf(line + offset, "%d ", g16[k]);
-        offset = strlen(line);
-    }
-    if(nnets > 1) {
-        sprintf(line + offset, "NET mapping: ");
-        offset = strlen(line);
-        for(int k = 0; k < nnets; k++) {
-            sprintf(line + offset, "%d ", n[k]);
-            offset = strlen(line);
-        }
-    }
-    INFO(SCCL_GRAPH, "%s", line);
-    system->type |= RCCL_TOPO_16P1H;
-    parseOptions(system, romeTopoModels[i].options);
-    // create 16P1H based on reference and remapped ids
-    SCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g16, nnets > 1 ? n : NULL));
-    if(romeTopoModels[i].treeBase != nullptr)
-        SCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, g16));
-    // clean up
-    free(all_gpu_permutations);
-    return scclSuccess;
-}
-scclResult_t parse4H4P(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
-#define NUM_HIVES 4
-#define HIVE_GPUS 4
-    static char ringRemap[256];
-    int ngpus = system->nodes[GPU].count;
-    int nnets = system->nodes[NET].count;
-    // only valid on Rome
-    int arch, vendor, model;
-    SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
-    if(arch != SCCL_TOPO_CPU_ARCH_X86 || vendor != SCCL_TOPO_CPU_VENDOR_AMD || model != SCCL_TOPO_CPU_TYPE_ROME)
-        return scclSuccess;
-    // number of GPUs and NICs on each numa node is used as first screening pattern
-    struct scclRomeModel romeTopo;
-    char pattern[256];
-    SCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
-    // only match for system with 16 GPUs
-    if(ngpus != NUM_HIVES * HIVE_GPUS || nnets != NUM_HIVES * HIVE_GPUS)
-        return scclSuccess;
-    int g_hives[ngpus], n_hives[nnets];
-    int ng_hives[NUM_HIVES];
-    // try to sort GPUs into hives
-    for(int i = 0; i < NUM_HIVES; i++)
-        ng_hives[i] = 0;
-    for(int i = 0; i < nnets; i++)
-        n_hives[i] = -1;
-    for(int i = 0; i < ngpus; i++)
-        g_hives[i] = -1;
-    for(int i = 0; i < ngpus; i++) {
-        int j, h;
-        for(j = 0; j < NUM_HIVES; j++) {
-            if(ng_hives[j]) {
-                if(romeTopo.connMatrix[i * ngpus + g_hives[j * HIVE_GPUS]]) {
-                    g_hives[j * HIVE_GPUS + ng_hives[j]] = i;
-                    ng_hives[j]++;
-                    break;
-                }
-            }
-        }
-        if(j >= NUM_HIVES) {
-            for(h = 0; h < NUM_HIVES; h++) {
-                if(ng_hives[h] == 0) {
-                    g_hives[h * HIVE_GPUS] = i;
-                    ng_hives[h]++;
-                    break;
-                }
-            }
-            if(h >= NUM_HIVES)
-                return scclSuccess;
-        }
-    }
-    for(int i = 0; i < NUM_HIVES; i++)
-        if(ng_hives[i] != 4)
-            return scclSuccess;
-    // remap NET ids
-    for(int i = 0; i < nnets; i++) {
-        int j;
-        for(j = 0; j < ngpus; j++) {
-            if(romeTopo.gdrLevel[i * nnets + g_hives[j]] == 3) {
-                n_hives[j] = i;
-                break;
-            }
-        }
-        if(j >= ngpus)
-            return scclSuccess;
-    }
-    // validation
-    for(int i = 0; i < nnets; i++)
-        if(n_hives[i] == -1)
-            return scclSuccess;
-    for(int i = 0; i < ngpus; i++)
-        if(g_hives[i] == -1)
-            return scclSuccess;
-    char line[1024];
-    sprintf(line, "Found matching Rome model 4P4H with GPU mapping: ");
-    int offset = strlen(line);
-    for(int k = 0; k < ngpus; k++) {
-        sprintf(line + offset, "%d ", g_hives[k]);
-        offset = strlen(line);
-    }
-    if(nnets > 1) {
-        sprintf(line + offset, "NET mapping: ");
-        offset = strlen(line);
-        for(int k = 0; k < nnets; k++) {
-            sprintf(line + offset, "%d ", n_hives[k]);
-            offset = strlen(line);
-        }
-    }
-    INFO(SCCL_GRAPH, "%s", line);
-    if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME)
-        system->type |= RCCL_TOPO_4P2H_ROME;
-    parseOptions(system, rome_model_68.options);
-    // create 4P4H based on reference and remapped ids
-    SCCLCHECK(parseGraph(rome_model_68.ringBase, system, graph, g_hives, n_hives));
-    return scclSuccess;
-}
-} // namespace detect
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/graph/rome_models.h
+++ b/src/hardware/graph/rome_models.h
-#ifndef SCCL_ROME_MODELS_H_
-#define SCCL_ROME_MODELS_H_
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace detect {
-scclResult_t parseGraph(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map, int* net_map);
-scclResult_t parseGraphLight(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map);
-scclResult_t parseRome4P2H(struct scclTopoSystem* system, struct scclTopoGraph* graph);
-scclResult_t parseChordalRing(struct scclTopoSystem* system, struct scclTopoGraph* graph);
-scclResult_t parse1H16P(struct scclTopoSystem* system, struct scclTopoGraph* graph);
-scclResult_t parse4H4P(struct scclTopoSystem* system, struct scclTopoGraph* graph);
-} // namespace detect
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
-#endif
\ No newline at end of file
--- a/src/hardware/graph/sccl_bfloat16.h
+++ b/src/hardware/graph/sccl_bfloat16.h
-/**
- * MIT License
- *
- * Copyright 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-/*!\file
- * \brief sccl_bfloat16.h provides struct for sccl_bfloat16 typedef
- */
-#ifndef _SCCL_BFLOAT16_H_
-#define _SCCL_BFLOAT16_H_
-#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
-// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
-// include a minimal definition of sccl_bfloat16
-#include <stdint.h>
-/*! \brief Struct to represent a 16 bit brain floating point number. */
-namespace sccl {
-typedef struct {
-    uint16_t data;
-} sccl_bfloat16;
-} // namespace sccl
-#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <hip/hip_runtime.h>
-#include <ostream>
-#include <type_traits>
-namespace sccl {
-struct sccl_bfloat16 {
-    uint16_t data;
-    enum truncate_t {
-        truncate
-    };
-    __host__ __device__ sccl_bfloat16() = default;
-    // round upper 16 bits of IEEE float to convert to bfloat16
-    explicit __host__ __device__ sccl_bfloat16(float f) : data(float_to_bfloat16(f)) {}
-    explicit __host__ __device__ sccl_bfloat16(float f, truncate_t) : data(truncate_float_to_bfloat16(f)) {}
-    // zero extend lower 16 bits of bfloat16 to convert to IEEE float
-    __host__ __device__ operator float() const {
-        union {
-            uint32_t int32;
-            float fp32;
-        } u = {uint32_t(data) << 16};
-        return u.fp32;
-    }
-    private:
-    static __host__ __device__ uint16_t float_to_bfloat16(float f) {
-        union {
-            float fp32;
-            uint32_t int32;
-        } u = {f};
-        if(~u.int32 & 0x7f800000) {
-            // When the exponent bits are not all 1s, then the value is zero, normal,
-            // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
-            // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
-            // This causes the bfloat16's mantissa to be incremented by 1 if the 16
-            // least significant bits of the float mantissa are greater than 0x8000,
-            // or if they are equal to 0x8000 and the least significant bit of the
-            // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
-            // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
-            // has the value 0x7f, then incrementing it causes it to become 0x00 and
-            // the exponent is incremented by one, which is the next higher FP value
-            // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
-            // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
-            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-            // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
-            // incrementing it causes it to become an exponent of 0xFF and a mantissa
-            // of 0x00, which is Inf, the next higher value to the unrounded value.
-            u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
-        } else if(u.int32 & 0xffff) {
-            // When all of the exponent bits are 1, the value is Inf or NaN.
-            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-            // bit being 1. Signaling NaN is indicated by the most significant
-            // mantissa bit being 0 but some other bit(s) being 1. If any of the
-            // lower 16 bits of the mantissa are 1, we set the least significant bit
-            // of the bfloat16 mantissa, in order to preserve signaling NaN in case
-            // the bloat16's mantissa bits are all 0.
-            u.int32 |= 0x10000; // Preserve signaling NaN
-        }
-        return uint16_t(u.int32 >> 16);
-    }
-    // Truncate instead of rounding, preserving SNaN
-    static __host__ __device__ uint16_t truncate_float_to_bfloat16(float f) {
-        union {
-            float fp32;
-            uint32_t int32;
-        } u = {f};
-        return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
-    }
-};
-typedef struct {
-    uint16_t data;
-} sccl_bfloat16_public;
-static_assert(std::is_standard_layout<sccl_bfloat16>{},
-              "sccl_bfloat16 is not a standard layout type, and thus is "
-              "incompatible with C.");
-static_assert(std::is_trivial<sccl_bfloat16>{},
-              "sccl_bfloat16 is not a trivial type, and thus is "
-              "incompatible with C.");
-static_assert(sizeof(sccl_bfloat16) == sizeof(sccl_bfloat16_public) && offsetof(sccl_bfloat16, data) == offsetof(sccl_bfloat16_public, data),
-              "internal sccl_bfloat16 does not match public sccl_bfloat16");
-inline std::ostream& operator<<(std::ostream& os, const sccl_bfloat16& bf16) { return os << float(bf16); }
-inline __host__ __device__ sccl_bfloat16 operator+(sccl_bfloat16 a) { return a; }
-inline __host__ __device__ sccl_bfloat16 operator-(sccl_bfloat16 a) {
-    a.data ^= 0x8000;
-    return a;
-}
-inline __host__ __device__ sccl_bfloat16 operator+(sccl_bfloat16 a, sccl_bfloat16 b) { return sccl_bfloat16(float(a) + float(b)); }
-inline __host__ __device__ sccl_bfloat16 operator-(sccl_bfloat16 a, sccl_bfloat16 b) { return sccl_bfloat16(float(a) - float(b)); }
-inline __host__ __device__ sccl_bfloat16 operator*(sccl_bfloat16 a, sccl_bfloat16 b) { return sccl_bfloat16(float(a) * float(b)); }
-inline __host__ __device__ sccl_bfloat16 operator/(sccl_bfloat16 a, sccl_bfloat16 b) { return sccl_bfloat16(float(a) / float(b)); }
-inline __host__ __device__ bool operator<(sccl_bfloat16 a, sccl_bfloat16 b) { return float(a) < float(b); }
-inline __host__ __device__ bool operator==(sccl_bfloat16 a, sccl_bfloat16 b) { return float(a) == float(b); }
-inline __host__ __device__ bool operator>(sccl_bfloat16 a, sccl_bfloat16 b) { return b < a; }
-inline __host__ __device__ bool operator<=(sccl_bfloat16 a, sccl_bfloat16 b) { return !(a > b); }
-inline __host__ __device__ bool operator!=(sccl_bfloat16 a, sccl_bfloat16 b) { return !(a == b); }
-inline __host__ __device__ bool operator>=(sccl_bfloat16 a, sccl_bfloat16 b) { return !(a < b); }
-inline __host__ __device__ sccl_bfloat16& operator+=(sccl_bfloat16& a, sccl_bfloat16 b) { return a = a + b; }
-inline __host__ __device__ sccl_bfloat16& operator-=(sccl_bfloat16& a, sccl_bfloat16 b) { return a = a - b; }
-inline __host__ __device__ sccl_bfloat16& operator*=(sccl_bfloat16& a, sccl_bfloat16 b) { return a = a * b; }
-inline __host__ __device__ sccl_bfloat16& operator/=(sccl_bfloat16& a, sccl_bfloat16 b) { return a = a / b; }
-inline __host__ __device__ sccl_bfloat16& operator++(sccl_bfloat16& a) { return a += sccl_bfloat16(1.0f); }
-inline __host__ __device__ sccl_bfloat16& operator--(sccl_bfloat16& a) { return a -= sccl_bfloat16(1.0f); }
-inline __host__ __device__ sccl_bfloat16 operator++(sccl_bfloat16& a, int) {
-    sccl_bfloat16 orig = a;
-    ++a;
-    return orig;
-}
-inline __host__ __device__ sccl_bfloat16 operator--(sccl_bfloat16& a, int) {
-    sccl_bfloat16 orig = a;
-    --a;
-    return orig;
-}
-namespace std {
-constexpr __host__ __device__ bool isinf(sccl_bfloat16 a) { return !(~a.data & 0x7f80) && !(a.data & 0x7f); }
-constexpr __host__ __device__ bool isnan(sccl_bfloat16 a) { return !(~a.data & 0x7f80) && +(a.data & 0x7f); }
-constexpr __host__ __device__ bool iszero(sccl_bfloat16 a) { return !(a.data & 0x7fff); }
-inline sccl_bfloat16 sin(sccl_bfloat16 a) { return sccl_bfloat16(sinf(float(a))); }
-inline sccl_bfloat16 cos(sccl_bfloat16 a) { return sccl_bfloat16(cosf(float(a))); }
-} // namespace std
-} // namespace sccl
-#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
-#endif // _SCCL_BFLOAT16_H_
--- a/src/hardware/graph/search.cc
+++ b/src/hardware/graph/search.cc
-#include "core.h"
-#include "graph.h"
-#include "topo.h"
-#include "xml.h"
-#include <math.h>
-#include <sys/time.h>
-#include "rome_models.h"
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace detect {
-SCCL_PARAM(CrossNic, "CROSS_NIC", 2);
-// Initialize system->maxBw. This is the per-channel (i.e. per-SM)
-// max bw.
-static float getMaxBw(struct scclTopoSystem* system, struct scclTopoNode* gpu, int type) {
-    float maxBw = 0.0;
-    for(int i = 0; i < system->nodes[type].count; i++) {
-        struct scclTopoLinkList* path = gpu->paths[type] + i;
-        float bw                      = path->bw;
-        if(path->count == 0)
-            continue;
-        maxBw = std::max(maxBw, bw);
-    }
-    return maxBw;
-}
-static float getTotalBw(struct scclTopoSystem* system, struct scclTopoNode* gpu) {
-    float nvlinkBw = 0.0, pciBw = 0.0;
-    for(int l = 0; l < gpu->nlinks; l++) {
-        struct scclTopoLink* link = gpu->links + l;
-        if(link->type == LINK_NVL)
-            nvlinkBw += link->bw;
-        if(link->type == LINK_PCI)
-            pciBw = link->bw;
-    }
-    return std::max(pciBw, nvlinkBw);
-}
-scclResult_t scclTopoSearchInit(struct scclTopoSystem* system) {
-    system->maxBw   = 0.0;
-    system->totalBw = 0.0;
-    int inter       = system->nodes[NET].count;
-    if(inter == 0 && system->nodes[GPU].count == 1) {
-        system->maxBw = LOC_BW;
-        return scclSuccess;
-    }
-    for(int g = 0; g < system->nodes[GPU].count; g++) {
-        struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
-        system->maxBw            = std::max(system->maxBw, getMaxBw(system, gpu, inter ? NET : GPU));
-        system->totalBw          = std::max(system->totalBw, getTotalBw(system, gpu));
-    }
-    return scclSuccess;
-}
-static scclResult_t findRevLink(struct scclTopoNode* node1, struct scclTopoNode* node2, struct scclTopoLink** revLink) {
-    for(int l = 0; l < node2->nlinks; l++) {
-        struct scclTopoLink* link = node2->links + l;
-        if(link->remNode == node1) {
-            *revLink = link;
-            return scclSuccess;
-        }
-    }
-    WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id);
-    return scclInternalError;
-}
-// This is unfortunately needed since manipulating floats often results in rounding errors.
-#define SUB_ROUND(a, b) (a = roundf((a - b) * 1000) / 1000)
-static scclResult_t followPath(struct scclTopoLinkList* path, struct scclTopoNode* start, int maxSteps, float bw, int* steps) {
-    float pciBw = bw;
-    for(int step = 0; step < path->count; step++) {
-        struct scclTopoNode* node = path->list[step]->remNode;
-        if(node->type == CPU) {
-            // Account for P2P inefficiency through Intel CPU RC
-            if(path->type == PATH_PHB && start->type == GPU && node->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && node->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
-                pciBw = INTEL_P2P_OVERHEAD(bw);
-            }
-        }
-    }
-    struct scclTopoNode* node = start;
-    for(int step = 0; step < maxSteps; step++) {
-        struct scclTopoLink* link    = path->list[step];
-        struct scclTopoLink* revLink = NULL;
-        float fwBw                   = link->type == LINK_PCI ? pciBw : bw;
-        float revBw                  = 0;
-        if(link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) {
-            if(revLink == NULL)
-                SCCLCHECK(findRevLink(node, link->remNode, &revLink));
-            revBw += fwBw / 8;
-        }
-        if(link->remNode->type == CPU && link->type == LINK_NVL) {
-            if(revLink == NULL)
-                SCCLCHECK(findRevLink(node, link->remNode, &revLink));
-            revBw += fwBw;
-        }
-        if(link->bw < fwBw || (revBw && revLink->bw < revBw)) {
-            *steps = step;
-            return scclSuccess;
-        }
-        SUB_ROUND(link->bw, fwBw);
-        if(revBw)
-            SUB_ROUND(revLink->bw, revBw);
-        node = link->remNode;
-    }
-    *steps = maxSteps;
-    return scclSuccess;
-}
-// Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1).
-static scclResult_t scclTopoFollowPath(
-    struct scclTopoSystem* system, struct scclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct scclTopoNode** node) {
-    // First handle easy cases
-    *node = system->nodes[type2].nodes + index2;
-    if(type1 == -1)
-        return scclSuccess;
-    struct scclTopoNode* node1       = system->nodes[type1].nodes + index1;
-    struct scclTopoLinkList* path    = node1->paths[type2] + index2;
-    struct scclTopoNode* node2       = system->nodes[type2].nodes + index2;
-    struct scclTopoLinkList* revPath = node2->paths[type1] + index1;
-    if(path == NULL) {
-        WARN("No path computed to go from %s/%d to %s/%d", topoNodeTypeStr[type1], index1, topoNodeTypeStr[type2], index2);
-        return scclInternalError;
-    }
-    if(path->count == 0)
-        return scclSuccess;
-    // Now check link type
-    *node     = NULL;
-    int intra = (type1 == GPU || type1 == NVS) && (type2 == GPU || type2 == NVS);
-    float bw  = intra ? graph->bwIntra : graph->bwInter;
-    int type  = intra ? graph->typeIntra : graph->typeInter;
-    if(mult == 1 && (path->type > type))
-        return scclSuccess;
-    if(mult == 1 &&
-       (graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE || graph->pattern == SCCL_TOPO_PATTERN_TREE || graph->pattern == SCCL_TOPO_PATTERN_SPLIT_TREE) &&
-       (revPath->type > type))
-        return scclSuccess;
-    bw *= mult;
-    // Check there is enough bandwidth on paths.
-    int step = 0;
-    SCCLCHECK(followPath(path, node1, path->count, bw, &step));
-    if(step < path->count)
-        goto rewind;
-    // Enough bandwidth : return destination node.
-    graph->nHops += mult * path->count;
-    *node = system->nodes[type2].nodes + index2;
-    return scclSuccess;
-rewind:
-    // Not enough bandwidth : rewind and exit.
-    SCCLCHECK(followPath(path, node1, step, -bw, &step));
-    return scclSuccess;
-}
-static int gpuPciBw(struct scclTopoNode* gpu) {
-    for(int l = 0; l < gpu->nlinks; l++) {
-        struct scclTopoLink* gpuLink = gpu->links + l;
-        if(gpuLink->type != LINK_PCI)
-            continue;
-        struct scclTopoNode* pci = gpuLink->remNode;
-        for(int l = 0; l < pci->nlinks; l++) {
-            struct scclTopoLink* pciLink = pci->links + l;
-            if(pciLink->remNode != gpu)
-                continue;
-            return std::min(gpuLink->bw, pciLink->bw);
-        }
-    }
-    return -1;
-}
-/* Choose the order in which we try next GPUs. This is critical for the search
-   to quickly converge to the best solution even if it eventually times out. */
-struct scclGpuScore {
-    int g;          // Retain the index
-    int startIndex; // Least important
-    int intraNhops;
-    int intraBw;
-    int interNhops;
-    int interPciBw;
-    int interBw; // Most important
-};
-static int cmpScore(const void* g1, const void* g2) {
-    struct scclGpuScore* s1 = (struct scclGpuScore*)g1;
-    struct scclGpuScore* s2 = (struct scclGpuScore*)g2;
-    int d;
-    if((d = (s2->interBw - s1->interBw)))
-        return d;
-    if((d = (s2->interPciBw - s1->interPciBw)))
-        return d;
-    if((d = (s1->interNhops - s2->interNhops)))
-        return d;
-    if((d = (s2->startIndex - s1->startIndex)))
-        return d;
-    if((d = (s2->intraBw - s1->intraBw)))
-        return d;
-    if((d = (s1->intraNhops - s2->intraNhops)))
-        return d;
-    return s1->startIndex - s2->startIndex;
-}
-static int cmpIntraScores(struct scclGpuScore* scores, int count) {
-    int intraBw    = scores[0].intraBw;
-    int intraNhops = scores[0].intraNhops;
-    for(int i = 1; i < count; i++) {
-        if(scores[i].intraBw != intraBw || scores[i].intraNhops != intraNhops)
-            return 1;
-    }
-    return 0;
-}
-static scclResult_t getGpuIndex(struct scclTopoSystem* system, int rank, int* index) {
-    for(int g = 0; g < system->nodes[GPU].count; g++) {
-        if(system->nodes[GPU].nodes[g].gpu.rank == rank) {
-            *index = g;
-            return scclSuccess;
-        }
-    }
-    WARN("Could not find gpu rank %d", rank);
-    return scclInternalError;
-}
-static scclResult_t getNetIndex(struct scclTopoSystem* system, int64_t id, int* index) {
-    for(int n = 0; n < system->nodes[NET].count; n++) {
-        if(system->nodes[NET].nodes[n].id == id) {
-            *index = n;
-            return scclSuccess;
-        }
-    }
-    WARN("Could not find net id %lx", id);
-    return scclInternalError;
-}
-static scclResult_t getNetPaths(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoLinkList** netPaths) {
-    int netId = graph->inter[graph->nChannels * 2];
-    int n;
-    SCCLCHECK(getNetIndex(system, netId, &n));
-    *netPaths = system->nodes[NET].nodes[n].paths[GPU];
-    return scclSuccess;
-}
-scclResult_t
-scclTopoSearchNextGpuSort(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoNode* gpu, int* next, int* countPtr, int sortNet) {
-    const uint64_t flag               = 1ULL << (graph->nChannels);
-    int ngpus                         = system->nodes[GPU].count;
-    struct scclTopoLinkList* paths    = gpu->paths[GPU];
-    struct scclTopoLinkList* netPaths = NULL;
-    if(sortNet)
-        SCCLCHECK(getNetPaths(system, graph, &netPaths));
-    struct scclGpuScore scores[SCCL_TOPO_MAX_NODES];
-    memset(scores, 0, ngpus * sizeof(struct scclGpuScore));
-    int start = gpu - system->nodes[GPU].nodes;
-    int count = 0;
-    for(int i = 1; i < ngpus; i++) {
-        int g = (start + i) % ngpus;
-        if(paths[g].count == 0)
-            continue; // There is no path to that GPU
-        if(system->nodes[GPU].nodes[g].used & flag)
-            continue;
-        scores[count].g          = g;
-        scores[count].startIndex = i;
-        scores[count].intraNhops = paths[g].count;
-        scores[count].intraBw    = paths[g].bw;
-        if(netPaths) {
-            scores[count].interNhops = netPaths[g].count;
-            scores[count].interPciBw = gpuPciBw(system->nodes[GPU].nodes + g);
-            scores[count].interBw    = netPaths[g].bw;
-        }
-        count++;
-    }
-    // Sort GPUs
-    qsort(scores, count, sizeof(struct scclGpuScore), cmpScore);
-    // Check if all have the same intra-node score in which case we go reverse for sortNet = -1
-    if(sortNet == -1 && cmpIntraScores(scores, count) == 0) {
-        for(int i = 0; i < count; i++)
-            next[i] = scores[count - 1 - i].g;
-    } else {
-        for(int i = 0; i < count; i++)
-            next[i] = scores[i].g;
-    }
-    *countPtr = count;
-    return scclSuccess;
-}
-scclResult_t scclTopoSearchRec(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int* time);
-// Try to keep all searchs within one second
-#define SCCL_SEARCH_GLOBAL_TIMEOUT (5ULL << 16)
-#define SCCL_SEARCH_TIMEOUT (1 << 14)
-#define SCCL_SEARCH_TIMEOUT_TREE (1 << 14)
-#define SCCL_SEARCH_TIMEOUT_SAMECHANNELS (1 << 8)
-#define FORCED_ORDER_PCI 1
-#define FORCED_ORDER_REPLAY 2
-scclResult_t scclTopoReplayGetGpu(struct scclTopoSystem* system, struct scclTopoGraph* graph, int step, int* g) {
-    *g = -1;
-    if(graph->nChannels == 0)
-        return scclInternalError;
-    int ngpus    = system->nodes[GPU].count;
-    int nextRank = graph->intra[(graph->nChannels - 1) * ngpus + step + 1];
-    for(int i = 0; i < ngpus; i++)
-        if(system->nodes[GPU].nodes[i].gpu.rank == nextRank) {
-            *g = i;
-            return scclSuccess;
-        }
-    if(*g == -1)
-        return scclInternalError;
-    return scclSuccess;
-}
-scclResult_t scclTopoSearchRecGpu(struct scclTopoSystem* system,
-                                  struct scclTopoGraph* graph,
-                                  struct scclTopoGraph* saveGraph,
-                                  struct scclTopoNode* gpu,
-                                  int step,
-                                  int backToNet,
-                                  int backToFirstRank,
-                                  int forcedOrder,
-                                  int* time);
-scclResult_t scclTopoSearchTryGpu(struct scclTopoSystem* system,
-                                  struct scclTopoGraph* graph,
-                                  struct scclTopoGraph* saveGraph,
-                                  int step,
-                                  int backToNet,
-                                  int backToFirstRank,
-                                  int forcedOrder,
-                                  int* time,
-                                  int type,
-                                  int index,
-                                  int g) {
-    const uint64_t flag = 1ULL << (graph->nChannels);
-    struct scclTopoNode* gpu;
-    SCCLCHECK(scclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu));
-    if(gpu) {
-        gpu->used ^= flag;
-        SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, time));
-        gpu->used ^= flag;
-        SCCLCHECK(scclTopoFollowPath(system, graph, type, index, GPU, g, -1, &gpu));
-    }
-    return scclSuccess;
-}
-static int scclTopoCountXGMI(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
-    int ngpus = system->nodes[GPU].count;
-    int count = 0;
-    for(int c = 0; c < graph->nChannels; c++) {
-        for(int i = 0; i < ngpus; i++) {
-            int g = graph->intra[ngpus * c + i];
-            int n = graph->intra[ngpus * c + ((i + 1) % ngpus)];
-            struct scclTopoNode* node;
-            int j;
-            for(j = 0; j < ngpus; j++)
-                if(system->nodes[GPU].nodes[j].gpu.rank == g)
-                    break;
-            if(j < ngpus) {
-                node = system->nodes[GPU].nodes + j;
-                for(int k = 0; k < system->nodes[GPU].count; k++) {
-                    if(node->paths[GPU][k].count == 1) {
-                        struct scclTopoLink* link    = node->paths[GPU][k].list[0];
-                        struct scclTopoNode* remNode = link->remNode;
-                        if(remNode->gpu.rank == n) {
-                            if(link->type == LINK_NVL)
-                                count++;
-                        }
-                    }
-                }
-            }
-        }
-    }
-    return count;
-}
-scclResult_t scclTopoSearchTryNvls(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int g, int ngpus, int* time) {
-    struct scclTopoNode* nvs;
-    struct scclTopoNode* gpu;
-    int d0 = 0; // See if there is enough bandwidth for NVS->GPU traffic
-    do {
-        SCCLCHECK(scclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? 2 : 1, &gpu));
-        d0++;
-    } while(gpu && d0 < system->nodes[GPU].count);
-    if(gpu == NULL) {
-        d0--;
-    } else {
-        int d1 = 0; // See if there is enough bandwidth for GPU->NVS traffic
-        do {
-            SCCLCHECK(scclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? 2 : 1, &nvs));
-            d1++;
-        } while(nvs && d1 < system->nodes[GPU].count);
-        if(nvs == NULL) {
-            d1--;
-        } else { // Both directions worked. Move on to the next path.
-            SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time));
-        }
-        while(d1) {
-            d1--;
-            SCCLCHECK(scclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? -2 : -1, &nvs));
-        }
-    }
-    while(d0) {
-        d0--;
-        SCCLCHECK(scclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? -2 : -1, &gpu));
-    }
-    return scclSuccess;
-}
-scclResult_t scclTopoCompareGraphs(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* refGraph, int* copy) {
-    // 1. Try to get the same nChannels between Rings and Trees
-    if(graph->nChannels < graph->minChannels)
-        return scclSuccess;
-    if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better.
-        if(graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count)
-            *copy = 1;
-        return scclSuccess;
-    }
-    // 2. Try to get better bandwidth
-    // Give a 15% perf bonus to paths not crossing nics
-    float target = 1.0 - (refGraph->crossNic - graph->crossNic) * .15;
-    if(graph->nChannels * graph->bwIntra > refGraph->nChannels * refGraph->bwIntra * target) {
-        *copy = 1;
-        return scclSuccess;
-    }
-    if(graph->nChannels * graph->bwIntra < refGraph->nChannels * refGraph->bwIntra * target)
-        return scclSuccess;
-    // 3. Less hops
-    if(graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops)
-        *copy = 1;
-    // 4. Prefer graph with more XGMI connections
-    if(graph->nChannels == refGraph->nChannels && scclTopoCountXGMI(system, refGraph) < scclTopoCountXGMI(system, graph))
-        *copy = 1;
-    return scclSuccess;
-}
-// Build a list of the best NETs to try.
-//
-// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
-//  index when trying to get back to the NIC.
-//
-// The list is built the following way:
-// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
-// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list
-//    based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which
-//    might have been choosen by GPU 0 (case with multiple independent communicators per node)
-// 3. Then add the NETs to the final list if they were not already added by another closer GPU.
-scclResult_t scclTopoSelectNets(struct scclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
-    int netCount = 0;
-    int localNetCount;
-    int* localNets;
-    SCCLCHECK(scclCalloc(&localNets, system->nodes[NET].count));
-    for(int t = 0; t <= typeInter; t++) {
-        for(int g = 0; g < system->nodes[GPU].count; g++) {
-            if(gpu != -1 && gpu != g)
-                continue;
-            localNetCount                  = 0;
-            struct scclTopoNode* gpu       = system->nodes[GPU].nodes + g;
-            struct scclTopoLinkList* paths = gpu->paths[NET];
-            for(int n = 0; n < system->nodes[NET].count; n++) {
-                if(paths[n].type == t)
-                    localNets[localNetCount++] = n;
-            }
-            if(localNetCount == 0)
-                continue;
-            // Shuffle by gpu NVML device number so that GPUs on the same PCI switch
-            // with multiple NICs don't use the same one as first choice.
-            for(int r = 0; r < system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) {
-                int net0 = localNets[0];
-                for(int i = 0; i < localNetCount - 1; i++)
-                    localNets[i] = localNets[i + 1];
-                localNets[localNetCount - 1] = net0;
-            }
-            // Append NICs to list
-            for(int i = 0; i < localNetCount; i++) {
-                int n     = localNets[i];
-                int found = 0;
-                while(nets[found] != n && found < netCount)
-                    found++;
-                if(found == netCount)
-                    nets[netCount++] = n;
-            }
-        }
-    }
-    *netCountRet = netCount;
-    free(localNets);
-    return scclSuccess;
-}
-scclResult_t scclTopoSearchRecGpu(struct scclTopoSystem* system,
-                                  struct scclTopoGraph* graph,
-                                  struct scclTopoGraph* saveGraph,
-                                  struct scclTopoNode* gpu,
-                                  int step,
-                                  int backToNet,
-                                  int backToFirstRank,
-                                  int forcedOrder,
-                                  int* time) {
-    if((*time) <= 0)
-        return scclSuccess;
-    (*time)--;
-    int ngpus = system->nodes[GPU].count;
-    if(step == ngpus) {
-        // Determine whether we found a better solution or not
-        int copy = 0;
-        graph->nChannels++;
-        SCCLCHECK(scclTopoCompareGraphs(system, graph, saveGraph, &copy));
-        if(copy) {
-            memcpy(saveGraph, graph, sizeof(struct scclTopoGraph));
-            if(graph->nChannels == graph->maxChannels)
-                *time = -1;
-        }
-        if(graph->nChannels < graph->maxChannels) {
-            SCCLCHECK(scclTopoSearchRec(system, graph, saveGraph, time));
-        }
-        graph->nChannels--;
-        return scclSuccess;
-    }
-    graph->intra[graph->nChannels * ngpus + step] = gpu->gpu.rank;
-    int g                                         = gpu - system->nodes[GPU].nodes;
-    if(step == backToNet) {
-        // first get back to NIC
-        if(system->nodes[NET].count) {
-            int startNetIndex;
-            SCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels * 2], &startNetIndex));
-            struct scclTopoNode* startNet = system->nodes[NET].nodes + startNetIndex;
-            int netcount;
-            int* nets;
-            SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count));
-            SCCLCHECK(scclTopoSelectNets(system, graph->typeInter, g, nets, &netcount));
-            for(int i = 0; i < netcount; i++) {
-                int n                    = nets[i];
-                struct scclTopoNode* net = system->nodes[NET].nodes + n;
-                if(graph->pattern == SCCL_TOPO_PATTERN_TREE && net->id != startNet->id)
-                    continue; // Trees are symmetric
-                if(graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port))
-                    continue;
-                // Balanced Tree : count half of the bandwidth on first two GPUs
-                int nextBackToNet = -1;
-                float bwInterSave = graph->bwInter;
-                if(graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE) {
-                    // Count half of the bandwidth on each of the first two GPUs
-                    if(step == 0)
-                        nextBackToNet = 1;
-                    else if(net->id != graph->inter[graph->nChannels * 2 + 1])
-                        continue;
-                    graph->bwInter /= 2;
-                }
-                SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
-                graph->bwInter = bwInterSave;
-                if(net) {
-                    graph->inter[graph->nChannels * 2 + 1] = net->id;
-                    SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
-                    if(graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE)
-                        graph->bwInter /= 2;
-                    SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
-                    graph->bwInter = bwInterSave;
-                }
-            }
-            free(nets);
-        }
-    } else if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) {
-        SCCLCHECK(scclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
-    } else if(step < system->nodes[GPU].count - 1) {
-        // Go to next GPU
-        int next[SCCL_TOPO_MAX_NODES];
-        int count;
-        if(forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
-            next[0] = step + 1;
-            count   = 1;
-        } else if(forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order
-            SCCLCHECK(scclTopoReplayGetGpu(system, graph, step, next));
-            count = 1;
-        } else { // Normal search
-            SCCLCHECK(scclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step + 1 ? 1 : -1));
-        }
-        for(int i = 0; i < count; i++) {
-            SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, step + 1, backToNet, backToFirstRank, forcedOrder, time, GPU, g, next[i]));
-        }
-    } else if(step == backToFirstRank) {
-        // Find first GPU and loop back to it
-        int p;
-        SCCLCHECK(getGpuIndex(system, graph->intra[graph->nChannels * ngpus], &p));
-        struct scclTopoNode* firstGpu;
-        SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, GPU, p, 1, &firstGpu));
-        if(firstGpu) {
-            SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step + 1, backToNet, -1, forcedOrder, time));
-            SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, GPU, p, -1, &firstGpu));
-        }
-    } else {
-        // Next path
-        SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
-    }
-    return scclSuccess;
-}
-scclResult_t scclTopoSearchRecNet(
-    struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
-    const int bw = graph->bwInter;
-    int* nets;
-    SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count));
-    int netcount;
-    SCCLCHECK(scclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount));
-    for(int i = 0; i < netcount; i++) {
-        int n                    = nets[i];
-        struct scclTopoNode* net = system->nodes[NET].nodes + n;
-        struct scclTopoNode* gpu;
-        if(graph->collNet && net->net.collSupport == 0)
-            continue;
-        if(net->net.bw < bw)
-            continue;
-        graph->inter[graph->nChannels * 2] = net->id;
-        graph->latencyInter                = net->net.latency;
-        for(int i = 0; i < system->nodes[NET].count; i++) {
-            if((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) {
-                system->nodes[NET].nodes[i].net.bw -= bw;
-            }
-        }
-        // NVLS needs to balance on all NICs
-        if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) {
-            SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, nets[graph->nChannels]));
-        } else {
-            if(graph->nChannels > 0) {
-                // Try to replay the last channel
-                int g;
-                SCCLCHECK(scclTopoReplayGetGpu(system, graph, -1, &g));
-                SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
-            }
-            if(graph->nChannels == 0 || graph->sameChannels == 0) {
-                if(graph->nChannels == 0) {
-                    // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
-                    struct scclTopoLinkList* paths = net->paths[GPU];
-                    int f = 0, f_gdr = 0;
-                    // find the first GPU that is closest to NIC
-                    for(int i = 0; i < system->nodes[GPU].count; i++) {
-                        if(paths[i].count <= paths[f].count) {
-                            // prefer GPU direct RDMA
-                            int gdr;
-                            SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr));
-                            if(paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) {
-                                f     = i;
-                                f_gdr = gdr;
-                            }
-                        }
-                    }
-                    int t = 1 << 10;
-                    SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
-                    if(t == -1)
-                        *time = -1;
-                }
-                // Then try the most local GPUs
-                float maxBw                    = 0;
-                int minHops                    = 0xfffffff;
-                struct scclTopoLinkList* paths = net->paths[GPU];
-                for(int g = 0; g < system->nodes[GPU].count; g++) {
-                    if(paths[g].bw > maxBw) {
-                        maxBw   = paths[g].bw;
-                        minHops = paths[g].count;
-                    } else if(paths[g].bw == maxBw && paths[g].count < minHops) {
-                        minHops = paths[g].count;
-                    }
-                }
-                if(maxBw >= bw) {
-                    // In the first loop, avoid using GPUs in both directions between channels (one channel
-                    // sending from that GPU and one channel receiving to that GPU), since that usually leads
-                    // to lower BW.
-                    for(int tryGpuBidir = 0; tryGpuBidir < 2; tryGpuBidir++) {
-                        for(int g = 0; g < system->nodes[GPU].count; g++) {
-                            if(paths[g].bw == maxBw && paths[g].count == minHops) {
-                                gpu         = system->nodes[GPU].nodes + g;
-                                int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1;
-                                if(tryGpuBidir == gpuUsed) {
-                                    SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        for(int i = 0; i < system->nodes[NET].count; i++) {
-            if((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) {
-                system->nodes[NET].nodes[i].net.bw += bw;
-            }
-        }
-    }
-    free(nets);
-    return scclSuccess;
-}
-/* Search Patterns
- *
- *     Intra-node
- * Ring            : GPU a -> GPU b -> .. -> GPU x -> GPU a
- * (=Split Tree Loop)
- * Tree            : GPU a -> GPU b -> .. -> GPU x
- * (=Split Tree)
- *
- *     Inter-node
- * Ring            : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic)
- * Tree            : NET n -> GPU a -> GPU b -> .. -> GPU x
- *                              `--> NET n (or m if crossNic)
- * Split Tree      : NET n -> GPU a -> GPU b -> .. -> GPU x
- *                                       `--> NET n (or m if crossNic)
- * Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a
- *                                       `--> NET n (or m if crossNic)
- */
-scclResult_t scclTopoSearchParams(struct scclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
-    if(system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
-        if(pattern == SCCL_TOPO_PATTERN_RING)
-            *backToNet = system->nodes[GPU].count - 1;
-        else if(pattern == SCCL_TOPO_PATTERN_SPLIT_TREE)
-            *backToNet = 1;
-        else
-            *backToNet = 0;
-        *backToFirstRank = -1;
-    } else {
-        *backToNet = -1;
-        if(pattern == SCCL_TOPO_PATTERN_RING)
-            *backToFirstRank = system->nodes[GPU].count - 1;
-        else
-            *backToFirstRank = -1;
-    }
-    return scclSuccess;
-}
-scclResult_t scclTopoSearchRec(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int* time) {
-    int backToNet, backToFirstRank;
-    SCCLCHECK(scclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
-    if(system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
-        // Start from NET
-        scclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
-    } else {
-        // Intra-node only.
-        if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) {
-            SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, graph->nChannels));
-            return scclSuccess;
-        } else if(graph->nChannels == 0) {
-            // Try PCI order first
-            SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
-        } else {
-            // Also try to replay previous channel
-            int g;
-            SCCLCHECK(scclTopoReplayGetGpu(system, graph, -1, &g));
-            SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
-        }
-        if(graph->sameChannels == 0 || graph->nChannels == 0) {
-            // Finally, try all other possibilities unless we are forced to use the same channels
-            for(int g = 0; g < system->nodes[GPU].count; g++) {
-                SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
-            }
-        }
-    }
-    return scclSuccess;
-}
-/************************************/
-/* User defined graph from XML file */
-/************************************/
-struct kvDict kvDictLinkType[] = {{"LOC", PATH_LOC},
-                                  {"NVL", PATH_NVL},
-                                  {"NVB", PATH_NVB},
-                                  {"PIX", PATH_PIX},
-                                  {"PXB", PATH_PXB},
-                                  {"PXN", PATH_PXN},
-                                  {"PHB", PATH_PHB},
-                                  {"SYS", PATH_SYS},
-                                  {NULL, 0}};
-scclResult_t scclTopoGetChannelFromXml(struct scclXmlNode* xmlChannel, int c, struct scclTopoSystem* system, struct scclTopoGraph* graph) {
-    int ngpus  = system->nodes[GPU].count;
-    int* inter = graph->inter + 2 * c;
-    int* intra = graph->intra + ngpus * c;
-    int n = 0, g = 0;
-    for(int s = 0; s < xmlChannel->nSubs; s++) {
-        struct scclXmlNode* sub = xmlChannel->subs[s];
-        int dev;
-        SCCLCHECK(xmlGetAttrInt(sub, "dev", &dev));
-        if(strcmp(sub->name, "net") == 0) {
-            inter[n++] = dev;
-        } else if(strcmp(sub->name, "gpu") == 0) {
-            int rank = -1;
-            for(int g = 0; g < ngpus; g++) {
-                if(system->nodes[GPU].nodes[g].gpu.dev == dev)
-                    rank = system->nodes[GPU].nodes[g].gpu.rank;
-            }
-            if(rank == -1) {
-                WARN("XML Import Channel : dev %d not found.", dev);
-                return scclSystemError;
-            }
-            intra[g++] = rank;
-        }
-    }
-    return scclSuccess;
-}
-scclResult_t scclTopoGetGraphFromXmlSub(struct scclXmlNode* xmlGraph, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* nChannels) {
-    int id;
-    SCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id));
-    if(graph->id != id)
-        return scclSuccess;
-    int crossNic;
-    SCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic));
-    if(scclParamCrossNic() == 0 && crossNic == 1)
-        return scclSuccess;
-    graph->crossNic = crossNic;
-    SCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern));
-    SCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
-    SCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra));
-    SCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter));
-    if(xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != scclSuccess)
-        graph->latencyInter = 0.0;
-    const char* str;
-    SCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
-    SCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
-    SCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str));
-    SCCLCHECK(kvConvertToInt(str, &graph->typeInter, kvDictLinkType));
-    SCCLCHECK(xmlGetAttrInt(xmlGraph, "samechannels", &graph->sameChannels));
-    for(int s = 0; s < xmlGraph->nSubs; s++) {
-        SCCLCHECK(scclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph));
-    }
-    *nChannels = xmlGraph->nSubs;
-    return scclSuccess;
-}
-scclResult_t scclTopoGetGraphFromXml(struct scclXmlNode* xmlGraphs, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* nChannels) {
-    for(int s = 0; s < xmlGraphs->nSubs; s++) {
-        SCCLCHECK(scclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph, nChannels));
-    }
-    return scclSuccess;
-}
-/* And the reverse : graph->xml */
-scclResult_t scclTopoGetXmlFromChannel(struct scclTopoGraph* graph, int c, struct scclTopoSystem* system, struct scclXml* xml, struct scclXmlNode* parent) {
-    struct scclXmlNode* xmlChannel;
-    int ngpus  = system->nodes[GPU].count;
-    int* inter = graph->inter + 2 * c;
-    int* intra = graph->intra + ngpus * c;
-    SCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel));
-    struct scclXmlNode* node;
-    if(system->nodes[NET].count) {
-        SCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
-        SCCLCHECK(xmlSetAttrInt(node, "dev", inter[0]));
-    }
-    for(int g = 0; g < ngpus; g++) {
-        SCCLCHECK(xmlAddNode(xml, xmlChannel, "gpu", &node));
-        int dev = -1;
-        for(int i = 0; i < ngpus; i++) {
-            if(system->nodes[GPU].nodes[i].gpu.rank == intra[g])
-                dev = system->nodes[GPU].nodes[i].gpu.dev;
-        }
-        if(dev == -1) {
-            WARN("XML Export Channel : rank %d not found.", intra[g]);
-            return scclInternalError;
-        }
-        SCCLCHECK(xmlSetAttrInt(node, "dev", dev));
-    }
-    if(system->nodes[NET].count) {
-        SCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
-        SCCLCHECK(xmlSetAttrInt(node, "dev", inter[1]));
-    }
-    return scclSuccess;
-}
-scclResult_t scclTopoGetXmlFromGraph(struct scclTopoGraph* graph, struct scclTopoSystem* system, struct scclXml* xml, struct scclXmlNode* parent) {
-    struct scclXmlNode* xmlGraph;
-    SCCLCHECK(xmlAddNode(xml, parent, "graph", &xmlGraph));
-    SCCLCHECK(xmlSetAttrInt(xmlGraph, "id", graph->id));
-    SCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern));
-    SCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic));
-    SCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels));
-    SCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->bwIntra));
-    SCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->bwInter));
-    SCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter));
-    const char* str;
-    SCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType));
-    SCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str));
-    SCCLCHECK(kvConvertToStr(graph->typeInter, &str, kvDictLinkType));
-    SCCLCHECK(xmlSetAttr(xmlGraph, "typeinter", str));
-    SCCLCHECK(xmlSetAttrInt(xmlGraph, "samechannels", graph->sameChannels));
-    for(int c = 0; c < graph->nChannels; c++) {
-        SCCLCHECK(scclTopoGetXmlFromChannel(graph, c, system, xml, xmlGraph));
-    }
-    return scclSuccess;
-}
-scclResult_t scclTopoGetXmlFromGraphs(int ngraphs, struct scclTopoGraph** graphs, struct scclTopoSystem* system, struct scclXml* xml) {
-    xml->maxIndex = 0;
-    struct scclXmlNode* xmlGraphs;
-    SCCLCHECK(xmlAddNode(xml, NULL, "graphs", &xmlGraphs));
-    SCCLCHECK(xmlSetAttrInt(xmlGraphs, "version", SCCL_GRAPH_XML_VERSION));
-    for(int g = 0; g < ngraphs; g++) {
-        SCCLCHECK(scclTopoGetXmlFromGraph(graphs[g], system, xml, xmlGraphs));
-    }
-    return scclSuccess;
-}
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-float speedArrayIntra[] = {48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12};
-float speedArrayInter[] = {48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12};
-#define NSPEEDSINTRA (sizeof(speedArrayIntra) / sizeof(float))
-#define NSPEEDSINTER (sizeof(speedArrayInter) / sizeof(float))
-#else
-float speedArrayIntra[] = {40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0};
-float speedArrayInter[] = {48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12};
-#define NSPEEDSINTRA (sizeof(speedArrayIntra) / sizeof(float))
-#define NSPEEDSINTER (sizeof(speedArrayInter) / sizeof(float))
-float sm90SpeedArrayIntra[] = {60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0};
-float sm90SpeedArrayInter[] = {48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12};
-#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra) / sizeof(float))
-#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter) / sizeof(float))
-#endif
-RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
-RCCL_PARAM(NChannels, "NCHANNELS", 0);
-scclResult_t scclTopoCompute(scclTopoSystem* system, struct scclTopoGraph* graph) {
-    int ngpus       = system->nodes[GPU].count;
-    graph->crossNic = scclParamCrossNic();
-    int crossNic    = (system->nodes[NET].count > 1) && graph->crossNic &&
-                           (graph->pattern == SCCL_TOPO_PATTERN_RING || graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE ||
-                            graph->pattern == SCCL_TOPO_PATTERN_SPLIT_TREE)
-                          ? 1
-                          : 0;
-    graph->bwIntra = graph->bwInter = 0;
-    graph->latencyInter             = 0;
-    if(graph->crossNic == 2)
-        graph->crossNic = 0;
-    graph->typeIntra      = ngpus == 1 ? PATH_LOC : PATH_NVL;
-    graph->typeInter      = PATH_PIX;
-    graph->nChannels      = 0;
-    graph->nIntraChannels = 0;
-    memset(graph->intraNets, 0, MAXCHANNELS * SCCL_TOPO_MAX_NODES * 2 * sizeof(int));
-    int trySameChannels = graph->pattern == SCCL_TOPO_PATTERN_NVLS ? 0 : 1;
-    graph->sameChannels = trySameChannels;
-    char* str = getenv("SCCL_GRAPH_FILE");
-    if(str) {
-        INFO(SCCL_ENV, "SCCL_GRAPH_FILE set by environment to %s", str);
-        struct scclXml* xml;
-        SCCLCHECK(scclCalloc(&xml, 1));
-        SCCLCHECK(scclTopoGetXmlGraphFromFile(str, xml));
-        int nChannels;
-        SCCLCHECK(scclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels));
-        INFO(SCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels);
-        free(xml);
-        if(graph->nChannels > 0)
-            return scclSuccess;
-    }
-    str            = getenv("SCCL_RINGS");
-    char* strTrees = getenv("RCCL_TREES");
-    if(str || strTrees) {
-        // user supplied topo
-        if(strTrees) {
-            SCCLCHECK(parseGraphLight(strTrees, system, graph, NULL));
-            system->treeDefined = true;
-        } else {
-            SCCLCHECK(parseGraph(str, system, graph, NULL, NULL));
-            int arch, vendor, model;
-            SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
-            if(graph->nChannels && arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME) {
-                system->type |= RCCL_TOPO_4P2H_ROME;
-            }
-        }
-    } else if(!rcclParamModelMatchingDisable() && !graph->collNet) {
-        // try to match 8P6L
-        SCCLCHECK(parseChordalRing(system, graph));
-        if(graph->nChannels)
-            return scclSuccess;
-        // try to match Rome 4P2H
-        SCCLCHECK(parseRome4P2H(system, graph));
-        if(graph->nChannels)
-            return scclSuccess;
-        // try to match 1H16P
-        SCCLCHECK(parse1H16P(system, graph));
-        if(graph->nChannels)
-            return scclSuccess;
-        // try to match 4H4P
-        SCCLCHECK(parse4H4P(system, graph));
-    }
-    if(graph->nChannels)
-        return scclSuccess;
-    if((graph->pattern == SCCL_TOPO_PATTERN_RING) && (system->type & RCCL_TOPO_4P2H_ROME) && (ngpus == system->nRanks)) {
-        // limit single node max channels when searching ring graph on Rome
-        graph->maxChannels = 2;
-    }
-    if(ngpus == 1)
-        if(graph->pattern != SCCL_TOPO_PATTERN_RING)
-            graph->pattern = SCCL_TOPO_PATTERN_TREE;
-    int ccMin;
-    SCCLCHECK(scclTopoGetCompCap(system, &ccMin, NULL));
-    if(graph->pattern == SCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90))
-        return scclSuccess;
-    if(ngpus == 1)
-        if(graph->pattern != SCCL_TOPO_PATTERN_RING)
-            graph->pattern = SCCL_TOPO_PATTERN_TREE;
-    if(system->nodes[NET].count == 0 && graph->pattern == SCCL_TOPO_PATTERN_NVLS) {
-        // Force intra-node NVLS algorithm to pull evenly from all GPUs.
-        graph->minChannels = graph->maxChannels = system->nodes[GPU].count;
-    }
-    struct scclTopoGraph tmpGraph;
-    memcpy(&tmpGraph, graph, sizeof(struct scclTopoGraph));
-    // First try crossnic, then decrease bw and finally increase bwIntra.
-    int nspeeds       = 0;
-    float* speedArray = NULL;
-    if(system->nodes[NET].count == 0) {
-        nspeeds    = NSPEEDSINTRA;
-        speedArray = speedArrayIntra;
-    } else {
-        nspeeds    = NSPEEDSINTER;
-        speedArray = speedArrayInter;
-    }
-    int pass       = 1;
-    int speedIndex = 0;
-    float maxBw    = system->maxBw;
-    float totalBw  = system->totalBw;
-    if(ngpus == 1 || graph->pattern != SCCL_TOPO_PATTERN_RING)
-        totalBw *= ngpus * 1.0 / (ngpus - 1);
-    while((speedArray[speedIndex] > maxBw || speedArray[speedIndex] * graph->minChannels > totalBw) && speedIndex < nspeeds - 1)
-        speedIndex++;
-    tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
-    int64_t globalTimeout               = SCCL_SEARCH_GLOBAL_TIMEOUT;
-search:
-    int time           = tmpGraph.sameChannels                        ? SCCL_SEARCH_TIMEOUT_SAMECHANNELS
-                         : tmpGraph.pattern == SCCL_TOPO_PATTERN_TREE ? SCCL_SEARCH_TIMEOUT_TREE
-                                                                      : SCCL_SEARCH_TIMEOUT;
-    tmpGraph.nChannels = 0;
-    globalTimeout -= time;
-    SCCLCHECK(scclTopoSearchRec(system, &tmpGraph, graph, &time));
-#if 0
-  printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
-  for (int c=0; c<graph->nChannels; c++) {
-    printf("%2d : ", c);
-    for (int g=0; g<ngpus; g++) {
-      printf("%d ", graph->intra[c*ngpus+g]);
-    }
-    printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]);
-    printf("\n");
-  }
-#endif
-    // Optimal solution, stop here
-    if(time == -1)
-        goto done;
-    if(graph->nChannels * graph->bwInter >= system->totalBw)
-        goto done;
-    if(pass == 1) {
-        // First pass, we don't have a solution yet ; try other options
-        // Try having different channels
-        if(tmpGraph.sameChannels == 1) {
-            tmpGraph.sameChannels = 0;
-            goto search;
-        }
-        tmpGraph.sameChannels = trySameChannels;
-        if(time != -1)
-            globalTimeout += time;
-        else
-            globalTimeout = SCCL_SEARCH_GLOBAL_TIMEOUT;
-        if(globalTimeout < 0 && graph->nChannels)
-            goto done;
-        tmpGraph.pattern = graph->pattern;
-        int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
-        if(tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
-            tmpGraph.typeIntra += 1;
-            goto search;
-        }
-        tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
-        if(system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS &&
-           (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
-            tmpGraph.typeInter += 1;
-            goto search;
-        }
-        tmpGraph.typeInter = PATH_PIX;
-        if(crossNic && tmpGraph.crossNic == 0) {
-            // Try again with crossNic if permitted
-            tmpGraph.crossNic = crossNic;
-            goto search;
-        }
-        tmpGraph.crossNic = 0;
-        // Decrease bw until we find a solution
-        if((speedIndex < nspeeds - 1) && (graph->nChannels == 0 || (speedArray[speedIndex + 1] / graph->bwInter > .49))) {
-            tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex];
-            goto search;
-        }
-        speedIndex = 0;
-        while(speedArray[speedIndex] > maxBw && speedIndex < nspeeds - 1)
-            speedIndex++;
-        tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
-    }
-done:
-    // We have a solution. Start from that solution and move to pass 2.
-    if(pass == 1) {
-        time = -1;
-        memcpy(&tmpGraph, graph, sizeof(tmpGraph));
-        speedIndex = 0;
-        while(speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds - 1)
-            speedIndex++;
-        tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
-        tmpGraph.minChannels                = graph->nChannels;
-        pass                                = 2;
-    }
-    // 3. See if we can increase bwIntra for trees (2 nodes or collnet)
-    if(pass == 2) {
-        if(time != 0 && graph->pattern != SCCL_TOPO_PATTERN_RING && tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter * 2 &&
-           speedIndex > 0) {
-            tmpGraph.bwIntra = speedArray[--speedIndex];
-            goto search;
-        }
-        time = -1;
-        memcpy(&tmpGraph, graph, sizeof(tmpGraph));
-    }
-    if(graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != SCCL_TOPO_PATTERN_NVLS) {
-        WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
-        for(int i = 0; i < ngpus; i++)
-            graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
-        graph->inter[0] = graph->inter[1] = 0;
-        graph->bwIntra = graph->bwInter = 0.1;
-        graph->typeIntra = graph->typeInter = PATH_SYS;
-        graph->nChannels                    = 1;
-    }
-    if(graph->nChannels == 0)
-        return scclSuccess;
-    if(graph->pattern == SCCL_TOPO_PATTERN_NVLS)
-        return scclSuccess;
-    if(graph->bwIntra < 25.0)
-        return scclSuccess;
-    if(ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4)
-        return scclSuccess;
-    int dupChannels = std::min(graph->nChannels * 2, graph->maxChannels);
-    memcpy(graph->intra + graph->nChannels * ngpus, graph->intra, (dupChannels - graph->nChannels) * ngpus * sizeof(int));
-    memcpy(graph->inter + graph->nChannels * 2, graph->inter, (dupChannels - graph->nChannels) * 2 * sizeof(int));
-    graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
-    graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
-    graph->nChannels = dupChannels;
-    int nc = rcclParamNChannels();
-    if(graph->nChannels > 0 && nc > 0 && nc <= MAXCHANNELS / 2 && nc > graph->nChannels) {
-        int nChannels = nc - graph->nChannels;
-        int nnets     = system->nodes[NET].count;
-        if(nnets <= 2) {
-            for(int i = 0; i < nChannels; ++i) {
-                memcpy(graph->intra + graph->nChannels * ngpus, graph->intra, ngpus * sizeof(int));
-                memcpy(graph->inter + graph->nChannels * 2, graph->inter, 2 * sizeof(int));
-                memcpy(graph->intraNets + graph->nChannels * ngpus * 2, graph->intraNets, 2 * ngpus * sizeof(int));
-                graph->nChannels++;
-            }
-        } else {
-            typedef struct {
-                int id;
-                int used;
-            } Net;
-            Net nets[nnets];
-            auto sortFunc = [](const void* a, const void* b) -> int { return ((Net*)a)->used - ((Net*)b)->used; };
-            memset(nets, 0, nnets * sizeof(Net));
-            for(int i = 0; i < nnets; ++i) {
-                nets[i].id = system->nodes[NET].nodes[i].id;
-            }
-            for(int i = 0; i < graph->nChannels; ++i) {
-                for(int j = 0; j < nnets; ++j) {
-                    if(nets[j].id == *(graph->inter + i * 2) || nets[j].id == *(graph->inter + i * 2 + 1)) {
-                        nets[j].used++;
-                    }
-                }
-            }
-            for(int i = 0; i < nChannels; ++i) {
-                memcpy(graph->intra + graph->nChannels * ngpus, graph->intra, ngpus * sizeof(int));
-                qsort(nets, nnets, sizeof(Net), sortFunc);
-                *(graph->inter + graph->nChannels * 2) = nets[0].id;
-                nets[0].used++;
-                qsort(nets, nnets, sizeof(Net), sortFunc);
-                if(graph->crossNic == 0 || graph->crossNic == 2) {
-                    *(graph->inter + graph->nChannels * 2 + 1) = nets[0].id;
-                    nets[0].used++;
-                    qsort(nets, nnets, sizeof(Net), sortFunc);
-                } else {
-                    nets[0].used++;
-                    qsort(nets, nnets, sizeof(Net), sortFunc);
-                    *(graph->inter + graph->nChannels * 2 + 1) = nets[0].id;
-                }
-                nets[0].used++;
-                memcpy(graph->intraNets + graph->nChannels * ngpus * 2, graph->intraNets, 2 * ngpus * sizeof(int));
-                graph->nChannels++;
-            }
-        }
-        graph->bwIntra /= DIVUP(nc, graph->nChannels);
-        graph->bwInter /= DIVUP(nc, graph->nChannels);
-    }
-    return scclSuccess;
-}
-scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
-    INFO(SCCL_GRAPH,
-         "Pattern %d, crossNic %d, nChannels %d, bw %f/%f, type %s/%s, sameChannels %d",
-         graph->pattern,
-         graph->crossNic,
-         graph->nChannels,
-         graph->bwIntra,
-         graph->bwInter,
-         topoPathTypeStr[graph->typeIntra],
-         topoPathTypeStr[graph->typeInter],
-         graph->sameChannels);
-    int ngpus = system->nodes[GPU].count;
-    char line[1024];
-    for(int c = 0; c < graph->nChannels; c++) {
-        sprintf(line, "%2d :", c);
-        int offset = strlen(line);
-        if(system->nodes[NET].count > 0 && system->nodes[GPU].count != system->nRanks && !graph->nIntraChannels) {
-            sprintf(line + offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2 * c]);
-            offset = strlen(line);
-        }
-        for(int i = 0; i < ngpus; i++) {
-            int n = graph->intraNets[(ngpus * c + i) * 2] - 'N';
-            if(n >= 0 && n < system->nodes[NET].count) {
-                sprintf(line + offset, " NET/%d", n);
-                offset = strlen(line);
-            }
-            sprintf(line + offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus * c + i]);
-            offset = strlen(line);
-            n      = graph->intraNets[(ngpus * c + i) * 2 + 1] - 'N';
-            if(n >= 0 && n < system->nodes[NET].count) {
-                sprintf(line + offset, " NET/%d", n);
-                offset = strlen(line);
-            }
-        }
-        if(system->nodes[NET].count > 0 && system->nodes[GPU].count != system->nRanks && !graph->nIntraChannels) {
-            sprintf(line + offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2 * c + 1]);
-            offset = strlen(line);
-        }
-        INFO(SCCL_GRAPH, "%s", line);
-    }
-    return scclSuccess;
-}
-scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs) {
-    char* str = getenv("SCCL_GRAPH_DUMP_FILE");
-    if(str) {
-        INFO(SCCL_ENV, "SCCL_GRAPH_DUMP_FILE set by environment to %s", str);
-        struct scclXml* xml;
-        SCCLCHECK(scclCalloc(&xml, 1));
-        SCCLCHECK(scclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
-        SCCLCHECK(scclTopoDumpXmlToFile(str, xml));
-        free(xml);
-    }
-    return scclSuccess;
-}
-#include "comm.h"
-// NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
-scclResult_t getNvlsNetDev(struct scclComm* comm, struct scclTopoGraph* graph, int* dev) {
-    int localRanks = comm->topo->nodes[GPU].count;
-    for(int c = 0; c < graph->nChannels; c++) {
-        if(graph->intra[c * localRanks] == comm->rank) {
-            *dev = graph->inter[c * 2];
-            return scclSuccess;
-        }
-    }
-    WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
-    return scclInternalError;
-}
-// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
-SCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2);
-scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) {
-    if(graph) {
-        // Honor the net device in the graph
-        int channel = channelId % graph->nChannels;
-        int ngpus   = comm->topo->nodes[GPU].count;
-        int index   = graph->intra[channel * ngpus] == rank ? 0 : 1;
-        if(graph->pattern != SCCL_TOPO_PATTERN_NVLS) {
-            *dev = graph->inter[channel * 2 + index];
-        } else {
-            SCCLCHECK(getNvlsNetDev(comm, graph, dev));
-        }
-        SCCLCHECK(scclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
-    } else if(peerRank == -1) {
-        return scclInternalError;
-    } else {
-        // Start with our local NIC and local Rank
-        SCCLCHECK(scclTopoGetLocalNet(comm->topo, rank, channelId, dev));
-        *proxyRank = rank;
-        int pxnLevel = scclPxnDisable(comm) == 1 ? 0 : scclParamP2pPxnLevel();
-        // See whether we can use the remote rank preferred device.
-        if(scclParamCrossNic() == 0 || (pxnLevel != 0)) {
-            // Find local NIC number close to local cudaDev
-            int cudaDev = comm->peerInfo[peerRank].cudaDev;
-            int localRank;
-            if(scclTopoDevToRank(comm->topo, cudaDev, &localRank) != scclSuccess)
-                return scclSuccess;
-            int netDev;
-            SCCLCHECK(scclTopoGetLocalNet(comm->topo, localRank, channelId, &netDev));
-            int n;
-            // Check that device exists on our node
-            if(scclParamCrossNic() == 0) {
-                if(scclTopoIdToIndex(comm->topo, NET, netDev, &n) != scclSuccess) {
-                    WARN("Rank %d requires NIC %d but that NIC is not available for rank %d", peerRank, netDev, rank);
-                    return scclInvalidUsage;
-                }
-                *dev = netDev;
-            }
-            if(pxnLevel == 1) {
-                int g, n;
-                SCCLCHECK(scclTopoRankToIndex(comm->topo, rank, &g));
-                SCCLCHECK(scclTopoIdToIndex(comm->topo, NET, netDev, &n));
-                struct scclTopoNode* gpu = comm->topo->nodes[GPU].nodes + g;
-                if(gpu->paths[NET][n].type <= PATH_PXN) {
-                    *dev = netDev;
-                    SCCLCHECK(scclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
-                }
-            } else if(pxnLevel == 2) {
-                // Check which local GPU corresponds to that NIC and see if we can use PXN.
-                int n, g1, g2;
-                SCCLCHECK(scclTopoIdToIndex(comm->topo, NET, netDev, &n));
-                SCCLCHECK(scclTopoRankToIndex(comm->topo, rank, &g1));
-                SCCLCHECK(scclTopoGetLocalGpu(comm->topo, netDev, &g2));
-                if(g2 != -1) {
-                    struct scclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes + g2;
-                    if(peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
-                        *proxyRank = peerGpu->gpu.rank;
-                        *dev       = netDev;
-                        return scclSuccess;
-                    }
-                }
-            }
-        }
-    }
-    return scclSuccess;
-}
-scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev) {
-    *dev = -1;
-    if(graph && graph->nIntraChannels) {
-        int n1    = -1;
-        int ngpus = system->nodes[GPU].count;
-        int nnets = system->nodes[NET].count;
-        int chan  = channelId % graph->nIntraChannels;
-        for(int i = 0; i < ngpus; i++) {
-            if(graph->intra[ngpus * chan + i] == rank) {
-                n1 = graph->intraNets[(ngpus * chan + i) * 2 + type] - 'N';
-                break;
-            }
-        }
-        if(n1 >= 0 && n1 < nnets) {
-            *dev = n1;
-        }
-    }
-    return scclSuccess;
-}
-scclResult_t scclTopoGetLinkType(struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter, int nInter, int* inter) {
-    int interGpus[MAX_XGMI_INTER_GPUS + 1];
-    int ngpus = system->nodes[GPU].count;
-    *isXGMI   = false;
-    // check for direct XGMI connection
-    for(int i = 0; i < ngpus; i++) {
-        if(system->nodes[GPU].nodes[i].gpu.dev == cudaDev1) {
-            struct scclTopoNode* node = system->nodes[GPU].nodes + i;
-            for(int k = 0; k < system->nodes[GPU].count; k++) {
-                if(node->paths[GPU][k].count == 1) {
-                    struct scclTopoLink* link    = node->paths[GPU][k].list[0];
-                    struct scclTopoNode* remNode = link->remNode;
-                    if(remNode->gpu.dev == cudaDev2) {
-                        *isXGMI = (link->type == LINK_NVL);
-                        if(*isXGMI)
-                            return scclSuccess;
-                    }
-                }
-            }
-        }
-    }
-    // try intermediate GPUs
-    if(maxInter) {
-        // check if there are intermediate GPUs that are connected to both
-        bool res1, res2, res3;
-        int j;
-        for(j = 0; j < nInter; j++) {
-            scclTopoGetLinkType(system, inter[j], inter[j + 1], &res1, 0);
-            if(!res1)
-                break;
-        }
-        if(j < nInter)
-            return scclSuccess;
-        if(nInter > 0 && inter != nullptr) {
-            scclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0);
-            if(res2) {
-                *isXGMI = true;
-                return scclSuccess;
-            }
-            memcpy(interGpus + 1, inter + 1, sizeof(int) * nInter);
-        }
-        interGpus[0] = cudaDev1;
-        // add one more intermediate GPU recursively util reaching max depth
-        nInter++;
-        if(nInter + 2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter)
-            return scclSuccess;
-        for(int i = 0; i < ngpus; i++) {
-            int dev = system->nodes[GPU].nodes[i].gpu.dev;
-            // skip duplicated GPU
-            if(dev == cudaDev2)
-                continue;
-            for(j = 0; j < nInter; j++)
-                if(dev == interGpus[j])
-                    break;
-            if(j < nInter)
-                continue;
-            // check connectivity with intermediate GPUs
-            interGpus[nInter] = dev;
-            scclTopoGetLinkType(system, cudaDev1, cudaDev2, &res3, maxInter, nInter, interGpus);
-            if(res3) {
-                *isXGMI = true;
-                return scclSuccess;
-            }
-        }
-    }
-    return scclSuccess;
-}
-} // namespace detect
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/graph/trees.cc
+++ b/src/hardware/graph/trees.cc
-#include "sccl.h"
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace detect {
-#define RANK_TO_INDEX(r) (rank > root ? rank - 1 : rank)
-/* Btree which alternates leaves and nodes.
- * Assumes root is 0, which conveniently builds a tree on powers of two,
- * (because we have pow2-1 ranks) which lets us manipulate bits.
- * Find first non-zero bit, then :
- * Find the parent :
- *   xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
- *   xx11[0] -> xx10[0] (3,7,11 below)
- * Find the children :
- *   xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
- *   xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
- *
- * Illustration :
- * 0---------------8
- *          ______/ \______
- *         4               12
- *       /   \            /  \
- *     2       6       10     \
- *    / \     / \     /  \     \
- *   1   3   5   7   9   11    13
- */
-scclResult_t scclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
-    int up, down0, down1;
-    int bit;
-    for(bit = 1; bit < nranks; bit <<= 1) {
-        if(bit & rank)
-            break;
-    }
-    if(rank == 0) {
-        *u  = -1;
-        *d0 = -1;
-        // Child rank is > 0 so it has to be our child 1, not 0.
-        *d1 = nranks > 1 ? bit >> 1 : -1;
-        return scclSuccess;
-    }
-    up = (rank ^ bit) | (bit << 1);
-    // if smaller than the parent, we are his first child, otherwise we're his second
-    if(up >= nranks)
-        up = (rank ^ bit);
-    *parentChildType = (rank < up) ? 0 : 1;
-    *u               = up;
-    int lowbit = bit >> 1;
-    // down0 is always within bounds
-    down0 = lowbit == 0 ? -1 : rank - lowbit;
-    down1 = lowbit == 0 ? -1 : rank + lowbit;
-    // Make sure down1 is within bounds
-    while(down1 >= nranks) {
-        down1 = lowbit == 0 ? -1 : rank + lowbit;
-        lowbit >>= 1;
-    }
-    *d0 = down0;
-    *d1 = down1;
-    return scclSuccess;
-}
-/* Build a double binary tree. Take the previous tree for the first tree.
- * For the second tree, we use a mirror tree (if nranks is even)
- *
- * 0---------------8                   3----------------11
- *          ______/ \                 / \______
- *         4         \               /         7
- *       /   \        \             /        /   \
- *     2       6       10         1        5      9
- *    / \     / \     /  \       / \      / \    / \
- *   1   3   5   7   9   11     0   2    4   6  8   10
- *
- * or shift it by one rank (if nranks is odd).
- *
- * 0---------------8            1---------------9
- *          ______/ \______              ______/ \______
- *         4               12           5                0
- *       /   \            /           /   \            /
- *     2       6       10           3       7       11
- *    / \     / \     /  \         / \     / \     /  \
- *   1   3   5   7   9   11       2   4   6   8  10   12
- */
-scclResult_t scclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
-    // First tree ... use a btree
-    scclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
-    // Second tree ... mirror or shift
-    if(nranks % 2 == 1) {
-        // shift
-        int shiftrank = (rank - 1 + nranks) % nranks;
-        int u, d0, d1;
-        scclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
-        *s1   = u == -1 ? -1 : (u + 1) % nranks;
-        *d1_0 = d0 == -1 ? -1 : (d0 + 1) % nranks;
-        *d1_1 = d1 == -1 ? -1 : (d1 + 1) % nranks;
-    } else {
-        // mirror
-        int u, d0, d1;
-        scclGetBtree(nranks, nranks - 1 - rank, &u, &d0, &d1, parentChildType1);
-        *s1   = u == -1 ? -1 : nranks - 1 - u;
-        *d1_0 = d0 == -1 ? -1 : nranks - 1 - d0;
-        *d1_1 = d1 == -1 ? -1 : nranks - 1 - d1;
-    }
-    return scclSuccess;
-}
-} // namespace detect
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/graph/tuning.cc
+++ b/src/hardware/graph/tuning.cc
-#include "core.h"
-#include "devcomm.h"
-#include "comm.h"
-#include "topo.h"
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace detect {
-SCCL_PARAM(Nthreads, "NTHREADS", -2);
-SCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);
-static int getNthreads(const char* name, int env, int min, int max, int def, int WarpSize) {
-    int nt = env;
-    if(nt > 0) {
-        if(nt % WarpSize != 0) {
-            WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WarpSize);
-            nt = max;
-        } else if(nt > max) {
-            WARN("Invalid %s %d (maximum %d).", name, nt, max);
-            nt = max;
-        } else if(nt < min) {
-            WARN("Invalid %s %d (minimum %d).", name, nt, min);
-            nt = min;
-        }
-    } else {
-        nt = def;
-    }
-    return nt;
-}
-scclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) {
-    int def, set;
-    if(str[0] == '^') {
-        def = 1;
-        set = 0;
-        str++;
-    } else {
-        def = 0;
-        set = 1;
-    }
-    for(int i = 0; i < nelems; i++)
-        list[i] = def;
-    char* tokStr = strdup(str);
-    char* tmpStr;
-    char* token = strtok_r(tokStr, ",", &tmpStr);
-    while(token) {
-        for(int i = 0; i < nelems; i++)
-            if(strcasecmp(token, elems[i]) == 0)
-                list[i] = set;
-        token = strtok_r(NULL, ",", &tmpStr);
-    }
-    free(tokStr);
-    return scclSuccess;
-}
-// Latencies in us, Bandwidths in GB/s
-// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
-static const float baseLat[SCCL_NUM_ALGORITHMS][SCCL_NUM_PROTOCOLS] = {{12.0, 12.0, 17.0},
-                                                                       {12.0, 12.0, 17.0}, // Tree, Ring
-                                                                       {12.0, 12.0, 17.0},
-                                                                       {12.0, 12.0, 17.0}, // Collnet Direct, Chain
-                                                                       {0, 0, 0},
-                                                                       {0, 0, 0}}; // NVLS, NVLS Tree
-// NVLink, PCI, Network
-#define SCCL_HW_NVLINK 0
-#define SCCL_HW_PCI 1
-#define SCCL_HW_NET 2
-struct tuningModel {
-    float hwLat[3][SCCL_NUM_ALGORITHMS][SCCL_NUM_PROTOCOLS];
-    float bwRatio[2][SCCL_NUM_ALGORITHMS][SCCL_NUM_PROTOCOLS];
-    float treeCorrectionFactor[SCCL_NUM_PROTOCOLS][27];
-    float ringCorrectionFactor[SCCL_NUM_PROTOCOLS][27];
-};
-static struct tuningModel tuning_model_0{
-    .hwLat =
-        {
-            /* NVLINK */
-            {/* Tree (LL/LL128/Simple)*/ {0.8, 1.4, 2.5},
-             /* Ring (LL/LL128/Simple)*/ {0.8, 2.2, 3.6},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 0.8},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 1.4},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* PCI */
-            {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
-             /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* NET */
-            {/* Tree (LL/LL128/Simple)*/ {11.8, 18.2, 20.8},
-             /* Ring (LL/LL128/Simple)*/ {9.5, 19.8, 15.1},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 11.8},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 18.2},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-        },
-    .bwRatio =
-        {
-            /* 2 nodes */
-            {/* Tree (LL/LL128/Simple)*/ {0.28, 0.22, 0.91},
-             /* Ring (LL/LL128/Simple)*/ {0.31, 0.34, 1.00},
-             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
-             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* more than 2 nodes */
-            {/* Tree (LL/LL128/Simple)*/ {0.04, 0.22, 0.95},
-             /* Ring (LL/LL128/Simple)*/ {0.04, 0.34, 1.00},
-             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
-             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-        },
-    .treeCorrectionFactor =
-        {
-            {
-                0.1, 0.2, 0.1, 0.1, 0.9, 0.3, 0.4, 0.1, 0.2, 0.4, 0.2, 0.1, 0.3, 0.3, 0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
-            },
-            {
-                0.1, 0.3, 1.0, 0.1, 0.5, 1.0, 0.9, 1.0, 1.0, 1.0, 0.3, 0.1, 0.4, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
-            },
-            //    { 0.2, 1.0, 0.1, 0.1, 0.7, 0.2, 0.4, 0.1, 0.1, 0.3, 0.4, 0.3, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, },
-            {
-                0.2, 1.0, 0.1, 0.1, 0.7, 0.2, 0.4, 0.1, 0.1, 0.3, 0.4, 0.3, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 0.9, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4,
-            },
-        },
-    .ringCorrectionFactor =
-        {
-            {
-                0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.2, 0.3, 0.5, 0.3, 0.1, 0.5, 0.5, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
-            },
-            {
-                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3,
-            },
-            {
-                1.0, 0.8, 0.2, 1.0, 1.0, 0.3, 1.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.5, 1.0, 0.8, 0.8, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-            },
-        },
-};
-static struct tuningModel tuning_model_1{
-    .hwLat =
-        {
-            /* NVLINK */
-            {/* Tree (LL/LL128/Simple)*/ {1.5, 1.5, 4.5},
-             /* Ring (LL/LL128/Simple)*/ {1.5, 1.5, 4.5},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 4.5},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 4.5},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* PCI */
-            {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
-             /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* NET */
-            {/* Tree (LL/LL128/Simple)*/ {33.0, 33.0, 15.8},
-             /* Ring (LL/LL128/Simple)*/ {5.1, 5.1, 68.8},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 15.8},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 15.8},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-        },
-    .bwRatio =
-        {
-            /* 2 nodes */
-            {/* Tree (LL/LL128/Simple)*/ {0.30, 1.00, 0.99},
-             /* Ring (LL/LL128/Simple)*/ {0.31, 1.00, 1.00},
-             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
-             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* more than 2 nodes */
-            {/* Tree (LL/LL128/Simple)*/ {0.15, 1.00, 0.42},
-             /* Ring (LL/LL128/Simple)*/ {0.20, 1.00, 1.00},
-             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
-             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-        },
-    .treeCorrectionFactor =
-        {
-            {
-                0.5, 0.4, 0.7, 0.6, 1.0, 1.0, 0.5, 0.4, 0.1, 0.5, 0.4, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.6, 0.5, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1,
-            },
-            {
-                0.5, 0.4, 0.7, 0.6, 1.0, 1.0, 0.5, 0.4, 0.1, 0.5, 0.4, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.6, 0.5, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1,
-            },
-            //    { 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.4, 0.5, 0.1, 0.6, 1.0, 1.0, 1.0, 0.6, 0.5, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.5, 0.3, 0.3, },
-            {
-                0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.4, 0.5, 0.1, 0.6, 1.0, 1.0, 1.0, 0.6, 0.5, 0.7, 1.0, 1.0, 1.0, 0.4, 0.4, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1,
-            },
-        },
-    .ringCorrectionFactor =
-        {
-            {
-                1.0, 0.5, 1.0, 1.0, 0.6, 0.7, 1.0, 1.0, 0.2, 1.0, 0.9, 0.7, 1.0, 1.0, 1.0, 0.9, 0.9, 0.8, 0.8, 0.7, 0.6, 0.5, 0.5, 0.3, 0.2, 0.1, 0.1,
-            },
-            {
-                1.0, 0.5, 1.0, 1.0, 0.6, 0.7, 1.0, 1.0, 0.2, 1.0, 0.9, 0.7, 1.0, 1.0, 1.0, 0.9, 0.9, 0.8, 0.8, 0.7, 0.6, 0.5, 0.5, 0.3, 0.2, 0.1, 0.1,
-            },
-            {
-                0.3, 1.0, 0.3, 0.1, 0.1, 0.1, 0.3, 0.7, 1.0, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.9, 1.0, 1.0, 1.0, 1.0,
-            },
-        },
-};
-static struct tuningModel tuning_model_2{
-    .hwLat =
-        {
-            /* NVLINK */
-            {/* Tree (LL/LL128/Simple)*/ {1.5, 1.5, 4.5},
-             /* Ring (LL/LL128/Simple)*/ {1.5, 1.5, 4.5},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 4.5},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 4.5},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* PCI */
-            {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
-             /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* NET */
-            {/* Tree (LL/LL128/Simple)*/ {27.9, 27.9, 15.8},
-             /* Ring (LL/LL128/Simple)*/ {12.1, 12.1, 68.8},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 15.8},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 15.8},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-        },
-    .bwRatio =
-        {
-            /* 2 nodes */
-            {/* Tree (LL/LL128/Simple)*/ {0.30, 1.00, 0.99},
-             /* Ring (LL/LL128/Simple)*/ {0.31, 1.00, 1.00},
-             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
-             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* more than 2 nodes */
-            {/* Tree (LL/LL128/Simple)*/ {0.07, 1.00, 0.42},
-             /* Ring (LL/LL128/Simple)*/ {0.08, 1.00, 1.00},
-             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
-             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-        },
-    .treeCorrectionFactor =
-        {
-            {
-                0.1, 0.4, 0.3, 0.3, 0.2, 0.4, 0.5, 0.1, 0.1, 0.6, 0.7, 0.7, 0.8, 1.0, 0.9, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
-            },
-            {
-                0.1, 0.4, 0.3, 0.3, 0.2, 0.4, 0.5, 0.1, 0.1, 0.6, 0.7, 0.7, 0.8, 1.0, 0.9, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
-            },
-            //    { 1.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.1, 0.6, 0.9, 0.8, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.9, 0.9, 1.0, 1.0, 1.0, },
-            {
-                1.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.1, 0.6, 0.9, 0.8, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 0.4, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4,
-            },
-        },
-    .ringCorrectionFactor =
-        {
-            {
-                0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 1.0, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
-            },
-            {
-                0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 1.0, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
-            },
-            {
-                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.5, 0.6, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-            },
-        },
-};
-static struct tuningModel tuning_model_3{
-    .hwLat =
-        {
-            /* NVLINK */
-            {/* Tree (LL/LL128/Simple)*/ {0.8, 0.0, 2.5},
-             /* Ring (LL/LL128/Simple)*/ {0.8, 0.0, 3.6},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 0.8},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 0.0},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* PCI */
-            {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
-             /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* NET */
-            {/* Tree (LL/LL128/Simple)*/ {12.5, 0.0, 22.4},
-             /* Ring (LL/LL128/Simple)*/ {9.5, 0.0, 19.8},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 12.5},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 0.0},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-        },
-    .bwRatio =
-        {
-            /* 2 nodes */
-            {/* Tree (LL/LL128/Simple)*/ {0.20, 0.00, 1.75},
-             /* Ring (LL/LL128/Simple)*/ {0.20, 0.00, 1.00},
-             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
-             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* more than 2 nodes */
-            {/* Tree (LL/LL128/Simple)*/ {0.20, 0.00, 0.96},
-             /* Ring (LL/LL128/Simple)*/ {0.20, 0.00, 1.00},
-             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
-             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-        },
-    .treeCorrectionFactor =
-        {
-            {
-                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.2, 1.0, 0.9, 1.0, 0.6, 0.4, 0.6, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
-            },
-            {
-                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-            },
-            //    { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.2, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7, 0.8, 0.9, 0.7, 0.7, },
-            {
-                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.2, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 0.4, 0.3, 0.3, 0.3, 0.4, 0.3, 0.3,
-            },
-        },
-    .ringCorrectionFactor =
-        {
-            {
-                0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.1, 0.2, 0.1, 0.4, 0.4, 0.2, 0.2, 0.3, 0.7, 0.5, 0.4, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
-            },
-            {
-                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-            },
-            {
-                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 1.0, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.4, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-            },
-        },
-};
-static struct tuningModel tuning_model_4{
-    .hwLat =
-        {
-            /* NVLINK */
-            {/* Tree (LL/LL128/Simple)*/ {0.8, 1.4, 2.5},
-             /* Ring (LL/LL128/Simple)*/ {0.8, 2.2, 3.6},
-             /* CollNetDirect (Simple)*/ {0.8, 1.4, 2.5},
-             /* CollNetChain (Simple)*/ {0.8, 1.4, 2.5},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* PCI */
-            {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
-             /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* NET */
-            {/* Tree (LL/LL128/Simple)*/ {32.2, 34.4, 47.6},
-             /* Ring (LL/LL128/Simple)*/ {35.4, 87.8, 209.2},
-             /* CollNetDirect (Simple)*/ {0.0, 0.0, 47.6},
-             /* CollNetChain (Simple)*/ {0.0, 0.0, 47.6},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-        },
-    .bwRatio =
-        {
-            /* 2 nodes */
-            {/* Tree (LL/LL128/Simple)*/ {0.16, 1.09, 1.61},
-             /* Ring (LL/LL128/Simple)*/ {0.15, 0.41, 1.00},
-             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
-             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-            /* more than 2 nodes */
-            {/* Tree (LL/LL128/Simple)*/ {0.16, 1.09, 1.08},
-             /* Ring (LL/LL128/Simple)*/ {0.15, 0.41, 1.00},
-             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
-             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
-             /* NVLS */ {0, 0, 0},
-             /* NVLS Tree */ {0, 0, 0}},
-        },
-    .treeCorrectionFactor =
-        {
-            {
-                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.1, 0.1, 0.2, 0.4, 0.6, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
-            },
-            {
-                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.2, 1.0, 0.5, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
-            },
-            //    { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.4, 0.3, 0.3, 0.1, 0.1, 1.0, 1.0, 0.7, 0.5, 0.6, 0.5, 0.6, 0.6, 0.5, 0.6, 0.6, 0.6, 0.7, },
-            //    { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.4, 0.3, 0.3, 0.1, 0.1, 1.0, 1.0, 0.7, 0.5, 0.6, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, },
-        },
-    .ringCorrectionFactor =
-        {
-            {
-                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
-            },
-            {
-                0.4, 0.5, 0.5, 0.4, 0.4, 0.4, 0.4, 0.2, 0.2, 0.1, 0.3, 1.0, 1.0, 0.7, 0.8, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.8, 0.5, 0.4, 0.3, 0.3,
-            },
-            {
-                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.8, 0.5, 0.1, 0.7, 0.2, 0.4, 0.4, 0.6, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-            },
-        },
-};
-static struct tuningModel rcclTuningModel[] = {
-    tuning_model_0,
-    tuning_model_1,
-    tuning_model_2,
-    tuning_model_3,
-    tuning_model_4,
-};
-/* Array indexes used below */
-#define VOLTA_COMPCAP_IDX 0
-#define AMPERE_COMPCAP_IDX 1
-#define HOPPER_COMPCAP_IDX 2
-// LL128 max BW per channel
-static const double llMaxBws[3][3] = {
-    /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
-    /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
-    /* Hopper-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}};
-static const double perChMaxRingLL128Bws[3][3] = {
-    /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
-    /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
-    /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
-};
-static const double perChMaxTreeLL128Bws[3][3] = {
-    /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
-    /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
-    /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
-};
-static const double perChMaxTreeBws[3][3] = {
-    /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
-    /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
-    /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
-};
-// Network post overhead in ns (1000 = 1 us)
-SCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
-static float getNetOverhead(struct scclComm* comm) {
-    if(scclParamNetOverhead() != -2)
-        return scclParamNetOverhead() * .001;
-    int cpuArch, cpuVendor, cpuModel;
-    SCCLCHECK(scclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
-    if(cpuArch == SCCL_TOPO_CPU_ARCH_X86 && cpuVendor == SCCL_TOPO_CPU_VENDOR_INTEL)
-        return 1.0;
-    if(cpuArch == SCCL_TOPO_CPU_ARCH_X86 && cpuVendor == SCCL_TOPO_CPU_VENDOR_AMD)
-        return 2.0;
-    else
-        return 1.0;
-}
-scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs) {
-    int simpleDefaultThreads = (graphs[SCCL_ALGO_RING]->bwIntra * graphs[SCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : SCCL_SIMPLE_MAX_NTHREADS;
-    comm->maxThreads[SCCL_ALGO_RING][SCCL_PROTO_SIMPLE] =
-        getNthreads("SCCL_NTHREADS", scclParamNthreads(), 4 * comm->WarpSize, SCCL_MAX_NTHREADS, simpleDefaultThreads, comm->WarpSize);
-    comm->maxThreads[SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE] = comm->maxThreads[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_SIMPLE] =
-        getNthreads("SCCL_NTHREADS", scclParamNthreads(), 4 * comm->WarpSize, SCCL_MAX_NTHREADS, SCCL_MAX_NTHREADS, comm->WarpSize);
-    comm->maxThreads[SCCL_ALGO_RING][SCCL_PROTO_LL] = comm->maxThreads[SCCL_ALGO_TREE][SCCL_PROTO_LL] =
-        comm->maxThreads[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_LL] =
-            getNthreads("SCCL_NTHREADS", scclParamNthreads(), 4 * comm->WarpSize, SCCL_MAX_NTHREADS, SCCL_MAX_NTHREADS, comm->WarpSize);
-    comm->maxThreads[SCCL_ALGO_RING][SCCL_PROTO_LL128] = comm->maxThreads[SCCL_ALGO_TREE][SCCL_PROTO_LL128] =
-        getNthreads("SCCL_LL128_NTHREADS", scclParamLl128Nthreads(), 4 * comm->WarpSize, SCCL_LL128_MAX_NTHREADS, SCCL_LL128_MAX_NTHREADS, comm->WarpSize);
-    int nNodes = comm->nNodes;
-    int nRanks = comm->nRanks;
-    if(nRanks <= 1)
-        return scclSuccess;
-    int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
-    int cpuArch, cpuVendor, cpuModel;
-    SCCLCHECK(scclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
-    int index2 = nNodes <= 2 ? nNodes - 1 : 2;
-    // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
-    int index1                 = nNodes == 1 ? compCapIndex : cpuVendor == SCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
-    double llMaxBw             = llMaxBws[index1][index2];
-    double perChMaxTreeBw      = perChMaxTreeBws[compCapIndex][index2];
-    double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
-    double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
-    // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
-    // if (cpuArch == SCCL_TOPO_CPU_ARCH_POWER) hwLat[SCCL_HW_PCI][SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE] = hwLat[SCCL_HW_PCI][SCCL_ALGO_RING][SCCL_PROTO_SIMPLE];
-    float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
-    int intraHw[SCCL_NUM_ALGORITHMS], hw[SCCL_NUM_ALGORITHMS];
-    for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++)
-        intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? SCCL_HW_NVLINK : SCCL_HW_PCI;
-    for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++)
-        hw[a] = nNodes == 1 ? intraHw[a] : SCCL_HW_NET;
-    for(int coll = 0; coll < SCCL_NUM_FUNCTIONS; coll++) {
-        int nsteps      = coll == scclFuncAllReduce ? 2 * (nRanks - 1) : coll == scclFuncReduceScatter || coll == scclFuncAllGather ? nRanks - 1 : nRanks;
-        int nInterSteps = coll == scclFuncAllReduce                                    ? (nNodes > 1 ? 2 * nNodes : 0)
-                          : coll == scclFuncReduceScatter || coll == scclFuncAllGather ? nNodes - 1
-                                                                                       : nNodes;
-        for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++) {
-            if(coll == scclFuncBroadcast && a != SCCL_ALGO_RING)
-                continue;
-            if(coll == scclFuncReduce && a != SCCL_ALGO_RING)
-                continue;
-            if(coll == scclFuncReduceScatter && a != SCCL_ALGO_RING)
-                continue;
-            if(coll == scclFuncAllGather && a != SCCL_ALGO_RING)
-                continue;
-            for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
-                if((a == SCCL_ALGO_NVLS || a == SCCL_ALGO_NVLS_TREE) && p != SCCL_PROTO_SIMPLE)
-                    continue;
-                int collnet = (a == SCCL_ALGO_COLLNET_DIRECT || a == SCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
-                float bw    = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
-                float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw;
-                // INFO(SCCL_INIT, "algo %s proto %s busBw %f baseBw %f bw %f nChannels %d bwIntra %f bwInter %f", scclAlgoStr[a], scclProtoStr[p], busBw,
-                // comm->topo->baseBw, bw, graphs[a]->nChannels, graphs[a]->bwIntra, graphs[a]->bwInter);
-                // Various model refinements
-                if(nNodes <= 2)
-                    busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p];
-                else
-                    busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p];
-                if(a == SCCL_ALGO_COLLNET_DIRECT && p == SCCL_PROTO_SIMPLE && minCompCap >= 90)
-                    busBw *= .85;
-                // Convert bus BW to algorithm BW
-                float ratio;
-                if(a == SCCL_ALGO_RING)
-                    ratio = (1.0 * nRanks) / nsteps;
-                else if(a == SCCL_ALGO_NVLS)
-                    ratio = 5.0 / 6.0;
-                else if(a == SCCL_ALGO_NVLS_TREE)
-                    ratio = .70 * nNodes / (2 * (nNodes - 1));
-                else
-                    ratio = .5;
-                comm->bandwidths[coll][a][p] = busBw * ratio;
-                comm->latencies[coll][a][p] = baseLat[a][p];
-                float intraLat              = rcclTuningModel[comm->topo->tuning].hwLat[intraHw[a]][a][p];
-                float interLat              = graphs[a]->latencyInter ? graphs[a]->latencyInter : rcclTuningModel[comm->topo->tuning].hwLat[SCCL_HW_NET][a][p];
-                // if (nNodes > 1 && p == SCCL_PROTO_LL) intraLat *= 1.8;
-                if(p == SCCL_PROTO_SIMPLE)
-                    interLat += graphs[a]->latencyInter;
-                if(a == SCCL_ALGO_RING) {
-                    float lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][a][p];
-                    if((coll == scclFuncReduce || coll == scclFuncBroadcast)) {
-                        if(graphs[a]->sameChannels) {
-                            comm->latencies[coll][a][p] += lat;
-                        } else {
-                            if(p == SCCL_PROTO_SIMPLE)
-                                lat = rcclTuningModel[comm->topo->tuning]
-                                          .hwLat[hw[a]][SCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
-                            comm->latencies[coll][a][p] += nsteps * lat;
-                        }
-                    } else {
-                        // Inter-node rings still have to launch nsteps * net overhead.
-                        float netOverhead = 0.0;
-                        if(nNodes > 1) {
-                            netOverhead = getNetOverhead(comm);
-                            if(p == SCCL_PROTO_SIMPLE)
-                                netOverhead *= 3;
-                        }
-                        intraLat = std::max(intraLat, netOverhead);
-                        comm->latencies[coll][a][p] += (nsteps - nInterSteps) * intraLat + nInterSteps * interLat;
-                    }
-                } else if(a == SCCL_ALGO_TREE) {
-                    comm->latencies[coll][a][p] += 2 * ((nRanks / nNodes - 1) * intraLat + log2i(nNodes) * interLat);
-                } else if(a == SCCL_ALGO_COLLNET_DIRECT) {
-                    comm->latencies[coll][a][p] +=
-                        2 * (std::min(1, (nRanks / nNodes - 1)) * intraLat + (nRanks / nNodes - 1) * 0.5) + interLat; // Add 0.5 arity serialization latency
-                } else if(a == SCCL_ALGO_COLLNET_CHAIN) {
-                    comm->latencies[coll][a][p] += 2 * (nRanks / nNodes - 1) * intraLat + interLat;
-                } else if(a == SCCL_ALGO_NVLS) {
-                    if(nNodes > 1)
-                        comm->latencies[coll][a][p] += rcclTuningModel[comm->topo->tuning].hwLat[SCCL_HW_NET][a][p];
-                } else if(a == SCCL_ALGO_NVLS_TREE) {
-                    comm->latencies[coll][a][p] += 2 * (nNodes - 1) * rcclTuningModel[comm->topo->tuning].hwLat[SCCL_HW_NET][a][p];
-                }
-            }
-        }
-    }
-    // Protocols/Algorithms enable/disable, and user overrides.
-    // All are enabled except ll128 which is enabled by default only in certain cases.
-    int protoEnable[SCCL_NUM_PROTOCOLS] = {1, 2, 1};
-    int algoEnable[SCCL_NUM_ALGORITHMS] = {1, 1, 1, 1, 1, 1};
-    const char* protoStr = getenv("SCCL_PROTO");
-    if(protoStr) {
-        INFO(SCCL_ENV, "SCCL_PROTO set by environment to %s", protoStr);
-        SCCLCHECK(parseList(protoStr, scclProtoStr, SCCL_NUM_PROTOCOLS, protoEnable));
-    }
-    const char* algoStr = getenv("SCCL_ALGO");
-    if(algoStr) {
-        INFO(SCCL_ENV, "SCCL_ALGO set by environment to %s", algoStr);
-        SCCLCHECK(parseList(algoStr, scclAlgoStr, SCCL_NUM_ALGORITHMS, algoEnable));
-    }
-    if(comm->nNodes == 1)
-        algoEnable[SCCL_ALGO_NVLS_TREE] = 0;
-    // Disable CollNet if it is not supported
-    if(comm->collNetSupport == 0) {
-        algoEnable[SCCL_ALGO_COLLNET_DIRECT] = 0;
-        algoEnable[SCCL_ALGO_COLLNET_CHAIN]  = 0;
-        if(comm->nNodes > 1)
-            algoEnable[SCCL_ALGO_NVLS] = 0;
-        // If user has hard set SCCL_ALGO=COLLNET, ignore it
-        if(algoEnable[SCCL_ALGO_RING] == 0 && algoEnable[SCCL_ALGO_TREE] == 0 && algoEnable[SCCL_ALGO_NVLS] == 0 && algoEnable[SCCL_ALGO_NVLS_TREE] == 0) {
-            algoEnable[SCCL_ALGO_RING] = algoEnable[SCCL_ALGO_TREE] = 1;
-            if(comm->rank == 0)
-                WARN("CollNet is not supported or fails to initialize, ignoring SCCL_ALGO=COLLNET");
-        }
-    } else {
-        // Disable CollNet+Direct if not on an NVSwitch system
-        int nvsCount = 0;
-        SCCLCHECK(scclTopoGetNvsCount(comm->topo, &nvsCount));
-        if(nvsCount == 0)
-            algoEnable[SCCL_ALGO_COLLNET_DIRECT] = 0;
-    }
-    for(int c = 0; c < SCCL_NUM_FUNCTIONS; c++)
-        for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++)
-            for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
-                // Disable LL protocol on gfx11xx
-                int pEnable = protoEnable[p];
-                if(pEnable == 2 && p == SCCL_PROTO_LL128) {
-#if defined(ENABLE_LL128)
-                    // Enable LL128 by default only on gfx90a with available tuning table
-                    pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
-                                      (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && comm->topo->ll128Enabled)
-                                  ? 1
-                                  : 0;
-#else
-                    pEnable = 0;
-#endif
-                }
-                if(pEnable == 0)
-                    comm->bandwidths[c][a][p] = 0;
-                // Never disable ring for non-allreduce operations. That allows to run real apps with SCCL_ALGO=TREE.
-                if(a == SCCL_ALGO_RING && c != scclFuncAllReduce)
-                    continue;
-                if(algoEnable[a] == 0)
-                    comm->bandwidths[c][a][p] = 0;
-            }
-    if(comm->rank == 0) {
-        char line[1024];
-        for(int block = 0; block < 2; block++) {
-            sprintf(line, "  Algorithm   |");
-            for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) {
-                int a = block * SCCL_NUM_ALGORITHMS / 2 + ba;
-                sprintf(line + strlen(line), " %14s   %14s   %14s |", "", scclAlgoStr[a], "");
-            }
-            INFO(SCCL_TUNING, "%s", line);
-            sprintf(line, "  Protocol    |");
-            for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) {
-                for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
-                    sprintf(line + strlen(line), " %14s |", scclProtoStr[p]);
-                }
-            }
-            INFO(SCCL_TUNING, "%s", line);
-            sprintf(line, " Max NThreads |");
-            for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) {
-                int a = block * SCCL_NUM_ALGORITHMS / 2 + ba;
-                for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
-                    sprintf(line + strlen(line), " %14d |", comm->maxThreads[a][p]);
-                }
-            }
-            INFO(SCCL_TUNING, "%s", line);
-            for(int c = 0; c < SCCL_NUM_FUNCTIONS; c++) {
-                sprintf(line, "%13s |", scclFuncStr[c]);
-                for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) {
-                    int a = block * SCCL_NUM_ALGORITHMS / 2 + ba;
-                    for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
-                        sprintf(line + strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
-                    }
-                }
-                INFO(SCCL_TUNING, "%s", line);
-            }
-        }
-    }
-    // Set per-thread amount of work before we increase nThreads and nChannels
-    for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++) {
-        comm->threadThresholds[a][SCCL_PROTO_LL]     = SCCL_LL_THREAD_THRESHOLD;
-        comm->threadThresholds[a][SCCL_PROTO_LL128]  = SCCL_LL128_THREAD_THRESHOLD;
-        comm->threadThresholds[a][SCCL_PROTO_SIMPLE] = SCCL_SIMPLE_THREAD_THRESHOLD;
-    }
-    comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_LL] *= nRanks;
-    comm->threadThresholds[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_SIMPLE] = 256;
-    comm->threadThresholds[SCCL_ALGO_COLLNET_CHAIN][SCCL_PROTO_SIMPLE]  = 256;
-    // Override defaults with user env
-    char* str = getenv("SCCL_THREAD_THRESHOLDS");
-    if(str) {
-        INFO(SCCL_ENV, "SCCL_THREAD_THRESHOLDS set by environment to %s", str);
-        ssize_t t[2][SCCL_NUM_PROTOCOLS] = {{-2, -2, -2}, {-2, -2, -2}};
-        sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0] + 1, t[0] + 2, t[1], t[1] + 1, t[1] + 2);
-        for(int a = 0; a < 2; a++) {
-            for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
-                if(t[a][p] >= 0)
-                    comm->threadThresholds[a][p] = t[a][p];
-            }
-        }
-    }
-    INFO(SCCL_INIT,
-         "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld | %ld",
-         comm->threadThresholds[SCCL_ALGO_TREE][SCCL_PROTO_LL],
-         comm->threadThresholds[SCCL_ALGO_TREE][SCCL_PROTO_LL128],
-         comm->threadThresholds[SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE],
-         comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_LL],
-         comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_LL128],
-         comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_SIMPLE],
-         comm->threadThresholds[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_SIMPLE],
-         comm->threadThresholds[SCCL_ALGO_COLLNET_CHAIN][SCCL_PROTO_SIMPLE]);
-    return scclSuccess;
-}
-scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) {
-    float bw  = info->comm->bandwidths[info->coll][algorithm][protocol];
-    float lat = info->comm->latencies[info->coll][algorithm][protocol];
-    if(bw == 0) {
-        *time = -1.0;
-        return scclSuccess;
-    }
-    int logSize = log2i(info->nBytes >> 6);
-    if(algorithm == SCCL_ALGO_TREE) {
-        if(logSize < 27)
-            bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][logSize];
-        else
-            bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][26];
-    } else if(algorithm == SCCL_ALGO_RING && info->comm->nNodes > 1) {
-        if(logSize < 27)
-            bw *= rcclTuningModel[info->comm->topo->tuning].ringCorrectionFactor[protocol][logSize];
-        else
-            bw *= rcclTuningModel[info->comm->topo->tuning].ringCorrectionFactor[protocol][26];
-    }
-    // Tree pipelining saves latency in aggregation cases
-    int latCount = algorithm == SCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, SCCL_MAX_WORK_ELEMENTS);
-    *time        = lat * latCount + (info->nBytes) / (1000 * bw);
-    return scclSuccess;
-}
-} // namespace detect
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/hardware_utils.cpp
+++ b/src/hardware/hardware_utils.cpp
+#include <stdint.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include "base.h"
+#include "hardware_utils.h"
+namespace sccl {
+namespace hardware {} // namespace hardware
+} // namespace sccl
--- a/src/hardware/hardware_utils.h
+++ b/src/hardware/hardware_utils.h
@@ -2,7 +2,13 @@
 #include <stdint.h>
 #include "base.h"
+#include "comm.h"
 namespace sccl {
-namespace hardware {} // namespace hardware
+namespace hardware {
+namespace ops {
+////
+} // namespace ops
+} // namespace hardware
 } // namespace sccl
--- a/src/hardware/net/device/net_ib.h
+++ b/src/hardware/net/device/net_ib.h
-#pragma once
-#include <assert.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <poll.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include "ibvwrap.h"
-#include "net_utils.h"
-namespace sccl {
-namespace hardware {
-namespace net {
-namespace device {
-//////////////////////////////////
-extern scclNet_t scclNetIb;
-} // namespace device
-} // namespace net
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/net/host/net_socket.h
+++ b/src/hardware/net/host/net_socket.h
-#pragma once
-#include <assert.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <poll.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include "base.h"
-#include "net_utils.h"
-namespace sccl {
-namespace hardware {
-namespace net {
-namespace host {
-//////////////////////////////////
-extern scclNet_t scclNetSocket;
-} // namespace host
-} // namespace net
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/net/ipc_socket/ipc_socket.cpp
+++ b/src/hardware/net/ipc_socket/ipc_socket.cpp
+#include <pthread.h>
+#include <stdlib.h>
+#include <poll.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <thread> // 为了使用 std::this_thread::sleep_for
+#include "ipc_socket.h"
+namespace sccl {
+namespace hardware {
+namespace net {
+namespace ipc_socket {
+//////////////////////////////////////// scclIpcSocket调用的函数 ////////////////////////////////////////
+scclIpcSocket::scclIpcSocket(int localRank, int localRanks, uint64_t hash, volatile uint32_t* abortFlag)
+    : localRank(localRank), localRanks(localRanks), ipc_hash(hash) {
+    scclResult_t res;
+    handle = new struct scclIpcSocketHandle();
+    if(localRanks > 0) {
+        pthread_pool = new ThreadPool(localRanks * 2); // 其中一半用于发送一半，用于接收
+    }
+    SCCLCHECKGOTO(scclIpcSocketInit(abortFlag), res, failure);
+    return;
+failure:
+    WARN("scclIpcSocket init failed");
+    return;
+}
+scclIpcSocket::~scclIpcSocket() {
+    // 释放pthpool
+    if(pthread_pool) {
+        delete(pthread_pool);
+    }
+    // 释放handle
+    if(handle->socketName[0] != '\0') {
+        unlink(handle->socketName);
+    }
+    if(handle->fd >= 0) {
+        close(handle->fd);
+    }
+    delete(handle);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+scclResult_t scclIpcSocket::scclIpcSocketInit(volatile uint32_t* abortFlag) {
+    // 中间变量
+    int fd = -1;
+    char temp_addr[SCCL_IPC_SOCKNAME_LEN];
+    // 初始化handle的成员变量
+    handle->fd            = -1;
+    handle->socketName[0] = '\0';
+    // 创建Unix域套接字
+    // af是本机IP地址类型，一般有PF_INET或者AF_INET（IPv4互联网协议族）,还有PF_INET6(IPv6互联网协议族)等，但是一般用IPv4。
+    // type有两种SOCK_STREAM 和SOCK_DGRAM分别对应tcp和udp协议，区别是用不用建立连接。
+    if((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
+        WARN("UDS: Socket creation error : %d", errno);
+        return scclSystemError;
+    }
+    // 将cliaddr结构体清零，确保没有残留数据
+    bzero(&my_cliaddr, sizeof(my_cliaddr));
+    my_cliaddr.sun_family = AF_UNIX;
+    // 为套接字创建唯一名称
+    int len = snprintf(temp_addr, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, localRank, ipc_hash);
+    if(len > (sizeof(my_cliaddr.sun_path) - 1)) {
+        WARN("UDS: Cannot bind provided name to socket. Name too large");
+        return scclInternalError;
+    }
+    INFO(SCCL_LOG_BOOTSTRAP, "UDS: Creating socket %s", temp_addr);
+    // 设置套接字路径
+    strncpy(my_cliaddr.sun_path, temp_addr, len);
+    my_cliaddr.sun_path[0] = '\0'; // Linux抽象套接字技巧
+    // 绑定套接字
+    if(bind(fd, (struct sockaddr*)&my_cliaddr, sizeof(my_cliaddr)) < 0) {
+        WARN("UDS: Binding to socket %s failed : %d", temp_addr, errno);
+        close(fd);
+        return scclSystemError;
+    }
+    // 设置handle的成员变量
+    handle->fd = fd;
+    strcpy(handle->socketName, temp_addr);
+    // 设置中止标志
+    handle->abortFlag = abortFlag;
+    // 将套接字标记为非阻塞
+    if(handle->abortFlag) {
+        int flags;
+        EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+        SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+    }
+    return scclSuccess;
+}
+/**
+ * 设置中止标志并更新socket的非阻塞模式
+ *
+ * @param flag 指向中止标志的指针。如果非空，将socket设为非阻塞模式；
+ *             如果为空，则恢复为阻塞模式。
+ * @note 该函数仅在handle有效时执行操作
+ */
+scclResult_t scclIpcSocket::setAbortFlag(volatile uint32_t* flag) {
+    if(handle) {
+        handle->abortFlag = flag;
+        if(flag) {
+            int flags;
+            EQCHECK(flags = fcntl(handle->fd, F_GETFL), -1);
+            SYSCHECK(fcntl(handle->fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+        } else {
+            int flags;
+            EQCHECK(flags = fcntl(handle->fd, F_GETFL), -1);
+            SYSCHECK(fcntl(handle->fd, F_SETFL, flags & ~O_NONBLOCK), "fcntl");
+        }
+    }
+    return scclSuccess;
+}
+// 获取 abortFlag 的函数
+volatile uint32_t* scclIpcSocket::getAbortFlag() const { return handle ? handle->abortFlag : nullptr; }
+/**
+ * 设置IPC套接字的超时时间
+ *
+ * @param timeout_ms 超时时间（毫秒）
+ * @return 成功返回scclSuccess
+ */
+scclResult_t scclIpcSocket::setTimeout(int timeout_ms) {
+    timeoutMs = timeout_ms;
+    return scclSuccess;
+}
+ThreadPool* scclIpcSocket::getPthreadPool() { return pthread_pool; }
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ * @brief 通过Unix域套接字发送文件描述符
+ *
+ * @param sendFd 要发送的文件描述符
+ * @param dst_rank 目标rank号
+ * @return scclResult_t 返回操作结果:
+ *         - scclSuccess: 发送成功
+ *         - scclInternalError: 内部错误(如地址过长或中止标志被设置)
+ *         - scclSystemError: 系统调用错误
+ *
+ * @note 使用Linux抽象套接字技巧(将sun_path[0]置为'\0')
+ *       通过SCM_RIGHTS机制发送文件描述符
+ *       函数会循环尝试发送直到成功或遇到错误
+ */
+scclResult_t scclIpcSocket::scclIpcSocketSendFd(const int sendFd, int dst_rank) {
+    // 创建一个临时地址字符串
+    char temp_addr[SCCL_IPC_SOCKNAME_LEN];
+    // 格式化地址字符串
+    int len = snprintf(temp_addr, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, dst_rank, ipc_hash);
+    // 检查地址字符串长度是否超过限制
+    if(len > (sizeof(my_cliaddr.sun_path) - 1)) {
+        WARN("UDS: Cannot connect to provided name for socket. Name too large");
+        return scclInternalError;
+    }
+    // 记录发送文件描述符的信息
+    INFO(SCCL_LOG_BOOTSTRAP, "UDS: Sending fd %d to UDS socket %s/fd:%d", sendFd, temp_addr, handle->fd);
+    // 初始化消息头结构体和iovec结构体
+    struct msghdr msg;
+    struct iovec iov[1];
+    // 联合体用于保证控制数组的对齐要求
+    union {
+        struct cmsghdr cm;
+        char control[CMSG_SPACE(sizeof(int))];
+    } control_un;
+    struct cmsghdr* cmptr;
+    struct sockaddr_un cliaddr;
+    // 构造客户端地址以发送共享句柄
+    bzero(&cliaddr, sizeof(cliaddr));
+    cliaddr.sun_family = AF_UNIX;
+    strncpy(cliaddr.sun_path, temp_addr, len);
+    cliaddr.sun_path[0] = '\0'; // Linux抽象套接字技巧
+    // 设置消息头的控制信息部分
+    msg.msg_control    = control_un.control;
+    msg.msg_controllen = sizeof(control_un.control);
+    cmptr             = CMSG_FIRSTHDR(&msg);
+    cmptr->cmsg_len   = CMSG_LEN(sizeof(int));
+    cmptr->cmsg_level = SOL_SOCKET;
+    cmptr->cmsg_type  = SCM_RIGHTS;
+    // 将要发送的文件描述符复制到控制信息中
+    memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+    // 设置消息头的地址信息部分
+    msg.msg_name    = (void*)&cliaddr;
+    msg.msg_namelen = sizeof(struct sockaddr_un);
+    // 设置iovec结构体，用于指定要发送的数据
+    iov[0].iov_base = (void*)"";
+    iov[0].iov_len  = 1;
+    // 将iovec结构体关联到消息头
+    msg.msg_iov    = iov;
+    msg.msg_iovlen = 1;
+    // 初始化消息标志
+    msg.msg_flags = 0;
+    ssize_t sendResult;
+    // 循环发送消息，直到成功发送数据
+    while((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
+        // 如果发送失败且错误不是EAGAIN, EWOULDBLOCK或EINTR，则记录警告并返回错误
+        if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
+            WARN("UDS: Sending data over socket %s failed : %d", temp_addr, errno);
+            return scclSystemError;
+        }
+        // 如果设置了中止标志，则返回内部错误
+        if(handle->abortFlag && *handle->abortFlag)
+            return scclInternalError;
+    }
+    // 返回成功
+    return scclSuccess;
+}
+/**
+ * @brief 通过IPC socket接收文件描述符
+ *
+ * 该函数使用recvmsg系统调用从socket接收文件描述符。函数会循环尝试接收，
+ * 直到成功或发生错误。接收到的文件描述符会通过参数recvFd返回。
+ *
+ * @param recvFd 用于存储接收到的文件描述符的指针
+ * @return scclResult_t 返回操作结果：
+ *         - scclSuccess: 成功接收文件描述符
+ *         - scclSystemError: 系统调用失败
+ *         - scclInternalError: 操作被中止
+ *
+ * @note 函数会处理EAGAIN、EWOULDBLOCK和EINTR错误，其他错误会导致返回失败。
+ *       接收到的控制消息必须符合SOL_SOCKET级别和SCM_RIGHTS类型。
+ */
+scclResult_t scclIpcSocket::scclIpcSocketRecvFd(int* recvFd) {
+    // 初始化消息头结构体和iovec结构体
+    struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
+    struct iovec iov[1];
+    // 联合体用于保证控制数组的对齐要求
+    union {
+        struct cmsghdr cm;
+        char control[CMSG_SPACE(sizeof(int))];
+    } control_un;
+    struct cmsghdr* cmptr;
+    char dummy_buffer[1];
+    int ret;
+    // 设置消息头的控制信息部分
+    msg.msg_control    = control_un.control;
+    msg.msg_controllen = sizeof(control_un.control);
+    // 设置iovec结构体，用于指定要接收的数据
+    iov[0].iov_base = (void*)dummy_buffer;
+    iov[0].iov_len  = sizeof(dummy_buffer);
+    // 将iovec结构体关联到消息头
+    msg.msg_iov    = iov;
+    msg.msg_iovlen = 1;
+    // 循环接收消息，直到成功接收到数据
+    while((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
+        // 如果接收失败且错误不是EAGAIN, EWOULDBLOCK或EINTR，则记录警告并返回错误
+        if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
+            WARN("UDS: Receiving data over socket failed : %d", errno);
+            return scclSystemError;
+        }
+        // 如果设置了中止标志，则返回内部错误
+        if(handle->abortFlag && *handle->abortFlag)
+            return scclInternalError;
+    }
+    // 检查接收到的控制信息
+    if(((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
+        // 如果控制信息的级别或类型不正确，则记录警告并返回错误
+        if((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
+            WARN("UDS: Receiving data over socket failed");
+            return scclSystemError;
+        }
+        // 将接收到的文件描述符复制到recvFd
+        memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
+    } else {
+        // 如果没有接收到控制信息，则记录警告并返回错误
+        WARN("UDS: Receiving data over socket %s failed", handle->socketName);
+        return scclSystemError;
+    }
+    // 记录成功接收到文件描述符的信息
+    INFO(SCCL_LOG_BOOTSTRAP, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
+    // 返回成功
+    return scclSuccess;
+}
+/**
+ * @brief 通过IPC套接字发送数据到指定目标rank
+ *
+ * @param data 要发送的数据指针
+ * @param dataLen 要发送的数据长度
+ * @param dst_rank 目标rank号
+ * @return scclResult_t 返回操作结果状态码：
+ *         - scclSuccess: 发送成功
+ *         - scclInternalError: 内部错误(如套接字名称过长或中止标志被设置)
+ *         - scclSystemError: 系统调用错误(如poll超时或sendmsg失败)
+ *
+ * @note 使用Linux抽象套接字技术，通过poll机制确保套接字可写后再发送数据
+ *       支持EAGAIN/EWOULDBLOCK/EINTR错误重试机制
+ */
+scclResult_t scclIpcSocket::scclIpcSocketSendData(const void* data, size_t dataLen, int dst_rank) {
+    // 构造目标地址字符串
+    char temp_addr[SCCL_IPC_SOCKNAME_LEN];
+    int len = snprintf(temp_addr, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, dst_rank, ipc_hash);
+    if(len > (sizeof(my_cliaddr.sun_path) - 1)) {
+        WARN("UDS: Unable to connect to the provided socket name. Name too long");
+        return scclInternalError;
+    }
+    // 设置消息结构体
+    struct msghdr msg;
+    struct iovec iov[1];
+    struct sockaddr_un cliaddr;
+    bzero(&cliaddr, sizeof(cliaddr));
+    cliaddr.sun_family = AF_UNIX;
+    strncpy(cliaddr.sun_path, temp_addr, len);
+    cliaddr.sun_path[0] = '\0'; // Linux抽象套接字技巧
+    iov[0].iov_base    = (void*)data;
+    iov[0].iov_len     = dataLen;
+    msg.msg_name       = (void*)&cliaddr;
+    msg.msg_namelen    = sizeof(cliaddr);
+    msg.msg_iov        = iov;
+    msg.msg_iovlen     = 1;
+    msg.msg_control    = NULL;
+    msg.msg_controllen = 0;
+    msg.msg_flags      = 0;
+    // 使用 poll 等待 socket 可写
+    struct pollfd pfd;
+    pfd.fd     = handle->fd;
+    pfd.events = POLLOUT;
+    int pollResult = poll(&pfd, 1, timeoutMs);
+    if(pollResult <= 0) {
+        if(pollResult == 0) {
+            WARN("UDS: Timeout occurred while waiting to send data to socket %s", temp_addr);
+        } else {
+            WARN("UDS: Error occurred while polling socket %s for writability : %d", temp_addr, errno);
+        }
+        return scclSystemError;
+    }
+    ssize_t sendResult;
+    while((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
+        if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
+            WARN("UDS: Error occurred while sending data through socket %s : %d", temp_addr, errno);
+            return scclSystemError;
+        }
+        if(handle->abortFlag && *handle->abortFlag)
+            return scclInternalError;
+        // 如果 sendmsg 因为 EAGAIN 或 EWOULDBLOCK 失败，重新 poll
+        pollResult = poll(&pfd, 1, timeoutMs);
+        if(pollResult <= 0) {
+            if(pollResult == 0) {
+                WARN("UDS: Timeout occurred while waiting to send data to socket %s", temp_addr);
+            } else {
+                WARN("UDS: Error occurred while polling socket %s for writability : %d", temp_addr, errno);
+            }
+            return scclSystemError;
+        }
+    }
+    INFO(SCCL_LOG_BOOTSTRAP, "UDS: Successfully sent %zu bytes of data through UDS socket %s", dataLen, temp_addr);
+    return scclSuccess;
+}
+/**
+ * @brief 通过IPC socket接收数据
+ *
+ * 该函数使用poll机制等待socket可读，然后通过recvmsg接收数据。
+ * 支持超时设置和中断处理，当发生错误或超时时返回相应错误码。
+ *
+ * @param buffer 接收数据的缓冲区指针
+ * @param bufferLen 缓冲区长度
+ * @param receivedLen 实际接收到的数据长度(输出参数)
+ * @return scclResult_t 操作结果状态码:
+ *         - scclSuccess: 成功接收数据
+ *         - scclSystemError: 系统调用错误
+ *         - scclInternalError: 被中断标志终止
+ */
+scclResult_t scclIpcSocket::scclIpcSocketRecvData(void* buffer, size_t bufferLen, size_t* receivedLen) {
+    // 设置消息结构体
+    struct msghdr msg = {0};
+    struct iovec iov[1];
+    iov[0].iov_base = buffer;
+    iov[0].iov_len  = bufferLen;
+    msg.msg_iov     = iov;
+    msg.msg_iovlen  = 1;
+    // 使用 poll 等待 socket 可读
+    struct pollfd pfd;
+    pfd.fd     = handle->fd;
+    pfd.events = POLLIN;
+    int pollResult = poll(&pfd, 1, timeoutMs);
+    if(pollResult <= 0) {
+        if(pollResult == 0) {
+            WARN("UDS: Timeout occurred while waiting to receive data from socket %s", handle->socketName);
+        } else {
+            WARN("UDS: Error occurred while polling socket %s for readability : %d", handle->socketName, errno);
+        }
+        return scclSystemError;
+    }
+    int ret;
+    while((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
+        if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
+            WARN("UDS: Error occurred while receiving data through socket %s : %d", handle->socketName, errno);
+            return scclSystemError;
+        }
+        if(handle->abortFlag && *handle->abortFlag)
+            return scclInternalError;
+        // 如果 recvmsg 因为 EAGAIN 或 EWOULDBLOCK 失败，重新 poll
+        pollResult = poll(&pfd, 1, timeoutMs);
+        if(pollResult <= 0) {
+            if(pollResult == 0) {
+                WARN("UDS: Timeout occurred while waiting to receive data from socket %s", handle->socketName);
+            } else {
+                WARN("UDS: Error occurred while polling socket %s for readability : %d", handle->socketName, errno);
+            }
+            return scclSystemError;
+        }
+    }
+    if(ret > 0) {
+        *receivedLen = ret;
+        INFO(SCCL_LOG_BOOTSTRAP, "UDS: Successfully received %zu bytes of data from socket %s", ret, handle->socketName);
+        return scclSuccess;
+    } else {
+        WARN("UDS: Error occurred while receiving data through socket %s", handle->socketName);
+        return scclSystemError;
+    }
+}
+/**
+ * @brief 通过Unix域套接字非阻塞发送数据到指定rank节点
+ *
+ * @param data 要发送的数据指针
+ * @param dataLen 要发送的数据长度(字节)
+ * @param dst_rank 目标rank号
+ * @return scclResult_t 返回操作结果:
+ *         - scclSuccess: 发送成功
+ *         - scclInternalError: 内部错误(地址过长或中止标志被设置)
+ *         - scclSystemError: 系统调用错误
+ *
+ * @note 使用Linux抽象套接字命名空间技术
+ *       函数会持续重试直到发送成功或发生错误
+ *       使用poll系统调用等待套接字变为可写状态
+ */
+scclResult_t scclIpcSocket::scclIpcSocketSendDataNonBlocking(const void* data, size_t dataLen, int dst_rank) {
+    // 创建一个临时地址字符串，用于存储目标套接字的地址
+    char temp_addr[SCCL_IPC_SOCKNAME_LEN];
+    // 格式化目标地址字符串
+    int len = snprintf(temp_addr, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, dst_rank, ipc_hash);
+    // 如果地址字符串太长，则返回错误
+    if(len > (sizeof(my_cliaddr.sun_path) - 1)) {
+        WARN("UDS: Cannot connect to provided name for socket. Name too large");
+        return scclInternalError;
+    }
+    // 记录日志，表示正在发送数据
+    INFO(SCCL_LOG_BOOTSTRAP, "UDS: Sending %zu bytes of data to UDS socket %s", dataLen, temp_addr);
+    // 设置消息头结构体
+    struct msghdr msg;
+    struct iovec iov[1];
+    struct sockaddr_un cliaddr;
+    bzero(&cliaddr, sizeof(cliaddr));
+    cliaddr.sun_family = AF_UNIX;
+    strncpy(cliaddr.sun_path, temp_addr, len);
+    cliaddr.sun_path[0] = '\0'; // Linux抽象套接字技巧
+    iov[0].iov_base     = (void*)data;
+    iov[0].iov_len      = dataLen;
+    msg.msg_name        = (void*)&cliaddr;
+    msg.msg_namelen     = sizeof(cliaddr);
+    msg.msg_iov         = iov;
+    msg.msg_iovlen      = 1;
+    msg.msg_control     = NULL;
+    msg.msg_controllen  = 0;
+    msg.msg_flags       = 0;
+    ssize_t sendResult;
+    // 尝试发送数据，如果失败则等待套接字变得可写后重试
+    while((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
+        // 如果错误不是 EAGAIN, EWOULDBLOCK 或 EINTR，则记录警告并返回错误
+        if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
+            WARN("UDS: Sending data over socket %s failed : %d", temp_addr, errno);
+            return scclSystemError;
+        }
+        // 如果设置了中止标志，则返回内部错误
+        if(handle->abortFlag && *handle->abortFlag)
+            return scclInternalError;
+        // 使用 poll 系统调用等待套接字变得可写
+        struct pollfd pfd;
+        pfd.fd         = handle->fd;
+        pfd.events     = POLLOUT;
+        int pollResult = poll(&pfd, 1, -1); // 无限等待
+        if(pollResult <= 0) {
+            WARN("UDS: Polling for socket %s to become writable failed : %d", temp_addr, errno);
+            return scclSystemError;
+        }
+    }
+    return scclSuccess;
+}
+/**
+ * @brief 非阻塞接收IPC socket数据
+ *
+ * 通过UDS套接字非阻塞接收数据，当数据不可读时会等待直到可读或发生错误。
+ *
+ * @param buffer 接收数据的缓冲区指针
+ * @param bufferLen 缓冲区长度
+ * @param receivedLen 实际接收到的数据长度(输出参数)
+ * @return scclResult_t 操作结果:
+ *     - scclSuccess: 成功接收数据
+ *     - scclSystemError: 系统调用错误
+ *     - scclInternalError: 被中止标志中断
+ *
+ * @note 内部使用recvmsg和poll系统调用实现
+ */
+scclResult_t scclIpcSocket::scclIpcSocketRecvDataNonBlocking(void* buffer, size_t bufferLen, size_t* receivedLen) {
+    // 初始化消息头结构体和iovec结构体
+    struct msghdr msg = {0};
+    struct iovec iov[1];
+    iov[0].iov_base = buffer;
+    iov[0].iov_len  = bufferLen;
+    msg.msg_iov     = iov;
+    msg.msg_iovlen  = 1;
+    int ret;
+    // 尝试接收消息，如果失败则等待套接字变得可读后重试
+    while((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
+        // 如果接收失败且错误不是EAGAIN, EWOULDBLOCK或EINTR，则记录警告并返回错误
+        if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
+            WARN("UDS: Receiving data over socket failed : %d", errno);
+            return scclSystemError;
+        }
+        // 如果设置了中止标志，则返回内部错误
+        if(handle->abortFlag && *handle->abortFlag)
+            return scclInternalError;
+        // 使用 poll 系统调用等待套接字变得可读
+        struct pollfd pfd;
+        pfd.fd         = handle->fd;
+        pfd.events     = POLLIN;
+        int pollResult = poll(&pfd, 1, -1); // 无限等待
+        if(pollResult <= 0) {
+            WARN("UDS: Polling for socket %s to become readable failed : %d", handle->socketName, errno);
+            return scclSystemError;
+        }
+    }
+    // 如果成功接收到数据，则记录接收到的数据长度并返回成功
+    if(ret > 0) {
+        *receivedLen = ret;
+        INFO(SCCL_LOG_BOOTSTRAP, "UDS: Received %zu bytes of data from socket %s", *receivedLen, handle->socketName);
+        return scclSuccess;
+    } else {
+        WARN("UDS: Receiving data over socket %s failed", handle->socketName);
+        return scclSystemError;
+    }
+}
+/**
+ * @brief 使用IPC套接字实现Allgather操作
+ *
+ * 该函数通过线程池并行发送和接收数据，实现多节点间的Allgather集合通信。
+ *
+ * @param sendData 发送数据缓冲区指针
+ * @param recvData 接收数据缓冲区指针
+ * @param dataLen 每个节点的数据长度(字节)
+ * @param wait 是否等待所有通信完成
+ * @return scclResult_t 返回操作结果(scclSuccess表示成功)
+ *
+ * @note 1. 会跳过本地rank的数据传输
+ *       2. 数据包格式: [发送rank(int)][数据]
+ *       3. 接收缓冲区需要预先分配足够空间(大小=localRanks*dataLen)
+ */
+scclResult_t scclIpcSocket::scclIpcSocketAllgather(const void* sendData, void* recvData, size_t dataLen, bool wait) {
+    if(pthread_pool == nullptr || localRanks <= 0) {
+        WARN("scclIpcSocket init error!");
+        return scclInternalError;
+    }
+    std::vector<std::future<void>> futures;
+    // 采用线程池发送和接收数据
+    for(int i = 0; i < localRanks; ++i) {
+        if(i != localRank) {
+            auto sendTask = [this, sendData, dataLen, i]() {
+                // 计算 DataPackage 的总大小
+                size_t packageSize = sizeof(int) + dataLen;
+                char* buffer       = new char[packageSize];
+                // 将 rank 信息和数据一起拷贝到 buffer 中
+                int* rankPtr = reinterpret_cast<int*>(buffer);
+                *rankPtr     = localRank;
+                char* dataPtr = buffer + sizeof(int);
+                memcpy(dataPtr, sendData, dataLen);
+                // 一次性发送 rank 信息和数据
+                scclIpcSocketSendData(buffer, packageSize, i);
+                delete[] buffer;
+            };
+            futures.push_back(pthread_pool->enqueue(sendTask));
+            auto recvTask = [this, recvData, dataLen, i]() {
+                // 准备接收缓冲区
+                size_t packageSize = sizeof(int) + dataLen;
+                char* buffer       = new char[packageSize];
+                size_t receivedLen;
+                // 一次性接收 rank 信息和数据
+                scclIpcSocketRecvData(buffer, packageSize, &receivedLen);
+                // 从 buffer 中提取 rank 信息和数据
+                int* rankPtr   = reinterpret_cast<int*>(buffer);
+                int senderRank = *rankPtr;
+                char* dataPtr = buffer + sizeof(int);
+                memcpy(static_cast<char*>(recvData) + senderRank * dataLen, dataPtr, dataLen);
+                delete[] buffer;
+            };
+            futures.push_back(pthread_pool->enqueue(recvTask));
+        } else {
+            // 自己的数据直接放置到正确位置
+            memcpy(static_cast<char*>(recvData) + localRank * dataLen, sendData, dataLen);
+        }
+    }
+    if(wait) {
+        // 等待所有任务完成
+        for(auto& fut : futures) {
+            fut.get();
+        }
+    }
+    return scclSuccess;
+}
+/**
+ * @brief 使用IPC套接字进行Allgather同步操作
+ *
+ * 该函数实现了基于IPC套接字的Allgather同步操作，将各进程的数据收集到所有进程的接收缓冲区中。
+ *
+ * @param sendData 发送数据缓冲区指针
+ * @param recvData 接收数据缓冲区指针
+ * @param dataLen 每个进程发送/接收的数据长度
+ * @param wait 是否等待所有通信任务完成
+ * @return scclResult_t 返回操作结果，成功返回scclSuccess，失败返回错误码
+ *
+ * @note 1. 函数会先将本地数据复制到接收缓冲区对应位置
+ *       2. 使用线程池并行处理与其他进程的通信任务
+ *       3. 当wait为true时会阻塞等待所有通信完成
+ */
+scclResult_t scclIpcSocket::scclIpcSocketAllgatherSync(const void* sendData, void* recvData, size_t dataLen, bool wait) {
+    if(pthread_pool == nullptr || localRanks <= 0) {
+        WARN("scclIpcSocket init error!");
+        return scclInternalError;
+    }
+    // 将当前进程的数据复制到接收缓冲区的对应位置
+    memcpy(static_cast<char*>(recvData) + localRank * dataLen, sendData, dataLen);
+    std::vector<std::future<void>> futures;
+    // 采用线程池发送和接收数据
+    for(int i = 0; i < localRanks; ++i) {
+        if(i != localRank) {
+            auto sendTask = [this, sendData, dataLen, i]() { scclIpcSocketSendData(sendData, dataLen, i); };
+            futures.push_back(pthread_pool->enqueue(sendTask));
+            auto recvTask = [this, recvData, dataLen, i]() {
+                size_t receivedLen;
+                scclIpcSocketRecvData(reinterpret_cast<char*>(recvData) + i * dataLen, dataLen, &receivedLen);
+            };
+            futures.push_back(pthread_pool->enqueue(recvTask));
+        }
+    }
+    if(wait) {
+        // 等待所有任务完成
+        for(auto& fut : futures) {
+            fut.get();
+        }
+    }
+    return scclSuccess;
+}
+/**
+ * @brief 通过IPC Socket进行广播操作
+ *
+ * 该函数实现了基于IPC Socket的广播通信机制。根进程(root)将数据发送给所有其他进程，
+ * 非根进程从根进程接收数据。可以选择是否等待所有通信操作完成。
+ *
+ * @param sendData 发送数据缓冲区指针(根进程使用)
+ * @param recvData 接收数据缓冲区指针(非根进程使用)
+ * @param dataLen 数据长度(字节)
+ * @param root 根进程的rank值
+ * @param wait 是否等待所有通信操作完成
+ *
+ * @return scclResult_t 返回操作结果状态码
+ *     - scclSuccess: 操作成功
+ *     - scclInternalError: IPC Socket未初始化或本地rank数无效
+ *     - scclInvalidArgument: 根进程rank值无效
+ */
+scclResult_t scclIpcSocket::scclIpcSocketBroadcast(const void* sendData, void* recvData, size_t dataLen, int root, bool wait) {
+    if(pthread_pool == nullptr || localRanks <= 0) {
+        WARN("scclIpcSocket init error!");
+        return scclInternalError;
+    }
+    if(root < 0 || root >= localRanks) {
+        WARN("scclIpcSocketBroadcast: Invalid root rank %d", root);
+        return scclInvalidArgument;
+    }
+    std::vector<std::future<scclResult_t>> futures; // 使用 future 来收集每个任务的返回结果
+    if(localRank == root) {
+        // 根进程：发送数据给所有其他进程
+        for(int i = 0; i < localRanks; ++i) {
+            if(i != root) {
+                auto sendTask = [this, sendData, dataLen, i]() -> scclResult_t { return scclIpcSocketSendData(sendData, dataLen, i); };
+                futures.push_back(pthread_pool->enqueue(sendTask));
+            }
+        }
+    } else {
+        // 非根进程：从根进程接收数据
+        auto recvTask = [this, recvData, dataLen, root]() -> scclResult_t {
+            size_t receivedLen;
+            return scclIpcSocketRecvData(recvData, dataLen, &receivedLen);
+        };
+        futures.push_back(pthread_pool->enqueue(recvTask));
+    }
+    if(wait) {
+        // 等待所有任务完成并检查结果
+        for(auto& fut : futures) {
+            scclResult_t result = fut.get();
+            if(result != scclSuccess) {
+                WARN("scclIpcSocketBroadcast: Task failed with error %d", result);
+                return scclInternalError;
+            }
+        }
+    }
+    return scclSuccess;
+}
+} // namespace ipc_socket
+} // namespace net
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/net/ipc_socket/ipc_socket.h
+++ b/src/hardware/net/ipc_socket/ipc_socket.h
+#pragma once
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include "base.h"
+#include "net_utils.h"
+#include "socket.h"
+#include "thread_pool.h"
+namespace sccl {
+namespace hardware {
+namespace net {
+namespace ipc_socket {
+#define SCCL_IPC_SOCKNAME_LEN 64
+#define SCCL_IPC_SOCKNAME_STR "/tmp/sccl-socket-%d-%lx"
+// 定义IPC套接字结构体
+struct scclIpcSocketHandle {
+    int fd;                                 // 文件描述符
+    char socketName[SCCL_IPC_SOCKNAME_LEN]; // 套接字名称
+    volatile uint32_t* abortFlag;           // 用于中止操作的标志
+};
+// 封装发送数据，包括rank信息和实际数据的引用
+struct DataPackage {
+    int rank;
+    char data[]; // 灵活数组成员，用于存储实际数据
+};
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+class scclIpcSocket {
+public:
+    // 构造函数和析构函数
+    scclIpcSocket(int localRank, int localRanks, uint64_t hash, volatile uint32_t* abortFlag = nullptr);
+    virtual ~scclIpcSocket();
+    // 初始化IPC套接字
+    scclResult_t scclIpcSocketInit(volatile uint32_t* abortFlag);
+    // 设置 abortFlag 的函数
+    scclResult_t setAbortFlag(volatile uint32_t* flag);
+    // 获取 abortFlag 的函数
+    volatile uint32_t* getAbortFlag() const;
+    // 设置IPC套接字的超时时间
+    scclResult_t setTimeout(int timeout_ms);
+    // 获取线程池指针
+    ThreadPool* getPthreadPool();
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    /*
+    并行计算时，不同的进程可能需要访问相同的文件或网络资源。通过发送文件描述符，可以避免多个进程重复打开相同的文件或建立相同的网络连接，从而节省资源和时间。
+    */
+    // 发送文件描述符
+    scclResult_t scclIpcSocketSendFd(const int sendFd, int dst_rank);
+    // 接收文件描述符
+    scclResult_t scclIpcSocketRecvFd(int* fd);
+    // 通过Unix域套接字发送数据到指定目标，阻塞方式
+    scclResult_t scclIpcSocketSendData(const void* data, size_t dataLen, int dst_rank);
+    // 通过Unix域套接字接收数据，阻塞方式
+    scclResult_t scclIpcSocketRecvData(void* buffer, size_t bufferLen, size_t* receivedLen);
+    // 通过Unix域套接字发送数据到指定目标，非阻塞方式
+    scclResult_t scclIpcSocketSendDataNonBlocking(const void* data, size_t dataLen, int dst_rank);
+    // 通过Unix域套接字接收数据，非阻塞方式
+    scclResult_t scclIpcSocketRecvDataNonBlocking(void* buffer, size_t bufferLen, size_t* receivedLen);
+    // local rank内的allgather操作。保证接收顺序
+    scclResult_t scclIpcSocketAllgather(const void* sendData, void* recvData, size_t dataLen, bool wait = true);
+    // local rank内的allgather操作。为了性能，不保证接收顺序，所以发送的信息中需要添加进程ID
+    scclResult_t scclIpcSocketAllgatherSync(const void* sendData, void* recvData, size_t dataLen, bool wait = true);
+    // local rank内的broadcast操作
+    scclResult_t scclIpcSocketBroadcast(const void* sendData, void* recvData, size_t dataLen, int root, bool wait = true);
+private:
+    // 定义并初始化一个 scclIpcSocket 结构体，用于处理 IPC 套接字连接
+    struct scclIpcSocketHandle* handle = nullptr;
+    // 定义一个 sockaddr_un 结构体，用于存储客户端地址信息
+    struct sockaddr_un my_cliaddr;
+    // 用于生成唯一套接字名称的hash值
+    const uint64_t ipc_hash;
+    // 非阻塞套接字设置
+    const volatile uint32_t* my_abortFlag;
+    // 进程id信息
+    int localRank  = -1;
+    int localRanks = 0;
+    // 线程池指针
+    ThreadPool* pthread_pool = nullptr;
+    // 设置超时时间为 10000 毫秒
+    int timeoutMs = 10000;
+};
+} // namespace ipc_socket
+} // namespace net
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/net/net.cpp
+++ b/src/hardware/net/net.cpp
+#include <stdint.h>
+#include "net.h"
+namespace sccl {
+namespace hardware {
+namespace net {
+/**
+ * 打印套接字地址信息
+ *
+ * @param sock_addr 套接字地址结构体指针
+ * @param prefix 输出信息的前缀字符串
+ * @return scclResult_t 返回操作结果，scclSuccess表示成功
+ *
+ * @note 该函数会格式化输出套接字地址信息，包含在分隔线中以便阅读
+ */
+scclResult_t printSocketAddr(union net_socket::scclSocketAddress* sock_addr, const char* prefix) {
+    char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2];
+    net::net_socket::scclSocketToString(sock_addr, line);
+    printf("\n==========================================\n%s addr: %s"
+           "\n==========================================\n",
+           prefix,
+           line);
+    return scclSuccess;
+}
+/**
+ * 打印套接字信息
+ *
+ * @param sock 指向scclSocket结构体的指针，包含套接字相关信息
+ * @param prefix 输出信息的前缀字符串
+ * @return 返回scclResult_t类型，成功时返回scclSuccess
+ *
+ * 该函数用于格式化输出套接字的详细信息，包括文件描述符、重试次数、
+ * 地址信息、状态标志等调试信息。输出格式包含分隔线以便于阅读。
+ */
+scclResult_t printSocketInfo(struct net_socket::scclSocket* sock, const char* prefix) {
+    char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2];
+    net::net_socket::scclSocketToString(&sock->addr, line);
+    printf("\n==========================================\n%s: fd: %d, acceptFd: %d, timedOutRetries: %d, refusedRetries: %d, \naddr: %s, abortFlag=%u, "
+           "asyncFlag=%d, state=%d, salen=%d, magic=%lu, type=%d"
+           "\n==========================================\n",
+           prefix,
+           sock->fd,
+           sock->acceptFd,
+           sock->timedOutRetries,
+           sock->refusedRetries,
+           line,
+           sock->abortFlag != NULL ? *sock->abortFlag : 0,
+           sock->asyncFlag,
+           int(sock->state),
+           sock->salen,
+           sock->magic,
+           int(sock->type));
+    return scclSuccess;
+}
+////////////////////////////////////////////////////////////////////////////////////////
+// 定义网络状态的枚举类型
+typedef enum scclNetState {
+    scclNetStateInit     = 0, // 初始化状态
+    scclNetStateEnabled  = 1, // 启用状态
+    scclNetStateDisabled = 2  // 禁用状态
+} scclNetState_t;
+// 定义一个数组，存储每种网络类型的状态，初始值均为初始化状态
+scclNetState_t scclNetStates[scclNetTypeNum] = {scclNetStateInit, scclNetStateInit, scclNetStateInit};
+/**
+ * 获取指定网络接口的状态
+ *
+ * @param i 网络接口索引
+ * @param state 输出参数，用于存储获取到的网络状态
+ * @return scclResult_t 返回操作结果，成功返回scclSuccess
+ *
+ * @note 该函数是线程安全的，内部使用互斥锁保护共享状态
+ * @note 如果网络未初始化，会自动执行初始化并更新状态
+ */
+scclResult_t netGetState(int i, scclNetState_t* state) {
+    pthread_mutex_lock(&netLock);
+    if(scclNetStates[i] == scclNetStateInit) {
+        int ndev;
+        if(scclNets[i]->init() != scclSuccess)
+            scclNetStates[i] = scclNetStateDisabled;
+        else if(scclNets[i]->devices(&ndev) != scclSuccess || ndev <= 0)
+            scclNetStates[i] = scclNetStateDisabled;
+        else
+            scclNetStates[i] = scclNetStateEnabled;
+    }
+    *state = scclNetStates[i];
+    pthread_mutex_unlock(&netLock);
+    return scclSuccess;
+}
+/**
+ * @brief 初始化指定名称的网络
+ *
+ * 遍历所有可用的网络类型，查找与指定名称匹配且状态为启用的网络。
+ * 如果找到匹配的网络，则将其赋值给scclNet参数。
+ *
+ * @param netName 要查找的网络名称，可为NULL表示匹配任意名称
+ * @param scclNet 输出参数，用于返回找到的网络实例
+ *
+ * @return scclResult_t 返回操作结果：
+ *         - scclSuccess 成功找到匹配网络
+ *         - scclInvalidUsage 未找到匹配网络
+ */
+scclResult_t scclNetInit(const char* netName, scclNet_t*& scclNet) {
+    // Initialize main communication network
+    bool ok = false;
+    for(int i = 0; i < scclNetTypeNum; i++) {
+        if(scclNets[i] == nullptr)
+            continue;
+        enum scclNetState state;
+        SCCLCHECK(netGetState(i, &state));
+        if(state != scclNetStateEnabled)
+            continue;
+        if(netName && strcasecmp(netName, scclNets[i]->name) != 0)
+            continue;
+        scclNet = scclNets[i];
+        ok      = true;
+        // if(scclCollNets[i]) {
+        //     SCCLCHECK(collNetGetState(i, &state));
+        //     if(state == scclNetStateEnabled) {
+        //         comm->scclCollNet = scclCollNets[i];
+        //     }
+        // }
+        break;
+    }
+    if(!ok) {
+        WARN("Error: network %s not found.", netName ? netName : "");
+        return scclInvalidUsage;
+    }
+    return scclSuccess;
+}
+} // namespace net
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/net/net.h
+++ b/src/hardware/net/net.h
 #pragma once
 #include <stdint.h>
+#include <memory>
 #include "base.h"
 #include "net_utils.h"
-#include "device/net_ib.h"
+#include "net_socket/socket.h"
-#include "host/net_socket.h"
+#include "net_ib/net_ib.h"
+#include "net_socket/net_socket.h"
 namespace sccl {
 namespace hardware {
 namespace net {
-//////////////////////////////////
+// 定义一个静态的pthread互斥锁，用于线程同步
-typedef enum net_type : uint8_t {
+static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
-    NET_IB     = 0,
-    NET_SOCKET = 1
-} net_type_t;
-//////////////////////////////////
-inline scclResult_t initNetSpecial(scclNet_t* net) {
-    int ndev;
-    // 初始化网络，如果初始化失败则返回内部错误
-    if(net->init() != scclSuccess)
-        return scclInternalError;
-    // 获取设备数量，如果获取失败则返回内部错误
-    if(net->devices(&ndev) != scclSuccess)
-        return scclInternalError;
-    // 如果设备数量小于或等于0，则返回系统错误
-    if(ndev <= 0)
-        return scclSystemError;
-    return scclSuccess;
-}
-/**
+//////////////////////////////////// 功能函数 ////////////////////////////////////
- * 初始化网络设备
+// 打印Socket信息
- *
+scclResult_t printSocketAddr(union net_socket::scclSocketAddress* sock_addr, const char* prefix);
- * @param net 指向scclNet_t结构体的指针，表示要初始化的网络设备
+scclResult_t printSocketInfo(struct net_socket::scclSocket* sock, const char* prefix);
- * @return scclResult_t 返回操作结果：
- *                      - scclSuccess: 初始化成功
- *                      - scclInternalError: 网络初始化或获取设备数量失败
- *                      - scclSystemError: 系统中无可用设备
- */
-inline scclNet_t* initNet(net_type_t t) {
-    scclNet_t* scclNet = NULL;
-    if(t == NET_IB) {
+//////////////////////////////////// 网络接口 ////////////////////////////////////
-        if(initNetSpecial(&(device::scclNetIb)) == scclSuccess) {
+// 定义网络类型数量的常量
-            scclNet = &(device::scclNetIb);
+constexpr int scclNetTypeNum = 3;
-        }
-    } else if(t == NET_SOCKET) {
-        if(initNetSpecial(&(host::scclNetSocket)) == scclSuccess) {
-            scclNet = &(host::scclNetSocket);
-        }
-    } else {
-        WARN("Unsupported network type.");
-    }
-    return scclNet;
+// 定义一个内联数组，存储不同类型的sccl网络指针
-}
+inline scclNetBase* scclNets[] = {nullptr, new net_ib::scclNetIb(), new net_socket::scclNetSocket()};
-////////////////////////////////////
+// 定义初始化sccl网络的函数
-inline scclNet_t* scclNets[3] = {nullptr, &device::scclNetIb, &host::scclNetSocket};
+scclResult_t scclNetInit(const char* netName, scclNet_t*& scclNet);
 } // namespace net
 } // namespace hardware