paths.cc

#include "core.h"
#include "graph.h"
#include "topo.h"
#include "comm.h"
#include "net.h"
#include "channel.h"
#include "xml.h"

namespace sccl {
namespace hardware {
namespace topology {
namespace graph {

// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths

struct scclTopoNodeList {
    struct scclTopoNode* list[SCCL_TOPO_MAX_NODES];
    int count;
};

static scclResult_t getPath(struct scclTopoSystem* system, struct scclTopoNode* node, int t, int64_t id, struct scclTopoLinkList** path) {
    for(int i = 0; i < system->nodes[t].count; i++) {
        if(system->nodes[t].nodes[i].id == id) {
            *path = node->paths[t] + i;
            return scclSuccess;
        }
    }
    WARN("Could not find node of type %d id %lx", t, id);
    return scclInternalError;
}

static scclResult_t scclTopoSetPaths(struct scclTopoNode* baseNode, struct scclTopoSystem* system) {
    if(baseNode->paths[baseNode->type] == NULL) {
        SCCLCHECK(scclCalloc(baseNode->paths + baseNode->type, system->nodes[baseNode->type].count));
    }

    // breadth-first search to set all paths to that node in the system
    struct scclTopoNodeList nodeList;
    struct scclTopoNodeList nextNodeList;
    nodeList.count     = 1;
    nodeList.list[0]   = baseNode;
    nextNodeList.count = 0;
    struct scclTopoLinkList* basePath;
    SCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
    basePath->count = 0;
    basePath->bw    = LOC_BW;
    basePath->type  = PATH_LOC;

    while(nodeList.count) {
        nextNodeList.count = 0;
        for(int n = 0; n < nodeList.count; n++) {
            struct scclTopoNode* node = nodeList.list[n];
            struct scclTopoLinkList* path;
            SCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
            for(int l = 0; l < node->nlinks; l++) {
                struct scclTopoLink* link    = node->links + l;
                struct scclTopoNode* remNode = link->remNode;
                if(remNode->paths[baseNode->type] == NULL) {
                    SCCLCHECK(scclCalloc(remNode->paths + baseNode->type, system->nodes[baseNode->type].count));
                }
                struct scclTopoLinkList* remPath;
                SCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
                float bw = std::min(path->bw, link->bw);

                // allow routing through a GPU only as 1 hop
                if(node != baseNode && node->type == GPU && (link->type != LINK_NVL || remNode->type != GPU || path->count > 1))
                    continue;

                if((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) {
                    // Find reverse link
                    for(int l = 0; l < remNode->nlinks; l++) {
                        if(remNode->links[l].remNode == node) {
                            remPath->list[0] = remNode->links + l;
                            break;
                        }
                    }
                    if(remPath->list[0] == NULL) {
                        WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
                             remNode->type,
                             remNode->id,
                             remNode->nlinks,
                             node->type,
                             node->id);
                        return scclInternalError;
                    }
                    // Copy the rest of the path
                    for(int i = 0; i < path->count; i++)
                        remPath->list[i + 1] = path->list[i];
                    remPath->count = path->count + 1;
                    remPath->bw    = bw;

                    // Start with path type = link type. PATH and LINK types are supposed to match.
                    // Don't consider LINK_NET as we only care about the NIC->GPU path.
                    int type = link->type == LINK_NET ? LINK_LOC : link->type;
                    // Differentiate between one and multiple PCI switches
                    if(node->type == PCI && remNode->type == PCI)
                        type = PATH_PXB;
                    // Consider a path going through the CPU as PATH_PHB
                    if(link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU))
                        type = PATH_PHB;
                    // Set 1 hop NVLink as NVB
                    // if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;

                    remPath->type = std::max(path->type, type);

                    // Add to the list for the next iteration if not already in the list
                    // Disallow GPUs as intermediate steps for now
                    if(remNode->type != GPU) {
                        int i;
                        for(i = 0; i < nextNodeList.count; i++)
                            if(nextNodeList.list[i] == remNode)
                                break;
                        if(i == nextNodeList.count)
                            nextNodeList.list[nextNodeList.count++] = remNode;
                    }
                }
            }
        }
        memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
    }
    return scclSuccess;
}

/**
 * 打印节点路径信息
 *
 * @param system 拓扑系统指针
 * @param node 待打印路径的节点指针
 *
 * 该函数用于输出指定节点的路径信息，包括路径类型、目标节点ID、
 * 路径跳数、带宽和路径类型字符串。输出格式为一行字符串。
 */
static void printNodePaths(struct scclTopoSystem* system, struct scclTopoNode* node) {
    char line[1024];
    sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
    int offset = strlen(line);
    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
        if(node->paths[t] == NULL)
            continue;
        for(int n = 0; n < system->nodes[t].count; n++) {
            sprintf(line + offset,
                    "%s/%lX (%d/%f/%s) ",
                    topoNodeTypeStr[t],
                    system->nodes[t].nodes[n].id,
                    node->paths[t][n].count,
                    node->paths[t][n].bw,
                    topoPathTypeStr[node->paths[t][n].type]);
            offset = strlen(line);
        }
    }
}

static scclResult_t getLocalCpu(struct scclTopoSystem* system, int gpu, int* retCpu) {
    // Find the closest CPU to a GPU
    int minHops                    = 0;
    int localCpu                   = -1;
    struct scclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
    for(int c = 0; c < system->nodes[CPU].count; c++) {
        int hops = paths[c].count;
        if(minHops == 0 || hops < minHops) {
            localCpu = c;
            minHops  = hops;
        }
    }
    if(localCpu == -1) {
        WARN("Error : could not find CPU close to GPU %d", gpu);
        return scclInternalError;
    }
    *retCpu = localCpu;
    return scclSuccess;
}

static scclResult_t addInterStep(struct scclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
    struct scclTopoNode* cpuNode = system->nodes[tx].nodes + ix;
    struct scclTopoNode* srcNode = system->nodes[t1].nodes + i1;

    int l = 0;
    // Node 1 -> CPU
    for(int i = 0; i < srcNode->paths[tx][ix].count; i++)
        srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i];
    // CPU -> Node 2
    for(int i = 0; i < cpuNode->paths[t2][i2].count; i++)
        srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];

    // Update path characteristics
    srcNode->paths[t2][i2].count = l;
    srcNode->paths[t2][i2].type  = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
    if(tx == GPU)
        srcNode->paths[t2][i2].type = PATH_PXN;
    srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw);
    return scclSuccess;
}

// Remove/free paths for a given type
static void scclTopoRemovePathType(struct scclTopoSystem* system, int nodeType) {
    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
        // Remove links _to_ the given type
        for(int n = 0; n < system->nodes[t].count; n++) {
            struct scclTopoNode* node = system->nodes[t].nodes + n;
            free(node->paths[nodeType]);
            node->paths[nodeType] = NULL;
        }
        // Remove links _from_ the given type
        for(int n = 0; n < system->nodes[nodeType].count; n++) {
            struct scclTopoNode* node = system->nodes[nodeType].nodes + n;
            free(node->paths[t]);
            node->paths[t] = NULL;
        }
    }
}

static const int levelsOldToNew[] = {PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS};
scclResult_t scclGetLevel(int* level, const char* disableEnv, const char* levelEnv) {
    if(*level == -1) {
        int l = -1;
        if(disableEnv) {
            char* str = getenv(disableEnv);
            if(str) {
                int disable = strtol(str, NULL, 0);
                if(disable == 1)
                    l = 0;
            }
        }
        if(l == -1) {
            char* str = getenv(levelEnv);
            if(str) {
                for(int i = 0; i <= PATH_SYS; i++) {
                    if(strcmp(str, topoPathTypeStr[i]) == 0) {
                        l = i;
                        break;
                    }
                }
                // Old style numbering
                // levelsOldToNew to is an array with each index corresponding to the
                // "old level" int, and each value mapping to the correct value defined in topo.h
                // maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew)
                if(l == -1 && str[0] >= '0' && str[0] <= '9') {
                    int oldLevel          = strtol(str, NULL, 0);
                    const int maxOldLevel = sizeof(levelsOldToNew) / sizeof(int) - 1;
                    if(oldLevel > maxOldLevel)
                        oldLevel = maxOldLevel;
                    l = levelsOldToNew[oldLevel];
                }
            }
        }
        if(l >= 0)
            INFO(SCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
        *level = l >= 0 ? l : -2;
    }
    return scclSuccess;
}

SCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
int scclTopoUserGdrLevel = -1;

scclResult_t scclTopoCheckGdr(struct scclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) {
    *useGdr = 0;

    // Get GPU and NET
    int n, g;
    SCCLCHECK(scclTopoIdToIndex(system, NET, netDev, &n));
    struct scclTopoNode* net = system->nodes[NET].nodes + n;
    SCCLCHECK(scclTopoIdToIndex(system, GPU, busId, &g));
    struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;

    // Check that both the NIC and GPUs support it
    if(net->net.gdrSupport == 0)
        return scclSuccess;
    if(gpu->gpu.gdrSupport == 0)
        return scclSuccess;

    if(read) { // For reads (sends) only enable under certain conditions
        int gdrReadParam = scclParamNetGdrRead();
        if(gdrReadParam == 0)
            return scclSuccess;
        if(gdrReadParam < 0) {
            int nvlink = 0;
            // Since we don't know whether there are other communicators,
            // it's better to keep things local if we have a single GPU.
            if(system->nodes[GPU].count == 1)
                nvlink = 1;
            for(int i = 0; i < system->nodes[GPU].count; i++) {
                if(i == g)
                    continue;
                if(gpu->paths[GPU][i].type == PATH_NVL) {
                    nvlink = 1;
                    break;
                }
            }
            if(!nvlink)
                return scclSuccess;
        }
    }

    // Check if we are close enough that it makes sense to enable GDR
    int netGdrLevel = system->netGdrLevel == -2 ? PATH_PXB : system->netGdrLevel;
    SCCLCHECK(scclGetLevel(&scclTopoUserGdrLevel, NULL, "SCCL_NET_GDR_LEVEL"));
    if(scclTopoUserGdrLevel != -2)
        netGdrLevel = scclTopoUserGdrLevel;
    else {
        int arch, vendor, model;
        SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
        if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME) {
            int i, d1 = -1, d2 = -1;
            for(i = 0; i < system->nodes[CPU].count; i++)
                if(system->nodes[GPU].nodes[g].paths[CPU][i].count == 2)
                    break;
            if(i < system->nodes[CPU].count)
                d1 = system->nodes[CPU].nodes[i].id;
            for(i = 0; i < system->nodes[CPU].count; i++)
                if(system->nodes[NET].nodes[n].paths[CPU][i].count == 2)
                    break;
            if(i < system->nodes[CPU].count)
                d2 = system->nodes[CPU].nodes[i].id;
            if(d1 != -1 && d2 != -1 && d1 == d2 && (system->nodes[GPU].nodes[g].id & 0xf0000) == (system->nodes[NET].nodes[n].net.busId & 0xf0000)) {
                netGdrLevel = PATH_PHB;
            }
        }
    }

    int distance = gpu->paths[NET][n].type;
    if(distance == PATH_PXN) {
        // In case of PXN, use the intermediate GPU distance instead
        int proxyRank, g;
        SCCLCHECK(scclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
        SCCLCHECK(scclTopoRankToIndex(system, proxyRank, &g));
        struct scclTopoNode* proxyGpu = system->nodes[GPU].nodes + g;
        distance                      = proxyGpu->paths[NET][n].type;
    }
    if(distance > netGdrLevel) {
        INFO(SCCL_NET, "GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
        return scclSuccess;
    }

    *useGdr = 1;
    INFO(SCCL_NET, "GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read);
    return scclSuccess;
}

// Set to 0 to disable the flush on Hopper when using GDR
SCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1);

// Determine whether we need to flush the GDR recv buffers
scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush) {
    int g;
    SCCLCHECK(scclTopoIdToIndex(system, GPU, busId, &g));
    struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
    // Flush is required on Ampere and earlier
    *flush = gpu->gpu.cudaCompCap < 90 ? 1 : scclParamNetForceFlush();
    return scclSuccess;
}

SCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 1);

// Check whether going through the network would be faster than going through P2P/SHM.
scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net) {
    if(scclParamNetDisableIntra() == 1) {
        *net = 0;
        return scclSuccess;
    }
    *net = 1;
    // First check the current GPU-to-GPU speed.
    int g1, g2;
    if(scclTopoIdToIndex(system, GPU, id1, &g1) != scclSuccess || scclTopoIdToIndex(system, GPU, id2, &g2) != scclSuccess) {
        return scclSuccess;
    }

    struct scclTopoNode* gpu1 = system->nodes[GPU].nodes + g1;
    struct scclTopoNode* gpu2 = system->nodes[GPU].nodes + g2;
    float speed               = gpu1->paths[GPU][g2].bw;

    // Now check the speed each GPU can access the network through PXB or better
    float netSpeed1 = 0, netSpeed2 = 0;
    for(int n = 0; n < system->nodes[NET].count; n++) {
        struct scclTopoLinkList* path = gpu1->paths[NET] + n;
        if(path->type <= PATH_PXB && path->bw > netSpeed1)
            netSpeed1 = path->bw;
        path = gpu2->paths[NET] + n;
        if(path->type <= PATH_PXB && path->bw > netSpeed2)
            netSpeed2 = path->bw;
    }

    if(netSpeed1 > speed && netSpeed2 > speed)
        return scclSuccess;
    *net = 0;
    return scclSuccess;
}

scclResult_t scclTopoGetIntermediateRank(struct scclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
    // Get GPU and NET
    int n, g;
    SCCLCHECK(scclTopoIdToIndex(system, NET, netDev, &n));
    SCCLCHECK(scclTopoRankToIndex(system, rank, &g));
    struct scclTopoNode* gpu      = system->nodes[GPU].nodes + g;
    struct scclTopoLinkList* path = gpu->paths[NET] + n;
    if(path->type == PATH_PXN) {
        struct scclTopoNode* node;
        int type = NVS;
        for(int i = 0; i < path->count && type == NVS; i++) {
            node = path->list[i]->remNode;
            type = node->type;
        }
        if(type != GPU) {
            WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
            return scclInternalError;
        }
        *intermediateRank = node->gpu.rank;
    } else {
        *intermediateRank = rank;
    }
    return scclSuccess;
}

SCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);

// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
// remote proxies without risking deadlocks
int scclPxnDisable(struct scclComm* comm) {
    static int pxnDisable = -1;
    if(pxnDisable == -1) {
        if(comm && scclNetVersion(comm) == 4) {
            INFO(SCCL_INIT, "PXN Disabled as plugin is v4");
            pxnDisable = 1;
        } else {
            pxnDisable = scclParamPxnDisable();
        }
    }
    return pxnDisable;
}

scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks) {
    struct scclTopoSystem* system = comm->topo;
    *nranks                       = 0;
    *intermediateRanks            = NULL;
    if(system->nodes[NET].count == 0)
        return scclSuccess;

    int nr     = 0;
    int* ranks = NULL;
    for(int rank = 0; rank < comm->nRanks; rank++) {
        int netDev, proxyRank;
        SCCLCHECK(scclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank));
        if(proxyRank == comm->rank)
            continue;
        int useGdr;
        SCCLCHECK(scclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr));
        if(useGdr == 0)
            continue;
        int found = 0;
        for(int r = 0; r < nr; r++) {
            if(ranks[r] == proxyRank)
                found = 1;
        }
        if(!found) {
            SCCLCHECK(scclRealloc(&ranks, nr, nr + 1));
            ranks[nr++] = proxyRank;
        }
    }
    *nranks            = nr;
    *intermediateRanks = ranks;
    return scclSuccess;
}

static bool rcclPathOverride(struct scclTopoSystem* system, uint64_t distance) {
    int i, j;

    for(i = 0; i < system->nodes[GPU].count; i++) {
        for(j = 0; j < system->nodes[NET].count; j++) {
            if((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) ||
               (system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance))
                break;
        }
        if(j >= system->nodes[NET].count)
            break;
    }
    if(i >= system->nodes[GPU].count) {
        for(i = 0; i < system->nodes[GPU].count; i++) {
            for(j = 0; j < system->nodes[NET].count; j++) {
                if((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) ||
                   (system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance))
                    system->nodes[GPU].nodes[i].paths[NET][j].type = PATH_PXB;
            }
        }
        return true;
    } else {
        return false;
    }
}

RCCL_PARAM(EnableIntranet, "ENABLE_INTRANET", -2);

scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm) {
    int* domains;
    int64_t* ids;
    SCCLCHECK(scclCalloc(&domains, system->nodes[GPU].count));
    SCCLCHECK(scclCalloc(&ids, system->nodes[GPU].count));
    int myDomain = 0;
    for(int g = 0; g < system->nodes[GPU].count; g++) {
        struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
        domains[g]               = g;
        ids[g]                   = gpu->id;
        for(int p = 0; p < g; p++) {
            if(gpu->paths[GPU][p].type < PATH_NET) {
                domains[g] = std::min(domains[g], domains[p]);
            }
        }
        if(gpu->gpu.rank == comm->rank)
            myDomain = domains[g];
    }

    int ngpus = system->nodes[GPU].count;
    for(int i = 0; i < ngpus; i++) {
        if(domains[i] == myDomain)
            continue;
        struct scclTopoNode* gpu = NULL;
        int g;
        for(g = 0; g < system->nodes[GPU].count /* This one varies over the loops */; g++) {
            gpu = system->nodes[GPU].nodes + g;
            if(gpu->id == ids[i])
                break;
            else
                gpu = NULL;
        }
        if(gpu == NULL) {
            WARN("Could not find id %lx", ids[i]);
            free(domains);
            free(ids);
            return scclInternalError;
        }
        SCCLCHECK(scclTopoRemoveNode(system, GPU, g));
    }

    // trim low speed port on same NIC
    for(int i = 0; i < system->nodes[NET].count; i++) {
        for(int j = 0; j < system->nodes[NET].count; j++) {
            if(i == j)
                continue;
            if(system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[j].net.asic) {
                if(system->nodes[NET].nodes[i].net.bw > system->nodes[NET].nodes[j].net.bw)
                    system->nodes[NET].nodes[j].net.bw = 0;
            }
        }
    }
    do {
        int n;
        for(n = 0; n < system->nodes[NET].count; n++) {
            if(system->nodes[NET].nodes[n].net.bw == 0)
                break;
        }
        if(n < system->nodes[NET].count) {
            SCCLCHECK(scclTopoRemoveNode(system, NET, n));
        } else
            break;
    } while(system->nodes[NET].count);

    int remove   = 1;
    int gdr      = 1;
    bool allXgmi = true;
    // detect if all GPUs are connected by XGMI
    for(int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
        int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
        for(int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
            if(i == j)
                continue;
            int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
            bool isXGMI;
            SCCLCHECK(scclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
            allXgmi &= isXGMI;
        }
    }
    if(allXgmi)
        system->type |= RCCL_TOPO_XGMI_ALL;
    for(int g = 0; g < system->nodes[GPU].count; g++) {
        int net;
        SCCLCHECK(scclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
        SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
        if(!gdr)
            break;
    }
    if(gdr && !allXgmi) {
        remove = 0;
        system->type |= RCCL_TOPO_GDR_ALL;
        INFO(SCCL_LOG_TOPO, "GDR is available on all GPUs");
    }

    // Special handling of gfx94x
    if(rcclParamEnableIntranet() == 1 || (rcclParamEnableIntranet() == -2 && IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
                                          system->nodes[GPU].count == 8 && system->nodes[NET].count == 8)) {
        remove = 0;
        system->type |= RCCL_TOPO_FORCE_INTRA;
    }
    comm->localRanks = system->nodes[GPU].count;
    if(system->nodes[GPU].count == comm->nRanks && remove) {
        for(int n = system->nodes[NET].count - 1; n >= 0; n--)
            SCCLCHECK(scclTopoRemoveNode(system, NET, n));
    }

    free(domains);
    free(ids);
    return scclSuccess;
}

void scclTopoFree(struct scclTopoSystem* system) {
    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++)
        scclTopoRemovePathType(system, t);
    free(system);
}

SCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 1);
SCCL_PARAM(NChannelsPerPeer, "NCHANNELS_PER_PEER", 4);

static scclResult_t scclTopoGetNchannels(struct scclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
    int peer;
    struct scclTopoLinkList* path = NULL;
    if(scclTopoRankToIndex(system, peerRank, &peer) == scclSuccess) {
        // Same rank
        if(g == peer) {
            *nChannels = -1;
            return scclSuccess;
        }
        // Local rank
        path = system->nodes[GPU].nodes[peer].paths[GPU] + g;
        if(path->type == PATH_NVL) {
            float nvlBw = scclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn);
            *nChannels  = (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 4 : 2) * std::max(1, (int)(path->bw / nvlBw));
        } else {
            *nChannels = 2;
        }
    } else {
        // Remote rank, use network
        *nChannels = scclParamNChannelsPerNetPeer();
    }
    return scclSuccess;
}

SCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 4);
SCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);

static int nextPow2(int v) {
    int pow2 = 1;
    while(pow2 < v)
        pow2 <<= 1;
    return pow2;
}

scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm) {
    /* here we already honor comm->max/minCTAs for p2pnChannels. */
    int MinP2pNchannels  = (int)scclParamMinP2pNChannels();
    int MaxP2pNchannels  = (int)scclParamMaxP2pNChannels();
    int NchannelsPerPeer = (int)scclParamNChannelsPerPeer();
    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MIN_P2P_NCHANNELS") == NULL)
        MinP2pNchannels = 32;
    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MAX_P2P_NCHANNELS") == NULL)
        MaxP2pNchannels = 32;
    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_NCHANNELS_PER_PEER") == NULL)
        NchannelsPerPeer = 32;

    int scclMinP2pNchannels = MinP2pNchannels;
    if(comm->sharedRes->owner != comm) {
        comm->p2pnChannels = std::min(comm->nChannels, MaxP2pNchannels);
        comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, scclMinP2pNchannels), comm->sharedRes->tpP2pNChannels);
    } else {
        comm->p2pnChannels = std::min(comm->nChannels, MaxP2pNchannels);
        comm->p2pnChannels = std::max(comm->p2pnChannels, scclMinP2pNchannels);
    }

    int minChannels = comm->p2pnChannels;
    // We need to loop through all local GPUs to have a global picture
    for(int g = 0; g < comm->topo->nodes[GPU].count; g++) {
        for(int r = 0; r < comm->nRanks; r++) {
            int nChannels;
            SCCLCHECK(scclTopoGetNchannels(comm->topo, g, r, &nChannels));
            if(nChannels >= 0)
                minChannels = std::min(minChannels, nChannels);
        }
    }

    int arch, vendor, model;
    SCCLCHECK(scclTopoCpuType(comm->topo, &arch, &vendor, &model));
    // Round to next pow2 nChannelsPerPeer and nChannels
    if(getNumaMaxGpus() == 1 && !scclTopoPathAllNVLink(comm->topo)) {
        comm->p2pnChannelsPerPeer = nextPow2(comm->p2pnChannels);
    } else {
        comm->p2pnChannelsPerPeer = (NchannelsPerPeer == -2 ? nextPow2(minChannels) : NchannelsPerPeer);
    }
    comm->p2pnChannels = nextPow2(comm->p2pnChannels);
    // Init channels that weren't used so far
    for(int c = comm->nChannels; c < std::max(comm->nChannels, comm->p2pnChannels); c++)
        SCCLCHECK(initChannel(comm, c));

    // We want to spread channels used when there aren't many and progressively
    // fill the whole space of nChannels. To do so we mirror the bits in the
    // nChannels space.
    for(int c = 0; c < comm->p2pnChannels; c++) {
        int mirror = 0;
        for(int b = 1, mb = (comm->p2pnChannels >> 1); b < comm->p2pnChannels; b <<= 1, mb >>= 1)
            if(c & b)
                mirror |= mb;
        comm->p2pChannels[c] = mirror;
    }
    return scclSuccess;
}

scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks) {
    int ngpus = system->nodes[GPU].count;
    SCCLCHECK(scclCalloc(ranks, ngpus));
    int nvbGpus = 0;
    for(int g = 0; g < ngpus; g++) {
        struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
        if(gpu->gpu.rank != rank)
            continue;
        for(int p = 0; p < ngpus; p++) {
            if(gpu->paths[GPU][p].type == PATH_NVB) {
                (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
            }
        }
    }
    *nranks = nvbGpus;
    return scclSuccess;
}

int scclTopoPathAllNVLink(struct scclTopoSystem* system) {
    int minPath = PATH_DIS;
    for(int i = 0; i < system->nodes[GPU].count; i++) {
        struct scclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
        for(int j = 0; j < system->nodes[GPU].count; j++) {
            if(i == j)
                continue;
            minPath = std::min(minPath, paths[j].type);
        }
    }
    return minPath >= PATH_PIX ? 0 : 1;
}

} // namespace graph

scclResult_t scclTopoPrintPaths(struct scclTopoSystem* system) {
    for(int i = 0; i < system->nodes[GPU].count; i++) {
        graph::printNodePaths(system, system->nodes[GPU].nodes + i);
    }
    for(int i = 0; i < system->nodes[NET].count; i++) {
        graph::printNodePaths(system, system->nodes[NET].nodes + i);
    }
    return scclSuccess;
}

int scclTopoUserP2pLevel = -1;
scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank) {
    *p2p = 0;
    if(read)
        *read = 0;
    if(intermediateRank)
        *intermediateRank = -1;

    // Get GPUs from topology
    int g1, g2;
    SCCLCHECK(scclTopoIdToIndex(system, GPU, id1, &g1));
    struct scclTopoNode* gpu1 = system->nodes[GPU].nodes + g1;
    if(scclTopoIdToIndex(system, GPU, id2, &g2) == scclInternalError) {
        // GPU not found, we can't use p2p.
        return scclSuccess;
    }

    int intermediateIndex = -1;
    // Set intermediate GPU rank, if routing through an intermediate GPU.
    struct scclTopoLinkList* path = gpu1->paths[GPU] + g2;
    if(path->count == 2) {
        struct scclTopoNode* intermediateNode = path->list[0]->remNode;
        if(intermediateNode->type == GPU) {
            intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
            if(intermediateRank)
                *intermediateRank = intermediateNode->gpu.rank;
        }
    }

    // In general, use P2P whenever we can.
    int p2pLevel = PATH_SYS;

    // User override
    if(scclTopoUserP2pLevel == -1)
        SCCLCHECK(scclGetLevel(&scclTopoUserP2pLevel, "SCCL_P2P_DISABLE", "SCCL_P2P_LEVEL"));
    if(scclTopoUserP2pLevel != -2) {
        p2pLevel = scclTopoUserP2pLevel;
        goto compare;
    }

    // Don't use P2P through ARM CPUs
    int arch, vendor, model;
    SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
    if(arch == SCCL_TOPO_CPU_ARCH_ARM)
        p2pLevel = PATH_PXB;
    if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
        p2pLevel = PATH_PXB;
    }
    if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
        p2pLevel = PATH_PXB;
    }

compare:
    // Compute the PCI distance and compare with the p2pLevel.
    if(path->type <= p2pLevel)
        *p2p = 1;

    if(path->type == PATH_NVL) {
        struct scclTopoNode* gpu2 = system->nodes[GPU].nodes + g2;
        // Enable P2P Read for Ampere/NVLink only
        if(read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80))
            *read = 1;
    }

    return scclSuccess;
}

scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm) {
    // Precompute paths between GPUs/NICs.

    // Remove everything in case we're re-computing
    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++)
        graph::scclTopoRemovePathType(system, t);

    // Set direct paths to CPUs. We need them in many cases.
    for(int c = 0; c < system->nodes[CPU].count; c++) {
        SCCLCHECK(graph::scclTopoSetPaths(system->nodes[CPU].nodes + c, system));
    }

    // Set direct paths to GPUs.
    for(int g = 0; g < system->nodes[GPU].count; g++) {
        SCCLCHECK(graph::scclTopoSetPaths(system->nodes[GPU].nodes + g, system));
    }

    // Set direct paths to NICs.
    for(int n = 0; n < system->nodes[NET].count; n++) {
        SCCLCHECK(graph::scclTopoSetPaths(system->nodes[NET].nodes + n, system));
    }

    // Set direct paths to NVSwitches.
    for(int n = 0; n < system->nodes[NVS].count; n++) {
        SCCLCHECK(graph::scclTopoSetPaths(system->nodes[NVS].nodes + n, system));
    }

    // Update path for GPUs when we don't want to / can't use GPU Direct P2P
    for(int g = 0; g < system->nodes[GPU].count; g++) {
        for(int p = 0; p < system->nodes[GPU].count; p++) {
            int p2p;
            SCCLCHECK(scclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
            if(p2p == 0) {
                // Divert all traffic through the CPU
                int cpu;
                SCCLCHECK(getLocalCpu(system, g, &cpu));
                SCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
            }
        }

        if(comm == NULL)
            continue;
        // Remove GPUs we can't (or don't want to) communicate with through P2P or SHM
        struct scclPeerInfo* dstInfo = comm->peerInfo + system->nodes[GPU].nodes[g].gpu.rank;
        for(int p = 0; p < system->nodes[GPU].count; p++) {
            if(p == g)
                continue;
            struct scclPeerInfo* srcInfo = comm->peerInfo + system->nodes[GPU].nodes[p].gpu.rank;
            int p2p;
            SCCLCHECK(scclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
            if(p2p == 0) {
                int shm;
                SCCLCHECK(scclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
                if(shm == 0) {
                    // Mark this peer as inaccessible. We'll trim it later.
                    system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
                }
            }
        }
    }

    // Special handling of gfx94x

#if !defined(TOPO_EXPL)
    char strValue[1024];
    SCCLCHECK(scclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue));
    if(strncmp("Hyper-V UEFI Release", strValue, 20) == 0) {
#endif
        int arch, vendor, model;
        SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
        if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_INTEL && IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
           ((system->nodes[GPU].count == 8 && system->nodes[NET].count == 8 && system->nodes[GPU].count == system->nRanks) ||
            (system->nodes[GPU].count != system->nRanks))) {
            if(!rcclPathOverride(system, 0x100000))
                rcclPathOverride(system, 0x1000);
        }
#if !defined(TOPO_EXPL)
    }
#endif

    // Update paths for NICs (no GPU Direct, PXN, ...)
    for(int n = 0; n < system->nodes[NET].count; n++) {
        struct scclTopoNode* netNode = system->nodes[NET].nodes + n;

        for(int g = 0; g < system->nodes[GPU].count; g++) {
            // Check whether we can access the NIC through another NVLink-connected GPU (PXN)
            struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
            if(scclPxnDisable(comm) != 1) {
                int localGpuIndex;
                SCCLCHECK(scclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex));
                if(localGpuIndex != g && localGpuIndex != -1) {
                    // PXN = PCI + NVLink.
                    struct scclTopoNode* peerNode = system->nodes[GPU].nodes + localGpuIndex;
                    // Only use PXN for NIC n if remote GPU p ...
                    if(peerNode->paths[NET][n].type <= PATH_PXB &&            // Is connected to the NIC through PCI
                       peerNode->paths[GPU][g].type <= PATH_NVL &&            // Is connected to us through NVLink
                       (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
                        gpu->paths[NET][n].type > PATH_PXB))                  // or avoids going through a CPU
                        // We can use that GPU as relay to communicate with that NIC.
                        // Only enabling it in the GPU->NIC direction for now to favor
                        // receiving locally and sending remotely (consistent with net.cc)
                        SCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
                }
            }
            // Update path when we dont want to / can't use GPU Direct RDMA.
            int gdr;
            SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
            if(gdr == 0) {
                // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
                int localCpu;
                SCCLCHECK(getLocalCpu(system, g, &localCpu));
                SCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
                SCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
            }
        }
    }
    return scclSuccess;
}

} // namespace topology
} // namespace hardware
} // namespace sccl