search.cc

#include "core.h"
#include "graph.h"
#include "topo.h"
#include "xml.h"
#include <math.h>
#include <sys/time.h>
#include "rome_models.h"

namespace sccl {
namespace hardware {
namespace topology {
namespace detect {

SCCL_PARAM(CrossNic, "CROSS_NIC", 2);

// Initialize system->maxBw. This is the per-channel (i.e. per-SM)
// max bw.
static float getMaxBw(struct scclTopoSystem* system, struct scclTopoNode* gpu, int type) {
    float maxBw = 0.0;
    for(int i = 0; i < system->nodes[type].count; i++) {
        struct scclTopoLinkList* path = gpu->paths[type] + i;
        float bw                      = path->bw;
        if(path->count == 0)
            continue;
        maxBw = std::max(maxBw, bw);
    }
    return maxBw;
}
static float getTotalBw(struct scclTopoSystem* system, struct scclTopoNode* gpu) {
    float nvlinkBw = 0.0, pciBw = 0.0;
    for(int l = 0; l < gpu->nlinks; l++) {
        struct scclTopoLink* link = gpu->links + l;
        if(link->type == LINK_NVL)
            nvlinkBw += link->bw;
        if(link->type == LINK_PCI)
            pciBw = link->bw;
    }
    return std::max(pciBw, nvlinkBw);
}
scclResult_t scclTopoSearchInit(struct scclTopoSystem* system) {
    system->maxBw   = 0.0;
    system->totalBw = 0.0;
    int inter       = system->nodes[NET].count;
    if(inter == 0 && system->nodes[GPU].count == 1) {
        system->maxBw = LOC_BW;
        return scclSuccess;
    }
    for(int g = 0; g < system->nodes[GPU].count; g++) {
        struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
        system->maxBw            = std::max(system->maxBw, getMaxBw(system, gpu, inter ? NET : GPU));
        system->totalBw          = std::max(system->totalBw, getTotalBw(system, gpu));
    }
    return scclSuccess;
}

static scclResult_t findRevLink(struct scclTopoNode* node1, struct scclTopoNode* node2, struct scclTopoLink** revLink) {
    for(int l = 0; l < node2->nlinks; l++) {
        struct scclTopoLink* link = node2->links + l;
        if(link->remNode == node1) {
            *revLink = link;
            return scclSuccess;
        }
    }
    WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id);
    return scclInternalError;
}

// This is unfortunately needed since manipulating floats often results in rounding errors.
#define SUB_ROUND(a, b) (a = roundf((a - b) * 1000) / 1000)

static scclResult_t followPath(struct scclTopoLinkList* path, struct scclTopoNode* start, int maxSteps, float bw, int* steps) {
    float pciBw = bw;
    for(int step = 0; step < path->count; step++) {
        struct scclTopoNode* node = path->list[step]->remNode;
        if(node->type == CPU) {
            // Account for P2P inefficiency through Intel CPU RC
            if(path->type == PATH_PHB && start->type == GPU && node->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && node->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
                pciBw = INTEL_P2P_OVERHEAD(bw);
            }
        }
    }

    struct scclTopoNode* node = start;
    for(int step = 0; step < maxSteps; step++) {
        struct scclTopoLink* link    = path->list[step];
        struct scclTopoLink* revLink = NULL;
        float fwBw                   = link->type == LINK_PCI ? pciBw : bw;
        float revBw                  = 0;
        if(link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) {
            if(revLink == NULL)
                SCCLCHECK(findRevLink(node, link->remNode, &revLink));
            revBw += fwBw / 8;
        }
        if(link->remNode->type == CPU && link->type == LINK_NVL) {
            if(revLink == NULL)
                SCCLCHECK(findRevLink(node, link->remNode, &revLink));
            revBw += fwBw;
        }
        if(link->bw < fwBw || (revBw && revLink->bw < revBw)) {
            *steps = step;
            return scclSuccess;
        }
        SUB_ROUND(link->bw, fwBw);
        if(revBw)
            SUB_ROUND(revLink->bw, revBw);
        node = link->remNode;
    }
    *steps = maxSteps;
    return scclSuccess;
}

// Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1).
static scclResult_t scclTopoFollowPath(
    struct scclTopoSystem* system, struct scclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct scclTopoNode** node) {
    // First handle easy cases
    *node = system->nodes[type2].nodes + index2;
    if(type1 == -1)
        return scclSuccess;
    struct scclTopoNode* node1       = system->nodes[type1].nodes + index1;
    struct scclTopoLinkList* path    = node1->paths[type2] + index2;
    struct scclTopoNode* node2       = system->nodes[type2].nodes + index2;
    struct scclTopoLinkList* revPath = node2->paths[type1] + index1;

    if(path == NULL) {
        WARN("No path computed to go from %s/%d to %s/%d", topoNodeTypeStr[type1], index1, topoNodeTypeStr[type2], index2);
        return scclInternalError;
    }
    if(path->count == 0)
        return scclSuccess;

    // Now check link type
    *node     = NULL;
    int intra = (type1 == GPU || type1 == NVS) && (type2 == GPU || type2 == NVS);
    float bw  = intra ? graph->bwIntra : graph->bwInter;
    int type  = intra ? graph->typeIntra : graph->typeInter;

    if(mult == 1 && (path->type > type))
        return scclSuccess;
    if(mult == 1 &&
       (graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE || graph->pattern == SCCL_TOPO_PATTERN_TREE || graph->pattern == SCCL_TOPO_PATTERN_SPLIT_TREE) &&
       (revPath->type > type))
        return scclSuccess;

    bw *= mult;

    // Check there is enough bandwidth on paths.
    int step = 0;
    SCCLCHECK(followPath(path, node1, path->count, bw, &step));
    if(step < path->count)
        goto rewind;

    // Enough bandwidth : return destination node.
    graph->nHops += mult * path->count;
    *node = system->nodes[type2].nodes + index2;
    return scclSuccess;

rewind:
    // Not enough bandwidth : rewind and exit.
    SCCLCHECK(followPath(path, node1, step, -bw, &step));
    return scclSuccess;
}

static int gpuPciBw(struct scclTopoNode* gpu) {
    for(int l = 0; l < gpu->nlinks; l++) {
        struct scclTopoLink* gpuLink = gpu->links + l;
        if(gpuLink->type != LINK_PCI)
            continue;
        struct scclTopoNode* pci = gpuLink->remNode;
        for(int l = 0; l < pci->nlinks; l++) {
            struct scclTopoLink* pciLink = pci->links + l;
            if(pciLink->remNode != gpu)
                continue;
            return std::min(gpuLink->bw, pciLink->bw);
        }
    }
    return -1;
}

/* Choose the order in which we try next GPUs. This is critical for the search
   to quickly converge to the best solution even if it eventually times out. */
struct scclGpuScore {
    int g;          // Retain the index
    int startIndex; // Least important
    int intraNhops;
    int intraBw;
    int interNhops;
    int interPciBw;
    int interBw; // Most important
};

static int cmpScore(const void* g1, const void* g2) {
    struct scclGpuScore* s1 = (struct scclGpuScore*)g1;
    struct scclGpuScore* s2 = (struct scclGpuScore*)g2;
    int d;
    if((d = (s2->interBw - s1->interBw)))
        return d;
    if((d = (s2->interPciBw - s1->interPciBw)))
        return d;
    if((d = (s1->interNhops - s2->interNhops)))
        return d;
    if((d = (s2->startIndex - s1->startIndex)))
        return d;
    if((d = (s2->intraBw - s1->intraBw)))
        return d;
    if((d = (s1->intraNhops - s2->intraNhops)))
        return d;
    return s1->startIndex - s2->startIndex;
}

static int cmpIntraScores(struct scclGpuScore* scores, int count) {
    int intraBw    = scores[0].intraBw;
    int intraNhops = scores[0].intraNhops;
    for(int i = 1; i < count; i++) {
        if(scores[i].intraBw != intraBw || scores[i].intraNhops != intraNhops)
            return 1;
    }
    return 0;
}

static scclResult_t getGpuIndex(struct scclTopoSystem* system, int rank, int* index) {
    for(int g = 0; g < system->nodes[GPU].count; g++) {
        if(system->nodes[GPU].nodes[g].gpu.rank == rank) {
            *index = g;
            return scclSuccess;
        }
    }
    WARN("Could not find gpu rank %d", rank);
    return scclInternalError;
}

static scclResult_t getNetIndex(struct scclTopoSystem* system, int64_t id, int* index) {
    for(int n = 0; n < system->nodes[NET].count; n++) {
        if(system->nodes[NET].nodes[n].id == id) {
            *index = n;
            return scclSuccess;
        }
    }
    WARN("Could not find net id %lx", id);
    return scclInternalError;
}

static scclResult_t getNetPaths(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoLinkList** netPaths) {
    int netId = graph->inter[graph->nChannels * 2];
    int n;
    SCCLCHECK(getNetIndex(system, netId, &n));
    *netPaths = system->nodes[NET].nodes[n].paths[GPU];
    return scclSuccess;
}

scclResult_t
scclTopoSearchNextGpuSort(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoNode* gpu, int* next, int* countPtr, int sortNet) {
    const uint64_t flag               = 1ULL << (graph->nChannels);
    int ngpus                         = system->nodes[GPU].count;
    struct scclTopoLinkList* paths    = gpu->paths[GPU];
    struct scclTopoLinkList* netPaths = NULL;
    if(sortNet)
        SCCLCHECK(getNetPaths(system, graph, &netPaths));

    struct scclGpuScore scores[SCCL_TOPO_MAX_NODES];
    memset(scores, 0, ngpus * sizeof(struct scclGpuScore));
    int start = gpu - system->nodes[GPU].nodes;
    int count = 0;
    for(int i = 1; i < ngpus; i++) {
        int g = (start + i) % ngpus;
        if(paths[g].count == 0)
            continue; // There is no path to that GPU
        if(system->nodes[GPU].nodes[g].used & flag)
            continue;
        scores[count].g          = g;
        scores[count].startIndex = i;
        scores[count].intraNhops = paths[g].count;
        scores[count].intraBw    = paths[g].bw;
        if(netPaths) {
            scores[count].interNhops = netPaths[g].count;
            scores[count].interPciBw = gpuPciBw(system->nodes[GPU].nodes + g);
            scores[count].interBw    = netPaths[g].bw;
        }
        count++;
    }

    // Sort GPUs
    qsort(scores, count, sizeof(struct scclGpuScore), cmpScore);

    // Check if all have the same intra-node score in which case we go reverse for sortNet = -1
    if(sortNet == -1 && cmpIntraScores(scores, count) == 0) {
        for(int i = 0; i < count; i++)
            next[i] = scores[count - 1 - i].g;
    } else {
        for(int i = 0; i < count; i++)
            next[i] = scores[i].g;
    }
    *countPtr = count;
    return scclSuccess;
}

scclResult_t scclTopoSearchRec(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int* time);

// Try to keep all searchs within one second
#define SCCL_SEARCH_GLOBAL_TIMEOUT (5ULL << 16)
#define SCCL_SEARCH_TIMEOUT (1 << 14)
#define SCCL_SEARCH_TIMEOUT_TREE (1 << 14)
#define SCCL_SEARCH_TIMEOUT_SAMECHANNELS (1 << 8)

#define FORCED_ORDER_PCI 1
#define FORCED_ORDER_REPLAY 2

scclResult_t scclTopoReplayGetGpu(struct scclTopoSystem* system, struct scclTopoGraph* graph, int step, int* g) {
    *g = -1;
    if(graph->nChannels == 0)
        return scclInternalError;
    int ngpus    = system->nodes[GPU].count;
    int nextRank = graph->intra[(graph->nChannels - 1) * ngpus + step + 1];
    for(int i = 0; i < ngpus; i++)
        if(system->nodes[GPU].nodes[i].gpu.rank == nextRank) {
            *g = i;
            return scclSuccess;
        }
    if(*g == -1)
        return scclInternalError;
    return scclSuccess;
}

scclResult_t scclTopoSearchRecGpu(struct scclTopoSystem* system,
                                  struct scclTopoGraph* graph,
                                  struct scclTopoGraph* saveGraph,
                                  struct scclTopoNode* gpu,
                                  int step,
                                  int backToNet,
                                  int backToFirstRank,
                                  int forcedOrder,
                                  int* time);

scclResult_t scclTopoSearchTryGpu(struct scclTopoSystem* system,
                                  struct scclTopoGraph* graph,
                                  struct scclTopoGraph* saveGraph,
                                  int step,
                                  int backToNet,
                                  int backToFirstRank,
                                  int forcedOrder,
                                  int* time,
                                  int type,
                                  int index,
                                  int g) {
    const uint64_t flag = 1ULL << (graph->nChannels);
    struct scclTopoNode* gpu;
    SCCLCHECK(scclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu));
    if(gpu) {
        gpu->used ^= flag;
        SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, time));
        gpu->used ^= flag;
        SCCLCHECK(scclTopoFollowPath(system, graph, type, index, GPU, g, -1, &gpu));
    }
    return scclSuccess;
}

static int scclTopoCountXGMI(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
    int ngpus = system->nodes[GPU].count;
    int count = 0;
    for(int c = 0; c < graph->nChannels; c++) {
        for(int i = 0; i < ngpus; i++) {
            int g = graph->intra[ngpus * c + i];
            int n = graph->intra[ngpus * c + ((i + 1) % ngpus)];
            struct scclTopoNode* node;
            int j;
            for(j = 0; j < ngpus; j++)
                if(system->nodes[GPU].nodes[j].gpu.rank == g)
                    break;
            if(j < ngpus) {
                node = system->nodes[GPU].nodes + j;
                for(int k = 0; k < system->nodes[GPU].count; k++) {
                    if(node->paths[GPU][k].count == 1) {
                        struct scclTopoLink* link    = node->paths[GPU][k].list[0];
                        struct scclTopoNode* remNode = link->remNode;
                        if(remNode->gpu.rank == n) {
                            if(link->type == LINK_NVL)
                                count++;
                        }
                    }
                }
            }
        }
    }
    return count;
}

scclResult_t scclTopoSearchTryNvls(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int g, int ngpus, int* time) {
    struct scclTopoNode* nvs;
    struct scclTopoNode* gpu;
    int d0 = 0; // See if there is enough bandwidth for NVS->GPU traffic
    do {
        SCCLCHECK(scclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? 2 : 1, &gpu));
        d0++;
    } while(gpu && d0 < system->nodes[GPU].count);
    if(gpu == NULL) {
        d0--;
    } else {
        int d1 = 0; // See if there is enough bandwidth for GPU->NVS traffic
        do {
            SCCLCHECK(scclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? 2 : 1, &nvs));
            d1++;
        } while(nvs && d1 < system->nodes[GPU].count);
        if(nvs == NULL) {
            d1--;
        } else { // Both directions worked. Move on to the next path.
            SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time));
        }
        while(d1) {
            d1--;
            SCCLCHECK(scclTopoFollowPath(system, graph, GPU, d1, NVS, 0, d1 == g ? -2 : -1, &nvs));
        }
    }
    while(d0) {
        d0--;
        SCCLCHECK(scclTopoFollowPath(system, graph, NVS, 0, GPU, d0, d0 == g ? -2 : -1, &gpu));
    }
    return scclSuccess;
}

scclResult_t scclTopoCompareGraphs(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* refGraph, int* copy) {
    // 1. Try to get the same nChannels between Rings and Trees
    if(graph->nChannels < graph->minChannels)
        return scclSuccess;

    if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better.
        if(graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count)
            *copy = 1;
        return scclSuccess;
    }
    // 2. Try to get better bandwidth
    // Give a 15% perf bonus to paths not crossing nics
    float target = 1.0 - (refGraph->crossNic - graph->crossNic) * .15;
    if(graph->nChannels * graph->bwIntra > refGraph->nChannels * refGraph->bwIntra * target) {
        *copy = 1;
        return scclSuccess;
    }
    if(graph->nChannels * graph->bwIntra < refGraph->nChannels * refGraph->bwIntra * target)
        return scclSuccess;

    // 3. Less hops
    if(graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops)
        *copy = 1;

    // 4. Prefer graph with more XGMI connections
    if(graph->nChannels == refGraph->nChannels && scclTopoCountXGMI(system, refGraph) < scclTopoCountXGMI(system, graph))
        *copy = 1;
    return scclSuccess;
}

// Build a list of the best NETs to try.
//
// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
//  index when trying to get back to the NIC.
//
// The list is built the following way:
// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list
//    based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which
//    might have been choosen by GPU 0 (case with multiple independent communicators per node)
// 3. Then add the NETs to the final list if they were not already added by another closer GPU.

scclResult_t scclTopoSelectNets(struct scclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
    int netCount = 0;
    int localNetCount;
    int* localNets;
    SCCLCHECK(scclCalloc(&localNets, system->nodes[NET].count));

    for(int t = 0; t <= typeInter; t++) {
        for(int g = 0; g < system->nodes[GPU].count; g++) {
            if(gpu != -1 && gpu != g)
                continue;
            localNetCount                  = 0;
            struct scclTopoNode* gpu       = system->nodes[GPU].nodes + g;
            struct scclTopoLinkList* paths = gpu->paths[NET];
            for(int n = 0; n < system->nodes[NET].count; n++) {
                if(paths[n].type == t)
                    localNets[localNetCount++] = n;
            }
            if(localNetCount == 0)
                continue;
            // Shuffle by gpu NVML device number so that GPUs on the same PCI switch
            // with multiple NICs don't use the same one as first choice.
            for(int r = 0; r < system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) {
                int net0 = localNets[0];
                for(int i = 0; i < localNetCount - 1; i++)
                    localNets[i] = localNets[i + 1];
                localNets[localNetCount - 1] = net0;
            }
            // Append NICs to list
            for(int i = 0; i < localNetCount; i++) {
                int n     = localNets[i];
                int found = 0;
                while(nets[found] != n && found < netCount)
                    found++;
                if(found == netCount)
                    nets[netCount++] = n;
            }
        }
    }

    *netCountRet = netCount;
    free(localNets);

    return scclSuccess;
}

scclResult_t scclTopoSearchRecGpu(struct scclTopoSystem* system,
                                  struct scclTopoGraph* graph,
                                  struct scclTopoGraph* saveGraph,
                                  struct scclTopoNode* gpu,
                                  int step,
                                  int backToNet,
                                  int backToFirstRank,
                                  int forcedOrder,
                                  int* time) {
    if((*time) <= 0)
        return scclSuccess;
    (*time)--;

    int ngpus = system->nodes[GPU].count;
    if(step == ngpus) {
        // Determine whether we found a better solution or not
        int copy = 0;
        graph->nChannels++;
        SCCLCHECK(scclTopoCompareGraphs(system, graph, saveGraph, &copy));
        if(copy) {
            memcpy(saveGraph, graph, sizeof(struct scclTopoGraph));
            if(graph->nChannels == graph->maxChannels)
                *time = -1;
        }
        if(graph->nChannels < graph->maxChannels) {
            SCCLCHECK(scclTopoSearchRec(system, graph, saveGraph, time));
        }
        graph->nChannels--;
        return scclSuccess;
    }
    graph->intra[graph->nChannels * ngpus + step] = gpu->gpu.rank;
    int g                                         = gpu - system->nodes[GPU].nodes;
    if(step == backToNet) {
        // first get back to NIC
        if(system->nodes[NET].count) {
            int startNetIndex;
            SCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels * 2], &startNetIndex));
            struct scclTopoNode* startNet = system->nodes[NET].nodes + startNetIndex;
            int netcount;
            int* nets;
            SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count));
            SCCLCHECK(scclTopoSelectNets(system, graph->typeInter, g, nets, &netcount));
            for(int i = 0; i < netcount; i++) {
                int n                    = nets[i];
                struct scclTopoNode* net = system->nodes[NET].nodes + n;
                if(graph->pattern == SCCL_TOPO_PATTERN_TREE && net->id != startNet->id)
                    continue; // Trees are symmetric
                if(graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port))
                    continue;

                // Balanced Tree : count half of the bandwidth on first two GPUs
                int nextBackToNet = -1;
                float bwInterSave = graph->bwInter;
                if(graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE) {
                    // Count half of the bandwidth on each of the first two GPUs
                    if(step == 0)
                        nextBackToNet = 1;
                    else if(net->id != graph->inter[graph->nChannels * 2 + 1])
                        continue;
                    graph->bwInter /= 2;
                }

                SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
                graph->bwInter = bwInterSave;
                if(net) {
                    graph->inter[graph->nChannels * 2 + 1] = net->id;
                    SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));

                    if(graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE)
                        graph->bwInter /= 2;
                    SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
                    graph->bwInter = bwInterSave;
                }
            }
            free(nets);
        }
    } else if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) {
        SCCLCHECK(scclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
    } else if(step < system->nodes[GPU].count - 1) {
        // Go to next GPU
        int next[SCCL_TOPO_MAX_NODES];
        int count;
        if(forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
            next[0] = step + 1;
            count   = 1;
        } else if(forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order
            SCCLCHECK(scclTopoReplayGetGpu(system, graph, step, next));
            count = 1;
        } else { // Normal search
            SCCLCHECK(scclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step + 1 ? 1 : -1));
        }
        for(int i = 0; i < count; i++) {
            SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, step + 1, backToNet, backToFirstRank, forcedOrder, time, GPU, g, next[i]));
        }
    } else if(step == backToFirstRank) {
        // Find first GPU and loop back to it
        int p;
        SCCLCHECK(getGpuIndex(system, graph->intra[graph->nChannels * ngpus], &p));
        struct scclTopoNode* firstGpu;
        SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, GPU, p, 1, &firstGpu));
        if(firstGpu) {
            SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step + 1, backToNet, -1, forcedOrder, time));
            SCCLCHECK(scclTopoFollowPath(system, graph, GPU, g, GPU, p, -1, &firstGpu));
        }
    } else {
        // Next path
        SCCLCHECK(scclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
    }
    return scclSuccess;
}

scclResult_t scclTopoSearchRecNet(
    struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
    const int bw = graph->bwInter;
    int* nets;
    SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count));
    int netcount;
    SCCLCHECK(scclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount));
    for(int i = 0; i < netcount; i++) {
        int n                    = nets[i];
        struct scclTopoNode* net = system->nodes[NET].nodes + n;
        struct scclTopoNode* gpu;
        if(graph->collNet && net->net.collSupport == 0)
            continue;
        if(net->net.bw < bw)
            continue;

        graph->inter[graph->nChannels * 2] = net->id;
        graph->latencyInter                = net->net.latency;

        for(int i = 0; i < system->nodes[NET].count; i++) {
            if((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) {
                system->nodes[NET].nodes[i].net.bw -= bw;
            }
        }

        // NVLS needs to balance on all NICs
        if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) {
            SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, nets[graph->nChannels]));
        } else {
            if(graph->nChannels > 0) {
                // Try to replay the last channel
                int g;
                SCCLCHECK(scclTopoReplayGetGpu(system, graph, -1, &g));
                SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
            }
            if(graph->nChannels == 0 || graph->sameChannels == 0) {
                if(graph->nChannels == 0) {
                    // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
                    struct scclTopoLinkList* paths = net->paths[GPU];
                    int f = 0, f_gdr = 0;
                    // find the first GPU that is closest to NIC
                    for(int i = 0; i < system->nodes[GPU].count; i++) {
                        if(paths[i].count <= paths[f].count) {
                            // prefer GPU direct RDMA
                            int gdr;
                            SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr));
                            if(paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) {
                                f     = i;
                                f_gdr = gdr;
                            }
                        }
                    }
                    int t = 1 << 10;
                    SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
                    if(t == -1)
                        *time = -1;
                }

                // Then try the most local GPUs
                float maxBw                    = 0;
                int minHops                    = 0xfffffff;
                struct scclTopoLinkList* paths = net->paths[GPU];
                for(int g = 0; g < system->nodes[GPU].count; g++) {
                    if(paths[g].bw > maxBw) {
                        maxBw   = paths[g].bw;
                        minHops = paths[g].count;
                    } else if(paths[g].bw == maxBw && paths[g].count < minHops) {
                        minHops = paths[g].count;
                    }
                }
                if(maxBw >= bw) {
                    // In the first loop, avoid using GPUs in both directions between channels (one channel
                    // sending from that GPU and one channel receiving to that GPU), since that usually leads
                    // to lower BW.
                    for(int tryGpuBidir = 0; tryGpuBidir < 2; tryGpuBidir++) {
                        for(int g = 0; g < system->nodes[GPU].count; g++) {
                            if(paths[g].bw == maxBw && paths[g].count == minHops) {
                                gpu         = system->nodes[GPU].nodes + g;
                                int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1;
                                if(tryGpuBidir == gpuUsed) {
                                    SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
                                }
                            }
                        }
                    }
                }
            }
        }

        for(int i = 0; i < system->nodes[NET].count; i++) {
            if((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) {
                system->nodes[NET].nodes[i].net.bw += bw;
            }
        }
    }
    free(nets);
    return scclSuccess;
}

/* Search Patterns
 *
 *     Intra-node
 * Ring            : GPU a -> GPU b -> .. -> GPU x -> GPU a
 * (=Split Tree Loop)
 * Tree            : GPU a -> GPU b -> .. -> GPU x
 * (=Split Tree)
 *
 *     Inter-node
 * Ring            : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic)
 * Tree            : NET n -> GPU a -> GPU b -> .. -> GPU x
 *                              `--> NET n (or m if crossNic)
 * Split Tree      : NET n -> GPU a -> GPU b -> .. -> GPU x
 *                                       `--> NET n (or m if crossNic)
 * Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a
 *                                       `--> NET n (or m if crossNic)
 */
scclResult_t scclTopoSearchParams(struct scclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
    if(system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
        if(pattern == SCCL_TOPO_PATTERN_RING)
            *backToNet = system->nodes[GPU].count - 1;
        else if(pattern == SCCL_TOPO_PATTERN_SPLIT_TREE)
            *backToNet = 1;
        else
            *backToNet = 0;
        *backToFirstRank = -1;
    } else {
        *backToNet = -1;
        if(pattern == SCCL_TOPO_PATTERN_RING)
            *backToFirstRank = system->nodes[GPU].count - 1;
        else
            *backToFirstRank = -1;
    }
    return scclSuccess;
}

scclResult_t scclTopoSearchRec(struct scclTopoSystem* system, struct scclTopoGraph* graph, struct scclTopoGraph* saveGraph, int* time) {
    int backToNet, backToFirstRank;
    SCCLCHECK(scclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
    if(system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
        // Start from NET
        scclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
    } else {
        // Intra-node only.
        if(graph->pattern == SCCL_TOPO_PATTERN_NVLS) {
            SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, graph->nChannels));
            return scclSuccess;
        } else if(graph->nChannels == 0) {
            // Try PCI order first
            SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
        } else {
            // Also try to replay previous channel
            int g;
            SCCLCHECK(scclTopoReplayGetGpu(system, graph, -1, &g));
            SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
        }
        if(graph->sameChannels == 0 || graph->nChannels == 0) {
            // Finally, try all other possibilities unless we are forced to use the same channels
            for(int g = 0; g < system->nodes[GPU].count; g++) {
                SCCLCHECK(scclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
            }
        }
    }
    return scclSuccess;
}

/************************************/
/* User defined graph from XML file */
/************************************/

struct kvDict kvDictLinkType[] = {{"LOC", PATH_LOC},
                                  {"NVL", PATH_NVL},
                                  {"NVB", PATH_NVB},
                                  {"PIX", PATH_PIX},
                                  {"PXB", PATH_PXB},
                                  {"PXN", PATH_PXN},
                                  {"PHB", PATH_PHB},
                                  {"SYS", PATH_SYS},
                                  {NULL, 0}};

scclResult_t scclTopoGetChannelFromXml(struct scclXmlNode* xmlChannel, int c, struct scclTopoSystem* system, struct scclTopoGraph* graph) {
    int ngpus  = system->nodes[GPU].count;
    int* inter = graph->inter + 2 * c;
    int* intra = graph->intra + ngpus * c;
    int n = 0, g = 0;
    for(int s = 0; s < xmlChannel->nSubs; s++) {
        struct scclXmlNode* sub = xmlChannel->subs[s];
        int dev;
        SCCLCHECK(xmlGetAttrInt(sub, "dev", &dev));
        if(strcmp(sub->name, "net") == 0) {
            inter[n++] = dev;
        } else if(strcmp(sub->name, "gpu") == 0) {
            int rank = -1;
            for(int g = 0; g < ngpus; g++) {
                if(system->nodes[GPU].nodes[g].gpu.dev == dev)
                    rank = system->nodes[GPU].nodes[g].gpu.rank;
            }
            if(rank == -1) {
                WARN("XML Import Channel : dev %d not found.", dev);
                return scclSystemError;
            }
            intra[g++] = rank;
        }
    }
    return scclSuccess;
}
scclResult_t scclTopoGetGraphFromXmlSub(struct scclXmlNode* xmlGraph, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* nChannels) {
    int id;
    SCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id));
    if(graph->id != id)
        return scclSuccess;

    int crossNic;
    SCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic));
    if(scclParamCrossNic() == 0 && crossNic == 1)
        return scclSuccess;
    graph->crossNic = crossNic;

    SCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern));
    SCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
    SCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra));
    SCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter));
    if(xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != scclSuccess)
        graph->latencyInter = 0.0;
    const char* str;
    SCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
    SCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
    SCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str));
    SCCLCHECK(kvConvertToInt(str, &graph->typeInter, kvDictLinkType));
    SCCLCHECK(xmlGetAttrInt(xmlGraph, "samechannels", &graph->sameChannels));
    for(int s = 0; s < xmlGraph->nSubs; s++) {
        SCCLCHECK(scclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph));
    }
    *nChannels = xmlGraph->nSubs;
    return scclSuccess;
}
scclResult_t scclTopoGetGraphFromXml(struct scclXmlNode* xmlGraphs, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* nChannels) {
    for(int s = 0; s < xmlGraphs->nSubs; s++) {
        SCCLCHECK(scclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph, nChannels));
    }
    return scclSuccess;
}

/* And the reverse : graph->xml */
scclResult_t scclTopoGetXmlFromChannel(struct scclTopoGraph* graph, int c, struct scclTopoSystem* system, struct scclXml* xml, struct scclXmlNode* parent) {
    struct scclXmlNode* xmlChannel;
    int ngpus  = system->nodes[GPU].count;
    int* inter = graph->inter + 2 * c;
    int* intra = graph->intra + ngpus * c;
    SCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel));
    struct scclXmlNode* node;
    if(system->nodes[NET].count) {
        SCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
        SCCLCHECK(xmlSetAttrInt(node, "dev", inter[0]));
    }
    for(int g = 0; g < ngpus; g++) {
        SCCLCHECK(xmlAddNode(xml, xmlChannel, "gpu", &node));
        int dev = -1;
        for(int i = 0; i < ngpus; i++) {
            if(system->nodes[GPU].nodes[i].gpu.rank == intra[g])
                dev = system->nodes[GPU].nodes[i].gpu.dev;
        }
        if(dev == -1) {
            WARN("XML Export Channel : rank %d not found.", intra[g]);
            return scclInternalError;
        }
        SCCLCHECK(xmlSetAttrInt(node, "dev", dev));
    }
    if(system->nodes[NET].count) {
        SCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
        SCCLCHECK(xmlSetAttrInt(node, "dev", inter[1]));
    }
    return scclSuccess;
}
scclResult_t scclTopoGetXmlFromGraph(struct scclTopoGraph* graph, struct scclTopoSystem* system, struct scclXml* xml, struct scclXmlNode* parent) {
    struct scclXmlNode* xmlGraph;
    SCCLCHECK(xmlAddNode(xml, parent, "graph", &xmlGraph));
    SCCLCHECK(xmlSetAttrInt(xmlGraph, "id", graph->id));
    SCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern));
    SCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic));
    SCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels));
    SCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->bwIntra));
    SCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->bwInter));
    SCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter));
    const char* str;
    SCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType));
    SCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str));
    SCCLCHECK(kvConvertToStr(graph->typeInter, &str, kvDictLinkType));
    SCCLCHECK(xmlSetAttr(xmlGraph, "typeinter", str));
    SCCLCHECK(xmlSetAttrInt(xmlGraph, "samechannels", graph->sameChannels));
    for(int c = 0; c < graph->nChannels; c++) {
        SCCLCHECK(scclTopoGetXmlFromChannel(graph, c, system, xml, xmlGraph));
    }
    return scclSuccess;
}
scclResult_t scclTopoGetXmlFromGraphs(int ngraphs, struct scclTopoGraph** graphs, struct scclTopoSystem* system, struct scclXml* xml) {
    xml->maxIndex = 0;
    struct scclXmlNode* xmlGraphs;
    SCCLCHECK(xmlAddNode(xml, NULL, "graphs", &xmlGraphs));
    SCCLCHECK(xmlSetAttrInt(xmlGraphs, "version", SCCL_GRAPH_XML_VERSION));
    for(int g = 0; g < ngraphs; g++) {
        SCCLCHECK(scclTopoGetXmlFromGraph(graphs[g], system, xml, xmlGraphs));
    }
    return scclSuccess;
}

#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
float speedArrayIntra[] = {48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12};
float speedArrayInter[] = {48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12};
#define NSPEEDSINTRA (sizeof(speedArrayIntra) / sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter) / sizeof(float))
#else
float speedArrayIntra[] = {40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0};
float speedArrayInter[] = {48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12};
#define NSPEEDSINTRA (sizeof(speedArrayIntra) / sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter) / sizeof(float))

float sm90SpeedArrayIntra[] = {60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0};
float sm90SpeedArrayInter[] = {48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12};
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra) / sizeof(float))
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter) / sizeof(float))
#endif

RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
RCCL_PARAM(NChannels, "NCHANNELS", 0);
scclResult_t scclTopoCompute(scclTopoSystem* system, struct scclTopoGraph* graph) {
    int ngpus       = system->nodes[GPU].count;
    graph->crossNic = scclParamCrossNic();
    int crossNic    = (system->nodes[NET].count > 1) && graph->crossNic &&
                           (graph->pattern == SCCL_TOPO_PATTERN_RING || graph->pattern == SCCL_TOPO_PATTERN_BALANCED_TREE ||
                            graph->pattern == SCCL_TOPO_PATTERN_SPLIT_TREE)
                          ? 1
                          : 0;
    graph->bwIntra = graph->bwInter = 0;
    graph->latencyInter             = 0;
    if(graph->crossNic == 2)
        graph->crossNic = 0;
    graph->typeIntra      = ngpus == 1 ? PATH_LOC : PATH_NVL;
    graph->typeInter      = PATH_PIX;
    graph->nChannels      = 0;
    graph->nIntraChannels = 0;
    memset(graph->intraNets, 0, MAXCHANNELS * SCCL_TOPO_MAX_NODES * 2 * sizeof(int));
    int trySameChannels = graph->pattern == SCCL_TOPO_PATTERN_NVLS ? 0 : 1;
    graph->sameChannels = trySameChannels;

    char* str = getenv("SCCL_GRAPH_FILE");
    if(str) {
        INFO(SCCL_ENV, "SCCL_GRAPH_FILE set by environment to %s", str);
        struct scclXml* xml;
        SCCLCHECK(scclCalloc(&xml, 1));
        SCCLCHECK(scclTopoGetXmlGraphFromFile(str, xml));
        int nChannels;
        SCCLCHECK(scclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels));
        INFO(SCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels);
        free(xml);
        if(graph->nChannels > 0)
            return scclSuccess;
    }

    str            = getenv("SCCL_RINGS");
    char* strTrees = getenv("RCCL_TREES");

    if(str || strTrees) {
        // user supplied topo
        if(strTrees) {
            SCCLCHECK(parseGraphLight(strTrees, system, graph, NULL));
            system->treeDefined = true;
        } else {
            SCCLCHECK(parseGraph(str, system, graph, NULL, NULL));
            int arch, vendor, model;
            SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
            if(graph->nChannels && arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME) {
                system->type |= RCCL_TOPO_4P2H_ROME;
            }
        }
    } else if(!rcclParamModelMatchingDisable() && !graph->collNet) {
        // try to match 8P6L
        SCCLCHECK(parseChordalRing(system, graph));
        if(graph->nChannels)
            return scclSuccess;
        // try to match Rome 4P2H
        SCCLCHECK(parseRome4P2H(system, graph));
        if(graph->nChannels)
            return scclSuccess;
        // try to match 1H16P
        SCCLCHECK(parse1H16P(system, graph));
        if(graph->nChannels)
            return scclSuccess;
        // try to match 4H4P
        SCCLCHECK(parse4H4P(system, graph));
    }
    if(graph->nChannels)
        return scclSuccess;

    if((graph->pattern == SCCL_TOPO_PATTERN_RING) && (system->type & RCCL_TOPO_4P2H_ROME) && (ngpus == system->nRanks)) {
        // limit single node max channels when searching ring graph on Rome
        graph->maxChannels = 2;
    }
    if(ngpus == 1)
        if(graph->pattern != SCCL_TOPO_PATTERN_RING)
            graph->pattern = SCCL_TOPO_PATTERN_TREE;

    int ccMin;
    SCCLCHECK(scclTopoGetCompCap(system, &ccMin, NULL));
    if(graph->pattern == SCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90))
        return scclSuccess;

    if(ngpus == 1)
        if(graph->pattern != SCCL_TOPO_PATTERN_RING)
            graph->pattern = SCCL_TOPO_PATTERN_TREE;

    if(system->nodes[NET].count == 0 && graph->pattern == SCCL_TOPO_PATTERN_NVLS) {
        // Force intra-node NVLS algorithm to pull evenly from all GPUs.
        graph->minChannels = graph->maxChannels = system->nodes[GPU].count;
    }

    struct scclTopoGraph tmpGraph;
    memcpy(&tmpGraph, graph, sizeof(struct scclTopoGraph));

    // First try crossnic, then decrease bw and finally increase bwIntra.
    int nspeeds       = 0;
    float* speedArray = NULL;
    if(system->nodes[NET].count == 0) {
        nspeeds    = NSPEEDSINTRA;
        speedArray = speedArrayIntra;
    } else {
        nspeeds    = NSPEEDSINTER;
        speedArray = speedArrayInter;
    }
    int pass       = 1;
    int speedIndex = 0;
    float maxBw    = system->maxBw;
    float totalBw  = system->totalBw;
    if(ngpus == 1 || graph->pattern != SCCL_TOPO_PATTERN_RING)
        totalBw *= ngpus * 1.0 / (ngpus - 1);
    while((speedArray[speedIndex] > maxBw || speedArray[speedIndex] * graph->minChannels > totalBw) && speedIndex < nspeeds - 1)
        speedIndex++;
    tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
    int64_t globalTimeout               = SCCL_SEARCH_GLOBAL_TIMEOUT;

search:
    int time           = tmpGraph.sameChannels                        ? SCCL_SEARCH_TIMEOUT_SAMECHANNELS
                         : tmpGraph.pattern == SCCL_TOPO_PATTERN_TREE ? SCCL_SEARCH_TIMEOUT_TREE
                                                                      : SCCL_SEARCH_TIMEOUT;
    tmpGraph.nChannels = 0;
    globalTimeout -= time;

    SCCLCHECK(scclTopoSearchRec(system, &tmpGraph, graph, &time));
#if 0
  printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
  for (int c=0; c<graph->nChannels; c++) {
    printf("%2d : ", c);
    for (int g=0; g<ngpus; g++) {
      printf("%d ", graph->intra[c*ngpus+g]);
    }
    printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]);
    printf("\n");
  }
#endif
    // Optimal solution, stop here
    if(time == -1)
        goto done;
    if(graph->nChannels * graph->bwInter >= system->totalBw)
        goto done;

    if(pass == 1) {
        // First pass, we don't have a solution yet ; try other options

        // Try having different channels
        if(tmpGraph.sameChannels == 1) {
            tmpGraph.sameChannels = 0;
            goto search;
        }
        tmpGraph.sameChannels = trySameChannels;

        if(time != -1)
            globalTimeout += time;
        else
            globalTimeout = SCCL_SEARCH_GLOBAL_TIMEOUT;
        if(globalTimeout < 0 && graph->nChannels)
            goto done;

        tmpGraph.pattern = graph->pattern;

        int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
        if(tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
            tmpGraph.typeIntra += 1;
            goto search;
        }
        tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;

        if(system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS &&
           (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
            tmpGraph.typeInter += 1;
            goto search;
        }
        tmpGraph.typeInter = PATH_PIX;

        if(crossNic && tmpGraph.crossNic == 0) {
            // Try again with crossNic if permitted
            tmpGraph.crossNic = crossNic;
            goto search;
        }
        tmpGraph.crossNic = 0;

        // Decrease bw until we find a solution
        if((speedIndex < nspeeds - 1) && (graph->nChannels == 0 || (speedArray[speedIndex + 1] / graph->bwInter > .49))) {
            tmpGraph.bwInter = tmpGraph.bwIntra = speedArray[++speedIndex];
            goto search;
        }
        speedIndex = 0;
        while(speedArray[speedIndex] > maxBw && speedIndex < nspeeds - 1)
            speedIndex++;
        tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
    }

done:
    // We have a solution. Start from that solution and move to pass 2.
    if(pass == 1) {
        time = -1;
        memcpy(&tmpGraph, graph, sizeof(tmpGraph));
        speedIndex = 0;
        while(speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds - 1)
            speedIndex++;
        tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[speedIndex];
        tmpGraph.minChannels                = graph->nChannels;
        pass                                = 2;
    }

    // 3. See if we can increase bwIntra for trees (2 nodes or collnet)
    if(pass == 2) {
        if(time != 0 && graph->pattern != SCCL_TOPO_PATTERN_RING && tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter * 2 &&
           speedIndex > 0) {
            tmpGraph.bwIntra = speedArray[--speedIndex];
            goto search;
        }
        time = -1;
        memcpy(&tmpGraph, graph, sizeof(tmpGraph));
    }

    if(graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != SCCL_TOPO_PATTERN_NVLS) {
        WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
        for(int i = 0; i < ngpus; i++)
            graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
        graph->inter[0] = graph->inter[1] = 0;
        graph->bwIntra = graph->bwInter = 0.1;
        graph->typeIntra = graph->typeInter = PATH_SYS;
        graph->nChannels                    = 1;
    }

    if(graph->nChannels == 0)
        return scclSuccess;
    if(graph->pattern == SCCL_TOPO_PATTERN_NVLS)
        return scclSuccess;
    if(graph->bwIntra < 25.0)
        return scclSuccess;
    if(ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4)
        return scclSuccess;

    int dupChannels = std::min(graph->nChannels * 2, graph->maxChannels);
    memcpy(graph->intra + graph->nChannels * ngpus, graph->intra, (dupChannels - graph->nChannels) * ngpus * sizeof(int));
    memcpy(graph->inter + graph->nChannels * 2, graph->inter, (dupChannels - graph->nChannels) * 2 * sizeof(int));
    graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
    graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
    graph->nChannels = dupChannels;

    int nc = rcclParamNChannels();
    if(graph->nChannels > 0 && nc > 0 && nc <= MAXCHANNELS / 2 && nc > graph->nChannels) {
        int nChannels = nc - graph->nChannels;
        int nnets     = system->nodes[NET].count;
        if(nnets <= 2) {
            for(int i = 0; i < nChannels; ++i) {
                memcpy(graph->intra + graph->nChannels * ngpus, graph->intra, ngpus * sizeof(int));
                memcpy(graph->inter + graph->nChannels * 2, graph->inter, 2 * sizeof(int));
                memcpy(graph->intraNets + graph->nChannels * ngpus * 2, graph->intraNets, 2 * ngpus * sizeof(int));
                graph->nChannels++;
            }
        } else {
            typedef struct {
                int id;
                int used;
            } Net;
            Net nets[nnets];
            auto sortFunc = [](const void* a, const void* b) -> int { return ((Net*)a)->used - ((Net*)b)->used; };
            memset(nets, 0, nnets * sizeof(Net));
            for(int i = 0; i < nnets; ++i) {
                nets[i].id = system->nodes[NET].nodes[i].id;
            }
            for(int i = 0; i < graph->nChannels; ++i) {
                for(int j = 0; j < nnets; ++j) {
                    if(nets[j].id == *(graph->inter + i * 2) || nets[j].id == *(graph->inter + i * 2 + 1)) {
                        nets[j].used++;
                    }
                }
            }
            for(int i = 0; i < nChannels; ++i) {
                memcpy(graph->intra + graph->nChannels * ngpus, graph->intra, ngpus * sizeof(int));
                qsort(nets, nnets, sizeof(Net), sortFunc);
                *(graph->inter + graph->nChannels * 2) = nets[0].id;
                nets[0].used++;
                qsort(nets, nnets, sizeof(Net), sortFunc);
                if(graph->crossNic == 0 || graph->crossNic == 2) {
                    *(graph->inter + graph->nChannels * 2 + 1) = nets[0].id;
                    nets[0].used++;
                    qsort(nets, nnets, sizeof(Net), sortFunc);
                } else {
                    nets[0].used++;
                    qsort(nets, nnets, sizeof(Net), sortFunc);
                    *(graph->inter + graph->nChannels * 2 + 1) = nets[0].id;
                }
                nets[0].used++;
                memcpy(graph->intraNets + graph->nChannels * ngpus * 2, graph->intraNets, 2 * ngpus * sizeof(int));
                graph->nChannels++;
            }
        }
        graph->bwIntra /= DIVUP(nc, graph->nChannels);
        graph->bwInter /= DIVUP(nc, graph->nChannels);
    }
    return scclSuccess;
}

scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
    INFO(SCCL_GRAPH,
         "Pattern %d, crossNic %d, nChannels %d, bw %f/%f, type %s/%s, sameChannels %d",
         graph->pattern,
         graph->crossNic,
         graph->nChannels,
         graph->bwIntra,
         graph->bwInter,
         topoPathTypeStr[graph->typeIntra],
         topoPathTypeStr[graph->typeInter],
         graph->sameChannels);
    int ngpus = system->nodes[GPU].count;

    char line[1024];
    for(int c = 0; c < graph->nChannels; c++) {
        sprintf(line, "%2d :", c);
        int offset = strlen(line);
        if(system->nodes[NET].count > 0 && system->nodes[GPU].count != system->nRanks && !graph->nIntraChannels) {
            sprintf(line + offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2 * c]);
            offset = strlen(line);
        }
        for(int i = 0; i < ngpus; i++) {
            int n = graph->intraNets[(ngpus * c + i) * 2] - 'N';
            if(n >= 0 && n < system->nodes[NET].count) {
                sprintf(line + offset, " NET/%d", n);
                offset = strlen(line);
            }
            sprintf(line + offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus * c + i]);
            offset = strlen(line);
            n      = graph->intraNets[(ngpus * c + i) * 2 + 1] - 'N';
            if(n >= 0 && n < system->nodes[NET].count) {
                sprintf(line + offset, " NET/%d", n);
                offset = strlen(line);
            }
        }
        if(system->nodes[NET].count > 0 && system->nodes[GPU].count != system->nRanks && !graph->nIntraChannels) {
            sprintf(line + offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2 * c + 1]);
            offset = strlen(line);
        }
        INFO(SCCL_GRAPH, "%s", line);
    }
    return scclSuccess;
}

scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs) {
    char* str = getenv("SCCL_GRAPH_DUMP_FILE");
    if(str) {
        INFO(SCCL_ENV, "SCCL_GRAPH_DUMP_FILE set by environment to %s", str);
        struct scclXml* xml;
        SCCLCHECK(scclCalloc(&xml, 1));
        SCCLCHECK(scclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
        SCCLCHECK(scclTopoDumpXmlToFile(str, xml));
        free(xml);
    }
    return scclSuccess;
}

#include "comm.h"
// NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
scclResult_t getNvlsNetDev(struct scclComm* comm, struct scclTopoGraph* graph, int* dev) {
    int localRanks = comm->topo->nodes[GPU].count;
    for(int c = 0; c < graph->nChannels; c++) {
        if(graph->intra[c * localRanks] == comm->rank) {
            *dev = graph->inter[c * 2];
            return scclSuccess;
        }
    }
    WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
    return scclInternalError;
}

// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
SCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2);

scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) {
    if(graph) {
        // Honor the net device in the graph
        int channel = channelId % graph->nChannels;
        int ngpus   = comm->topo->nodes[GPU].count;
        int index   = graph->intra[channel * ngpus] == rank ? 0 : 1;
        if(graph->pattern != SCCL_TOPO_PATTERN_NVLS) {
            *dev = graph->inter[channel * 2 + index];
        } else {
            SCCLCHECK(getNvlsNetDev(comm, graph, dev));
        }
        SCCLCHECK(scclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
    } else if(peerRank == -1) {
        return scclInternalError;
    } else {
        // Start with our local NIC and local Rank
        SCCLCHECK(scclTopoGetLocalNet(comm->topo, rank, channelId, dev));
        *proxyRank = rank;

        int pxnLevel = scclPxnDisable(comm) == 1 ? 0 : scclParamP2pPxnLevel();
        // See whether we can use the remote rank preferred device.
        if(scclParamCrossNic() == 0 || (pxnLevel != 0)) {
            // Find local NIC number close to local cudaDev
            int cudaDev = comm->peerInfo[peerRank].cudaDev;
            int localRank;
            if(scclTopoDevToRank(comm->topo, cudaDev, &localRank) != scclSuccess)
                return scclSuccess;
            int netDev;
            SCCLCHECK(scclTopoGetLocalNet(comm->topo, localRank, channelId, &netDev));

            int n;
            // Check that device exists on our node
            if(scclParamCrossNic() == 0) {
                if(scclTopoIdToIndex(comm->topo, NET, netDev, &n) != scclSuccess) {
                    WARN("Rank %d requires NIC %d but that NIC is not available for rank %d", peerRank, netDev, rank);
                    return scclInvalidUsage;
                }
                *dev = netDev;
            }
            if(pxnLevel == 1) {
                int g, n;
                SCCLCHECK(scclTopoRankToIndex(comm->topo, rank, &g));
                SCCLCHECK(scclTopoIdToIndex(comm->topo, NET, netDev, &n));
                struct scclTopoNode* gpu = comm->topo->nodes[GPU].nodes + g;
                if(gpu->paths[NET][n].type <= PATH_PXN) {
                    *dev = netDev;
                    SCCLCHECK(scclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
                }
            } else if(pxnLevel == 2) {
                // Check which local GPU corresponds to that NIC and see if we can use PXN.
                int n, g1, g2;
                SCCLCHECK(scclTopoIdToIndex(comm->topo, NET, netDev, &n));
                SCCLCHECK(scclTopoRankToIndex(comm->topo, rank, &g1));
                SCCLCHECK(scclTopoGetLocalGpu(comm->topo, netDev, &g2));
                if(g2 != -1) {
                    struct scclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes + g2;
                    if(peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
                        *proxyRank = peerGpu->gpu.rank;
                        *dev       = netDev;
                        return scclSuccess;
                    }
                }
            }
        }
    }
    return scclSuccess;
}

scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev) {
    *dev = -1;
    if(graph && graph->nIntraChannels) {
        int n1    = -1;
        int ngpus = system->nodes[GPU].count;
        int nnets = system->nodes[NET].count;
        int chan  = channelId % graph->nIntraChannels;
        for(int i = 0; i < ngpus; i++) {
            if(graph->intra[ngpus * chan + i] == rank) {
                n1 = graph->intraNets[(ngpus * chan + i) * 2 + type] - 'N';
                break;
            }
        }
        if(n1 >= 0 && n1 < nnets) {
            *dev = n1;
        }
    }
    return scclSuccess;
}

scclResult_t scclTopoGetLinkType(struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter, int nInter, int* inter) {
    int interGpus[MAX_XGMI_INTER_GPUS + 1];
    int ngpus = system->nodes[GPU].count;
    *isXGMI   = false;
    // check for direct XGMI connection
    for(int i = 0; i < ngpus; i++) {
        if(system->nodes[GPU].nodes[i].gpu.dev == cudaDev1) {
            struct scclTopoNode* node = system->nodes[GPU].nodes + i;
            for(int k = 0; k < system->nodes[GPU].count; k++) {
                if(node->paths[GPU][k].count == 1) {
                    struct scclTopoLink* link    = node->paths[GPU][k].list[0];
                    struct scclTopoNode* remNode = link->remNode;
                    if(remNode->gpu.dev == cudaDev2) {
                        *isXGMI = (link->type == LINK_NVL);
                        if(*isXGMI)
                            return scclSuccess;
                    }
                }
            }
        }
    }
    // try intermediate GPUs
    if(maxInter) {
        // check if there are intermediate GPUs that are connected to both
        bool res1, res2, res3;
        int j;
        for(j = 0; j < nInter; j++) {
            scclTopoGetLinkType(system, inter[j], inter[j + 1], &res1, 0);
            if(!res1)
                break;
        }
        if(j < nInter)
            return scclSuccess;
        if(nInter > 0 && inter != nullptr) {
            scclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0);
            if(res2) {
                *isXGMI = true;
                return scclSuccess;
            }
            memcpy(interGpus + 1, inter + 1, sizeof(int) * nInter);
        }
        interGpus[0] = cudaDev1;
        // add one more intermediate GPU recursively util reaching max depth
        nInter++;
        if(nInter + 2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter)
            return scclSuccess;
        for(int i = 0; i < ngpus; i++) {
            int dev = system->nodes[GPU].nodes[i].gpu.dev;
            // skip duplicated GPU
            if(dev == cudaDev2)
                continue;
            for(j = 0; j < nInter; j++)
                if(dev == interGpus[j])
                    break;
            if(j < nInter)
                continue;
            // check connectivity with intermediate GPUs
            interGpus[nInter] = dev;
            scclTopoGetLinkType(system, cudaDev1, cudaDev2, &res3, maxInter, nInter, interGpus);
            if(res3) {
                *isXGMI = true;
                return scclSuccess;
            }
        }
    }
    return scclSuccess;
}

} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl