topo.cc

#include <sys/stat.h>
#include <fcntl.h>
#include <dirent.h>
#include <string.h>
#include <unistd.h>
#include <algorithm>

#include "topo.h"
#include "utils.h"
#include "cpuset.h"
#include "nvmlwrap.h"
// #include "net.h"
// #include "graph.h"
// #include "comm.h"
// #include "net.h"
// #include "coll_net.h"
// #include "cpuset.h"

namespace sccl {
namespace hardware {
namespace topology {
namespace topo {

const char* topoNodeTypeStr[] = {"GPU", "PCI", "NVS", "CPU", "NIC", "NET"};
const char* topoLinkTypeStr[] = {"LOC", "XGMI", "", "PCI", "", "", "", "SYS", "NET"};
const char* topoPathTypeStr[] = {"LOC", "XGMI", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS"};

namespace topo_basic {

struct kvDict kvDictPciClass[] = {{"0x060400", PCI},
                                  {"0x068000", NVS},
                                  {"0x068001", CPU},
                                  {"0x03", GPU},
                                  {"0x02", NIC},
                                  {"0x120000", GPU},
                                  {"0x0b4000", GPU},
                                  {NULL, PCI /* Default fallback value */}};
struct kvDict kvDictPciGen[]   = {{"2.5 GT/s", 15},
                                  {"5 GT/s", 30},
                                  {"8 GT/s", 60},
                                  {"16 GT/s", 120},
                                  {"32 GT/s", 240}, /* Kernel 5.6 and earlier */
                                  {"2.5 GT/s PCIe", 15},
                                  {"5.0 GT/s PCIe", 30},
                                  {"8.0 GT/s PCIe", 60},
                                  {"16.0 GT/s PCIe", 120},
                                  {"32.0 GT/s PCIe", 240},
                                  {"64.0 GT/s PCIe", 480},
                                  {NULL, 60 /* Default fallback */}}; // x100 Mbps per lane

// 定义一个参数 TopoDumpFileRank，用于指定拓扑结构转储文件的等级，默认值为0
SCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0);

// 定义一个参数 IgnoreCpuAffinity，用于指定是否忽略CPU亲和性，默认值为0（不忽略）
SCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);

//////////////////////////////////////////////////////////////////////////////////////////////
scclResult_t scclTopoAddNet(struct scclXmlNode* xmlNet, struct scclTopoSystem* system, struct scclTopoNode* nic, int64_t busId) {
    int dev;
    SCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));

    struct scclTopoNode* net;
    SCCLCHECK(scclTopoCreateNode(system, &net, NET, dev));
    const char* str;
    SCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
    if(str)
        sscanf(str, "0x%lx", &net->net.asic);
    else
        net->net.asic = dev;

    int mbps;
    SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
    if(mbps <= 0)
        mbps = 10000; // Some NICs define speed = -1
    net->net.bw = mbps / 8000.0;
    if(xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != scclSuccess)
        net->net.latency = 0;
    SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
    SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
    // SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS));
    SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0));
    net->net.busId = busId;

    SCCLCHECK(scclTopoConnectNodes(nic, net, LINK_NET, net->net.bw));
    SCCLCHECK(scclTopoConnectNodes(net, nic, LINK_NET, net->net.bw));
    return scclSuccess;
}

scclResult_t scclTopoAddNic(struct scclXmlNode* xmlNic, struct scclTopoSystem* system, struct scclTopoNode* nic, int64_t busId) {
    for(int s = 0; s < xmlNic->nSubs; s++) {
        struct scclXmlNode* xmlNet = xmlNic->subs[s];
        if(strcmp(xmlNet->name, "net") != 0)
            continue;
        int index;
        SCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
        if(index == -1)
            continue;
        SCCLCHECK(scclTopoAddNet(xmlNet, system, nic, busId));
    }
    return scclSuccess;
}

/**
 * @brief 添加GPU拓扑节点到系统
 *
 * 从XML节点中解析GPU属性并填充到拓扑节点结构中，包括：
 * - CUDA计算能力(sm)
 * - GCN架构名称(gcn)
 * - HIP设备架构(arch)
 * - 设备排名(rank)
 * - 设备号(dev)
 * - GDR支持标志(gdr)
 *
 * @param xmlGpu 包含GPU配置的XML节点
 * @param system 目标拓扑系统
 * @param gpu 待填充的GPU拓扑节点
 * @return scclResult_t 操作结果，成功返回scclSuccess
 *
 * @note 此函数仅处理GPU基础属性，NVLink连接将在后续处理
 */
scclResult_t scclTopoAddGpu(struct scclXmlNode* xmlGpu, struct scclTopoSystem* system, struct scclTopoNode* gpu) {
    SCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
    const char* gcnArch;
    const char* gcnArchName;
    SCCLCHECK(xmlGetAttr(xmlGpu, "gcn", &gcnArch));
    convertGcnArchToGcnArchName(gcnArch, &gcnArchName);
    gpu->gpu.gcn = strdup(gcnArchName);
    scclHipDeviceArch_t arch;
    SCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value));
    memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t));
    SCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
    SCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev));
    SCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport));
    // Do not go any further, nvlinks will be added in a second pass
    return scclSuccess;
}

/**
 * @brief 添加PCI设备到拓扑系统
 *
 * 解析XML节点中的PCI设备信息，并根据设备类型（GPU/NIC/普通PCI）创建对应的拓扑节点。
 * 对于GPU设备，会进一步解析rank信息；对于NIC设备，会合并多端口设备；对于普通PCI设备，
 * 会解析vendor/device等属性并递归处理子设备。
 *
 * @param xmlPci 包含PCI设备信息的XML节点
 * @param system 目标拓扑系统
 * @param parent 父拓扑节点
 * @return scclResult_t 操作结果，成功返回scclSuccess
 */
scclResult_t scclTopoAddPci(struct scclXmlNode* xmlPci, struct scclTopoSystem* system, struct scclTopoNode* parent) {
    const char* str;

    int type;
    SCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));
    SCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));

    int64_t busId;
    SCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));
    SCCLCHECK(busIdToInt64(str, &busId));

    struct scclTopoNode* node  = NULL;
    struct scclXmlNode* xmlGpu = NULL;
    SCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));
    if(xmlGpu != NULL) {
        type = GPU;
        int index;
        SCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
        if(index == -1)
            return scclSuccess;
        SCCLCHECK(scclTopoCreateNode(system, &node, type, busId));
        SCCLCHECK(scclTopoAddGpu(xmlGpu, system, node));
    }
    struct scclXmlNode* xmlNic = NULL;
    SCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));
    if(xmlNic != NULL) {
        type = NIC;
        // Ignore sub device ID and merge multi-port NICs into one PCI device.
        busId &= 0xfffffffffffffff0;
        struct scclTopoNode* nicNode = NULL;
        SCCLCHECK(scclTopoGetNode(system, &nicNode, type, busId));
        if(nicNode == NULL) {
            SCCLCHECK(scclTopoCreateNode(system, &nicNode, type, busId));
            node = nicNode; // Connect it to parent later on
        }
        SCCLCHECK(scclTopoAddNic(xmlNic, system, nicNode, busId));
    } else if(type == PCI) {
        SCCLCHECK(scclTopoCreateNode(system, &node, type, busId));
        SCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str));
        if(str)
            node->pci.device += strtol(str, NULL, 0) << 48;
        SCCLCHECK(xmlGetAttr(xmlPci, "device", &str));
        if(str)
            node->pci.device += strtol(str, NULL, 0) << 32;
        SCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str));
        if(str)
            node->pci.device += strtol(str, NULL, 0) << 16;
        SCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str));
        if(str)
            node->pci.device += strtol(str, NULL, 0);

        for(int s = 0; s < xmlPci->nSubs; s++) {
            struct scclXmlNode* xmlSubPci = xmlPci->subs[s];
            SCCLCHECK(scclTopoAddPci(xmlSubPci, system, node));
        }
    }

    if(node) {
        int width, speed;
        SCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
        SCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));

        // Manage cases where speed was not indicated in /sys
        if(width == 0)
            width = 16;
        SCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)

        SCCLCHECK(scclTopoConnectNodes(node, parent, LINK_PCI, width * speed / 80.0));
        SCCLCHECK(scclTopoConnectNodes(parent, node, LINK_PCI, width * speed / 80.0));
    }
    return scclSuccess;
}

struct kvDict kvDictCpuArch[]   = {{"x86_64", SCCL_TOPO_CPU_ARCH_X86}, {"arm64", SCCL_TOPO_CPU_ARCH_ARM}, {"ppc64", SCCL_TOPO_CPU_ARCH_POWER}, {NULL, 0}};
struct kvDict kvDictCpuVendor[] = {{"GenuineIntel", SCCL_TOPO_CPU_VENDOR_INTEL},
                                   {"AuthenticAMD", SCCL_TOPO_CPU_VENDOR_AMD},
                                   {"CentaurHauls", SCCL_TOPO_CPU_VENDOR_ZHAOXIN},
                                   {"  Shanghai  ", SCCL_TOPO_CPU_VENDOR_ZHAOXIN},
                                   {NULL, 0}};

/**
 * @brief 添加CPU拓扑信息到系统拓扑结构中
 *
 * 从XML节点中解析CPU信息，包括NUMA ID、CPU架构、厂商、型号等，
 * 并创建对应的拓扑节点。同时处理CPU关联的PCI设备和NIC设备。
 *
 * @param xmlCpu 包含CPU配置信息的XML节点
 * @param system 目标拓扑系统
 * @return scclResult_t 操作结果，成功返回scclSuccess
 */
scclResult_t scclTopoAddCpu(struct scclXmlNode* xmlCpu, struct scclTopoSystem* system) {
    int numaId;
    // 从XML节点获取NUMA ID
    SCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
    struct scclTopoNode* cpu;
    // 创建一个新的CPU节点
    SCCLCHECK(scclTopoCreateNode(system, &cpu, CPU, numaId));
    const char* str;
    // 获取CPU的亲和性属性
    SCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
    if(str != NULL) {
        SCCLCHECK(scclStrToCpuset(str, &cpu->cpu.affinity));
    }

    // 获取CPU架构信息
    SCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));
    SCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));
    if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86) {
        // 获取CPU供应商信息
        SCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));
        SCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));
        if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
            int familyId, modelId;
            // 获取Intel CPU的家族ID和型号ID
            SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
            SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
            // 根据家族ID和型号ID确定CPU型号
            cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? SCCL_TOPO_CPU_TYPE_SKL : SCCL_TOPO_CPU_INTEL_BDW;
        } else if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
            int familyId, modelId;
            // 获取兆芯CPU的家族ID和型号ID
            SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
            SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
            if(familyId == 7 && modelId == 0x5B)
                cpu->cpu.model = SCCL_TOPO_CPU_TYPE_YONGFENG;
        }
        if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_AMD) {
            int familyId, modelId;
            // 获取AMD CPU的家族ID和型号ID
            SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
            SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
            // 将“Milan”也视为“Rome”
            cpu->cpu.model = ((familyId == 143 && modelId >= 49) || familyId == 175) ? SCCL_TOPO_CPU_TYPE_ROME : SCCL_TOPO_CPU_TYPE_ZEN;
        }
    }
    // 遍历CPU节点的子节点
    for(int s = 0; s < xmlCpu->nSubs; s++) {
        struct scclXmlNode* node = xmlCpu->subs[s];
        // 如果子节点是PCI设备，添加PCI节点
        if(strcmp(node->name, "pci") == 0)
            SCCLCHECK(scclTopoAddPci(node, system, cpu));
        // 如果子节点是NIC设备，添加NIC节点
        if(strcmp(node->name, "nic") == 0) {
            struct scclTopoNode* nic = NULL;
            SCCLCHECK(scclTopoGetNode(system, &nic, NIC, 0));
            if(nic == NULL) {
                SCCLCHECK(scclTopoCreateNode(system, &nic, NIC, 0));
                SCCLCHECK(scclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW));
                SCCLCHECK(scclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW));
            }
            SCCLCHECK(scclTopoAddNic(node, system, nic, 0));
        }
    }
    return scclSuccess;
}

// scclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
//     char* str = path + offset;
//     // Remove trailing "/"
//     if(*str == '/')
//         str--;
//     // Find next /
//     while(*str != '/')
//         str--;
//     str++;
//     int64_t numid;
//     SCCLCHECK(busIdToInt64(str, &numid));
//     // Ignore subdevice because those should use the same PCI link so we want to merge nodes.
//     numid -= numid & 0xf;
//     *id = numid;
//     return scclSuccess;
// }

static scclResult_t findLocalCpu(struct scclTopoNode* node, struct scclTopoNode** cpu) {
    *cpu = NULL;
    if(node->type == CPU) {
        *cpu = node;
        return scclSuccess;
    }
    for(int l = 0; l < node->nlinks; l++) {
        if(node->links[l].type == LINK_PCI)
            SCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
        if(*cpu != NULL)
            return scclSuccess;
    }
    return scclSuccess;
}

static scclResult_t scclTopoGetInterCpuBw(struct scclTopoNode* cpu, float* bw) {
    *bw = LOC_BW;
    if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_POWER) {
        *bw = P9_BW;
        return scclSuccess;
    }
    if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_ARM) {
        *bw = ARM_BW;
        return scclSuccess;
    }
    if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
        *bw = cpu->cpu.model == SCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
    }
    if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
        *bw = cpu->cpu.model == SCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
    }
    return scclSuccess;
}

// BCM Gen4 Switches present themselves as a two-level hierarchical switch
// even though they're supposed to sustain full BW across all ports.
// Flatten the switch as this extra level can break the search and make
// SCCL take wrong topology decisions.
scclResult_t scclTopoFlattenBcmSwitches(struct scclTopoSystem* system) {
    for(int s = 0; s < system->nodes[PCI].count; s++) {
        struct scclTopoNode* pciSwitch = system->nodes[PCI].nodes + s;
        uint64_t device                = pciSwitch->pci.device;
        // Only flatten PEX Gen 4 switches in base mode
        if((device & 0xfffffffffffff000) == 0x1000c0101000a000) {
            // Find sub switches with the same device ID.
            int64_t* subSwIds;
            SCCLCHECK(scclCalloc(&subSwIds, pciSwitch->nlinks));
            int subs = 0;
            for(int l = 0; l < pciSwitch->nlinks; l++) {
                struct scclTopoNode* sub = pciSwitch->links[l].remNode;
                // Only fuse sub switches with the same device ID.
                if(sub->type != PCI || sub->pci.device != device)
                    continue;
                // Save sub switch for later
                subSwIds[subs++] = sub->id;
                // Remove link to that sub switch
                memmove(pciSwitch->links + l, pciSwitch->links + l + 1, (pciSwitch->nlinks - l - 1) * (sizeof(struct scclTopoLink)));
                pciSwitch->nlinks--;
                // Don't increase l for the next iteration as we just shifted all links by one.
                l--;
            }

            for(int s = 0; s < subs; s++) {
                // Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
                int index;
                SCCLCHECK(scclTopoIdToIndex(system, PCI, subSwIds[s], &index));
                struct scclTopoNode* sub = system->nodes[PCI].nodes + index;
                // Connect all sub PCI devices to the parent switch
                for(int l = 0; l < sub->nlinks; l++) {
                    struct scclTopoNode* remNode = sub->links[l].remNode;
                    if(remNode == pciSwitch)
                        continue;
                    // Add link from parent PCI switch -> PCI device
                    memcpy(pciSwitch->links + pciSwitch->nlinks, sub->links + l, sizeof(struct scclTopoLink));
                    pciSwitch->nlinks++;
                    // Update link from PCI device -> parent PCI switch
                    for(int rl = 0; rl < remNode->nlinks; rl++) {
                        if(remNode->links[rl].remNode == sub) {
                            remNode->links[rl].remNode = pciSwitch;
                            break;
                        }
                    }
                }
                SCCLCHECK(scclTopoRemoveNode(system, PCI, index));
            }
            // Set subdevice to 0x0000 to make sure we don't merge this switch again.
            pciSwitch->pci.device = 0x1000c01010000000;
            free(subSwIds);
            // Restart, as system->nodes[PCI].nodes has changed.
            s = 0;
        }
    }
    return scclSuccess;
}

scclResult_t scclTopoConnectCpus(struct scclTopoSystem* system) {
    // And connect all CPU nodes together
    for(int n = 0; n < system->nodes[CPU].count; n++) {
        for(int p = 0; p < system->nodes[CPU].count; p++) {
            if(n == p)
                continue;
            float bw;
            SCCLCHECK(scclTopoGetInterCpuBw(system->nodes[CPU].nodes + n, &bw));
            SCCLCHECK(scclTopoConnectNodes(system->nodes[CPU].nodes + n, system->nodes[CPU].nodes + p, LINK_SYS, bw));
        }
    }
    return scclSuccess;
}

static scclResult_t scclTopoSort(struct scclTopoNode* node, struct scclTopoNode* upNode) {
    // 如果存在上级节点，则调整当前节点的链接顺序，使上级节点的链接位于最后
    if(upNode) {
        int l = 0;
        // 找到指向upNode的链接
        while(node->links[l].remNode != upNode)
            l++;
        struct scclTopoLink upLink;
        // 复制找到的链接到upLink
        memcpy(&upLink, node->links + l, sizeof(struct scclTopoLink));
        // 将所有链接左移，直到upLink被移动到链接列表的末尾
        while(node->links[l + 1].remNode) {
            memcpy(node->links + l, node->links + l + 1, sizeof(struct scclTopoLink));
            l++;
        }
        // 将upLink放到链接列表的末尾
        memcpy(node->links + l, &upLink, sizeof(struct scclTopoLink));
    }

    // 递归地对PCI树进行排序
    for(int l = 0; l < node->nlinks; l++) {
        struct scclTopoLink* link = node->links + l;
        // 如果链接类型是PCI且远端节点不是上级节点，则递归排序
        if(link->type == LINK_PCI && link->remNode != upNode)
            SCCLCHECK(scclTopoSort(link->remNode, node));
    }
    return scclSuccess;
}

// We want the graph to be organized to ease/accelerate traversal :
// 1. NVLinks (already the case)
// 2. PCI down
// 3. PCI up
// 4. SYS (already the case)
scclResult_t scclTopoSortSystem(struct scclTopoSystem* system) {
    for(int n = 0; n < system->nodes[CPU].count; n++)
        SCCLCHECK(scclTopoSort(system->nodes[CPU].nodes + n, NULL));
    return scclSuccess;
}

float scclTopoXGMISpeed(const char* gcn) {
    if(IsArchMatch(gcn, "gfx90a"))
        return MI200_XGMI_WIDTH;
    else if(IsArchMatch(gcn, "gfx94"))
        return GFX94X_XGMI_WIDTH;
    else
        return VEGA_XGMI_WIDTH;
}

/**
 * @brief 添加XGMI拓扑连接
 *
 * 处理XML节点中的XGMI连接信息，建立GPU与其他设备（GPU/CPU/NVS）之间的NVL连接。
 *
 * @param node XML节点指针，包含XGMI连接配置信息
 * @param system 拓扑系统指针，用于存储和管理拓扑节点
 * @param parentBusId 父设备的PCIe总线ID字符串
 *
 * @return scclResult_t 返回操作结果状态码：
 *         - scclSuccess: 操作成功
 *         - scclInternalError: 找不到指定GPU设备时返回错误
 *
 * @note 1. 支持GPU-GPU、GPU-CPU、GPU-NVS三种连接类型
 *       2. 连接带宽由GPU的GCN架构和连接数量共同决定
 *       3. 递归处理子节点时保持总线ID传递
 */
scclResult_t scclTopoAddXGMI(struct scclXmlNode* node, struct scclTopoSystem* system, const char* parentBusId) {
    if(strcmp(node->name, "xgmi") == 0) {
        struct scclTopoNode* gpu = NULL;
        int64_t pBusId;
        SCCLCHECK(busIdToInt64(parentBusId, &pBusId));
        SCCLCHECK(scclTopoGetNode(system, &gpu, GPU, pBusId));
        if(gpu == NULL) {
            WARN("Add XGMI error : could not find GPU %lx\n", pBusId);
            return scclInternalError;
        }
        int count;
        SCCLCHECK(xmlGetAttrInt(node, "count", &count));
        const char* targetClass;
        SCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));
        int targetType;
        SCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));
        struct scclTopoNode* remote = NULL;
        if(targetType == GPU) {
            // NVL P2P connection to another GPU
            const char* target;
            SCCLCHECK(xmlGetAttrStr(node, "target", &target));
            int64_t busId;
            SCCLCHECK(busIdToInt64(target, &busId));
            SCCLCHECK(scclTopoGetNode(system, &remote, GPU, busId));
        } else if(targetType == CPU) {
            // NVL connection to the local CPU
            SCCLCHECK(findLocalCpu(gpu, &remote));
        } else {
            if(system->nodes[NVS].count == 0) {
                SCCLCHECK(scclTopoCreateNode(system, &remote, NVS, 0));
            } else {
                remote = system->nodes[NVS].nodes;
            }
        }
        if(remote) {
            float nvlSpeed = scclTopoXGMISpeed(gpu->gpu.gcn);
            SCCLCHECK(scclTopoConnectNodes(gpu, remote, LINK_NVL, count * nvlSpeed));
            if(remote->type != GPU) {
                SCCLCHECK(scclTopoConnectNodes(remote, gpu, LINK_NVL, count * nvlSpeed));
            }
        }
    } else {
        const char* busId;
        SCCLCHECK(xmlGetAttr(node, "busid", &busId));
        for(int s = 0; s < node->nSubs; s++) {
            SCCLCHECK(scclTopoAddXGMI(node->subs[s], system, busId ? busId : parentBusId));
        }
    }
    return scclSuccess;
}

/**
 * @brief 获取指定GPU组的本地网络掩码
 *
 * 遍历系统中所有网络节点，找到与指定GPU组(g)连接带宽最大且路径类型最优的网络节点，
 * 将这些网络节点的ID转换为位掩码形式输出。
 *
 * @param system 拓扑系统指针
 * @param g GPU组索引
 * @param localNetMask [out] 输出的本地网络掩码(64位无符号整数)
 * @param type [out] 可选参数，输出最优路径类型
 * @return scclResult_t 成功返回scclSuccess，失败返回错误码
 */
static scclResult_t getLocalNetMask(struct scclTopoSystem* system, int g, uint64_t* localNetMask, int* type) {
    int minType = PATH_DIS;
    float maxBw = 0;
    int count   = 0;
    int* nets;
    SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count));
    for(int n = 0; n < system->nodes[NET].count; n++) {
        struct scclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU] + g;
        if(path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
            maxBw   = path->bw;
            minType = path->type;
            if(type)
                *type = minType;
            count = 0;
        }
        if(path->bw == maxBw && path->type == minType)
            nets[count++] = system->nodes[NET].nodes[n].id;
    }

    *localNetMask = 0ULL;
    for(int n = 0; n < count; n++) {
        if(nets[n] >= 64)
            return scclInternalError;
        *localNetMask |= 1ULL << nets[n];
    }
    free(nets);
    return scclSuccess;
}

static scclResult_t scclTopoPrintRec(struct scclTopoNode* node, struct scclTopoNode* prevNode, char* line, int offset) {
    if(node->type == GPU) {
        sprintf(line + offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
    } else if(node->type == CPU) {
        sprintf(line + offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
    } else if(node->type == PCI) {
        sprintf(line + offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device);
    } else {
        sprintf(line + offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
    }
    INFO(SCCL_LOG_TOPO, "%s", line);
    for(int i = 0; i < offset; i++)
        line[i] = ' ';

    for(int l = 0; l < node->nlinks; l++) {
        struct scclTopoLink* link = node->links + l;
        if(link->type == LINK_LOC)
            continue;
        if(link->type != LINK_PCI || link->remNode != prevNode) {
            sprintf(line + offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
            int nextOffset = strlen(line);
            if(link->type == LINK_PCI) {
                SCCLCHECK(scclTopoPrintRec(link->remNode, node, line, nextOffset));
            } else {
                if(link->remNode->type == NET) {
                    sprintf(line + nextOffset,
                            "%s/%lX (%lx/%d/%f)",
                            topoNodeTypeStr[link->remNode->type],
                            link->remNode->id,
                            link->remNode->net.asic,
                            link->remNode->net.port,
                            link->remNode->net.bw);
                } else {
                    sprintf(line + nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
                }
                INFO(SCCL_LOG_TOPO, "%s", line);
            }
        }
    }
    return scclSuccess;
}

} // namespace topo_basic

////////////////////////////////////////////////////////////////////////////////////////////////
bool isHswDriverExist() {
    const ::std::string basePath = "/sys/bus/pci/drivers";

    DIR* dir = opendir(basePath.c_str());
    if(!dir) {
        return false;
    }
    struct dirent* entry;
    bool found = false;
    while((entry = readdir(dir)) != nullptr) {
        ::std::string name = entry->d_name;
        if(name != "." && name != ".." && name.compare(0, 3, "hsw") == 0) {
            found = true;
            break;
        }
    }
    closedir(dir);
    return found;
}

int getIBNum() {
    int count                    = 0;
    const ::std::string basePath = "/sys/class/infiniband";

    DIR* dir = opendir(basePath.c_str());
    if(!dir) {
        return count;
    }
    struct dirent* entry;
    while((entry = readdir(dir)) != nullptr) {
        if(strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
            continue;
        if(strncmp(entry->d_name, "mlx5", 4) == 0)
            ++count;
    }
    closedir(dir);
    return count;
}

scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id) {
    uint64_t* localNetMasks;
    int ngpus = system->nodes[GPU].count;
    SCCLCHECK(scclCalloc(&localNetMasks, ngpus));

    // Fill localNetMasks for all GPUs.
    for(int g = 0; g < ngpus; g++) {
        SCCLCHECK(topo_basic::getLocalNetMask(system, g, localNetMasks + g, NULL));
    }

    // Find GPUs which have the same mask as rank, i.e. share the same local Nets.
    int gpu;
    SCCLCHECK(scclTopoRankToIndex(system, rank, &gpu));
    int netLocalGpus = 0, netLocalGpu = 0;
    for(int g = 0; g < ngpus; g++) {
        if(localNetMasks[g] == localNetMasks[gpu]) {
            if(g == gpu)
                netLocalGpu = netLocalGpus;
            netLocalGpus++;
        }
    }
    uint64_t localNetMask = localNetMasks[gpu];
    free(localNetMasks);
    if(localNetMask == 0)
        return scclInternalError;

    // Round robin on GPUs and channels
    int gIndex = 0, cId = 0, n = 0;
    while(1) {
        if(1ULL << n & localNetMask) {
            if(gIndex == netLocalGpu && cId == channelId) {
                *id = n;
                return scclSuccess;
            }
            gIndex++;
            if(gIndex == netLocalGpus) {
                gIndex = 0;
                cId++;
            }
        }
        n = (n + 1) % 64;
    }
}

scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex) {
    int ngpus = system->nodes[GPU].count;
    int* gpus;
    SCCLCHECK(scclCalloc(&gpus, ngpus));

    // Find localNetMask which includes net with the most local GPUs.
    int netLocalGpus = 0, minType = PATH_DIS;
    uint64_t localNetMask = 0ULL;
    for(int g = 0; g < ngpus; g++) {
        int type = PATH_DIS;
        uint64_t mask;
        SCCLCHECK(topo_basic::getLocalNetMask(system, g, &mask, &type));
        if((1ULL << net) & mask) {
            if(type < minType) {
                localNetMask = mask;
                netLocalGpus = 0;
                minType      = type;
            }
            if(type == minType) {
                if(localNetMask && mask != localNetMask) {
                    WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n",
                         g,
                         gpus[netLocalGpus - 1],
                         minType,
                         net,
                         mask,
                         localNetMask);
                    free(gpus);
                    return scclInternalError;
                }
                gpus[netLocalGpus] = g;
                netLocalGpus++;
            }
        }
    }
    if(localNetMask == 0ULL) {
        *gpuIndex = -1;
        free(gpus);
        return scclSuccess;
    }

    // Round robin on GPUs and channels
    int gIndex = 0, cId = 0, n = 0;
    while(1) {
        if(1ULL << n & localNetMask) {
            if(n == net) {
                *gpuIndex = gpus[gIndex];
                free(gpus);
                return scclSuccess;
            }
            gIndex++;
            if(gIndex == netLocalGpus) {
                gIndex = 0;
                cId++;
            }
        }
        n = (n + 1) % 64;
    }
}

scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model) {
    *arch   = system->nodes[CPU].nodes[0].cpu.arch;
    *vendor = system->nodes[CPU].nodes[0].cpu.vendor;
    *model  = system->nodes[CPU].nodes[0].cpu.model;
    return scclSuccess;
}

scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity) {
    struct scclTopoNode *cpu = NULL, *gpu = NULL;
    for(int g = 0; g < system->nodes[GPU].count; g++) {
        if(system->nodes[GPU].nodes[g].gpu.rank == rank) {
            gpu = system->nodes[GPU].nodes + g;
            // Find closer CPU
            int cpuIndex = -1, minHops = 0;
            for(int c = 0; c < system->nodes[CPU].count; c++) {
                int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
                if(cpuIndex == -1 || nHops < minHops) {
                    cpuIndex = c;
                    minHops  = nHops;
                }
            }
            cpu = system->nodes[CPU].nodes + cpuIndex;
        }
    }
    if(cpu == NULL) {
        WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
        return scclInternalError;
    }

    // Query the CPU affinity set we were provided
    cpu_set_t mask;
    SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");

    // Get the affinity of the CPU close to our GPU.
    cpu_set_t cpuMask = cpu->cpu.affinity;
    cpu_set_t finalMask;
    if(topo_basic::scclParamIgnoreCpuAffinity())
        // Ignore the CPU affinity set and use the GPU one instead
        finalMask = cpuMask;
    else
        // Use a subset of the GPU affinity set
        CPU_AND(&finalMask, &mask, &cpuMask);

    memcpy(affinity, &finalMask, sizeof(cpu_set_t));

    // If there is a non empty set, use it to set affinity
    if(CPU_COUNT(&finalMask)) {
        char affinityStr[sizeof(cpu_set_t) * 2];
        SCCLCHECK(scclCpusetToStr(&finalMask, affinityStr));
        INFO(SCCL_LOG_TOPO, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
    }
    return scclSuccess;
}

scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count) {
    *count = system->nodes[GPU].count;
    return scclSuccess;
}

scclResult_t scclTopoGetNetCount(struct scclTopoSystem* system, int* count) {
    *count = system->nodes[NET].count;
    return scclSuccess;
}

scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count) {
    *count = system->nodes[NVS].count;
    return scclSuccess;
}

scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank) {
    for(int g = 0; g < system->nodes[GPU].count; g++) {
        if(system->nodes[GPU].nodes[g].gpu.rank == rank) {
            *localRank = g;
            return scclSuccess;
        }
    }
    WARN("Could not find local GPU with rank %d", rank);
    return scclInternalError;
}

scclResult_t scclTopoPrint(struct scclTopoSystem* s) {
    INFO(SCCL_LOG_TOPO, "=== System : maxBw %2.1f totalBw %2.1f ===", s->maxBw, s->totalBw);
    char line[1024];
    for(int n = 0; n < s->nodes[CPU].count; n++)
        SCCLCHECK(topo_basic::scclTopoPrintRec(s->nodes[CPU].nodes + n, NULL, line, 0));
    INFO(SCCL_LOG_TOPO, "==========================================");
    return scclSuccess;
}

scclResult_t scclTopoGetNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id) {
    for(int i = 0; i < system->nodes[type].count; i++) {
        if(system->nodes[type].nodes[i].id == id) {
            *node = system->nodes[type].nodes + i;
            return scclSuccess;
        }
    }
    return scclSuccess;
}

scclResult_t scclTopoCreateNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id) {
    if(system->nodes[type].count == SCCL_TOPO_MAX_NODES) {
        WARN("Error : tried to create too many nodes of type %d", type);
        return scclInternalError;
    }
    struct scclTopoNode* n = system->nodes[type].nodes + system->nodes[type].count;
    system->nodes[type].count++;
    n->type = type;
    n->id   = id;
    if(type == GPU) {
        // Create link to itself (used in some corner cases)
        n->nlinks           = 1;
        n->links[0].type    = LINK_LOC;
        n->links[0].remNode = n;
        n->links[0].bw      = LOC_BW;
        n->gpu.dev          = SCCL_TOPO_UNDEF;
        n->gpu.rank         = SCCL_TOPO_UNDEF;
        n->gpu.cudaCompCap  = SCCL_TOPO_UNDEF;
    } else if(type == CPU) {
        n->cpu.arch   = SCCL_TOPO_UNDEF;
        n->cpu.vendor = SCCL_TOPO_UNDEF;
        n->cpu.model  = SCCL_TOPO_UNDEF;
    } else if(type == NET) {
        n->net.asic    = 0ULL;
        n->net.port    = SCCL_TOPO_UNDEF;
        n->net.bw      = 0.0;
        n->net.latency = 0.0;
    }
    *node = n;
    return scclSuccess;
}

/**
 * 从拓扑系统中移除指定类型的节点
 *
 * @param system 拓扑系统指针
 * @param type 要移除的节点类型
 * @param index 要移除的节点索引
 * @return scclResult_t 返回操作结果(scclSuccess表示成功)
 *
 * 该函数会:
 * 1. 释放被移除节点的所有路径内存
 * 2. 更新其他节点到被移除节点的链接关系
 * 3. 调整节点数组中剩余节点的位置
 * 4. 减少该类型节点的计数
 */
scclResult_t scclTopoRemoveNode(struct scclTopoSystem* system, int type, int index) {
    struct scclTopoNode* delNode = system->nodes[type].nodes + index;
    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
        free(delNode->paths[t]);
        for(int n = 0; n < system->nodes[t].count; n++) {
            struct scclTopoNode* node = system->nodes[t].nodes + n;
            if(node == delNode)
                continue;
            for(int l = 0; l < node->nlinks; l++) {
                while(l < node->nlinks && node->links[l].remNode == delNode) {
                    memmove(node->links + l, node->links + l + 1, (node->nlinks - l - 1) * sizeof(struct scclTopoLink));
                    node->nlinks--;
                }
                if(l < node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) {
                    node->links[l].remNode--;
                }
            }
        }
    }
    memmove(delNode, delNode + 1, (system->nodes[type].count - index - 1) * sizeof(struct scclTopoNode));
    system->nodes[type].count--;
    return scclSuccess;
}

scclResult_t scclTopoConnectNodes(struct scclTopoNode* node, struct scclTopoNode* remNode, int type, float bw) {
    // Aggregate links into higher bw for NVLink
    struct scclTopoLink* link;
    for(link = node->links; link->remNode; link++) {
        if(link->remNode == remNode && link->type == type)
            break;
    }
    if(link->remNode == NULL)
        node->nlinks++;
    link->type    = type;
    link->remNode = remNode;
    link->bw += bw;

    // Sort links in BW descending order
    struct scclTopoLink linkSave;
    memcpy(&linkSave, link, sizeof(struct scclTopoLink));
    while(link != node->links) {
        if((link - 1)->bw >= linkSave.bw)
            break;
        memcpy(link, link - 1, sizeof(struct scclTopoLink));
        link--;
    }
    memcpy(link, &linkSave, sizeof(struct scclTopoLink));
    return scclSuccess;
}

scclResult_t scclTopoGetSystemFromXml(struct scclXml* xml, struct scclTopoSystem** topoSystem) {
    SCCLCHECK(scclCalloc(topoSystem, 1));
    struct scclXmlNode* topNode;
    SCCLCHECK(xmlFindTag(xml, "system", &topNode));

    printf("topNode->nSubs=%d\n", topNode->nSubs);
    for(int s = 0; s < topNode->nSubs; s++) {
        struct scclXmlNode* node = topNode->subs[s];
        if(strcmp(node->name, "cpu") == 0)
            SCCLCHECK(topo_basic::scclTopoAddCpu(node, *topoSystem));
    }
    SCCLCHECK(topo_basic::scclTopoAddXGMI(topNode, *topoSystem, NULL));
    SCCLCHECK(topo_basic::scclTopoFlattenBcmSwitches(*topoSystem));
    SCCLCHECK(topo_basic::scclTopoConnectCpus(*topoSystem));
    SCCLCHECK(topo_basic::scclTopoSortSystem(*topoSystem));

    return scclSuccess;
}

/**
 * 获取系统中所有GPU节点的计算能力范围
 *
 * @param system 拓扑系统指针
 * @param ccMin 输出参数，返回最小计算能力版本
 * @param ccMax 输出参数，返回最大计算能力版本
 * @return scclResult_t 成功返回scclSuccess，无GPU节点返回scclInternalError
 */
scclResult_t scclTopoGetCompCap(struct scclTopoSystem* system, int* ccMin, int* ccMax) {
    if(system->nodes[GPU].count == 0)
        return scclInternalError;
    int min, max;
    min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap;
    for(int g = 1; g < system->nodes[GPU].count; g++) {
        min = ::std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
        max = ::std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
    }
    if(ccMin)
        *ccMin = min;
    if(ccMax)
        *ccMax = max;
    return scclSuccess;
}

scclResult_t scclTopoIdToIndex(struct scclTopoSystem* system, int type, int64_t id, int* index) {
    *index = -1;
    for(int i = 0; i < system->nodes[type].count; i++) {
        if(system->nodes[type].nodes[i].id == id) {
            *index = i;
            return scclSuccess;
        }
    }
    return scclInternalError;
}

scclResult_t scclTopoRankToIndex(struct scclTopoSystem* system, int rank, int* index) {
    *index = -1;
    for(int i = 0; i < system->nodes[GPU].count; i++) {
        if(system->nodes[GPU].nodes[i].gpu.rank == rank) {
            *index = i;
            return scclSuccess;
        }
    }
    return scclInternalError;
}

scclResult_t scclTopoDevToRank(struct scclTopoSystem* system, int dev, int* rank) {
    *rank = -1;
    for(int i = 0; i < system->nodes[GPU].count; i++) {
        if(system->nodes[GPU].nodes[i].gpu.dev == dev) {
            *rank = system->nodes[GPU].nodes[i].gpu.rank;
            return scclSuccess;
        }
    }
    return scclInternalError;
}

/**
 * @brief 获取系统拓扑结构
 *
 * 该函数用于获取系统的拓扑结构信息，包括GPU和NIC设备。
 * 首先尝试从环境变量SCCL_TOPO_FILE指定的XML文件加载拓扑，
 * 若未指定则尝试加载默认拓扑文件（根据IB设备数量选择不同文件）。
 * 自动检测本地GPU和NIC设备信息并填充到拓扑结构中。
 *
 * @param comm 通信上下文指针
 * @param system 输出参数，返回创建的拓扑系统指针
 * @return scclResult_t 返回操作结果，scclSuccess表示成功
 */
// scclResult_t scclTopoGetSystem(struct scclTopoComm* comm, struct scclTopoSystem** system) {
//     struct scclXml* xml;
//     SCCLCHECK(scclCalloc(&xml, 1));
//     char* xmlTopoFile = getenv("SCCL_TOPO_FILE");
//     if(xmlTopoFile) {
//         INFO(SCCL_LOG_TOPO, "SCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
//         SCCLCHECK(scclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
//     } else {
//         bool useDefaultTopo = true;
//         bool HswExist       = topo_basic::isHswDriverExist();
//         if(HswExist == true) {
//             char* rocmPath = getenv("ROCM_PATH");
//             if(rocmPath != NULL) {
//                 ::std::string xmlPath;
//                 int IBNum = topo_basic::getIBNum();
//                 if(IBNum == 8 || IBNum == 9 || IBNum == 10) {
//                     xmlPath = ::std::string(rocmPath) + "/rccl/lib/built-in-BW-topo-input.xml";
//                     if(access(xmlPath.c_str(), F_OK) == 0) {
//                         SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
//                         useDefaultTopo = false;
//                     }
//                 } else if(IBNum == 4 || IBNum == 5 || IBNum == 6) {
//                     xmlPath = ::std::string(rocmPath) + "/rccl/lib/built-in-508-topo-input.xml";
//                     if(access(xmlPath.c_str(), F_OK) == 0) {
//                         SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
//                         useDefaultTopo = false;
//                     }
//                 }
//             }
//         }
//         if(useDefaultTopo) {
//             INFO(SCCL_LOG_TOPO, "No default topo for now, please provide your own topo xml file");
//         }
//     }

//     if(xml->maxIndex == 0) {
//         // Create top tag
//         struct scclXmlNode* top;
//         SCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
//         SCCLCHECK(xmlSetAttrInt(top, "version", SCCL_TOPO_XML_VERSION));
//     }

//     // Auto-detect GPUs if needed
//     for(int r = 0; r < comm->nRanks; r++) {
//         if(comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
//             char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
//             SCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
//             struct scclXmlNode* node;
//             SCCLCHECK(scclTopoFillGpu(xml, busId, &node));
//             if(node == NULL)
//                 continue;
//             SCCLCHECK(xmlSetAttrInt(node, "keep", 1));
//             SCCLCHECK(xmlSetAttrInt(node, "rank", r));
//             SCCLCHECK(topo_basic::xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
//         }
//     }

//     // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
//     // so we start with collnet so that it has precedence.
//     int netDevCount = 0;
//     if(netDevCount == 0) {
//         SCCLCHECK(comm->scclNet->devices(&netDevCount));
//     }

//     for(int n = 0; n < netDevCount; n++) {
//         sccl::hardware::net::scclNetProperties_t props;
//         SCCLCHECK(comm->scclNet->getProperties(n, &props));
//         struct scclXmlNode* netNode;
//         SCCLCHECK(scclTopoFillNet(xml, props.pciPath, props.name, &netNode));
//         SCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
//         SCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
//         SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "speed", props.speed));
//         SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "port", props.port));
//         SCCLCHECK(topo_basic::xmlInitAttrFloat(netNode, "latency", props.latency));
//         SCCLCHECK(topo_basic::xmlInitAttrUint64(netNode, "guid", props.guid));
//         SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "maxconn", props.maxComms));
//         bool gdrSupport =
//             (props.ptrSupport & sccl::hardware::net::SCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & sccl::hardware::net::SCCL_PTR_DMABUF));
//         INFO(SCCL_LOG_TOPO, "NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->scclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
//         SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "gdr", gdrSupport));
//     }

//     // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
//     SCCLCHECK(scclTopoTrimXml(xml));

//     xmlTopoFile = getenv("SCCL_TOPO_DUMP_FILE");
//     if(xmlTopoFile && comm->rank == topo_basic::scclParamTopoDumpFileRank()) {
//         INFO(SCCL_LOG_TOPO, "SCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
//         SCCLCHECK(scclTopoDumpXmlToFile(xmlTopoFile, xml));
//     }

//     SCCLCHECK(scclTopoGetSystemFromXml(xml, system));
//     free(xml);

//     return scclSuccess;
// }

scclResult_t scclTopoGetSystem(struct scclTopoSystem** system) {
    using namespace sccl;
    struct scclXml* xml;
    SCCLCHECK(scclCalloc(&xml, 1));
    bool HswExist = isHswDriverExist();
    if(HswExist == true) {
        ::std::string xmlPath;
        int IBNum = getIBNum();
        if(IBNum == 8 || IBNum == 9 || IBNum == 10) {
            xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
            SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
        }
    }
    if(xml->maxIndex == 0) {
        // Create top tag
        struct scclXmlNode* top;
        SCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
        SCCLCHECK(xmlSetAttrInt(top, "version", SCCL_TOPO_XML_VERSION));
    }

    // Auto-detect GPUs if needed
    // for(int r = 0; r < comm->nRanks; r++) {
    //     if(comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
    //         char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
    //         SCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
    //         struct scclXmlNode* node;
    //         SCCLCHECK(scclTopoFillGpu(xml, busId, &node));
    //         if(node == NULL)
    //             continue;
    //         SCCLCHECK(xmlSetAttrInt(node, "keep", 1));
    //         SCCLCHECK(xmlSetAttrInt(node, "rank", r));
    //         SCCLCHECK(topo_basic::xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
    //     }
    // }

    // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
    // so we start with collnet so that it has precedence.
    int netDevCount = 0;
    auto scclNet    = sccl::hardware::net::initNet(sccl::hardware::net::NET_IB);
    if(netDevCount == 0) {
        SCCLCHECK(scclNet->devices(&netDevCount));
    }

    for(int n = 0; n < netDevCount; n++) {
        sccl::hardware::net::scclNetProperties_t props;
        SCCLCHECK(scclNet->getProperties(n, &props));
        struct scclXmlNode* netNode;
        SCCLCHECK(scclTopoFillNet(xml, props.pciPath, props.name, &netNode));
        SCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
        SCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
        SCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
        SCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
        SCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
        SCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
        SCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
        bool gdrSupport = (props.ptrSupport & sccl::hardware::net::SCCL_PTR_CUDA) && (props.ptrSupport & sccl::hardware::net::SCCL_PTR_DMABUF);
        INFO(SCCL_LOG_TOPO, "NET/%s : GPU Direct RDMA %s for HCA %d '%s'", scclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
        SCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
    }

    // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
    SCCLCHECK(scclTopoTrimXml(xml));
    SCCLCHECK(scclTopoGetSystemFromXml(xml, system));
    free(xml);

    return scclSuccess;
}

} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl