Initial Code for SCCL_v1

d9d23f34 · lishen · 57df3737 · d9d23f34 · d9d23f34 · d9d23f34
Commit d9d23f34 authored Jun 20, 2025 by lishen
20 changed files
--- a/src/hardware/net/net_utils.h
+++ b/src/hardware/net/net_utils.h
+#pragma once
+
+#include <stdint.h>
+#include "base.h"
+
+namespace sccl {
+namespace hardware {
+namespace net {
+
+struct netIf {       // 网络接口结构体
+    char prefix[64]; // 网络前缀
+    int port;        // 端口号
+};
+
+// 解析字符串列表，将结果存储在网络接口列表中
+int parseStringList(const char* string, struct netIf* ifList, int maxList);
+
+// 根据给定的字符串和端口，匹配网络接口列表中的接口
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
+
+scclResult_t rocmLibraryInit(void);
+
+////////////////////////////////// 用于定义网络设备 //////////////////////////////////
+typedef struct {
+    char* name;     // 主要用于日志记录。
+    char* pciPath;  // PCI设备在/sys中的路径。
+    uint64_t guid;  // NIC芯片的唯一标识符。对于具有多个PCI功能（物理或虚拟）的卡非常重要。
+    int ptrSupport; // [SCCL_PTR_HOST|SCCL_PTR_CUDA|SCCL_PTR_DMABUF]
+    int speed;      // 端口速度，单位为Mbps。
+    int port;       // 端口号。
+    float latency;  // 网络延迟
+    int maxComms;   // 我们可以创建的最大通信数量
+    int maxRecvs;   // 最大分组接收数量。
+} scclNetProperties_t;
+
+typedef struct {
+    // 网络的名称（主要用于日志）
+    const char* name;
+    // 初始化网络。
+    scclResult_t (*init)();
+    // 返回适配器的数量。
+    scclResult_t (*devices)(int* ndev);
+    // 获取各种设备属性。
+    scclResult_t (*getProperties)(int dev, scclNetProperties_t* props);
+    // 创建一个接收对象并提供一个句柄以连接到它。该句柄最多可以是 SCCL_NET_HANDLE_MAXSIZE 字节，并将在排名之间交换以创建连接。
+    scclResult_t (*listen)(int dev, void* handle, void** listenComm);
+    // 连接到一个句柄并返回一个发送 comm 对象给该对等体。
+    // 此调用不应阻塞以建立连接，而应成功返回 sendComm == NULL，并期望再次调用直到 sendComm != NULL。
+    scclResult_t (*connect)(int dev, void* handle, void** sendComm);
+    // 在远程对等体调用 connect 后最终确定连接建立。
+    // 此调用不应阻塞以建立连接，而应成功返回 recvComm == NULL，并期望再次调用直到 recvComm != NULL。
+    scclResult_t (*accept)(void* listenComm, void** recvComm);
+    // 注册/注销内存。Comm 可以是 sendComm 或 recvComm。
+    // 类型是 SCCL_PTR_HOST 或 SCCL_PTR_CUDA。
+    scclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+    /* DMA-BUF 支持 */
+    scclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+    scclResult_t (*deregMr)(void* comm, void* mhandle);
+    // 异步发送到对等体。
+    // 如果调用不能执行（或会阻塞），则可能返回 request == NULL
+    scclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+    // 异步从对等体接收。 如果调用不能执行（或会阻塞），则可能返回 request == NULL
+    scclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+    // 执行刷新/栅栏操作，以确保所有使用 SCCL_PTR_CUDA 接收到的数据对 GPU 可见
+    scclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+    // 测试请求是否完成。如果 size 不为 NULL，则返回发送/接收的字节数。
+    scclResult_t (*test)(void* request, int* done, int* sizes);
+    // 关闭并释放 send/recv comm 对象
+    scclResult_t (*closeSend)(void* sendComm);
+    scclResult_t (*closeRecv)(void* recvComm);
+    scclResult_t (*closeListen)(void* listenComm);
+} scclNet_t;
+
+////////////////////////////////// 其他定义 //////////////////////////////////
+
+typedef enum sccl_ptr {
+    SCCL_PTR_HOST   = 0x1,
+    SCCL_PTR_CUDA   = 0x2,
+    SCCL_PTR_DMABUF = 0x4
+} sccl_ptr_t;
+
+#define SCCL_NET_HANDLE_MAXSIZE 128
+
+} // namespace net
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/net/rocm_wrap.cpp
+++ b/src/hardware/net/rocm_wrap.cpp
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <dlfcn.h>
+#include <sys/utsname.h>
+#include <fstream>
+
+#include "base.h"
+#include "rocm_wrap.h"
+
+namespace sccl {
+namespace hardware {
+namespace net {
+namespace rocm_wrap {
+#define DECLARE_ROCM_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
+
+DECLARE_ROCM_PFN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
+/* ROCr Driver functions loaded with dlsym() */
+DECLARE_ROCM_PFN(hsa_init);
+DECLARE_ROCM_PFN(hsa_system_get_info);
+DECLARE_ROCM_PFN(hsa_status_string);
+
+SCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 0);
+
+static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+static scclResult_t initResult;
+
+static void* hsaLib;
+static uint16_t version_major, version_minor;
+bool scclCudaLaunchBlocking = false;
+
+//////////////////////////////////////////////////////////////////////////////
+static void initOnceFunc() {
+    do {
+        char* val              = getenv("CUDA_LAUNCH_BLOCKING");
+        scclCudaLaunchBlocking = val != nullptr && val[0] != 0 && !(val[0] == '0' && val[1] == 0);
+    } while(0);
+
+    bool dmaBufSupport = false;
+    hsa_status_t res;
+
+    /*
+     * Load ROCr driver library
+     */
+    char path[1024];
+    char* scclCudaPath = getenv("RCCL_ROCR_PATH");
+    if(scclCudaPath == NULL)
+        snprintf(path, 1024, "%s", "libhsa-runtime64.so");
+    else
+        snprintf(path, 1024, "%s%s", scclCudaPath, "libhsa-runtime64.so");
+
+    hsaLib = dlopen(path, RTLD_LAZY);
+    if(hsaLib == NULL) {
+        WARN("Failed to find ROCm runtime library in %s (RCCL_ROCR_PATH=%s)", scclCudaPath, scclCudaPath);
+        goto error;
+    }
+
+    /*
+     * Load initial ROCr functions
+     */
+
+    pfn_hsa_init = (PFN_hsa_init)dlsym(hsaLib, "hsa_init");
+    if(pfn_hsa_init == NULL) {
+        WARN("Failed to load ROCr missing symbol hsa_init");
+        goto error;
+    }
+    pfn_hsa_init();
+
+    pfn_hsa_system_get_info = (PFN_hsa_system_get_info)dlsym(hsaLib, "hsa_system_get_info");
+    if(pfn_hsa_system_get_info == NULL) {
+        WARN("Failed to load ROCr missing symbol hsa_system_get_info");
+        goto error;
+    }
+
+    pfn_hsa_status_string = (PFN_hsa_status_string)dlsym(hsaLib, "hsa_status_string");
+    if(pfn_hsa_status_string == NULL) {
+        WARN("Failed to load ROCr missing symbol hsa_status_string");
+        goto error;
+    }
+
+    res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &version_major);
+    if(res != 0) {
+        WARN("pfn_hsa_system_get_info failed with %d", res);
+        goto error;
+    }
+    res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &version_minor);
+    if(res != 0) {
+        WARN("pfn_hsa_system_get_info failed with %d", res);
+        goto error;
+    }
+
+    INFO(SCCL_LOG_NET, "ROCr version %d.%d", version_major, version_minor);
+
+    // if (hsaDriverVersion < ROCR_DRIVER_MIN_VERSION) {
+    //  WARN("ROCr Driver version found is %d. Minimum requirement is %d", hsaDriverVersion, ROCR_DRIVER_MIN_VERSION);
+    //  Silently ignore version check mismatch for backwards compatibility
+    // goto error;
+    //}
+
+    /* DMA-BUF support */
+    // ROCm support
+    if(scclParamDmaBufEnable() == 0) {
+        INFO(SCCL_LOG_NET, "Dmabuf feature disabled without SCCL_ENABLE_DMABUF_SUPPORT=1");
+        goto error;
+    }
+    res = pfn_hsa_system_get_info((hsa_system_info_t)0x204, &dmaBufSupport);
+    if(res != HSA_STATUS_SUCCESS || !dmaBufSupport) {
+        INFO(SCCL_LOG_NET, "Current version of ROCm does not support dmabuf feature.");
+        goto error;
+    } else {
+        pfn_hsa_amd_portable_export_dmabuf = (PFN_hsa_amd_portable_export_dmabuf)dlsym(hsaLib, "hsa_amd_portable_export_dmabuf");
+        if(pfn_hsa_amd_portable_export_dmabuf == NULL) {
+            WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf");
+            goto error;
+        } else {
+            // check OS kernel support
+            struct utsname utsname;
+            FILE* fp             = NULL;
+            char kernel_opt1[28] = "CONFIG_DMABUF_MOVE_NOTIFY=y";
+            char kernel_opt2[20] = "CONFIG_PCI_P2PDMA=y";
+            char kernel_conf_file[128];
+            char buf[256];
+            int found_opt1 = 0;
+            int found_opt2 = 0;
+
+            // check for kernel name exists
+            if(uname(&utsname) == -1)
+                INFO(SCCL_LOG_NET, "Could not get kernel name");
+            // format and store the kernel conf file location
+            snprintf(kernel_conf_file, sizeof(kernel_conf_file), "/boot/config-%s", utsname.release);
+            fp = fopen(kernel_conf_file, "r");
+            if(fp == NULL)
+                INFO(SCCL_LOG_NET, "Could not open kernel conf file");
+            // look for kernel_opt1 and kernel_opt2 in the conf file and check
+            while(fgets(buf, sizeof(buf), fp) != NULL) {
+                if(strstr(buf, kernel_opt1) != NULL) {
+                    found_opt1 = 1;
+                    INFO(SCCL_LOG_NET, "CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release);
+                }
+                if(strstr(buf, kernel_opt2) != NULL) {
+                    found_opt2 = 1;
+                    INFO(SCCL_LOG_NET, "CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release);
+                }
+            }
+            if(!found_opt1 || !found_opt2) {
+                dmaBufSupport = 0;
+                INFO(SCCL_LOG_NET, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release);
+                INFO(SCCL_LOG_NET, "DMA_BUF_SUPPORT Failed due to OS kernel support");
+            }
+
+            if(dmaBufSupport)
+                INFO(SCCL_LOG_NET, "DMA_BUF Support Enabled");
+            else
+                goto error;
+        }
+    }
+
+    /*
+     * Required to initialize the ROCr Driver.
+     * Multiple calls of hsa_init() will return immediately
+     * without making any relevant change
+     */
+
+    initResult = scclSuccess;
+
+error:
+    initResult = scclSystemError;
+    return;
+}
+
+} // namespace rocm_wrap
+
+scclResult_t rocmLibraryInit() {
+    pthread_once(&rocm_wrap::initOnceControl, rocm_wrap::initOnceFunc);
+    return rocm_wrap::initResult;
+}
+
+} // namespace net
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/net/rocm_wrap.h
+++ b/src/hardware/net/rocm_wrap.h
+#pragma once
+
+#include <hsa/hsa.h>
+
+namespace sccl {
+namespace hardware {
+namespace net {
+namespace rocm_wrap {
+
+typedef hsa_status_t (*PFN_hsa_init)();
+typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, void* value);
+typedef hsa_status_t (*PFN_hsa_status_string)(hsa_status_t status, const char** status_string);
+typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size_t size, int* dmabuf, uint64_t* offset);
+
+#define DECLARE_ROCM_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
+
+DECLARE_ROCM_PFN_EXTERN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
+
+/* ROCr Driver functions loaded with dlsym() */
+DECLARE_ROCM_PFN_EXTERN(hsa_init);
+DECLARE_ROCM_PFN_EXTERN(hsa_system_get_info);
+DECLARE_ROCM_PFN_EXTERN(hsa_status_string);
+
+} // namespace rocm_wrap
+
+scclResult_t rocmLibraryInit(void);
+
+} // namespace net
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/readme.MD
+++ b/src/hardware/readme.MD
+# hardware功能
+
+包括基于硬件的网络连接，以及通信相关的底层指令
--- a/src/hardware/topo_bak/cpuset.h
+++ b/src/hardware/topo_bak/cpuset.h
+/*************************************************************************
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef SCCL_CPUSET_H_
+#define SCCL_CPUSET_H_
+
+#include "base.h"
+
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace topo {
+// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
+
+/**
+ * 将十六进制字符转换为对应的整数值
+ *
+ * @param c 输入的十六进制字符（0-9, a-f）
+ * @return 返回对应的整数值（0-15），如果输入无效则返回-1
+ */
+static int hexToInt(char c) {
+    int v = c - '0';
+    if(v < 0)
+        return -1;
+    if(v > 9)
+        v = 10 + c - 'a';
+    if((v < 0) || (v > 15))
+        return -1;
+    return v;
+}
+
+#define CPU_SET_N_U32 (sizeof(cpu_set_t) / sizeof(uint32_t))
+
+/**
+ * 将十六进制字符串转换为CPU集合掩码
+ *
+ * @param str 输入的十六进制字符串，用逗号分隔不同部分
+ * @param mask 输出的CPU集合掩码
+ * @return scclSuccess 表示转换成功
+ *
+ * @note 字符串从左到右对应掩码从高到低的32位字
+ *       每个字符代表4位十六进制数
+ *       遇到非十六进制字符会提前终止转换
+ */
+static scclResult_t scclStrToCpuset(const char* str, cpu_set_t* mask) {
+    uint32_t cpumasks[CPU_SET_N_U32];
+    int m       = CPU_SET_N_U32 - 1;
+    cpumasks[m] = 0;
+    for(int o = 0; o < strlen(str); o++) {
+        char c = str[o];
+        if(c == ',') {
+            m--;
+            cpumasks[m] = 0;
+        } else {
+            int v = hexToInt(c);
+            if(v == -1)
+                break;
+            cpumasks[m] <<= 4;
+            cpumasks[m] += v;
+        }
+    }
+    // Copy cpumasks to mask
+    for(int a = 0; m < CPU_SET_N_U32; a++, m++) {
+        memcpy(((uint32_t*)mask) + a, cpumasks + m, sizeof(uint32_t));
+    }
+    return scclSuccess;
+}
+
+/**
+ * 将CPU集合掩码转换为十六进制字符串表示
+ *
+ * @param mask 输入的CPU集合掩码
+ * @param str 输出的字符串缓冲区，用于存储转换结果
+ * @return 返回操作结果(scclSuccess表示成功)
+ *
+ * 转换规则：
+ * 1. 将cpu_set_t按字节从高到低转换为十六进制字符串
+ * 2. 每4个字节后添加一个逗号分隔符
+ * 3. 忽略前导零
+ */
+static scclResult_t scclCpusetToStr(cpu_set_t* mask, char* str) {
+    int c       = 0;
+    uint8_t* m8 = (uint8_t*)mask;
+    for(int o = sizeof(cpu_set_t) - 1; o >= 0; o--) {
+        if(c == 0 && m8[o] == 0)
+            continue;
+        sprintf(str + c, "%02x", m8[o]);
+        c += 2;
+        if(o && o % 4 == 0) {
+            sprintf(str + c, ",");
+            c++;
+        }
+    }
+    str[c] = '\0';
+    return scclSuccess;
+}
+
+/**
+ * 将CPU集合掩码转换为范围字符串表示
+ *
+ * @param mask 输入的CPU集合掩码
+ * @param str  用于存储结果的缓冲区
+ * @param len  缓冲区长度
+ * @return     返回转换后的字符串指针(即str参数)
+ *
+ * 该函数将CPU集合掩码转换为可读的范围字符串格式，例如"0-3,5,7-9"。
+ * 如果缓冲区空间不足，结果会被截断。空集合会返回空字符串。
+ */
+static char* scclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) {
+    int c     = 0;
+    int start = -1;
+    // Iterate through all possible CPU bits plus one extra position
+    for(int cpu = 0; cpu <= CPU_SETSIZE; cpu++) {
+        int isSet = (cpu == CPU_SETSIZE) ? 0 : CPU_ISSET(cpu, mask);
+        // Start of a new range
+        if(isSet && start == -1) {
+            start = cpu;
+        }
+        // End of a range, add comma between ranges
+        if(!isSet && start != -1) {
+            if(cpu - 1 == start) {
+                c += snprintf(str + c, len - c, "%s%d", c ? "," : "", start);
+            } else {
+                c += snprintf(str + c, len - c, "%s%d-%d", c ? "," : "", start, cpu - 1);
+            }
+            if(c >= len - 1)
+                break;
+            start = -1;
+        }
+    }
+    if(c == 0)
+        str[0] = '\0';
+    return str;
+}
+
+} // namespace topo
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
+#endif
--- a/src/hardware/topo_bak/detect_topo.cc
+++ b/src/hardware/topo_bak/detect_topo.cc
--- a/src/hardware/topo_bak/detect_topo.h
+++ b/src/hardware/topo_bak/detect_topo.h
--- a/src/hardware/topo_bak/nvmlwrap.cc
+++ b/src/hardware/topo_bak/nvmlwrap.cc
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nvmlwrap.h"
+#include "base.h"
+
+#include <initializer_list>
+#include <memory>
+#include <mutex>
+
+namespace sccl {
+namespace hardware {
+namespace topology {
+
+int scclNvmlDeviceCount = 0;
+scclNvmlDeviceInfo scclNvmlDevices[scclNvmlMaxDevices];
+scclNvmlDevicePairInfo scclNvmlDevicePairs[scclNvmlMaxDevices][scclNvmlMaxDevices];
+
+#if SCCL_NVML_DIRECT
+#define SCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name) arglist = name;
+#else
+#include <dlfcn.h>
+#define SCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name) arglist = nullptr;
+#endif
+
+namespace {
+SCCL_NVML_FN(nvmlInit, nvmlReturn_t, ())
+SCCL_NVML_FN(nvmlInit_v2, nvmlReturn_t, ())
+SCCL_NVML_FN(nvmlShutdown, nvmlReturn_t, ())
+SCCL_NVML_FN(nvmlDeviceGetCount, nvmlReturn_t, (unsigned int*))
+SCCL_NVML_FN(nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*))
+SCCL_NVML_FN(nvmlDeviceGetHandleByPciBusId, nvmlReturn_t, (const char* pciBusId, nvmlDevice_t* device))
+SCCL_NVML_FN(nvmlDeviceGetHandleByIndex, nvmlReturn_t, (unsigned int index, nvmlDevice_t* device))
+SCCL_NVML_FN(nvmlDeviceGetIndex, nvmlReturn_t, (nvmlDevice_t device, unsigned* index))
+SCCL_NVML_FN(nvmlErrorString, char const*, (nvmlReturn_t r))
+SCCL_NVML_FN(nvmlDeviceGetNvLinkState, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive))
+SCCL_NVML_FN(nvmlDeviceGetNvLinkRemotePciInfo, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci))
+SCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult))
+SCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
+SCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
+SCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values))
+
+std::mutex lock; // NVML has had some thread safety bugs
+bool initialized                    = false;
+thread_local bool threadInitialized = false;
+scclResult_t initResult;
+} // namespace
+
+scclResult_t scclNvmlEnsureInitialized() {
+    // Optimization to avoid repeatedly grabbing the lock when we only want to
+    // read from the global tables.
+    if(threadInitialized)
+        return initResult;
+    threadInitialized = true;
+
+    std::lock_guard<std::mutex> locked(lock);
+
+    if(initialized)
+        return initResult;
+    initialized = true;
+
+#if !SCCL_NVML_DIRECT
+    if(pfn_nvmlInit == nullptr) {
+        void* libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
+        if(libhandle == nullptr) {
+            WARN("Failed to open libnvidia-ml.so.1");
+            initResult = scclSystemError;
+            return initResult;
+        }
+
+        struct Symbol {
+            void** ppfn;
+            char const* name;
+        };
+        std::initializer_list<Symbol> symbols = {{(void**)&pfn_nvmlInit, "nvmlInit"},
+                                                 {(void**)&pfn_nvmlInit_v2, "nvmlInit_v2"},
+                                                 {(void**)&pfn_nvmlShutdown, "nvmlShutdown"},
+                                                 {(void**)&pfn_nvmlDeviceGetCount, "nvmlDeviceGetCount"},
+                                                 {(void**)&pfn_nvmlDeviceGetCount_v2, "nvmlDeviceGetCount_v2"},
+                                                 {(void**)&pfn_nvmlDeviceGetHandleByPciBusId, "nvmlDeviceGetHandleByPciBusId"},
+                                                 {(void**)&pfn_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"},
+                                                 {(void**)&pfn_nvmlDeviceGetIndex, "nvmlDeviceGetIndex"},
+                                                 {(void**)&pfn_nvmlErrorString, "nvmlErrorString"},
+                                                 {(void**)&pfn_nvmlDeviceGetNvLinkState, "nvmlDeviceGetNvLinkState"},
+                                                 {(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"},
+                                                 {(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
+                                                 {(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
+                                                 {(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"},
+                                                 {(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"}};
+        for(Symbol sym : symbols) {
+            *sym.ppfn = dlsym(libhandle, sym.name);
+        }
+    }
+#endif
+
+#if SCCL_NVML_DIRECT
+    bool have_v2 = true;
+#else
+    bool have_v2 = pfn_nvmlInit_v2 != nullptr; // if this compare is done in the SCCL_NVML_DIRECT=1 case then GCC warns about it never being null
+#endif
+    nvmlReturn_t res1 = (have_v2 ? pfn_nvmlInit_v2 : pfn_nvmlInit)();
+    if(res1 != NVML_SUCCESS) {
+        WARN("nvmlInit%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1));
+        initResult = scclSystemError;
+        return initResult;
+    }
+
+    unsigned int ndev;
+    res1 = (have_v2 ? pfn_nvmlDeviceGetCount_v2 : pfn_nvmlDeviceGetCount)(&ndev);
+    if(res1 != NVML_SUCCESS) {
+        WARN("nvmlDeviceGetCount%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1));
+        initResult = scclSystemError;
+        return initResult;
+    }
+
+    scclNvmlDeviceCount = int(ndev);
+    if(scclNvmlMaxDevices < scclNvmlDeviceCount) {
+        WARN("nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (scclNvmlMaxDevices=%d)", scclNvmlDeviceCount, scclNvmlMaxDevices);
+        initResult = scclInternalError;
+        return initResult;
+    }
+
+    for(int a = 0; a < scclNvmlDeviceCount; a++) {
+        res1 = pfn_nvmlDeviceGetHandleByIndex(a, &scclNvmlDevices[a].handle);
+        if(res1 != NVML_SUCCESS) {
+            WARN("nvmlDeviceGetHandleByIndex(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
+            initResult = scclSystemError;
+            return initResult;
+        }
+
+        res1 = pfn_nvmlDeviceGetCudaComputeCapability(
+            scclNvmlDevices[a].handle, &scclNvmlDevices[a].computeCapabilityMajor, &scclNvmlDevices[a].computeCapabilityMinor);
+        if(res1 != NVML_SUCCESS) {
+            WARN("nvmlDeviceGetCudaComputeCapability(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
+            initResult = scclSystemError;
+            return initResult;
+        }
+    }
+
+    for(int a = 0; a < scclNvmlDeviceCount; a++) {
+        for(int b = 0; b < scclNvmlDeviceCount; b++) {
+            nvmlDevice_t da = scclNvmlDevices[a].handle;
+            nvmlDevice_t db = scclNvmlDevices[b].handle;
+
+            res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_READ, &scclNvmlDevicePairs[a][b].p2pStatusRead);
+            if(res1 != NVML_SUCCESS) {
+                WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
+                initResult = scclSystemError;
+                return initResult;
+            }
+
+            res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_WRITE, &scclNvmlDevicePairs[a][b].p2pStatusWrite);
+            if(res1 != NVML_SUCCESS) {
+                WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
+                initResult = scclSystemError;
+                return initResult;
+            }
+        }
+    }
+
+    initResult = scclSuccess;
+    return initResult;
+}
+
+#define NVMLCHECK(name, ...)                                             \
+    do {                                                                 \
+        nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__);                \
+        if(e44241808 != NVML_SUCCESS) {                                  \
+            WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
+            return scclSystemError;                                      \
+        }                                                                \
+    } while(0)
+
+#define NVMLTRY(name, ...)                                                                  \
+    do {                                                                                    \
+        if(!SCCL_NVML_DIRECT && pfn_##name == nullptr)                                      \
+            return scclInternalError; /* missing symbol is not a warned error */            \
+        nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__);                                   \
+        if(e44241808 != NVML_SUCCESS) {                                                     \
+            if(e44241808 != NVML_ERROR_NOT_SUPPORTED)                                       \
+                INFO(SCCL_LOG_TOPO, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
+            return scclSystemError;                                                         \
+        }                                                                                   \
+    } while(0)
+
+scclResult_t scclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+    SCCLCHECK(scclNvmlEnsureInitialized());
+    std::lock_guard<std::mutex> locked(lock);
+    NVMLCHECK(nvmlDeviceGetHandleByPciBusId, pciBusId, device);
+    return scclSuccess;
+}
+
+scclResult_t scclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
+    SCCLCHECK(scclNvmlEnsureInitialized());
+    *device = scclNvmlDevices[index].handle;
+    return scclSuccess;
+}
+
+scclResult_t scclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+    SCCLCHECK(scclNvmlEnsureInitialized());
+    for(int d = 0; d < scclNvmlDeviceCount; d++) {
+        if(scclNvmlDevices[d].handle == device) {
+            *index = d;
+            return scclSuccess;
+        }
+    }
+    return scclInvalidArgument;
+}
+
+scclResult_t scclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) {
+    SCCLCHECK(scclNvmlEnsureInitialized());
+    std::lock_guard<std::mutex> locked(lock);
+    NVMLTRY(nvmlDeviceGetNvLinkState, device, link, isActive);
+    return scclSuccess;
+}
+
+scclResult_t scclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) {
+    SCCLCHECK(scclNvmlEnsureInitialized());
+    std::lock_guard<std::mutex> locked(lock);
+    NVMLTRY(nvmlDeviceGetNvLinkRemotePciInfo, device, link, pci);
+    return scclSuccess;
+}
+
+scclResult_t scclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) {
+    SCCLCHECK(scclNvmlEnsureInitialized());
+    std::lock_guard<std::mutex> locked(lock);
+    NVMLTRY(nvmlDeviceGetNvLinkCapability, device, link, capability, capResult);
+    return scclSuccess;
+}
+
+scclResult_t scclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
+    SCCLCHECK(scclNvmlEnsureInitialized());
+
+    for(int d = 0; d < scclNvmlDeviceCount; d++) {
+        if(device == scclNvmlDevices[d].handle) {
+            *major = scclNvmlDevices[d].computeCapabilityMajor;
+            *minor = scclNvmlDevices[d].computeCapabilityMinor;
+            return scclSuccess;
+        }
+    }
+    return scclInvalidArgument;
+}
+
+scclResult_t scclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus) {
+    SCCLCHECK(scclNvmlEnsureInitialized());
+
+    if(p2pIndex == NVML_P2P_CAPS_INDEX_READ || p2pIndex == NVML_P2P_CAPS_INDEX_WRITE) {
+        int a = -1, b = -1;
+        for(int d = 0; d < scclNvmlDeviceCount; d++) {
+            if(device1 == scclNvmlDevices[d].handle)
+                a = d;
+            if(device2 == scclNvmlDevices[d].handle)
+                b = d;
+        }
+        if(a == -1 || b == -1)
+            return scclInvalidArgument;
+        if(p2pIndex == NVML_P2P_CAPS_INDEX_READ)
+            *p2pStatus = scclNvmlDevicePairs[a][b].p2pStatusRead;
+        else
+            *p2pStatus = scclNvmlDevicePairs[a][b].p2pStatusWrite;
+    } else {
+        std::lock_guard<std::mutex> locked(lock);
+        NVMLCHECK(nvmlDeviceGetP2PStatus, device1, device2, p2pIndex, p2pStatus);
+    }
+    return scclSuccess;
+}
+
+scclResult_t scclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values) {
+    SCCLCHECK(scclNvmlEnsureInitialized());
+    std::lock_guard<std::mutex> locked(lock);
+    NVMLTRY(nvmlDeviceGetFieldValues, device, valuesCount, values);
+    return scclSuccess;
+}
+
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/topo_bak/nvmlwrap.h
+++ b/src/hardware/topo_bak/nvmlwrap.h
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef SCCL_NVMLWRAP_H_
+#define SCCL_NVMLWRAP_H_
+
+#include "check.h"
+
+namespace sccl {
+namespace hardware {
+namespace topology {
+
+// #define SCCL_NVML_DIRECT 1
+#ifndef SCCL_NVML_DIRECT
+#define SCCL_NVML_DIRECT 0
+#endif
+
+#if SCCL_NVML_DIRECT
+#include "nvml.h"
+#else
+// Dynamically handle dependencies on NVML
+
+/* Extracted from nvml.h */
+typedef struct nvmlDevice_st* nvmlDevice_t;
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
+
+typedef enum nvmlEnableState_enum {
+    NVML_FEATURE_DISABLED = 0, //!< Feature disabled
+    NVML_FEATURE_ENABLED  = 1  //!< Feature enabled
+} nvmlEnableState_t;
+
+typedef enum nvmlNvLinkCapability_enum {
+    NVML_NVLINK_CAP_P2P_SUPPORTED  = 0, // P2P over NVLink is supported
+    NVML_NVLINK_CAP_SYSMEM_ACCESS  = 1, // Access to system memory is supported
+    NVML_NVLINK_CAP_P2P_ATOMICS    = 2, // P2P atomics are supported
+    NVML_NVLINK_CAP_SYSMEM_ATOMICS = 3, // System memory atomics are supported
+    NVML_NVLINK_CAP_SLI_BRIDGE     = 4, // SLI is supported over this link
+    NVML_NVLINK_CAP_VALID          = 5, // Link is supported on this device
+    // should be last
+    NVML_NVLINK_CAP_COUNT
+} nvmlNvLinkCapability_t;
+
+typedef enum nvmlReturn_enum {
+    NVML_SUCCESS                       = 0,  //!< The operation was successful
+    NVML_ERROR_UNINITIALIZED           = 1,  //!< NVML was not first initialized with nvmlInit()
+    NVML_ERROR_INVALID_ARGUMENT        = 2,  //!< A supplied argument is invalid
+    NVML_ERROR_NOT_SUPPORTED           = 3,  //!< The requested operation is not available on target device
+    NVML_ERROR_NO_PERMISSION           = 4,  //!< The current user does not have permission for operation
+    NVML_ERROR_ALREADY_INITIALIZED     = 5,  //!< Deprecated: Multiple initializations are now allowed through ref counting
+    NVML_ERROR_NOT_FOUND               = 6,  //!< A query to find an object was unsuccessful
+    NVML_ERROR_INSUFFICIENT_SIZE       = 7,  //!< An input argument is not large enough
+    NVML_ERROR_INSUFFICIENT_POWER      = 8,  //!< A device's external power cables are not properly attached
+    NVML_ERROR_DRIVER_NOT_LOADED       = 9,  //!< NVIDIA driver is not loaded
+    NVML_ERROR_TIMEOUT                 = 10, //!< User provided timeout passed
+    NVML_ERROR_IRQ_ISSUE               = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU
+    NVML_ERROR_LIBRARY_NOT_FOUND       = 12, //!< NVML Shared Library couldn't be found or loaded
+    NVML_ERROR_FUNCTION_NOT_FOUND      = 13, //!< Local version of NVML doesn't implement this function
+    NVML_ERROR_CORRUPTED_INFOROM       = 14, //!< infoROM is corrupted
+    NVML_ERROR_GPU_IS_LOST             = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible
+    NVML_ERROR_RESET_REQUIRED          = 16, //!< The GPU requires a reset before it can be used again
+    NVML_ERROR_OPERATING_SYSTEM        = 17, //!< The GPU control device has been blocked by the operating system/cgroups
+    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch
+    NVML_ERROR_IN_USE                  = 19, //!< An operation cannot be performed because the GPU is currently in use
+    NVML_ERROR_UNKNOWN                 = 999 //!< An internal driver error occurred
+} nvmlReturn_t;
+
+typedef struct nvmlPciInfo_st {
+    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
+    unsigned int domain;                            //!< The PCI domain on which the device's bus resides, 0 to 0xffff
+    unsigned int bus;                               //!< The bus on which the device resides, 0 to 0xff
+    unsigned int device;                            //!< The device's id on the bus, 0 to 31
+    unsigned int pciDeviceId;                       //!< The combined 16-bit device id and 16-bit vendor id
+
+    // Added in NVML 2.285 API
+    unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID
+
+    // NVIDIA reserved for internal use only
+    unsigned int reserved0;
+    unsigned int reserved1;
+    unsigned int reserved2;
+    unsigned int reserved3;
+} nvmlPciInfo_t;
+
+/* P2P Capability Index Status*/
+typedef enum nvmlGpuP2PStatus_enum {
+    NVML_P2P_STATUS_OK = 0,
+    NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
+    NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
+    NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
+    NVML_P2P_STATUS_DISABLED_BY_REGKEY,
+    NVML_P2P_STATUS_NOT_SUPPORTED,
+    NVML_P2P_STATUS_UNKNOWN
+} nvmlGpuP2PStatus_t;
+
+/* P2P Capability Index*/
+typedef enum nvmlGpuP2PCapsIndex_enum {
+    NVML_P2P_CAPS_INDEX_READ = 0,
+    NVML_P2P_CAPS_INDEX_WRITE,
+    NVML_P2P_CAPS_INDEX_NVLINK,
+    NVML_P2P_CAPS_INDEX_ATOMICS,
+    NVML_P2P_CAPS_INDEX_PROP,
+    NVML_P2P_CAPS_INDEX_UNKNOWN
+} nvmlGpuP2PCapsIndex_t;
+
+/**
+ * Represents the type for sample value returned
+ */
+typedef enum nvmlValueType_enum {
+    NVML_VALUE_TYPE_DOUBLE             = 0,
+    NVML_VALUE_TYPE_UNSIGNED_INT       = 1,
+    NVML_VALUE_TYPE_UNSIGNED_LONG      = 2,
+    NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
+    NVML_VALUE_TYPE_SIGNED_LONG_LONG   = 4,
+
+    // Keep this last
+    NVML_VALUE_TYPE_COUNT
+} nvmlValueType_t;
+
+/**
+ * Union to represent different types of Value
+ */
+typedef union nvmlValue_st {
+    double dVal;               //!< If the value is double
+    unsigned int uiVal;        //!< If the value is unsigned int
+    unsigned long ulVal;       //!< If the value is unsigned long
+    unsigned long long ullVal; //!< If the value is unsigned long long
+    signed long long sllVal;   //!< If the value is signed long long
+} nvmlValue_t;
+
+/**
+ * Field Identifiers.
+ *
+ * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
+ */
+
+/* NVLink Speed */
+#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90       //!< Common NVLink Speed in MBps for active links
+#define NVML_FI_DEV_NVLINK_LINK_COUNT 91              //!< Number of NVLinks present on the device
+
+/**
+ * Remote device NVLink ID
+ *
+ * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t.
+ */
+#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146       //!< Remote device NVLink ID
+
+/**
+ * NVSwitch: connected NVLink count
+ */
+#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch
+
+#define NVML_FI_DEV_NVLINK_GET_SPEED 164
+#define NVML_FI_DEV_NVLINK_GET_STATE 165
+#define NVML_FI_DEV_NVLINK_GET_VERSION 166
+#define NVML_FI_MAX 167 //!< One greater than the largest field ID defined above
+
+/**
+ * Information for a Field Value Sample
+ */
+typedef struct nvmlFieldValue_st {
+    unsigned int
+        fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above.
+    unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId
+                          //!< can represent linkId.
+    long long timestamp;  //!< CPU Timestamp of this value in microseconds since 1970
+    long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by
+                           //!< the same driver call.
+    nvmlValueType_t valueType; //!< Type of the value stored in value
+    nvmlReturn_t nvmlReturn;   //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn !=
+                               //!< NVML_SUCCESS
+    nvmlValue_t value;         //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
+} nvmlFieldValue_t;
+
+/* End of nvml.h */
+#endif                  // SCCL_NVML_DIRECT
+
+constexpr int scclNvmlMaxDevices = 32;
+struct scclNvmlDeviceInfo {
+    nvmlDevice_t handle;
+    int computeCapabilityMajor, computeCapabilityMinor;
+};
+struct scclNvmlDevicePairInfo {
+    nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite;
+};
+extern int scclNvmlDeviceCount;
+extern scclNvmlDeviceInfo scclNvmlDevices[scclNvmlMaxDevices];
+extern scclNvmlDevicePairInfo scclNvmlDevicePairs[scclNvmlMaxDevices][scclNvmlMaxDevices];
+
+// All scclNvmlFoo() functions call scclNvmlEnsureInitialized() implicitly.
+// Outsiders need only call it if they want to inspect the scclNvml global
+// tables above.
+scclResult_t scclNvmlEnsureInitialized();
+
+scclResult_t scclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
+scclResult_t scclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
+scclResult_t scclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device);
+scclResult_t scclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive);
+scclResult_t scclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci);
+scclResult_t scclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult);
+scclResult_t scclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
+scclResult_t scclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
+scclResult_t scclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values);
+
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
+
+#endif // End include guard
--- a/src/hardware/topo_bak/rocm_smi_wrap.cc
+++ b/src/hardware/topo_bak/rocm_smi_wrap.cc
+#include "rocm_smi_wrap.h"
+
+namespace sccl {
+
+#define ROCMSMICHECK(cmd)                          \
+    do {                                           \
+        rsmi_status_t ret = cmd;                   \
+        if(ret != RSMI_STATUS_SUCCESS) {           \
+            const char* err;                       \
+            rsmi_status_string(ret, &err);         \
+            WARN("ROCm SMI init failure %s", err); \
+            return scclInternalError;              \
+        }                                          \
+    } while(false)
+
+/**
+ * 初始化ROCm SMI库并获取版本信息
+ *
+ * @return scclSuccess 初始化成功
+ * @note 该函数会打印ROCm SMI库的版本信息到日志
+ */
+scclResult_t rocm_smi_init() {
+    ROCMSMICHECK(rsmi_init(0));
+    rsmi_version_t version;
+    ROCMSMICHECK(rsmi_version_get(&version));
+    INFO(SCCL_LOG_TOPO, "rocm_smi_lib: version %d.%d.%d.%s", version.major, version.minor, version.patch, version.build);
+    return scclSuccess;
+}
+
+/**
+ * 获取系统中可用的ROCm设备数量
+ *
+ * @param num_devs 输出参数，用于存储获取到的设备数量
+ * @return scclResult_t 返回操作结果，scclSuccess表示成功
+ */
+scclResult_t rocm_smi_getNumDevice(uint32_t* num_devs) {
+    ROCMSMICHECK(rsmi_num_monitor_devices(num_devs));
+    return scclSuccess;
+}
+
+scclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* busId, size_t len) {
+    uint64_t id;
+    ROCMSMICHECK(rsmi_dev_pci_id_get(deviceIndex, &id));
+    /** rocm_smi's bus ID format
+     *  | Name     | Field   |
+     *  ---------- | ------- |
+     *  | Domain   | [64:32] |
+     *  | Reserved | [31:16] |
+     *  | Bus      | [15: 8] |
+     *  | Device   | [ 7: 3] |
+     *  | Function | [ 2: 0] |
+     **/
+    // snprintf(busId, len, "%04lx:%02lx:%02lx.%01lx", (id) >> 32, (id & 0xff00) >> 8, (id & 0xf8) >> 3, (id & 0x7));
+    printf(busId, len, "%04lx:%02lx:%02lx.%01lx", (id) >> 32, (id & 0xff00) >> 8, (id & 0xf8) >> 3, (id & 0x7));
+    return scclSuccess;
+}
+
+scclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex) {
+    uint32_t i, num_devs = 0;
+    int64_t busid;
+
+    busIdToInt64(pciBusId, &busid);
+    /** convert to rocm_smi's bus ID format
+     *  | Name     | Field   |
+     *  ---------- | ------- |
+     *  | Domain   | [64:32] |
+     *  | Reserved | [31:16] |
+     *  | Bus      | [15: 8] |
+     *  | Device   | [ 7: 3] |
+     *  | Function | [ 2: 0] |
+     **/
+    busid = ((busid & 0xffff00000L) << 12) + ((busid & 0xff000L) >> 4) + ((busid & 0xff0L) >> 1) + (busid & 0x7L);
+    ROCMSMICHECK(rsmi_num_monitor_devices(&num_devs));
+    for(i = 0; i < num_devs; i++) {
+        uint64_t bdfid;
+        ROCMSMICHECK(rsmi_dev_pci_id_get(i, &bdfid));
+        if(bdfid == busid)
+            break;
+    }
+
+    if(i < num_devs) {
+        *deviceIndex = i;
+        return scclSuccess;
+    } else {
+        WARN("rocm_smi_lib: %s device index not found", pciBusId);
+        return scclInternalError;
+    }
+}
+
+/**
+ * 获取两个ROCm设备之间的链接信息
+ *
+ * @param srcIndex 源设备索引
+ * @param dstIndex 目标设备索引
+ * @param rsmi_type [out] 返回链接类型(RSMI_IO_LINK_TYPE)
+ * @param hops [out] 返回跳数(默认为2，XGMI类型且权重为15时为1)
+ * @param count [out] 返回链接计数(默认为1，XGMI类型时根据带宽计算)
+ *
+ * @return 成功返回scclSuccess，失败返回错误码
+ *
+ * @note 对于XGMI类型链接，当ROCm SMI版本>=2时，会根据最小/最大带宽计算链接计数
+ */
+scclResult_t rocm_smi_getLinkInfo(int srcIndex, int dstIndex, RSMI_IO_LINK_TYPE* rsmi_type, int* hops, int* count) {
+    uint64_t rsmi_hops, rsmi_weight;
+    ROCMSMICHECK(rsmi_topo_get_link_type(srcIndex, dstIndex, &rsmi_hops, rsmi_type));
+    ROCMSMICHECK(rsmi_topo_get_link_weight(srcIndex, dstIndex, &rsmi_weight));
+    *hops  = 2;
+    *count = 1;
+    if(*rsmi_type == RSMI_IOLINK_TYPE_XGMI && rsmi_weight == 15) {
+        *hops = 1;
+// #if defined USE_ROCM_SMI64CONFIG && rocm_smi_VERSION_MAJOR >= 2
+#if 1
+        uint64_t min_bw = 0, max_bw = 0;
+        rsmi_version_t version;
+        ROCMSMICHECK(rsmi_version_get(&version));
+        if(version.major >= 2)
+            ROCMSMICHECK(rsmi_minmax_bandwidth_get(srcIndex, dstIndex, &min_bw, &max_bw));
+        if(max_bw && min_bw)
+            *count = max_bw / min_bw;
+
+        INFO(SCCL_LOG_GRAPH, "rocm smi srcIndex:%d dstIndex:%d min_bw:%ld max_bw:%ld count:%d", srcIndex, dstIndex, min_bw, max_bw, *count);
+#endif
+    }
+    return scclSuccess;
+}
+
+} // namespace sccl
--- a/src/hardware/topo_bak/rocm_smi_wrap.h
+++ b/src/hardware/topo_bak/rocm_smi_wrap.h
+/*
+Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ROCM_SMI_WRAP_H_
+#define ROCM_SMI_WRAP_H_
+
+#include "rocm_smi/rocm_smi.h"
+#ifdef USE_ROCM_SMI64CONFIG
+#include "rocm_smi/rocm_smi64Config.h"
+#endif
+#include "base.h"
+
+namespace sccl {
+
+// 初始化ROCm SMI库
+scclResult_t rocm_smi_init();
+
+// 获取设备数量
+scclResult_t rocm_smi_getNumDevice(uint32_t* num_devs);
+
+// 根据设备索引获取设备的PCI总线ID字符串
+scclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len);
+
+// 根据PCI总线ID字符串获取设备索引
+scclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex);
+
+// 获取两个设备之间的链接信息，包括链接类型、跳数和链接数量
+scclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int* hops, int* count);
+
+} // namespace sccl
+#endif
--- a/src/hardware/topo_bak/topo.cc
+++ b/src/hardware/topo_bak/topo.cc
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <string.h>
+#include <unistd.h>
+#include <algorithm>
+
+#include "topo.h"
+#include "utils.h"
+#include "cpuset.h"
+#include "nvmlwrap.h"
+// #include "net.h"
+// #include "graph.h"
+// #include "comm.h"
+// #include "net.h"
+// #include "coll_net.h"
+// #include "cpuset.h"
+
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace topo {
+
+const char* topoNodeTypeStr[] = {"GPU", "PCI", "NVS", "CPU", "NIC", "NET"};
+const char* topoLinkTypeStr[] = {"LOC", "XGMI", "", "PCI", "", "", "", "SYS", "NET"};
+const char* topoPathTypeStr[] = {"LOC", "XGMI", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS"};
+
+namespace topo_basic {
+
+struct kvDict kvDictPciClass[] = {{"0x060400", PCI},
+                                  {"0x068000", NVS},
+                                  {"0x068001", CPU},
+                                  {"0x03", GPU},
+                                  {"0x02", NIC},
+                                  {"0x120000", GPU},
+                                  {"0x0b4000", GPU},
+                                  {NULL, PCI /* Default fallback value */}};
+struct kvDict kvDictPciGen[]   = {{"2.5 GT/s", 15},
+                                  {"5 GT/s", 30},
+                                  {"8 GT/s", 60},
+                                  {"16 GT/s", 120},
+                                  {"32 GT/s", 240}, /* Kernel 5.6 and earlier */
+                                  {"2.5 GT/s PCIe", 15},
+                                  {"5.0 GT/s PCIe", 30},
+                                  {"8.0 GT/s PCIe", 60},
+                                  {"16.0 GT/s PCIe", 120},
+                                  {"32.0 GT/s PCIe", 240},
+                                  {"64.0 GT/s PCIe", 480},
+                                  {NULL, 60 /* Default fallback */}}; // x100 Mbps per lane
+
+// 定义一个参数 TopoDumpFileRank，用于指定拓扑结构转储文件的等级，默认值为0
+SCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0);
+
+// 定义一个参数 IgnoreCpuAffinity，用于指定是否忽略CPU亲和性，默认值为0（不忽略）
+SCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+scclResult_t scclTopoAddNet(struct scclXmlNode* xmlNet, struct scclTopoSystem* system, struct scclTopoNode* nic, int64_t busId) {
+    int dev;
+    SCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
+
+    struct scclTopoNode* net;
+    SCCLCHECK(scclTopoCreateNode(system, &net, NET, dev));
+    const char* str;
+    SCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
+    if(str)
+        sscanf(str, "0x%lx", &net->net.asic);
+    else
+        net->net.asic = dev;
+
+    int mbps;
+    SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
+    if(mbps <= 0)
+        mbps = 10000; // Some NICs define speed = -1
+    net->net.bw = mbps / 8000.0;
+    if(xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != scclSuccess)
+        net->net.latency = 0;
+    SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
+    SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
+    // SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS));
+    SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0));
+    net->net.busId = busId;
+
+    SCCLCHECK(scclTopoConnectNodes(nic, net, LINK_NET, net->net.bw));
+    SCCLCHECK(scclTopoConnectNodes(net, nic, LINK_NET, net->net.bw));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoAddNic(struct scclXmlNode* xmlNic, struct scclTopoSystem* system, struct scclTopoNode* nic, int64_t busId) {
+    for(int s = 0; s < xmlNic->nSubs; s++) {
+        struct scclXmlNode* xmlNet = xmlNic->subs[s];
+        if(strcmp(xmlNet->name, "net") != 0)
+            continue;
+        int index;
+        SCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
+        if(index == -1)
+            continue;
+        SCCLCHECK(scclTopoAddNet(xmlNet, system, nic, busId));
+    }
+    return scclSuccess;
+}
+
+/**
+ * @brief 添加GPU拓扑节点到系统
+ *
+ * 从XML节点中解析GPU属性并填充到拓扑节点结构中，包括：
+ * - CUDA计算能力(sm)
+ * - GCN架构名称(gcn)
+ * - HIP设备架构(arch)
+ * - 设备排名(rank)
+ * - 设备号(dev)
+ * - GDR支持标志(gdr)
+ *
+ * @param xmlGpu 包含GPU配置的XML节点
+ * @param system 目标拓扑系统
+ * @param gpu 待填充的GPU拓扑节点
+ * @return scclResult_t 操作结果，成功返回scclSuccess
+ *
+ * @note 此函数仅处理GPU基础属性，NVLink连接将在后续处理
+ */
+scclResult_t scclTopoAddGpu(struct scclXmlNode* xmlGpu, struct scclTopoSystem* system, struct scclTopoNode* gpu) {
+    SCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
+    const char* gcnArch;
+    const char* gcnArchName;
+    SCCLCHECK(xmlGetAttr(xmlGpu, "gcn", &gcnArch));
+    convertGcnArchToGcnArchName(gcnArch, &gcnArchName);
+    gpu->gpu.gcn = strdup(gcnArchName);
+    scclHipDeviceArch_t arch;
+    SCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value));
+    memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t));
+    SCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
+    SCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev));
+    SCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport));
+    // Do not go any further, nvlinks will be added in a second pass
+    return scclSuccess;
+}
+
+/**
+ * @brief 添加PCI设备到拓扑系统
+ *
+ * 解析XML节点中的PCI设备信息，并根据设备类型（GPU/NIC/普通PCI）创建对应的拓扑节点。
+ * 对于GPU设备，会进一步解析rank信息；对于NIC设备，会合并多端口设备；对于普通PCI设备，
+ * 会解析vendor/device等属性并递归处理子设备。
+ *
+ * @param xmlPci 包含PCI设备信息的XML节点
+ * @param system 目标拓扑系统
+ * @param parent 父拓扑节点
+ * @return scclResult_t 操作结果，成功返回scclSuccess
+ */
+scclResult_t scclTopoAddPci(struct scclXmlNode* xmlPci, struct scclTopoSystem* system, struct scclTopoNode* parent) {
+    const char* str;
+
+    int type;
+    SCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));
+    SCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));
+
+    int64_t busId;
+    SCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));
+    SCCLCHECK(busIdToInt64(str, &busId));
+
+    struct scclTopoNode* node  = NULL;
+    struct scclXmlNode* xmlGpu = NULL;
+    SCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));
+    if(xmlGpu != NULL) {
+        type = GPU;
+        int index;
+        SCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
+        if(index == -1)
+            return scclSuccess;
+        SCCLCHECK(scclTopoCreateNode(system, &node, type, busId));
+        SCCLCHECK(scclTopoAddGpu(xmlGpu, system, node));
+    }
+    struct scclXmlNode* xmlNic = NULL;
+    SCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));
+    if(xmlNic != NULL) {
+        type = NIC;
+        // Ignore sub device ID and merge multi-port NICs into one PCI device.
+        busId &= 0xfffffffffffffff0;
+        struct scclTopoNode* nicNode = NULL;
+        SCCLCHECK(scclTopoGetNode(system, &nicNode, type, busId));
+        if(nicNode == NULL) {
+            SCCLCHECK(scclTopoCreateNode(system, &nicNode, type, busId));
+            node = nicNode; // Connect it to parent later on
+        }
+        SCCLCHECK(scclTopoAddNic(xmlNic, system, nicNode, busId));
+    } else if(type == PCI) {
+        SCCLCHECK(scclTopoCreateNode(system, &node, type, busId));
+        SCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str));
+        if(str)
+            node->pci.device += strtol(str, NULL, 0) << 48;
+        SCCLCHECK(xmlGetAttr(xmlPci, "device", &str));
+        if(str)
+            node->pci.device += strtol(str, NULL, 0) << 32;
+        SCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str));
+        if(str)
+            node->pci.device += strtol(str, NULL, 0) << 16;
+        SCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str));
+        if(str)
+            node->pci.device += strtol(str, NULL, 0);
+
+        for(int s = 0; s < xmlPci->nSubs; s++) {
+            struct scclXmlNode* xmlSubPci = xmlPci->subs[s];
+            SCCLCHECK(scclTopoAddPci(xmlSubPci, system, node));
+        }
+    }
+
+    if(node) {
+        int width, speed;
+        SCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
+        SCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));
+
+        // Manage cases where speed was not indicated in /sys
+        if(width == 0)
+            width = 16;
+        SCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)
+
+        SCCLCHECK(scclTopoConnectNodes(node, parent, LINK_PCI, width * speed / 80.0));
+        SCCLCHECK(scclTopoConnectNodes(parent, node, LINK_PCI, width * speed / 80.0));
+    }
+    return scclSuccess;
+}
+
+struct kvDict kvDictCpuArch[]   = {{"x86_64", SCCL_TOPO_CPU_ARCH_X86}, {"arm64", SCCL_TOPO_CPU_ARCH_ARM}, {"ppc64", SCCL_TOPO_CPU_ARCH_POWER}, {NULL, 0}};
+struct kvDict kvDictCpuVendor[] = {{"GenuineIntel", SCCL_TOPO_CPU_VENDOR_INTEL},
+                                   {"AuthenticAMD", SCCL_TOPO_CPU_VENDOR_AMD},
+                                   {"CentaurHauls", SCCL_TOPO_CPU_VENDOR_ZHAOXIN},
+                                   {"  Shanghai  ", SCCL_TOPO_CPU_VENDOR_ZHAOXIN},
+                                   {NULL, 0}};
+
+/**
+ * @brief 添加CPU拓扑信息到系统拓扑结构中
+ *
+ * 从XML节点中解析CPU信息，包括NUMA ID、CPU架构、厂商、型号等，
+ * 并创建对应的拓扑节点。同时处理CPU关联的PCI设备和NIC设备。
+ *
+ * @param xmlCpu 包含CPU配置信息的XML节点
+ * @param system 目标拓扑系统
+ * @return scclResult_t 操作结果，成功返回scclSuccess
+ */
+scclResult_t scclTopoAddCpu(struct scclXmlNode* xmlCpu, struct scclTopoSystem* system) {
+    int numaId;
+    // 从XML节点获取NUMA ID
+    SCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
+    struct scclTopoNode* cpu;
+    // 创建一个新的CPU节点
+    SCCLCHECK(scclTopoCreateNode(system, &cpu, CPU, numaId));
+    const char* str;
+    // 获取CPU的亲和性属性
+    SCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
+    if(str != NULL) {
+        SCCLCHECK(scclStrToCpuset(str, &cpu->cpu.affinity));
+    }
+
+    // 获取CPU架构信息
+    SCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));
+    SCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));
+    if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86) {
+        // 获取CPU供应商信息
+        SCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));
+        SCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));
+        if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
+            int familyId, modelId;
+            // 获取Intel CPU的家族ID和型号ID
+            SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
+            SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
+            // 根据家族ID和型号ID确定CPU型号
+            cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? SCCL_TOPO_CPU_TYPE_SKL : SCCL_TOPO_CPU_INTEL_BDW;
+        } else if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
+            int familyId, modelId;
+            // 获取兆芯CPU的家族ID和型号ID
+            SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
+            SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
+            if(familyId == 7 && modelId == 0x5B)
+                cpu->cpu.model = SCCL_TOPO_CPU_TYPE_YONGFENG;
+        }
+        if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_AMD) {
+            int familyId, modelId;
+            // 获取AMD CPU的家族ID和型号ID
+            SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
+            SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
+            // 将“Milan”也视为“Rome”
+            cpu->cpu.model = ((familyId == 143 && modelId >= 49) || familyId == 175) ? SCCL_TOPO_CPU_TYPE_ROME : SCCL_TOPO_CPU_TYPE_ZEN;
+        }
+    }
+    // 遍历CPU节点的子节点
+    for(int s = 0; s < xmlCpu->nSubs; s++) {
+        struct scclXmlNode* node = xmlCpu->subs[s];
+        // 如果子节点是PCI设备，添加PCI节点
+        if(strcmp(node->name, "pci") == 0)
+            SCCLCHECK(scclTopoAddPci(node, system, cpu));
+        // 如果子节点是NIC设备，添加NIC节点
+        if(strcmp(node->name, "nic") == 0) {
+            struct scclTopoNode* nic = NULL;
+            SCCLCHECK(scclTopoGetNode(system, &nic, NIC, 0));
+            if(nic == NULL) {
+                SCCLCHECK(scclTopoCreateNode(system, &nic, NIC, 0));
+                SCCLCHECK(scclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW));
+                SCCLCHECK(scclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW));
+            }
+            SCCLCHECK(scclTopoAddNic(node, system, nic, 0));
+        }
+    }
+    return scclSuccess;
+}
+
+// scclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
+//     char* str = path + offset;
+//     // Remove trailing "/"
+//     if(*str == '/')
+//         str--;
+//     // Find next /
+//     while(*str != '/')
+//         str--;
+//     str++;
+//     int64_t numid;
+//     SCCLCHECK(busIdToInt64(str, &numid));
+//     // Ignore subdevice because those should use the same PCI link so we want to merge nodes.
+//     numid -= numid & 0xf;
+//     *id = numid;
+//     return scclSuccess;
+// }
+
+static scclResult_t findLocalCpu(struct scclTopoNode* node, struct scclTopoNode** cpu) {
+    *cpu = NULL;
+    if(node->type == CPU) {
+        *cpu = node;
+        return scclSuccess;
+    }
+    for(int l = 0; l < node->nlinks; l++) {
+        if(node->links[l].type == LINK_PCI)
+            SCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+        if(*cpu != NULL)
+            return scclSuccess;
+    }
+    return scclSuccess;
+}
+
+static scclResult_t scclTopoGetInterCpuBw(struct scclTopoNode* cpu, float* bw) {
+    *bw = LOC_BW;
+    if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_POWER) {
+        *bw = P9_BW;
+        return scclSuccess;
+    }
+    if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_ARM) {
+        *bw = ARM_BW;
+        return scclSuccess;
+    }
+    if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
+        *bw = cpu->cpu.model == SCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
+    }
+    if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
+        *bw = cpu->cpu.model == SCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
+    }
+    return scclSuccess;
+}
+
+// BCM Gen4 Switches present themselves as a two-level hierarchical switch
+// even though they're supposed to sustain full BW across all ports.
+// Flatten the switch as this extra level can break the search and make
+// SCCL take wrong topology decisions.
+scclResult_t scclTopoFlattenBcmSwitches(struct scclTopoSystem* system) {
+    for(int s = 0; s < system->nodes[PCI].count; s++) {
+        struct scclTopoNode* pciSwitch = system->nodes[PCI].nodes + s;
+        uint64_t device                = pciSwitch->pci.device;
+        // Only flatten PEX Gen 4 switches in base mode
+        if((device & 0xfffffffffffff000) == 0x1000c0101000a000) {
+            // Find sub switches with the same device ID.
+            int64_t* subSwIds;
+            SCCLCHECK(scclCalloc(&subSwIds, pciSwitch->nlinks));
+            int subs = 0;
+            for(int l = 0; l < pciSwitch->nlinks; l++) {
+                struct scclTopoNode* sub = pciSwitch->links[l].remNode;
+                // Only fuse sub switches with the same device ID.
+                if(sub->type != PCI || sub->pci.device != device)
+                    continue;
+                // Save sub switch for later
+                subSwIds[subs++] = sub->id;
+                // Remove link to that sub switch
+                memmove(pciSwitch->links + l, pciSwitch->links + l + 1, (pciSwitch->nlinks - l - 1) * (sizeof(struct scclTopoLink)));
+                pciSwitch->nlinks--;
+                // Don't increase l for the next iteration as we just shifted all links by one.
+                l--;
+            }
+
+            for(int s = 0; s < subs; s++) {
+                // Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
+                int index;
+                SCCLCHECK(scclTopoIdToIndex(system, PCI, subSwIds[s], &index));
+                struct scclTopoNode* sub = system->nodes[PCI].nodes + index;
+                // Connect all sub PCI devices to the parent switch
+                for(int l = 0; l < sub->nlinks; l++) {
+                    struct scclTopoNode* remNode = sub->links[l].remNode;
+                    if(remNode == pciSwitch)
+                        continue;
+                    // Add link from parent PCI switch -> PCI device
+                    memcpy(pciSwitch->links + pciSwitch->nlinks, sub->links + l, sizeof(struct scclTopoLink));
+                    pciSwitch->nlinks++;
+                    // Update link from PCI device -> parent PCI switch
+                    for(int rl = 0; rl < remNode->nlinks; rl++) {
+                        if(remNode->links[rl].remNode == sub) {
+                            remNode->links[rl].remNode = pciSwitch;
+                            break;
+                        }
+                    }
+                }
+                SCCLCHECK(scclTopoRemoveNode(system, PCI, index));
+            }
+            // Set subdevice to 0x0000 to make sure we don't merge this switch again.
+            pciSwitch->pci.device = 0x1000c01010000000;
+            free(subSwIds);
+            // Restart, as system->nodes[PCI].nodes has changed.
+            s = 0;
+        }
+    }
+    return scclSuccess;
+}
+
+scclResult_t scclTopoConnectCpus(struct scclTopoSystem* system) {
+    // And connect all CPU nodes together
+    for(int n = 0; n < system->nodes[CPU].count; n++) {
+        for(int p = 0; p < system->nodes[CPU].count; p++) {
+            if(n == p)
+                continue;
+            float bw;
+            SCCLCHECK(scclTopoGetInterCpuBw(system->nodes[CPU].nodes + n, &bw));
+            SCCLCHECK(scclTopoConnectNodes(system->nodes[CPU].nodes + n, system->nodes[CPU].nodes + p, LINK_SYS, bw));
+        }
+    }
+    return scclSuccess;
+}
+
+static scclResult_t scclTopoSort(struct scclTopoNode* node, struct scclTopoNode* upNode) {
+    // 如果存在上级节点，则调整当前节点的链接顺序，使上级节点的链接位于最后
+    if(upNode) {
+        int l = 0;
+        // 找到指向upNode的链接
+        while(node->links[l].remNode != upNode)
+            l++;
+        struct scclTopoLink upLink;
+        // 复制找到的链接到upLink
+        memcpy(&upLink, node->links + l, sizeof(struct scclTopoLink));
+        // 将所有链接左移，直到upLink被移动到链接列表的末尾
+        while(node->links[l + 1].remNode) {
+            memcpy(node->links + l, node->links + l + 1, sizeof(struct scclTopoLink));
+            l++;
+        }
+        // 将upLink放到链接列表的末尾
+        memcpy(node->links + l, &upLink, sizeof(struct scclTopoLink));
+    }
+
+    // 递归地对PCI树进行排序
+    for(int l = 0; l < node->nlinks; l++) {
+        struct scclTopoLink* link = node->links + l;
+        // 如果链接类型是PCI且远端节点不是上级节点，则递归排序
+        if(link->type == LINK_PCI && link->remNode != upNode)
+            SCCLCHECK(scclTopoSort(link->remNode, node));
+    }
+    return scclSuccess;
+}
+
+// We want the graph to be organized to ease/accelerate traversal :
+// 1. NVLinks (already the case)
+// 2. PCI down
+// 3. PCI up
+// 4. SYS (already the case)
+scclResult_t scclTopoSortSystem(struct scclTopoSystem* system) {
+    for(int n = 0; n < system->nodes[CPU].count; n++)
+        SCCLCHECK(scclTopoSort(system->nodes[CPU].nodes + n, NULL));
+    return scclSuccess;
+}
+
+float scclTopoXGMISpeed(const char* gcn) {
+    if(IsArchMatch(gcn, "gfx90a"))
+        return MI200_XGMI_WIDTH;
+    else if(IsArchMatch(gcn, "gfx94"))
+        return GFX94X_XGMI_WIDTH;
+    else
+        return VEGA_XGMI_WIDTH;
+}
+
+/**
+ * @brief 添加XGMI拓扑连接
+ *
+ * 处理XML节点中的XGMI连接信息，建立GPU与其他设备（GPU/CPU/NVS）之间的NVL连接。
+ *
+ * @param node XML节点指针，包含XGMI连接配置信息
+ * @param system 拓扑系统指针，用于存储和管理拓扑节点
+ * @param parentBusId 父设备的PCIe总线ID字符串
+ *
+ * @return scclResult_t 返回操作结果状态码：
+ *         - scclSuccess: 操作成功
+ *         - scclInternalError: 找不到指定GPU设备时返回错误
+ *
+ * @note 1. 支持GPU-GPU、GPU-CPU、GPU-NVS三种连接类型
+ *       2. 连接带宽由GPU的GCN架构和连接数量共同决定
+ *       3. 递归处理子节点时保持总线ID传递
+ */
+scclResult_t scclTopoAddXGMI(struct scclXmlNode* node, struct scclTopoSystem* system, const char* parentBusId) {
+    if(strcmp(node->name, "xgmi") == 0) {
+        struct scclTopoNode* gpu = NULL;
+        int64_t pBusId;
+        SCCLCHECK(busIdToInt64(parentBusId, &pBusId));
+        SCCLCHECK(scclTopoGetNode(system, &gpu, GPU, pBusId));
+        if(gpu == NULL) {
+            WARN("Add XGMI error : could not find GPU %lx\n", pBusId);
+            return scclInternalError;
+        }
+        int count;
+        SCCLCHECK(xmlGetAttrInt(node, "count", &count));
+        const char* targetClass;
+        SCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));
+        int targetType;
+        SCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));
+        struct scclTopoNode* remote = NULL;
+        if(targetType == GPU) {
+            // NVL P2P connection to another GPU
+            const char* target;
+            SCCLCHECK(xmlGetAttrStr(node, "target", &target));
+            int64_t busId;
+            SCCLCHECK(busIdToInt64(target, &busId));
+            SCCLCHECK(scclTopoGetNode(system, &remote, GPU, busId));
+        } else if(targetType == CPU) {
+            // NVL connection to the local CPU
+            SCCLCHECK(findLocalCpu(gpu, &remote));
+        } else {
+            if(system->nodes[NVS].count == 0) {
+                SCCLCHECK(scclTopoCreateNode(system, &remote, NVS, 0));
+            } else {
+                remote = system->nodes[NVS].nodes;
+            }
+        }
+        if(remote) {
+            float nvlSpeed = scclTopoXGMISpeed(gpu->gpu.gcn);
+            SCCLCHECK(scclTopoConnectNodes(gpu, remote, LINK_NVL, count * nvlSpeed));
+            if(remote->type != GPU) {
+                SCCLCHECK(scclTopoConnectNodes(remote, gpu, LINK_NVL, count * nvlSpeed));
+            }
+        }
+    } else {
+        const char* busId;
+        SCCLCHECK(xmlGetAttr(node, "busid", &busId));
+        for(int s = 0; s < node->nSubs; s++) {
+            SCCLCHECK(scclTopoAddXGMI(node->subs[s], system, busId ? busId : parentBusId));
+        }
+    }
+    return scclSuccess;
+}
+
+/**
+ * @brief 获取指定GPU组的本地网络掩码
+ *
+ * 遍历系统中所有网络节点，找到与指定GPU组(g)连接带宽最大且路径类型最优的网络节点，
+ * 将这些网络节点的ID转换为位掩码形式输出。
+ *
+ * @param system 拓扑系统指针
+ * @param g GPU组索引
+ * @param localNetMask [out] 输出的本地网络掩码(64位无符号整数)
+ * @param type [out] 可选参数，输出最优路径类型
+ * @return scclResult_t 成功返回scclSuccess，失败返回错误码
+ */
+static scclResult_t getLocalNetMask(struct scclTopoSystem* system, int g, uint64_t* localNetMask, int* type) {
+    int minType = PATH_DIS;
+    float maxBw = 0;
+    int count   = 0;
+    int* nets;
+    SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count));
+    for(int n = 0; n < system->nodes[NET].count; n++) {
+        struct scclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU] + g;
+        if(path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
+            maxBw   = path->bw;
+            minType = path->type;
+            if(type)
+                *type = minType;
+            count = 0;
+        }
+        if(path->bw == maxBw && path->type == minType)
+            nets[count++] = system->nodes[NET].nodes[n].id;
+    }
+
+    *localNetMask = 0ULL;
+    for(int n = 0; n < count; n++) {
+        if(nets[n] >= 64)
+            return scclInternalError;
+        *localNetMask |= 1ULL << nets[n];
+    }
+    free(nets);
+    return scclSuccess;
+}
+
+static scclResult_t scclTopoPrintRec(struct scclTopoNode* node, struct scclTopoNode* prevNode, char* line, int offset) {
+    if(node->type == GPU) {
+        sprintf(line + offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
+    } else if(node->type == CPU) {
+        sprintf(line + offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
+    } else if(node->type == PCI) {
+        sprintf(line + offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device);
+    } else {
+        sprintf(line + offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
+    }
+    INFO(SCCL_LOG_TOPO, "%s", line);
+    for(int i = 0; i < offset; i++)
+        line[i] = ' ';
+
+    for(int l = 0; l < node->nlinks; l++) {
+        struct scclTopoLink* link = node->links + l;
+        if(link->type == LINK_LOC)
+            continue;
+        if(link->type != LINK_PCI || link->remNode != prevNode) {
+            sprintf(line + offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
+            int nextOffset = strlen(line);
+            if(link->type == LINK_PCI) {
+                SCCLCHECK(scclTopoPrintRec(link->remNode, node, line, nextOffset));
+            } else {
+                if(link->remNode->type == NET) {
+                    sprintf(line + nextOffset,
+                            "%s/%lX (%lx/%d/%f)",
+                            topoNodeTypeStr[link->remNode->type],
+                            link->remNode->id,
+                            link->remNode->net.asic,
+                            link->remNode->net.port,
+                            link->remNode->net.bw);
+                } else {
+                    sprintf(line + nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
+                }
+                INFO(SCCL_LOG_TOPO, "%s", line);
+            }
+        }
+    }
+    return scclSuccess;
+}
+
+} // namespace topo_basic
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+bool isHswDriverExist() {
+    const ::std::string basePath = "/sys/bus/pci/drivers";
+
+    DIR* dir = opendir(basePath.c_str());
+    if(!dir) {
+        return false;
+    }
+    struct dirent* entry;
+    bool found = false;
+    while((entry = readdir(dir)) != nullptr) {
+        ::std::string name = entry->d_name;
+        if(name != "." && name != ".." && name.compare(0, 3, "hsw") == 0) {
+            found = true;
+            break;
+        }
+    }
+    closedir(dir);
+    return found;
+}
+
+int getIBNum() {
+    int count                    = 0;
+    const ::std::string basePath = "/sys/class/infiniband";
+
+    DIR* dir = opendir(basePath.c_str());
+    if(!dir) {
+        return count;
+    }
+    struct dirent* entry;
+    while((entry = readdir(dir)) != nullptr) {
+        if(strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
+            continue;
+        if(strncmp(entry->d_name, "mlx5", 4) == 0)
+            ++count;
+    }
+    closedir(dir);
+    return count;
+}
+
+scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id) {
+    uint64_t* localNetMasks;
+    int ngpus = system->nodes[GPU].count;
+    SCCLCHECK(scclCalloc(&localNetMasks, ngpus));
+
+    // Fill localNetMasks for all GPUs.
+    for(int g = 0; g < ngpus; g++) {
+        SCCLCHECK(topo_basic::getLocalNetMask(system, g, localNetMasks + g, NULL));
+    }
+
+    // Find GPUs which have the same mask as rank, i.e. share the same local Nets.
+    int gpu;
+    SCCLCHECK(scclTopoRankToIndex(system, rank, &gpu));
+    int netLocalGpus = 0, netLocalGpu = 0;
+    for(int g = 0; g < ngpus; g++) {
+        if(localNetMasks[g] == localNetMasks[gpu]) {
+            if(g == gpu)
+                netLocalGpu = netLocalGpus;
+            netLocalGpus++;
+        }
+    }
+    uint64_t localNetMask = localNetMasks[gpu];
+    free(localNetMasks);
+    if(localNetMask == 0)
+        return scclInternalError;
+
+    // Round robin on GPUs and channels
+    int gIndex = 0, cId = 0, n = 0;
+    while(1) {
+        if(1ULL << n & localNetMask) {
+            if(gIndex == netLocalGpu && cId == channelId) {
+                *id = n;
+                return scclSuccess;
+            }
+            gIndex++;
+            if(gIndex == netLocalGpus) {
+                gIndex = 0;
+                cId++;
+            }
+        }
+        n = (n + 1) % 64;
+    }
+}
+
+scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex) {
+    int ngpus = system->nodes[GPU].count;
+    int* gpus;
+    SCCLCHECK(scclCalloc(&gpus, ngpus));
+
+    // Find localNetMask which includes net with the most local GPUs.
+    int netLocalGpus = 0, minType = PATH_DIS;
+    uint64_t localNetMask = 0ULL;
+    for(int g = 0; g < ngpus; g++) {
+        int type = PATH_DIS;
+        uint64_t mask;
+        SCCLCHECK(topo_basic::getLocalNetMask(system, g, &mask, &type));
+        if((1ULL << net) & mask) {
+            if(type < minType) {
+                localNetMask = mask;
+                netLocalGpus = 0;
+                minType      = type;
+            }
+            if(type == minType) {
+                if(localNetMask && mask != localNetMask) {
+                    WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n",
+                         g,
+                         gpus[netLocalGpus - 1],
+                         minType,
+                         net,
+                         mask,
+                         localNetMask);
+                    free(gpus);
+                    return scclInternalError;
+                }
+                gpus[netLocalGpus] = g;
+                netLocalGpus++;
+            }
+        }
+    }
+    if(localNetMask == 0ULL) {
+        *gpuIndex = -1;
+        free(gpus);
+        return scclSuccess;
+    }
+
+    // Round robin on GPUs and channels
+    int gIndex = 0, cId = 0, n = 0;
+    while(1) {
+        if(1ULL << n & localNetMask) {
+            if(n == net) {
+                *gpuIndex = gpus[gIndex];
+                free(gpus);
+                return scclSuccess;
+            }
+            gIndex++;
+            if(gIndex == netLocalGpus) {
+                gIndex = 0;
+                cId++;
+            }
+        }
+        n = (n + 1) % 64;
+    }
+}
+
+scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model) {
+    *arch   = system->nodes[CPU].nodes[0].cpu.arch;
+    *vendor = system->nodes[CPU].nodes[0].cpu.vendor;
+    *model  = system->nodes[CPU].nodes[0].cpu.model;
+    return scclSuccess;
+}
+
+scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity) {
+    struct scclTopoNode *cpu = NULL, *gpu = NULL;
+    for(int g = 0; g < system->nodes[GPU].count; g++) {
+        if(system->nodes[GPU].nodes[g].gpu.rank == rank) {
+            gpu = system->nodes[GPU].nodes + g;
+            // Find closer CPU
+            int cpuIndex = -1, minHops = 0;
+            for(int c = 0; c < system->nodes[CPU].count; c++) {
+                int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
+                if(cpuIndex == -1 || nHops < minHops) {
+                    cpuIndex = c;
+                    minHops  = nHops;
+                }
+            }
+            cpu = system->nodes[CPU].nodes + cpuIndex;
+        }
+    }
+    if(cpu == NULL) {
+        WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
+        return scclInternalError;
+    }
+
+    // Query the CPU affinity set we were provided
+    cpu_set_t mask;
+    SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
+
+    // Get the affinity of the CPU close to our GPU.
+    cpu_set_t cpuMask = cpu->cpu.affinity;
+    cpu_set_t finalMask;
+    if(topo_basic::scclParamIgnoreCpuAffinity())
+        // Ignore the CPU affinity set and use the GPU one instead
+        finalMask = cpuMask;
+    else
+        // Use a subset of the GPU affinity set
+        CPU_AND(&finalMask, &mask, &cpuMask);
+
+    memcpy(affinity, &finalMask, sizeof(cpu_set_t));
+
+    // If there is a non empty set, use it to set affinity
+    if(CPU_COUNT(&finalMask)) {
+        char affinityStr[sizeof(cpu_set_t) * 2];
+        SCCLCHECK(scclCpusetToStr(&finalMask, affinityStr));
+        INFO(SCCL_LOG_TOPO, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
+    }
+    return scclSuccess;
+}
+
+scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count) {
+    *count = system->nodes[GPU].count;
+    return scclSuccess;
+}
+
+scclResult_t scclTopoGetNetCount(struct scclTopoSystem* system, int* count) {
+    *count = system->nodes[NET].count;
+    return scclSuccess;
+}
+
+scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count) {
+    *count = system->nodes[NVS].count;
+    return scclSuccess;
+}
+
+scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank) {
+    for(int g = 0; g < system->nodes[GPU].count; g++) {
+        if(system->nodes[GPU].nodes[g].gpu.rank == rank) {
+            *localRank = g;
+            return scclSuccess;
+        }
+    }
+    WARN("Could not find local GPU with rank %d", rank);
+    return scclInternalError;
+}
+
+scclResult_t scclTopoPrint(struct scclTopoSystem* s) {
+    INFO(SCCL_LOG_TOPO, "=== System : maxBw %2.1f totalBw %2.1f ===", s->maxBw, s->totalBw);
+    char line[1024];
+    for(int n = 0; n < s->nodes[CPU].count; n++)
+        SCCLCHECK(topo_basic::scclTopoPrintRec(s->nodes[CPU].nodes + n, NULL, line, 0));
+    INFO(SCCL_LOG_TOPO, "==========================================");
+    return scclSuccess;
+}
+
+scclResult_t scclTopoGetNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id) {
+    for(int i = 0; i < system->nodes[type].count; i++) {
+        if(system->nodes[type].nodes[i].id == id) {
+            *node = system->nodes[type].nodes + i;
+            return scclSuccess;
+        }
+    }
+    return scclSuccess;
+}
+
+scclResult_t scclTopoCreateNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id) {
+    if(system->nodes[type].count == SCCL_TOPO_MAX_NODES) {
+        WARN("Error : tried to create too many nodes of type %d", type);
+        return scclInternalError;
+    }
+    struct scclTopoNode* n = system->nodes[type].nodes + system->nodes[type].count;
+    system->nodes[type].count++;
+    n->type = type;
+    n->id   = id;
+    if(type == GPU) {
+        // Create link to itself (used in some corner cases)
+        n->nlinks           = 1;
+        n->links[0].type    = LINK_LOC;
+        n->links[0].remNode = n;
+        n->links[0].bw      = LOC_BW;
+        n->gpu.dev          = SCCL_TOPO_UNDEF;
+        n->gpu.rank         = SCCL_TOPO_UNDEF;
+        n->gpu.cudaCompCap  = SCCL_TOPO_UNDEF;
+    } else if(type == CPU) {
+        n->cpu.arch   = SCCL_TOPO_UNDEF;
+        n->cpu.vendor = SCCL_TOPO_UNDEF;
+        n->cpu.model  = SCCL_TOPO_UNDEF;
+    } else if(type == NET) {
+        n->net.asic    = 0ULL;
+        n->net.port    = SCCL_TOPO_UNDEF;
+        n->net.bw      = 0.0;
+        n->net.latency = 0.0;
+    }
+    *node = n;
+    return scclSuccess;
+}
+
+/**
+ * 从拓扑系统中移除指定类型的节点
+ *
+ * @param system 拓扑系统指针
+ * @param type 要移除的节点类型
+ * @param index 要移除的节点索引
+ * @return scclResult_t 返回操作结果(scclSuccess表示成功)
+ *
+ * 该函数会:
+ * 1. 释放被移除节点的所有路径内存
+ * 2. 更新其他节点到被移除节点的链接关系
+ * 3. 调整节点数组中剩余节点的位置
+ * 4. 减少该类型节点的计数
+ */
+scclResult_t scclTopoRemoveNode(struct scclTopoSystem* system, int type, int index) {
+    struct scclTopoNode* delNode = system->nodes[type].nodes + index;
+    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
+        free(delNode->paths[t]);
+        for(int n = 0; n < system->nodes[t].count; n++) {
+            struct scclTopoNode* node = system->nodes[t].nodes + n;
+            if(node == delNode)
+                continue;
+            for(int l = 0; l < node->nlinks; l++) {
+                while(l < node->nlinks && node->links[l].remNode == delNode) {
+                    memmove(node->links + l, node->links + l + 1, (node->nlinks - l - 1) * sizeof(struct scclTopoLink));
+                    node->nlinks--;
+                }
+                if(l < node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) {
+                    node->links[l].remNode--;
+                }
+            }
+        }
+    }
+    memmove(delNode, delNode + 1, (system->nodes[type].count - index - 1) * sizeof(struct scclTopoNode));
+    system->nodes[type].count--;
+    return scclSuccess;
+}
+
+scclResult_t scclTopoConnectNodes(struct scclTopoNode* node, struct scclTopoNode* remNode, int type, float bw) {
+    // Aggregate links into higher bw for NVLink
+    struct scclTopoLink* link;
+    for(link = node->links; link->remNode; link++) {
+        if(link->remNode == remNode && link->type == type)
+            break;
+    }
+    if(link->remNode == NULL)
+        node->nlinks++;
+    link->type    = type;
+    link->remNode = remNode;
+    link->bw += bw;
+
+    // Sort links in BW descending order
+    struct scclTopoLink linkSave;
+    memcpy(&linkSave, link, sizeof(struct scclTopoLink));
+    while(link != node->links) {
+        if((link - 1)->bw >= linkSave.bw)
+            break;
+        memcpy(link, link - 1, sizeof(struct scclTopoLink));
+        link--;
+    }
+    memcpy(link, &linkSave, sizeof(struct scclTopoLink));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoGetSystemFromXml(struct scclXml* xml, struct scclTopoSystem** topoSystem) {
+    SCCLCHECK(scclCalloc(topoSystem, 1));
+    struct scclXmlNode* topNode;
+    SCCLCHECK(xmlFindTag(xml, "system", &topNode));
+
+    printf("topNode->nSubs=%d\n", topNode->nSubs);
+    for(int s = 0; s < topNode->nSubs; s++) {
+        struct scclXmlNode* node = topNode->subs[s];
+        if(strcmp(node->name, "cpu") == 0)
+            SCCLCHECK(topo_basic::scclTopoAddCpu(node, *topoSystem));
+    }
+    SCCLCHECK(topo_basic::scclTopoAddXGMI(topNode, *topoSystem, NULL));
+    SCCLCHECK(topo_basic::scclTopoFlattenBcmSwitches(*topoSystem));
+    SCCLCHECK(topo_basic::scclTopoConnectCpus(*topoSystem));
+    SCCLCHECK(topo_basic::scclTopoSortSystem(*topoSystem));
+
+    return scclSuccess;
+}
+
+/**
+ * 获取系统中所有GPU节点的计算能力范围
+ *
+ * @param system 拓扑系统指针
+ * @param ccMin 输出参数，返回最小计算能力版本
+ * @param ccMax 输出参数，返回最大计算能力版本
+ * @return scclResult_t 成功返回scclSuccess，无GPU节点返回scclInternalError
+ */
+scclResult_t scclTopoGetCompCap(struct scclTopoSystem* system, int* ccMin, int* ccMax) {
+    if(system->nodes[GPU].count == 0)
+        return scclInternalError;
+    int min, max;
+    min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap;
+    for(int g = 1; g < system->nodes[GPU].count; g++) {
+        min = ::std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
+        max = ::std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
+    }
+    if(ccMin)
+        *ccMin = min;
+    if(ccMax)
+        *ccMax = max;
+    return scclSuccess;
+}
+
+scclResult_t scclTopoIdToIndex(struct scclTopoSystem* system, int type, int64_t id, int* index) {
+    *index = -1;
+    for(int i = 0; i < system->nodes[type].count; i++) {
+        if(system->nodes[type].nodes[i].id == id) {
+            *index = i;
+            return scclSuccess;
+        }
+    }
+    return scclInternalError;
+}
+
+scclResult_t scclTopoRankToIndex(struct scclTopoSystem* system, int rank, int* index) {
+    *index = -1;
+    for(int i = 0; i < system->nodes[GPU].count; i++) {
+        if(system->nodes[GPU].nodes[i].gpu.rank == rank) {
+            *index = i;
+            return scclSuccess;
+        }
+    }
+    return scclInternalError;
+}
+
+scclResult_t scclTopoDevToRank(struct scclTopoSystem* system, int dev, int* rank) {
+    *rank = -1;
+    for(int i = 0; i < system->nodes[GPU].count; i++) {
+        if(system->nodes[GPU].nodes[i].gpu.dev == dev) {
+            *rank = system->nodes[GPU].nodes[i].gpu.rank;
+            return scclSuccess;
+        }
+    }
+    return scclInternalError;
+}
+
+/**
+ * @brief 获取系统拓扑结构
+ *
+ * 该函数用于获取系统的拓扑结构信息，包括GPU和NIC设备。
+ * 首先尝试从环境变量SCCL_TOPO_FILE指定的XML文件加载拓扑，
+ * 若未指定则尝试加载默认拓扑文件（根据IB设备数量选择不同文件）。
+ * 自动检测本地GPU和NIC设备信息并填充到拓扑结构中。
+ *
+ * @param comm 通信上下文指针
+ * @param system 输出参数，返回创建的拓扑系统指针
+ * @return scclResult_t 返回操作结果，scclSuccess表示成功
+ */
+// scclResult_t scclTopoGetSystem(struct scclTopoComm* comm, struct scclTopoSystem** system) {
+//     struct scclXml* xml;
+//     SCCLCHECK(scclCalloc(&xml, 1));
+//     char* xmlTopoFile = getenv("SCCL_TOPO_FILE");
+//     if(xmlTopoFile) {
+//         INFO(SCCL_LOG_TOPO, "SCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
+//         SCCLCHECK(scclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
+//     } else {
+//         bool useDefaultTopo = true;
+//         bool HswExist       = topo_basic::isHswDriverExist();
+//         if(HswExist == true) {
+//             char* rocmPath = getenv("ROCM_PATH");
+//             if(rocmPath != NULL) {
+//                 ::std::string xmlPath;
+//                 int IBNum = topo_basic::getIBNum();
+//                 if(IBNum == 8 || IBNum == 9 || IBNum == 10) {
+//                     xmlPath = ::std::string(rocmPath) + "/rccl/lib/built-in-BW-topo-input.xml";
+//                     if(access(xmlPath.c_str(), F_OK) == 0) {
+//                         SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
+//                         useDefaultTopo = false;
+//                     }
+//                 } else if(IBNum == 4 || IBNum == 5 || IBNum == 6) {
+//                     xmlPath = ::std::string(rocmPath) + "/rccl/lib/built-in-508-topo-input.xml";
+//                     if(access(xmlPath.c_str(), F_OK) == 0) {
+//                         SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
+//                         useDefaultTopo = false;
+//                     }
+//                 }
+//             }
+//         }
+//         if(useDefaultTopo) {
+//             INFO(SCCL_LOG_TOPO, "No default topo for now, please provide your own topo xml file");
+//         }
+//     }
+
+//     if(xml->maxIndex == 0) {
+//         // Create top tag
+//         struct scclXmlNode* top;
+//         SCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
+//         SCCLCHECK(xmlSetAttrInt(top, "version", SCCL_TOPO_XML_VERSION));
+//     }
+
+//     // Auto-detect GPUs if needed
+//     for(int r = 0; r < comm->nRanks; r++) {
+//         if(comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
+//             char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+//             SCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
+//             struct scclXmlNode* node;
+//             SCCLCHECK(scclTopoFillGpu(xml, busId, &node));
+//             if(node == NULL)
+//                 continue;
+//             SCCLCHECK(xmlSetAttrInt(node, "keep", 1));
+//             SCCLCHECK(xmlSetAttrInt(node, "rank", r));
+//             SCCLCHECK(topo_basic::xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
+//         }
+//     }
+
+//     // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
+//     // so we start with collnet so that it has precedence.
+//     int netDevCount = 0;
+//     if(netDevCount == 0) {
+//         SCCLCHECK(comm->scclNet->devices(&netDevCount));
+//     }
+
+//     for(int n = 0; n < netDevCount; n++) {
+//         sccl::hardware::net::scclNetProperties_t props;
+//         SCCLCHECK(comm->scclNet->getProperties(n, &props));
+//         struct scclXmlNode* netNode;
+//         SCCLCHECK(scclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+//         SCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
+//         SCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+//         SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "speed", props.speed));
+//         SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "port", props.port));
+//         SCCLCHECK(topo_basic::xmlInitAttrFloat(netNode, "latency", props.latency));
+//         SCCLCHECK(topo_basic::xmlInitAttrUint64(netNode, "guid", props.guid));
+//         SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+//         bool gdrSupport =
+//             (props.ptrSupport & sccl::hardware::net::SCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & sccl::hardware::net::SCCL_PTR_DMABUF));
+//         INFO(SCCL_LOG_TOPO, "NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->scclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+//         SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "gdr", gdrSupport));
+//     }
+
+//     // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
+//     SCCLCHECK(scclTopoTrimXml(xml));
+
+//     xmlTopoFile = getenv("SCCL_TOPO_DUMP_FILE");
+//     if(xmlTopoFile && comm->rank == topo_basic::scclParamTopoDumpFileRank()) {
+//         INFO(SCCL_LOG_TOPO, "SCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
+//         SCCLCHECK(scclTopoDumpXmlToFile(xmlTopoFile, xml));
+//     }
+
+//     SCCLCHECK(scclTopoGetSystemFromXml(xml, system));
+//     free(xml);
+
+//     return scclSuccess;
+// }
+
+scclResult_t scclTopoGetSystem(struct scclTopoSystem** system) {
+    using namespace sccl;
+    struct scclXml* xml;
+    SCCLCHECK(scclCalloc(&xml, 1));
+    bool HswExist = isHswDriverExist();
+    if(HswExist == true) {
+        ::std::string xmlPath;
+        int IBNum = getIBNum();
+        if(IBNum == 8 || IBNum == 9 || IBNum == 10) {
+            xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
+            SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
+        }
+    }
+    if(xml->maxIndex == 0) {
+        // Create top tag
+        struct scclXmlNode* top;
+        SCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
+        SCCLCHECK(xmlSetAttrInt(top, "version", SCCL_TOPO_XML_VERSION));
+    }
+
+    // Auto-detect GPUs if needed
+    // for(int r = 0; r < comm->nRanks; r++) {
+    //     if(comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
+    //         char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    //         SCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
+    //         struct scclXmlNode* node;
+    //         SCCLCHECK(scclTopoFillGpu(xml, busId, &node));
+    //         if(node == NULL)
+    //             continue;
+    //         SCCLCHECK(xmlSetAttrInt(node, "keep", 1));
+    //         SCCLCHECK(xmlSetAttrInt(node, "rank", r));
+    //         SCCLCHECK(topo_basic::xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
+    //     }
+    // }
+
+    // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
+    // so we start with collnet so that it has precedence.
+    int netDevCount = 0;
+    auto scclNet    = sccl::hardware::net::initNet(sccl::hardware::net::NET_IB);
+    if(netDevCount == 0) {
+        SCCLCHECK(scclNet->devices(&netDevCount));
+    }
+
+    for(int n = 0; n < netDevCount; n++) {
+        sccl::hardware::net::scclNetProperties_t props;
+        SCCLCHECK(scclNet->getProperties(n, &props));
+        struct scclXmlNode* netNode;
+        SCCLCHECK(scclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+        SCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
+        SCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+        SCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
+        SCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+        SCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
+        SCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
+        SCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+        bool gdrSupport = (props.ptrSupport & sccl::hardware::net::SCCL_PTR_CUDA) && (props.ptrSupport & sccl::hardware::net::SCCL_PTR_DMABUF);
+        INFO(SCCL_LOG_TOPO, "NET/%s : GPU Direct RDMA %s for HCA %d '%s'", scclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+        SCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
+    }
+
+    // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
+    SCCLCHECK(scclTopoTrimXml(xml));
+    SCCLCHECK(scclTopoGetSystemFromXml(xml, system));
+    free(xml);
+
+    return scclSuccess;
+}
+
+} // namespace topo
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/topo_bak/topo.h
+++ b/src/hardware/topo_bak/topo.h
+#ifndef SCCL_TOPO_H_
+#define SCCL_TOPO_H_
+
+#include <string.h>
+#include "base.h"
+#include "archinfo.h"
+#include "xml.h"
+#include "net.h"
+
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace topo {
+
+#define SCCL_TOPO_NODE_TYPES 6
+static constexpr int SCCL_TOPO_MAX_NODES = 256;
+#define SCCL_TOPO_MAX_LINKS 32
+#define SCCL_TOPO_MAX_HOPS (SCCL_TOPO_MAX_NODES * SCCL_TOPO_NODE_TYPES)
+
+// 定义硬件拓扑类型枚举
+typedef enum topoNodeType {
+    GPU = 0, // 图形处理单元
+    PCI = 1, // 外围组件互连
+    NVS = 2, // 非易失性存储器
+    CPU = 3, // 中央处理器，实际上是NUMA域
+    NIC = 4, // 网络接口控制器
+    NET = 5  // 网络
+} topoNodeType_t;
+extern const char* topoNodeTypeStr[];
+
+// 定义链接类型和路径类型的枚举，以确保它们尽可能匹配
+typedef enum topoLinkType {
+    LINK_LOC = 0, // 本地链接
+    LINK_NVL = 1, // NVLink链接
+    // 路径类型PATH_NVB占位，不定义
+    LINK_PCI = 3, // PCI链接
+    // 路径类型PATH_PXB占位，不定义
+    // 路径类型PATH_PXN占位，不定义
+    // 路径类型PATH_PHB占位，不定义
+    LINK_SYS = 7, // 系统链接
+    LINK_NET = 8  // 网络链接
+} topoLinkType_t;
+extern const char* topoLinkTypeStr[];
+
+// 定义 topoPathType_t 枚举类型，用于表示不同的路径类型。
+enum topoPathType {
+    PATH_LOC = 0, // 本地路径
+    PATH_NVL = 1, // 通过 NVLink 连接
+    PATH_NVB = 2, // 通过中间 GPU 使用 NVLink 连接
+    PATH_PIX = 3, // 通过最多一个 PCIe 桥连接
+    PATH_PXB = 4, // 通过多个 PCIe 桥连接（不经过 PCIe 主桥）
+    PATH_PXN = 5, // GPU 和 NIC 之间通过中间 GPU 连接
+    PATH_PHB = 6, // 通过 PCIe 以及 PCIe 主桥连接
+    PATH_SYS = 7, // 通过 PCIe 以及 NUMA 节点之间的 SMP 互连连接
+    PATH_NET = 8, // 通过网络连接
+    PATH_DIS = 9  // 断开连接
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct scclTopoNode;
+
+struct scclTopoLink {
+    int type;
+    float bw;
+    struct scclTopoNode* remNode;
+};
+
+struct scclTopoLinkList {
+    int type;
+    float bw;
+    int count;
+    struct scclTopoLink* list[SCCL_TOPO_MAX_HOPS];
+};
+
+struct scclTopoNode {
+    int type;   // 节点类型
+    int64_t id; // 节点ID
+    // 类型特定数据
+    union {
+        struct {
+            int dev;              // NVML设备编号
+            int rank;             // 排名
+            int cudaCompCap;      // CUDA计算能力
+            int gdrSupport;       // GDR支持
+            const char* gcn;      // GCN架构名称
+            hipDeviceArch_t arch; // HIP设备架构
+        } gpu;                    // GPU节点
+        struct {
+            uint64_t asic;   // ASIC标识
+            int port;        // 端口编号
+            float bw;        // 带宽
+            float latency;   // 延迟
+            int gdrSupport;  // GDR支持
+            int collSupport; // 集合操作支持
+            int maxChannels; // 最大通道数
+            int64_t busId;   // 总线ID
+        } net;               // 网络节点
+        struct {
+            int arch;           // 架构
+            int vendor;         // 供应商
+            int model;          // 模型
+            cpu_set_t affinity; // CPU亲和性
+        } cpu;                  // CPU节点
+        struct {
+            uint64_t device; // PCI设备
+        } pci;               // PCI节点
+    };
+    int nlinks;                                     // 链接数量
+    struct scclTopoLink links[SCCL_TOPO_MAX_LINKS]; // 链接列表
+    // 预计算路径到GPU和NIC
+    struct scclTopoLinkList* paths[SCCL_TOPO_NODE_TYPES];
+    // 搜索期间使用
+    uint64_t used;
+};
+
+struct scclTopoNodeSet {
+    int count;                                      // 节点数量
+    struct scclTopoNode nodes[SCCL_TOPO_MAX_NODES]; // 节点数组，最大数量由SCCL_TOPO_MAX_NODES定义
+};
+
+struct scclTopoSystem {
+    struct scclTopoNodeSet nodes[SCCL_TOPO_NODE_TYPES]; // 节点集，用于存储不同类型的节点
+    float maxBw;                                        // 系统最大带宽
+    float baseBw;                                       // 基础带宽
+    float totalBw;                                      // 系统总带宽
+    int type;                                           // 系统类型
+    int nRanks;                                         // 系统中的秩数
+    int netGdrLevel;                                    // 网络GDR级别
+    int tuning;                                         // 调优参数
+
+    int pivotA2ANumBiRings; // Pivot A2A模式下的双向环路数量
+    bool pivotA2AEnabled;   // 是否启用Pivot A2A通信模式
+    bool treeDefined;       // 是否定义了树结构
+    bool ll128Enabled;      // 是否启用了LL128模式
+    bool mscclEnabled;      // 是否启用了MSCCL模式
+};
+
+#define LOC_BW 5000.0
+#define SM60_NVLINK_BW 18.0
+#define SM70_NVLINK_BW 20.0
+#define SM80_NVLINK_BW 20.0
+#define SM90_NVLINK_BW 20.0
+#define SM86_NVLINK_BW 12.0
+#define PCI_BW 12.0 // PCI Gen3 x16
+#define QPI_BW 6.0
+#define SKL_QPI_BW 10.0
+#define ZPI_BW 6.0
+#define YONGFENG_ZPI_BW 9.0
+#define P9_BW 32.0
+#define ARM_BW 6.0
+#define NET_BW 12.0 // 100Gbit
+#define VEGA_XGMI_WIDTH 24.0
+#define MI200_XGMI_WIDTH 36.0
+#define GFX94X_XGMI_WIDTH 48.0
+
+// 英特尔CPU将GPU的P2P流量转换为64字节的PCI TLP，因此GPU之间的流量消耗更多的PCI带宽。
+#define INTEL_P2P_OVERHEAD(bw) (bw * 6 / 5)
+
+enum topoCpuArch {
+    SCCL_TOPO_CPU_ARCH_X86   = 1,
+    SCCL_TOPO_CPU_ARCH_POWER = 2,
+    SCCL_TOPO_CPU_ARCH_ARM   = 3
+};
+
+enum topoCpuVendor {
+    SCCL_TOPO_CPU_VENDOR_INTEL   = 1,
+    SCCL_TOPO_CPU_VENDOR_AMD     = 2,
+    SCCL_TOPO_CPU_VENDOR_ZHAOXIN = 3
+};
+
+enum topoCpuType {
+    SCCL_TOPO_CPU_TYPE_BDW      = 1,
+    SCCL_TOPO_CPU_TYPE_SKL      = 2,
+    SCCL_TOPO_CPU_TYPE_ZEN      = 3,
+    SCCL_TOPO_CPU_TYPE_ROME     = 4,
+    SCCL_TOPO_CPU_TYPE_YONGFENG = 5
+};
+
+enum topoCpuPattern {
+    SCCL_TOPO_PATTERN_BALANCED_TREE = 1,
+    SCCL_TOPO_PATTERN_SPLIT_TREE    = 2,
+    SCCL_TOPO_PATTERN_TREE          = 3,
+    SCCL_TOPO_PATTERN_RING          = 4,
+    SCCL_TOPO_PATTERN_NVLS          = 5
+};
+
+#define SCCL_TOPO_MAX_NODES 256
+
+extern const char* topoPathTypeStr[];
+
+#define SCCL_TOPO_CPU_INTEL_BDW 1
+#define SCCL_TOPO_CPU_INTEL_SKL 2
+
+enum topoSysType {
+    SCCL_TOPO_UNDEF       = -1,
+    SCCL_TOPO_CR8G        = 1,
+    SCCL_TOPO_4P2H_ROME   = 2,
+    SCCL_TOPO_GDR_ALL     = 4,
+    SCCL_TOPO_16P1H       = 8,
+    SCCL_TOPO_FORCE_INTRA = 16,
+    SCCL_TOPO_XGMI_ALL    = 32
+};
+
+// struct scclTopoComm {
+//     int type;
+//     int id;
+
+//     int rank;
+//     int nRanks;
+//     int node;
+//     int nNodes;
+//     int localRank;
+//     int localRanks;
+//     bool dmaBufSupport;
+
+//     struct scclPeerInfo* peerInfo;
+//     sccl::hardware::net::scclNet_t* scclNet;
+// };
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+// 检查是否存在Hsw驱动程序
+bool isHswDriverExist();
+// 获取InfiniBand (IB) 设备的数量
+int getIBNum();
+
+// 获取拓扑节点
+scclResult_t scclTopoGetNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id);
+// 创建拓扑节点
+scclResult_t scclTopoCreateNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id);
+// 移除拓扑节点
+scclResult_t scclTopoRemoveNode(struct scclTopoSystem* system, int type, int id);
+// 连接两个拓扑节点
+scclResult_t scclTopoConnectNodes(struct scclTopoNode* node, struct scclTopoNode* remNode, int type, float bw);
+// 从XML获取系统拓扑
+scclResult_t scclTopoGetSystemFromXml(struct scclXml* xml, struct scclTopoSystem** topoSystem);
+// 打印系统路径
+scclResult_t scclTopoPrint(struct scclTopoSystem* system);
+// 获取计算能力
+scclResult_t scclTopoGetCompCap(struct scclTopoSystem* system, int* ccMin, int* ccMax);
+// 将ID转换为索引
+scclResult_t scclTopoIdToIndex(struct scclTopoSystem* system, int type, int64_t id, int* index);
+// 将Rank转换为索引
+scclResult_t scclTopoRankToIndex(struct scclTopoSystem* system, int rank, int* index);
+// 将设备ID转换为Rank
+scclResult_t scclTopoDevToRank(struct scclTopoSystem* system, int dev, int* rank);
+// 获取XGMI速度
+float scclTopoXGMISpeed(const char* gcn);
+// 获取本地网络信息
+scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id);
+// 获取本地GPU信息
+scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex);
+// 获取CPU类型信息
+scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model);
+// 查找CPU亲和性
+scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity);
+// 获取GPU数量
+scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count);
+// 获取网络接口数量
+scclResult_t scclTopoGetNetCount(struct scclTopoSystem* system, int* count);
+// 获取NVS（非易失性存储器）数量
+scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count);
+// 获取本地排名
+scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank);
+
+// // 获取系统拓扑结构
+// scclResult_t scclTopoGetSystem(struct scclTopoComm* comm, struct scclTopoSystem** system);
+scclResult_t scclTopoGetSystem(struct scclTopoSystem** system);
+
+} // namespace topo
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
+
+#endif
--- a/src/hardware/topo_bak/topo_utils.h
+++ b/src/hardware/topo_bak/topo_utils.h
+#pragma once
+
+#include <string.h>
+#include "base.h"
+#include "archinfo.h"
+#include "xml.h"
+// #include "net.h"
+
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace topo {
+
+#define SCCL_TOPO_NODE_TYPES 6
+static constexpr int SCCL_TOPO_MAX_NODES = 256;
+#define SCCL_TOPO_MAX_LINKS 32
+#define SCCL_TOPO_MAX_HOPS (SCCL_TOPO_MAX_NODES * SCCL_TOPO_NODE_TYPES)
+
+// 定义硬件拓扑类型枚举
+typedef enum topoNodeType {
+    GPU = 0, // 图形处理单元
+    PCI = 1, // 外围组件互连
+    NVS = 2, // 非易失性存储器
+    CPU = 3, // 中央处理器，实际上是NUMA域
+    NIC = 4, // 网络接口控制器
+    NET = 5  // 网络
+} topoNodeType_t;
+extern const char* topoNodeTypeStr[];
+
+// 定义链接类型和路径类型的枚举，以确保它们尽可能匹配
+typedef enum topoLinkType {
+    LINK_LOC = 0, // 本地链接
+    LINK_NVL = 1, // NVLink链接
+    // 路径类型PATH_NVB占位，不定义
+    LINK_PCI = 3, // PCI链接
+    // 路径类型PATH_PXB占位，不定义
+    // 路径类型PATH_PXN占位，不定义
+    // 路径类型PATH_PHB占位，不定义
+    LINK_SYS = 7, // 系统链接
+    LINK_NET = 8  // 网络链接
+} topoLinkType_t;
+extern const char* topoLinkTypeStr[];
+
+// 定义 topoPathType_t 枚举类型，用于表示不同的路径类型。
+enum topoPathType {
+    PATH_LOC = 0, // 本地路径
+    PATH_NVL = 1, // 通过 NVLink 连接
+    PATH_NVB = 2, // 通过中间 GPU 使用 NVLink 连接
+    PATH_PIX = 3, // 通过最多一个 PCIe 桥连接
+    PATH_PXB = 4, // 通过多个 PCIe 桥连接（不经过 PCIe 主桥）
+    PATH_PXN = 5, // GPU 和 NIC 之间通过中间 GPU 连接
+    PATH_PHB = 6, // 通过 PCIe 以及 PCIe 主桥连接
+    PATH_SYS = 7, // 通过 PCIe 以及 NUMA 节点之间的 SMP 互连连接
+    PATH_NET = 8, // 通过网络连接
+    PATH_DIS = 9  // 断开连接
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct scclTopoNode;
+
+struct scclTopoLink {
+    int type;
+    float bw;
+    struct scclTopoNode* remNode;
+};
+
+struct scclTopoLinkList {
+    int type;
+    float bw;
+    int count;
+    struct scclTopoLink* list[SCCL_TOPO_MAX_HOPS];
+};
+
+struct scclTopoNode {
+    int type; // 节点类型
+    int id;   // 节点ID
+    // 类型特定数据
+    union {
+        struct {
+            int dev;              // NVML设备编号
+            int rank;             // 排名
+            int cudaCompCap;      // CUDA计算能力
+            int gdrSupport;       // GDR支持
+            const char* gcn;      // GCN架构名称
+            hipDeviceArch_t arch; // HIP设备架构
+        } gpu;                    // GPU节点
+        struct {
+            uint64_t asic;   // ASIC标识
+            int port;        // 端口编号
+            float bw;        // 带宽
+            float latency;   // 延迟
+            int gdrSupport;  // GDR支持
+            int collSupport; // 集合操作支持
+            int maxChannels; // 最大通道数
+            int64_t busId;   // 总线ID
+        } net;               // 网络节点
+        struct {
+            int arch;           // 架构
+            int vendor;         // 供应商
+            int model;          // 模型
+            cpu_set_t affinity; // CPU亲和性
+        } cpu;                  // CPU节点
+        struct {
+            uint64_t device; // PCI设备
+        } pci;               // PCI节点
+    };
+    int nlinks;                                     // 链接数量
+    struct scclTopoLink links[SCCL_TOPO_MAX_LINKS]; // 链接列表
+    // 预计算路径到GPU和NIC
+    struct scclTopoLinkList* paths[SCCL_TOPO_NODE_TYPES];
+    // 搜索期间使用
+    uint64_t used;
+};
+
+struct scclTopoNodeSet {
+    int count;                                      // 节点数量
+    struct scclTopoNode nodes[SCCL_TOPO_MAX_NODES]; // 节点数组，最大数量由SCCL_TOPO_MAX_NODES定义
+};
+
+struct scclTopoSystem {
+    struct scclTopoNodeSet nodes[SCCL_TOPO_NODE_TYPES]; // 节点集，用于存储不同类型的节点
+    float maxBw;                                        // 系统最大带宽
+    float baseBw;                                       // 基础带宽
+    float totalBw;                                      // 系统总带宽
+    int type;                                           // 系统类型
+    int nRanks;                                         // 系统中的秩数
+    int netGdrLevel;                                    // 网络GDR级别
+    int tuning;                                         // 调优参数
+
+    int pivotA2ANumBiRings; // Pivot A2A模式下的双向环路数量
+    bool pivotA2AEnabled;   // 是否启用Pivot A2A通信模式
+    bool treeDefined;       // 是否定义了树结构
+    bool ll128Enabled;      // 是否启用了LL128模式
+    bool mscclEnabled;      // 是否启用了MSCCL模式
+};
+
+#define LOC_BW 5000.0
+#define SM60_NVLINK_BW 18.0
+#define SM70_NVLINK_BW 20.0
+#define SM80_NVLINK_BW 20.0
+#define SM90_NVLINK_BW 20.0
+#define SM86_NVLINK_BW 12.0
+#define PCI_BW 12.0 // PCI Gen3 x16
+#define QPI_BW 6.0
+#define SKL_QPI_BW 10.0
+#define ZPI_BW 6.0
+#define YONGFENG_ZPI_BW 9.0
+#define P9_BW 32.0
+#define ARM_BW 6.0
+#define NET_BW 12.0 // 100Gbit
+#define VEGA_XGMI_WIDTH 24.0
+#define MI200_XGMI_WIDTH 36.0
+#define GFX94X_XGMI_WIDTH 48.0
+
+// 英特尔CPU将GPU的P2P流量转换为64字节的PCI TLP，因此GPU之间的流量消耗更多的PCI带宽。
+#define INTEL_P2P_OVERHEAD(bw) (bw * 6 / 5)
+
+enum topoCpuArch {
+    SCCL_TOPO_CPU_ARCH_X86   = 1,
+    SCCL_TOPO_CPU_ARCH_POWER = 2,
+    SCCL_TOPO_CPU_ARCH_ARM   = 3
+};
+
+enum topoCpuVendor {
+    SCCL_TOPO_CPU_VENDOR_INTEL   = 1,
+    SCCL_TOPO_CPU_VENDOR_AMD     = 2,
+    SCCL_TOPO_CPU_VENDOR_ZHAOXIN = 3
+};
+
+enum topoCpuType {
+    SCCL_TOPO_CPU_TYPE_BDW      = 1,
+    SCCL_TOPO_CPU_TYPE_SKL      = 2,
+    SCCL_TOPO_CPU_TYPE_ZEN      = 3,
+    SCCL_TOPO_CPU_TYPE_ROME     = 4,
+    SCCL_TOPO_CPU_TYPE_YONGFENG = 5
+};
+
+enum topoCpuPattern {
+    SCCL_TOPO_PATTERN_BALANCED_TREE = 1,
+    SCCL_TOPO_PATTERN_SPLIT_TREE    = 2,
+    SCCL_TOPO_PATTERN_TREE          = 3,
+    SCCL_TOPO_PATTERN_RING          = 4,
+    SCCL_TOPO_PATTERN_NVLS          = 5
+};
+
+#define SCCL_TOPO_MAX_NODES 256
+
+extern const char* topoPathTypeStr[];
+
+#define SCCL_TOPO_CPU_INTEL_BDW 1
+#define SCCL_TOPO_CPU_INTEL_SKL 2
+
+enum topoSysType {
+    SCCL_TOPO_UNDEF       = -1,
+    SCCL_TOPO_CR8G        = 1,
+    SCCL_TOPO_4P2H_ROME   = 2,
+    SCCL_TOPO_GDR_ALL     = 4,
+    SCCL_TOPO_16P1H       = 8,
+    SCCL_TOPO_FORCE_INTRA = 16,
+    SCCL_TOPO_XGMI_ALL    = 32
+};
+
+} // namespace topo
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/topo_bak/xml.cc
+++ b/src/hardware/topo_bak/xml.cc
+/*************************************************************************
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include "check.h"
+#include "nvmlwrap.h"
+#include "xml.h"
+#include "rocm_smi_wrap.h"
+#include "archinfo.h"
+
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace topo {
+
+/**************/
+/* XML Struct */
+/* Functions  */
+/**************/
+
+scclResult_t xmlGetAttrIndex(struct scclXmlNode* node, const char* attrName, int* index) {
+    *index           = -1;
+    const int nAttrs = node->nAttrs;
+    for(int a = 0; a < nAttrs; a++) {
+        if(strncmp(node->attrs[a].key, attrName, MAX_STR_LEN) == 0) {
+            *index = a;
+            return scclSuccess;
+        }
+    }
+    return scclSuccess;
+}
+
+scclResult_t xmlGetAttr(struct scclXmlNode* node, const char* attrName, const char** value) {
+    int index;
+    SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+    *value = index == -1 ? NULL : node->attrs[index].value;
+    return scclSuccess;
+}
+
+scclResult_t xmlGetAttrStr(struct scclXmlNode* node, const char* attrName, const char** value) {
+    SCCLCHECK(xmlGetAttr(node, attrName, value));
+    if(*value == NULL) {
+        WARN("Attribute %s of node %s not found", attrName, node->name);
+        return scclInternalError;
+    }
+    return scclSuccess;
+}
+/**
+ * 从XML节点属性中获取整数值
+ *
+ * @param node XML节点指针
+ * @param attrName 属性名称
+ * @param value 输出参数，用于存储解析后的整数值
+ * @return 成功返回scclSuccess，失败返回错误码
+ *
+ * @note 该函数会先获取属性字符串值，然后将其转换为整数
+ */
+scclResult_t xmlGetAttrInt(struct scclXmlNode* node, const char* attrName, int* value) {
+    const char* str;
+    SCCLCHECK(xmlGetAttrStr(node, attrName, &str));
+    *value = strtol(str, NULL, 0);
+    return scclSuccess;
+}
+
+/**
+ * 从XML节点获取整数属性值，若属性不存在则返回默认值
+ *
+ * @param node XML节点指针
+ * @param attrName 要获取的属性名
+ * @param value 输出参数，用于存储获取到的整数值
+ * @param defaultValue 当属性不存在时返回的默认值
+ * @return scclResult_t 操作结果，成功返回scclSuccess
+ */
+scclResult_t xmlGetAttrIntDefault(struct scclXmlNode* node, const char* attrName, int* value, int defaultValue) {
+    const char* str;
+    SCCLCHECK(xmlGetAttr(node, attrName, &str));
+    *value = str ? strtol(str, NULL, 0) : defaultValue;
+    return scclSuccess;
+}
+
+// Only set values if not already set
+/**
+ * @brief 初始化XML节点的整数属性
+ *
+ * 如果属性不存在则创建并设置值，已存在则不修改
+ *
+ * @param node XML节点指针
+ * @param attrName 属性名称
+ * @param value 要设置的整数值
+ * @return scclResult_t 返回操作结果(scclSuccess表示成功)
+ */
+scclResult_t xmlInitAttrInt(struct scclXmlNode* node, const char* attrName, const int value) {
+    int index;
+    SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+    if(index == -1) {
+        index = node->nAttrs++;
+        strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+        snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
+    }
+    return scclSuccess;
+}
+
+/**
+ * 初始化XML节点的uint64类型属性
+ *
+ * @param node XML节点指针
+ * @param attrName 属性名称
+ * @param value 要设置的属性值(16进制格式)
+ * @return 成功返回scclSuccess，失败返回错误码
+ *
+ * 功能：为指定XML节点添加或更新一个uint64类型的属性，属性值将以"0x%lx"格式存储
+ * 注意：如果属性已存在，则直接使用新值覆盖原有值
+ */
+scclResult_t xmlInitAttrUint64(struct scclXmlNode* node, const char* attrName, const uint64_t value) {
+    int index;
+    SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+    if(index == -1) {
+        index = node->nAttrs++;
+        strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+        snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value);
+    }
+    return scclSuccess;
+}
+
+scclResult_t xmlGetAttrFloat(struct scclXmlNode* node, const char* attrName, float* value) {
+    const char* str;
+    SCCLCHECK(xmlGetAttrStr(node, attrName, &str));
+    *value = strtof(str, NULL);
+    return scclSuccess;
+}
+
+scclResult_t xmlInitAttrFloat(struct scclXmlNode* node, const char* attrName, const float value) {
+    int index;
+    SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+    if(index == -1) {
+        index = node->nAttrs++;
+        strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+        snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value);
+    }
+    return scclSuccess;
+}
+
+scclResult_t xmlFindTag(struct scclXml* xml, const char* tagName, struct scclXmlNode** node) {
+    *node = NULL;
+    for(int i = 0; i < xml->maxIndex; i++) {
+        struct scclXmlNode* n = xml->nodes + i;
+        if(strcmp(n->name, tagName) == 0) {
+            *node = n;
+            return scclSuccess;
+        }
+    }
+    return scclSuccess;
+}
+
+scclResult_t xmlFindTagKv(struct scclXml* xml, const char* tagName, struct scclXmlNode** node, const char* attrName, const char* attrValue) {
+    *node = NULL;
+    for(int i = 0; i < xml->maxIndex; i++) {
+        struct scclXmlNode* n = xml->nodes + i;
+        if(strcmp(n->name, tagName) == 0) {
+            const char* value;
+            SCCLCHECK(xmlGetAttr(n, attrName, &value));
+            if(value && strcmp(value, attrValue) == 0) {
+                *node = n;
+                return scclSuccess;
+            }
+        }
+    }
+    return scclSuccess;
+}
+
+scclResult_t xmlSetAttr(struct scclXmlNode* node, const char* attrName, const char* value) {
+    int index;
+    SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+    if(index == -1) {
+        index = node->nAttrs++;
+        strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+        node->attrs[index].key[MAX_STR_LEN] = '\0';
+    }
+    strncpy(node->attrs[index].value, value, MAX_STR_LEN);
+    node->attrs[index].value[MAX_STR_LEN] = '\0';
+    return scclSuccess;
+}
+
+scclResult_t xmlSetAttrIfUnset(struct scclXmlNode* node, const char* attrName, const char* value) {
+    int index;
+    SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+    if(index != -1)
+        return scclSuccess;
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    node->attrs[index].key[MAX_STR_LEN] = '\0';
+    strncpy(node->attrs[index].value, value, MAX_STR_LEN);
+    node->attrs[index].value[MAX_STR_LEN] = '\0';
+    return scclSuccess;
+}
+
+scclResult_t xmlSetAttrInt(struct scclXmlNode* node, const char* attrName, const int value) {
+    int index;
+    SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+    if(index == -1) {
+        index = node->nAttrs++;
+        strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+        node->attrs[index].key[MAX_STR_LEN] = '\0';
+    }
+    snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
+    node->attrs[index].value[MAX_STR_LEN] = '\0';
+    return scclSuccess;
+}
+
+scclResult_t xmlSetAttrFloat(struct scclXmlNode* node, const char* attrName, const float value) {
+    int index;
+    SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+    if(index == -1) {
+        index = node->nAttrs++;
+        strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+        node->attrs[index].key[MAX_STR_LEN] = '\0';
+    }
+    snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
+    node->attrs[index].value[MAX_STR_LEN] = '\0';
+    return scclSuccess;
+}
+
+scclResult_t xmlUnsetAttr(struct scclXmlNode* node, const char* attrName) {
+    int index;
+    SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+    if(index == -1)
+        return scclSuccess;
+    for(int i = index + 1; i < node->nAttrs; i++) {
+        strcpy(node->attrs[i - 1].key, node->attrs[i].key);
+        strcpy(node->attrs[i - 1].value, node->attrs[i].value);
+    }
+    node->nAttrs--;
+    return scclSuccess;
+}
+
+scclResult_t xmlGetSub(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub) {
+    *sub = NULL;
+    for(int s = 0; s < node->nSubs; s++) {
+        if(strcmp(node->subs[s]->name, subName) == 0) {
+            *sub = node->subs[s];
+            return scclSuccess;
+        }
+    }
+    return scclSuccess;
+}
+
+scclResult_t xmlGetSubKv(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub, const char* attrName, const char* attrValue) {
+    *sub = NULL;
+    for(int s = 0; s < node->nSubs; s++) {
+        struct scclXmlNode* subNode = node->subs[s];
+        if(strcmp(subNode->name, subName) == 0) {
+            const char* value;
+            SCCLCHECK(xmlGetAttr(subNode, attrName, &value));
+            if(value && strcmp(value, attrValue) == 0) {
+                *sub = node->subs[s];
+                return scclSuccess;
+            }
+        }
+    }
+    return scclSuccess;
+}
+scclResult_t xmlGetSubKvInt(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub, const char* attrName, const int attrValue) {
+    char strValue[10];
+    snprintf(strValue, 10, "%d", attrValue);
+    SCCLCHECK(xmlGetSubKv(node, subName, sub, attrName, strValue));
+    return scclSuccess;
+}
+
+scclResult_t xmlAddNode(struct scclXml* xml, struct scclXmlNode* parent, const char* subName, struct scclXmlNode** sub) {
+    if(xml->maxIndex == MAX_NODES) {
+        WARN("Error : too many XML nodes (max %d)", MAX_NODES);
+        return scclInternalError;
+    }
+    struct scclXmlNode* s = xml->nodes + xml->maxIndex++;
+    s->nSubs              = 0;
+    s->nAttrs             = 0;
+    *sub                  = s;
+    s->parent             = parent;
+    if(parent)
+        parent->subs[parent->nSubs++] = s;
+    strncpy(s->name, subName, MAX_STR_LEN);
+    s->name[MAX_STR_LEN] = '\0';
+    return scclSuccess;
+}
+
+scclResult_t xmlRemoveNode(struct scclXmlNode* node) {
+    node->type                 = NODE_TYPE_NONE;
+    struct scclXmlNode* parent = node->parent;
+    if(parent == NULL)
+        return scclSuccess;
+    int shift = 0;
+    for(int s = 0; s < parent->nSubs; s++) {
+        if(parent->subs[s] == node)
+            shift = 1;
+        else if(shift)
+            parent->subs[s - 1] = parent->subs[s];
+    }
+    parent->nSubs--;
+    return scclSuccess;
+}
+
+scclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* dict) {
+    struct kvDict* d = dict;
+    while(d->str) {
+        if(strncmp(str, d->str, strlen(d->str)) == 0) {
+            *value = d->value;
+            return scclSuccess;
+        }
+        d++;
+    }
+    INFO(SCCL_LOG_GRAPH, "KV Convert to int : could not find value of '%s' in dictionary, falling back to %d", str, d->value);
+    *value = d->value;
+    return scclSuccess;
+}
+scclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict) {
+    struct kvDict* d = dict;
+    while(d->str) {
+        if(value == d->value) {
+            *str = d->str;
+            return scclSuccess;
+        }
+        d++;
+    }
+    WARN("KV Convert to str : could not find value %d in dictionary", value);
+    return scclInternalError;
+}
+
+namespace xml {
+/*******************/
+/* XML File Parser */
+/*******************/
+
+scclResult_t xmlGetChar(FILE* file, char* c) {
+    if(fread(c, 1, 1, file) == 0) {
+        WARN("XML Parse : Unexpected EOF");
+        return scclInternalError;
+    }
+    return scclSuccess;
+}
+
+scclResult_t xmlGetValue(FILE* file, char* value, char* last) {
+    char c;
+    SCCLCHECK(xmlGetChar(file, &c));
+    if(c != '"' && c != '\'') {
+#if INT_OK
+        int o = 0;
+        do {
+            value[o++] = c;
+            SCCLCHECK(xmlGetChar(file, &c));
+        } while(c >= '0' && c <= '9');
+        value[o] = '\0';
+        *last    = c;
+        return scclSuccess;
+#else
+        WARN("XML Parse : Expected (double) quote.");
+        return scclInternalError;
+#endif
+    }
+    int o = 0;
+    do {
+        SCCLCHECK(xmlGetChar(file, &c));
+        value[o++] = c;
+    } while(c != '"');
+    value[o - 1] = '\0';
+    SCCLCHECK(xmlGetChar(file, last));
+    return scclSuccess;
+}
+
+scclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
+    char c;
+    char* ptr = name;
+    int o     = 0;
+    do {
+        SCCLCHECK(xmlGetChar(file, &c));
+        if(c == '=') {
+            ptr[o] = '\0';
+            if(value == NULL) {
+                WARN("XML Parse : Unexpected value with name %s", ptr);
+                return scclInternalError;
+            }
+            return xmlGetValue(file, value, last);
+        }
+        ptr[o] = c;
+        if(o == MAX_STR_LEN - 1) {
+            ptr[o] = '\0';
+            WARN("Error : name %s too long (max %d)", ptr, MAX_STR_LEN);
+            return scclInternalError;
+        }
+        o++;
+    } while(c != ' ' && c != '>' && c != '/' && c != '\n' && c != '\r');
+    ptr[o - 1] = '\0';
+    *last      = c;
+    return scclSuccess;
+}
+
+// Shift the 3-chars string by one char and append c at the end
+#define SHIFT_APPEND(s, c) \
+    do {                   \
+        s[0] = s[1];       \
+        s[1] = s[2];       \
+        s[2] = c;          \
+    } while(0)
+scclResult_t xmlSkipComment(FILE* file, char* start, char next) {
+    // Start from something neutral with \0 at the end.
+    char end[4] = "...";
+
+    // Inject all trailing chars from previous reads. We don't need
+    // to check for --> here because there cannot be a > in the name.
+    for(int i = 0; i < strlen(start); i++)
+        SHIFT_APPEND(end, start[i]);
+    SHIFT_APPEND(end, next);
+
+    // Stop when we find "-->"
+    while(strcmp(end, "-->") != 0) {
+        int c;
+        if(fread(&c, 1, 1, file) != 1) {
+            WARN("XML Parse error : unterminated comment");
+            return scclInternalError;
+        }
+        SHIFT_APPEND(end, c);
+    }
+    return scclSuccess;
+}
+
+scclResult_t xmlGetNode(FILE* file, struct scclXmlNode* node) {
+    node->type = NODE_TYPE_NONE;
+    char c     = ' ';
+    while(c == ' ' || c == '\n' || c == '\r') {
+        if(fread(&c, 1, 1, file) == 0)
+            return scclSuccess;
+    }
+    if(c != '<') {
+        WARN("XML Parse error : expecting '<', got '%c'", c);
+        return scclInternalError;
+    }
+    // Read XML element name
+    SCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
+
+    // Check for comments
+    if(strncmp(node->name, "!--", 3) == 0) {
+        SCCLCHECK(xmlSkipComment(file, node->name + 3, c));
+        return xmlGetNode(file, node);
+    }
+
+    // Check for closing tag
+    if(node->name[0] == '\0' && c == '/') {
+        node->type = NODE_TYPE_CLOSE;
+        // Re-read the name, we got '/' in the first call
+        SCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
+        if(c != '>') {
+            WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name);
+            return scclInternalError;
+        }
+        return scclSuccess;
+    }
+
+    node->type = NODE_TYPE_OPEN;
+
+    // Get Attributes
+    int a = 0;
+    while(c == ' ') {
+        SCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
+        if(a == MAX_ATTR_COUNT) {
+            INFO(SCCL_LOG_TOPO, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT);
+            // Actually we need to still consume the extra attributes so we have an extra one.
+        } else
+            a++;
+    }
+    node->nAttrs = a;
+    if(c == '/') {
+        node->type = NODE_TYPE_SINGLE;
+        char str[MAX_STR_LEN];
+        SCCLCHECK(xmlGetToken(file, str, NULL, &c));
+    }
+    if(c != '>') {
+        WARN("XML Parse : expected >, got '%c'", c);
+        return scclInternalError;
+    }
+    return scclSuccess;
+}
+
+typedef scclResult_t (*xmlHandlerFunc_t)(FILE*, struct scclXml*, struct scclXmlNode*);
+
+struct xmlHandler {
+    const char* name;
+    xmlHandlerFunc_t func;
+};
+
+scclResult_t xmlLoadSub(FILE* file, struct scclXml* xml, struct scclXmlNode* head, struct xmlHandler handlers[], int nHandlers) {
+    if(head && head->type == NODE_TYPE_SINGLE)
+        return scclSuccess;
+    while(1) {
+        if(xml->maxIndex == MAX_NODES) {
+            WARN("Error : XML parser is limited to 1024 nodes");
+            return scclInternalError;
+        }
+        struct scclXmlNode* node = xml->nodes + xml->maxIndex;
+        memset(node, 0, sizeof(struct scclXmlNode));
+        SCCLCHECK(xmlGetNode(file, node));
+        if(node->type == NODE_TYPE_NONE) {
+            if(head) {
+                WARN("XML Parse : unterminated %s", head->name);
+                return scclInternalError;
+            } else {
+                // All done
+                return scclSuccess;
+            }
+        }
+        if(head && node->type == NODE_TYPE_CLOSE) {
+            if(strcmp(node->name, head->name) != 0) {
+                WARN("XML Mismatch : %s / %s", head->name, node->name);
+                return scclInternalError;
+            }
+            return scclSuccess;
+        }
+        int found = 0;
+        for(int h = 0; h < nHandlers; h++) {
+            if(strcmp(node->name, handlers[h].name) == 0) {
+                if(head)
+                    head->subs[head->nSubs++] = node;
+                node->parent = head;
+                node->nSubs  = 0;
+                xml->maxIndex++;
+                SCCLCHECK(handlers[h].func(file, xml, node));
+                found = 1;
+                break;
+            }
+        }
+        if(!found) {
+            if(nHandlers)
+                INFO(SCCL_LOG_TOPO, "Ignoring element %s", node->name);
+            SCCLCHECK(xmlLoadSub(file, xml, node, NULL, 0));
+        }
+    }
+}
+
+/**************/
+/* XML Writer */
+/**************/
+
+scclResult_t scclTopoDumpXmlRec(int indent, FILE* file, struct scclXmlNode* node) {
+    for(int i = 0; i < indent; i++)
+        fprintf(file, " ");
+    fprintf(file, "<%s", node->name);
+
+    for(int a = 0; a < node->nAttrs; a++) {
+        fprintf(file, " %s=\"%s\"", node->attrs[a].key, node->attrs[a].value);
+    }
+    if(node->nSubs == 0) {
+        fprintf(file, "/>\n");
+    } else {
+        fprintf(file, ">\n");
+        for(int s = 0; s < node->nSubs; s++) {
+            SCCLCHECK(scclTopoDumpXmlRec(indent + 2, file, node->subs[s]));
+        }
+        for(int i = 0; i < indent; i++)
+            fprintf(file, " ");
+        fprintf(file, "</%s>\n", node->name);
+    }
+    return scclSuccess;
+}
+
+/****************************************/
+/* Parser rules for our specific format */
+/****************************************/
+
+scclResult_t scclTopoXmlLoadNvlink(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
+    SCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoXmlLoadGpu(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
+
+    struct xmlHandler handlers[] = {{"xgmi", scclTopoXmlLoadNvlink}};
+    SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoXmlLoadNet(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
+    SCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoXmlLoadNic(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
+    struct xmlHandler handlers[] = {{"net", scclTopoXmlLoadNet}};
+    SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoXmlLoadPci(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
+    struct xmlHandler handlers[] = {{"pci", scclTopoXmlLoadPci}, {"gpu", scclTopoXmlLoadGpu}, {"nic", scclTopoXmlLoadNic}};
+    SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoXmlLoadCpu(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
+    struct xmlHandler handlers[] = {{"pci", scclTopoXmlLoadPci}, {"nic", scclTopoXmlLoadNic}};
+    SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoXmlLoadSystem(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
+    int version;
+    SCCLCHECK(xmlGetAttrInt(head, "version", &version));
+    if(version != SCCL_TOPO_XML_VERSION) {
+        WARN("XML Topology has wrong version %d, %d needed", version, SCCL_TOPO_XML_VERSION);
+        return scclInvalidUsage;
+    }
+    const char* name;
+    SCCLCHECK(xmlGetAttr(head, "name", &name));
+    if(name != NULL)
+        INFO(SCCL_LOG_TOPO, "Loading topology %s", name);
+    else
+        INFO(SCCL_LOG_TOPO, "Loading unnamed topology");
+
+    struct xmlHandler handlers[] = {{"cpu", scclTopoXmlLoadCpu}};
+    SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+    return scclSuccess;
+}
+
+/**********************/
+/* XML creation       */
+/* from autodetection */
+/**********************/
+
+#define BUSID_SIZE (sizeof("0000:00:00.0"))
+#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+static void memcpylower(char* dst, const char* src, const size_t size) {
+    for(int i = 0; i < size; i++)
+        dst[i] = tolower(src[i]);
+
+    return;
+}
+static scclResult_t getPciPath(const char* busId, char** path) {
+    char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+    memcpylower(busPath + sizeof("/sys/class/pci_bus/") - 1, busId, BUSID_REDUCED_SIZE - 1);
+    memcpylower(busPath + sizeof("/sys/class/pci_bus/0000:00/../../") - 1, busId, BUSID_SIZE - 1);
+    *path = realpath(busPath, NULL);
+    if(*path == NULL) {
+        WARN("Could not find real path of %s", busPath);
+        return scclSystemError;
+    }
+    return scclSuccess;
+}
+
+scclResult_t scclTopoSetAttrFromSys(struct scclXmlNode* pciNode, const char* path, const char* fileName, const char* attrName) {
+    char strValue[MAX_STR_LEN];
+    SCCLCHECK(scclTopoGetStrFromSys(path, fileName, strValue));
+    if(strValue[0] != '\0') {
+        SCCLCHECK(xmlSetAttr(pciNode, attrName, strValue));
+    }
+    INFO(SCCL_LOG_TOPO, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue);
+    return scclSuccess;
+}
+
+scclResult_t scclTopoGetXmlFromCpu(struct scclXmlNode* cpuNode, struct scclXml* xml) {
+    int index;
+    SCCLCHECK(xmlGetAttrIndex(cpuNode, "affinity", &index));
+    if(index == -1) {
+        const char* numaId;
+        SCCLCHECK(xmlGetAttr(cpuNode, "numaid", &numaId));
+        if(numaId == NULL) {
+            WARN("GetXmlFromCpu : could not find CPU numa ID.");
+            return scclInternalError;
+        }
+        // Set affinity
+        char cpumaskPath[] = "/sys/devices/system/node/node0000";
+        sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId);
+        SCCLCHECK(scclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity"));
+    }
+
+    SCCLCHECK(xmlGetAttrIndex(cpuNode, "arch", &index));
+    if(index == -1) {
+        // Fill CPU type / vendor / model
+#if defined(__PPC__)
+        SCCLCHECK(xmlSetAttr(cpuNode, "arch", "ppc64"));
+#elif defined(__aarch64__)
+        SCCLCHECK(xmlSetAttr(cpuNode, "arch", "arm64"));
+#elif defined(__x86_64__)
+        SCCLCHECK(xmlSetAttr(cpuNode, "arch", "x86_64"));
+#endif
+    }
+
+#if defined(__x86_64__)
+    SCCLCHECK(xmlGetAttrIndex(cpuNode, "vendor", &index));
+    if(index == -1) {
+        union {
+            struct {
+                // CPUID 0 String register order
+                uint32_t ebx;
+                uint32_t edx;
+                uint32_t ecx;
+            };
+            char vendor[12];
+        } cpuid0;
+
+        asm volatile("cpuid" : "=b"(cpuid0.ebx), "=c"(cpuid0.ecx), "=d"(cpuid0.edx) : "a"(0) : "memory");
+        char vendor[13];
+        strncpy(vendor, cpuid0.vendor, 12);
+        vendor[12] = '\0';
+        SCCLCHECK(xmlSetAttr(cpuNode, "vendor", vendor));
+    }
+
+    SCCLCHECK(xmlGetAttrIndex(cpuNode, "familyid", &index));
+    if(index == -1) {
+        union {
+            struct {
+                unsigned steppingId : 4;
+                unsigned modelId : 4;
+                unsigned familyId : 4;
+                unsigned processorType : 2;
+                unsigned resv0 : 2;
+                unsigned extModelId : 4;
+                unsigned extFamilyId : 8;
+                unsigned resv1 : 4;
+            };
+            uint32_t val;
+        } cpuid1;
+        asm volatile("cpuid" : "=a"(cpuid1.val) : "a"(1) : "memory");
+        int familyId = cpuid1.familyId + (cpuid1.extFamilyId << 4);
+        int modelId  = cpuid1.modelId + (cpuid1.extModelId << 4);
+        SCCLCHECK(xmlSetAttrInt(cpuNode, "familyid", familyId));
+        SCCLCHECK(xmlSetAttrInt(cpuNode, "modelid", modelId));
+    }
+#endif
+    return scclSuccess;
+}
+
+scclResult_t scclTopoGetPciNode(struct scclXml* xml, const char* busId, struct scclXmlNode** pciNode) {
+    SCCLCHECK(xmlFindTagKv(xml, "pci", pciNode, "busid", busId));
+    if(*pciNode == NULL) {
+        SCCLCHECK(xmlAddNode(xml, NULL, "pci", pciNode));
+        SCCLCHECK(xmlSetAttr(*pciNode, "busid", busId));
+    }
+    return scclSuccess;
+}
+
+// Check whether a string is in BDF format or not.
+// BDF (Bus-Device-Function) is "BBBB:BB:DD.F" where B, D and F are hex digits.
+// There can be trailing chars.
+int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); }
+int checkBDFFormat(char* bdf) {
+    if(bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.')
+        return 0;
+    if(isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) || isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) ||
+       isHex(bdf[9] == 0) || isHex(bdf[11] == 0))
+        return 0;
+    return 1;
+}
+
+scclResult_t scclTopoGetXmlFromSys(struct scclXmlNode* pciNode, struct scclXml* xml) {
+    // Fill info, then parent
+    const char* busId;
+    SCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+    char* path = NULL;
+    getPciPath(busId, &path);
+
+    if(path) {
+        SCCLCHECK(scclTopoSetAttrFromSys(pciNode, path, "class", "class"));
+    }
+    int index;
+    SCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
+    if(index == -1) {
+        if(path)
+            scclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor");
+    }
+    SCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
+    if(index == -1) {
+        if(path)
+            scclTopoSetAttrFromSys(pciNode, path, "device", "device");
+    }
+    SCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
+    if(index == -1) {
+        if(path)
+            scclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor");
+    }
+    SCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
+    if(index == -1) {
+        if(path)
+            scclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device");
+    }
+    SCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
+    if(index == -1) {
+        if(path) {
+            char deviceSpeedStr[MAX_STR_LEN];
+            float deviceSpeed;
+            SCCLCHECK(scclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
+            sscanf(deviceSpeedStr, "%f GT/s", &deviceSpeed);
+            char portSpeedStr[MAX_STR_LEN];
+            float portSpeed;
+            SCCLCHECK(scclTopoGetStrFromSys(path, "../max_link_speed", portSpeedStr));
+            if(portSpeedStr[0])
+                sscanf(portSpeedStr, "%f GT/s", &portSpeed);
+            else
+                portSpeed = deviceSpeed;
+            SCCLCHECK(xmlSetAttr(pciNode, "link_speed", portSpeed < deviceSpeed ? portSpeedStr : deviceSpeedStr));
+        } else {
+            SCCLCHECK(xmlSetAttr(pciNode, "link_speed", ""));
+        }
+    }
+    SCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
+    if(index == -1) {
+        if(path) {
+            char strValue[MAX_STR_LEN];
+            SCCLCHECK(scclTopoGetStrFromSys(path, "max_link_width", strValue));
+            int deviceWidth = strtol(strValue, NULL, 0);
+            SCCLCHECK(scclTopoGetStrFromSys(path, "../max_link_width", strValue));
+            int portWidth;
+            if(strValue[0])
+                portWidth = strtol(strValue, NULL, 0);
+            else
+                portWidth = deviceWidth;
+            SCCLCHECK(xmlSetAttrInt(pciNode, "link_width", std::min(deviceWidth, portWidth)));
+        } else {
+            SCCLCHECK(xmlSetAttr(pciNode, "link_width", ""));
+        }
+    }
+    struct scclXmlNode* parent = pciNode->parent;
+    if(parent == NULL) {
+        if(path) {
+            // Save that for later in case next step is a CPU
+            char numaIdStr[MAX_STR_LEN];
+            SCCLCHECK(scclTopoGetStrFromSys(path, "numa_node", numaIdStr));
+            // Workaround kernel bug for now
+            if(strcmp(numaIdStr, "-1") == 0)
+                strcpy(numaIdStr, "0");
+
+            // Go up one level in the PCI tree. Rewind two "/" and follow the upper PCI
+            // switch, or stop if we reach a CPU root complex.
+            int slashCount = 0;
+            int parentOffset;
+            for(parentOffset = strlen(path) - 1; parentOffset > 0; parentOffset--) {
+                if(path[parentOffset] == '/') {
+                    slashCount++;
+                    path[parentOffset] = '\0';
+                    int start          = parentOffset - 1;
+                    while(start > 0 && path[start] != '/')
+                        start--;
+                    // Check whether the parent path looks like "BBBB:BB:DD.F" or not.
+                    if(checkBDFFormat(path + start + 1) == 0) {
+                        // This a CPU root complex. Create a CPU tag and stop there.
+                        struct scclXmlNode* topNode;
+                        SCCLCHECK(xmlFindTag(xml, "system", &topNode));
+                        SCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr));
+                        if(parent == NULL) {
+                            SCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
+                            SCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr));
+                        }
+                    } else if(slashCount == 2) {
+                        // Continue on the upper PCI switch
+                        for(int i = strlen(path) - 1; i > 0; i--) {
+                            if(path[i] == '/') {
+                                SCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", path + i + 1));
+                                if(parent == NULL) {
+                                    SCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
+                                    SCCLCHECK(xmlSetAttr(parent, "busid", path + i + 1));
+                                }
+                                break;
+                            }
+                        }
+                    }
+                }
+                if(parent)
+                    break;
+            }
+        } else {
+            // No information on /sys, attach GPU to unknown CPU
+            SCCLCHECK(xmlFindTagKv(xml, "cpu", &parent, "numaid", "-1"));
+            if(parent == NULL) {
+                struct scclXmlNode* topNode;
+                SCCLCHECK(xmlFindTag(xml, "system", &topNode));
+                SCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
+                SCCLCHECK(xmlSetAttr(parent, "numaid", "-1"));
+                SCCLCHECK(scclTopoGetXmlFromCpu(parent, xml));
+            }
+        }
+        pciNode->parent               = parent;
+        parent->subs[parent->nSubs++] = pciNode;
+    }
+    if(strcmp(parent->name, "pci") == 0) {
+        SCCLCHECK(scclTopoGetXmlFromSys(parent, xml));
+    } else if(strcmp(parent->name, "cpu") == 0) {
+        SCCLCHECK(scclTopoGetXmlFromCpu(parent, xml));
+    }
+    free(path);
+    return scclSuccess;
+}
+
+scclResult_t scclTopoGetXmlFromGpu(struct scclXmlNode* pciNode, uint32_t rocmDev, struct scclXml* xml, struct scclXmlNode** gpuNodeRet) {
+    struct scclXmlNode* gpuNode = NULL;
+    SCCLCHECK(xmlGetSub(pciNode, "gpu", &gpuNode));
+    if(gpuNode == NULL)
+        SCCLCHECK(xmlAddNode(xml, pciNode, "gpu", &gpuNode));
+
+    int index = -1;
+
+    int dev = -1;
+    SCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
+    if(index == -1) {
+        if(rocmDev == -1) {
+            const char* busId;
+            SCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+            if(busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess)
+                dev = -1;
+        } else {
+            dev = rocmDev;
+        }
+        SCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev));
+    }
+    SCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev));
+    if(dev == -1) {
+        *gpuNodeRet = NULL;
+        return scclSuccess;
+    }
+
+    SCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index));
+    if(index == -1) {
+        int hipMajor, hipMinor;
+        hipDeviceProp_t devProp;
+        HIPCHECK(hipGetDeviceProperties(&devProp, 0));
+        hipMajor = devProp.major;
+        hipMinor = devProp.minor;
+        SCCLCHECK(xmlSetAttrInt(gpuNode, "sm", hipMajor * 10 + hipMinor));
+    }
+    int sm;
+    SCCLCHECK(xmlGetAttrInt(gpuNode, "sm", &sm));
+
+    const char* gcn;
+    const char* gcnArchName;
+    SCCLCHECK(xmlGetAttrIndex(gpuNode, "gcn", &index));
+    if(index == -1) {
+        hipDeviceProp_t devProp;
+        HIPCHECK(hipGetDeviceProperties(&devProp, 0));
+        // extract only the releveant info from the gcnArchName attribute
+        // e.g.: convert "gfx908:sramecc+:xnack-" to "gfx908"
+        char gcnArchNameSubstr[6];
+        GcnArchNameFormat(devProp.gcnArchName, gcnArchNameSubstr);
+        gcn = gcnArchNameSubstr;
+        SCCLCHECK(xmlSetAttr(gpuNode, "gcn", gcn));
+    }
+    SCCLCHECK(xmlGetAttr(gpuNode, "gcn", &gcn));
+    convertGcnArchToGcnArchName(gcn, &gcnArchName);
+    SCCLCHECK(xmlSetAttr(gpuNode, "gcn", gcnArchName));
+
+    scclHipDeviceArch_t arch;
+    SCCLCHECK(xmlGetAttrIndex(gpuNode, "arch", &index));
+    if(index == -1) {
+        hipDeviceProp_t devProp;
+        HIPCHECK(hipGetDeviceProperties(&devProp, 0));
+        memcpy(&arch.arch, &devProp.arch, sizeof(hipDeviceArch_t));
+        SCCLCHECK(xmlSetAttrInt(gpuNode, "arch", arch.value));
+    }
+    SCCLCHECK(xmlGetAttrInt(gpuNode, "arch", &arch.value));
+
+    struct scclXmlNode* nvlNode = NULL;
+    SCCLCHECK(xmlGetSub(gpuNode, "nvlink", &nvlNode));
+    if(nvlNode == NULL) {
+        const char* busId;
+        SCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+        uint32_t deviceCnt;
+        SCCLCHECK(rocm_smi_getNumDevice(&deviceCnt));
+        for(int i = 0; i < deviceCnt; i++) {
+            if(i != dev) {
+                RSMI_IO_LINK_TYPE rsmi_type;
+                int hops, count;
+                if(rocm_smi_getLinkInfo(dev, i, &rsmi_type, &hops, &count) == scclSuccess) {
+                    if(rsmi_type >= RSMI_IOLINK_TYPE_XGMI && hops >= 1) {
+                        char busIdStr[] = "00000000:00:00.0";
+                        SCCLCHECK(rocm_smi_getDevicePciBusIdString(i, busIdStr, sizeof(busIdStr)));
+                        char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+                        for(int c = 0; c < NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+                            lowerId[c] = tolower(busIdStr[c]);
+                            if(busIdStr[c] == 0)
+                                break;
+                        }
+                        SCCLCHECK(xmlGetSubKv(gpuNode, "xgmi", &nvlNode, "target", lowerId));
+                        if(nvlNode == NULL) {
+                            SCCLCHECK(xmlAddNode(xml, gpuNode, "xgmi", &nvlNode));
+                            SCCLCHECK(xmlSetAttr(nvlNode, "target", lowerId));
+                            SCCLCHECK(xmlSetAttrInt(nvlNode, "count", count));
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // Fill target classes
+    for(int s = 0; s < gpuNode->nSubs; s++) {
+        struct scclXmlNode* sub = gpuNode->subs[s];
+        if(strcmp(sub->name, "xgmi") != 0)
+            continue;
+        int index;
+        SCCLCHECK(xmlGetAttrIndex(sub, "tclass", &index));
+        if(index == -1) {
+            const char* busId;
+            SCCLCHECK(xmlGetAttr(sub, "target", &busId));
+            char* path;
+            getPciPath(busId, &path);
+            if(path == NULL || strcmp(busId, "fffffff:ffff:ff") == 0) {
+                // Remote NVLink device is not visible inside this VM. Assume NVSwitch.
+                SCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000"));
+            } else {
+                SCCLCHECK(scclTopoSetAttrFromSys(sub, path, "class", "tclass"));
+                free(path);
+            }
+        }
+    }
+    *gpuNodeRet = gpuNode;
+    return scclSuccess;
+}
+
+// Returns the subsystem name of a path, i.e. the end of the path
+// where sysPath/subsystem points to.
+scclResult_t scclTopoGetSubsystem(const char* sysPath, char* subSys) {
+    char subSysPath[PATH_MAX];
+    sprintf(subSysPath, "%s/subsystem", sysPath);
+    char* path = realpath(subSysPath, NULL);
+    if(path == NULL) {
+        subSys[0] = '\0';
+    } else {
+        int offset;
+        for(offset = strlen(path); offset > 0 && path[offset] != '/'; offset--)
+            ;
+        strcpy(subSys, path + offset + 1);
+        free(path);
+    }
+    return scclSuccess;
+}
+
+scclResult_t scclTopoTrimXmlRec(struct scclXmlNode* node) {
+    const char* str;
+    SCCLCHECK(xmlGetAttr(node, "keep", &str));
+    if(str && strcmp(str, "1") == 0) {
+        SCCLCHECK(xmlUnsetAttr(node, "keep"));
+    } else {
+        // Copy nSubs and subs as they could change as we trim recursively.
+        struct scclXmlNode* subs[MAX_SUBS];
+        int nSubs = node->nSubs;
+        memcpy(subs, node->subs, node->nSubs * sizeof(struct scclXmlNode*));
+        for(int s = 0; s < nSubs; s++) {
+            SCCLCHECK(scclTopoTrimXmlRec(subs[s]));
+        }
+        if(node->nSubs == 0)
+            SCCLCHECK(xmlRemoveNode(node));
+    }
+    return scclSuccess;
+}
+
+/**************************************************/
+/* Parser rules for the user-defined graph search */
+/**************************************************/
+
+scclResult_t scclTopoXmlGraphLoadGpu(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
+    SCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoXmlGraphLoadNet(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
+    SCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoXmlGraphLoadChannel(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
+    struct xmlHandler handlers[] = {{"net", scclTopoXmlGraphLoadNet}, {"gpu", scclTopoXmlGraphLoadGpu}};
+    SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoXmlGraphLoadGraph(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
+    struct xmlHandler handlers[] = {{"channel", scclTopoXmlGraphLoadChannel}};
+    SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoXmlGraphLoadGraphs(FILE* file, struct scclXml* xmlGraph, struct scclXmlNode* head) {
+    int version;
+    SCCLCHECK(xmlGetAttrInt(head, "version", &version));
+    if(version != SCCL_GRAPH_XML_VERSION) {
+        WARN("XML Graph has wrong version %d, %d needed", version, SCCL_GRAPH_XML_VERSION);
+        return scclInvalidUsage;
+    }
+    const char* name;
+    SCCLCHECK(xmlGetAttr(head, "name", &name));
+    if(name != NULL)
+        INFO(SCCL_LOG_TOPO, "Loading graphs for topology %s", name);
+    else
+        INFO(SCCL_LOG_TOPO, "Loading graphs");
+
+    struct xmlHandler handlers[] = {{"graph", scclTopoXmlGraphLoadGraph}};
+    SCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1));
+    return scclSuccess;
+}
+
+} // namespace xml
+
+scclResult_t scclTopoGetXmlFromFile(const char* xmlTopoFile, struct scclXml* xml, int warn) {
+    FILE* file = fopen(xmlTopoFile, "r");
+    if(file == NULL) {
+        if(warn) {
+            WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno));
+        }
+        return scclSuccess;
+    }
+    INFO(SCCL_LOG_TOPO, "Loading topology file %s", xmlTopoFile);
+    struct xml::xmlHandler handlers[] = {{"system", xml::scclTopoXmlLoadSystem}};
+    xml->maxIndex                     = 0;
+    SCCLCHECK(xml::xmlLoadSub(file, xml, NULL, handlers, 1));
+    fclose(file);
+    return scclSuccess;
+}
+
+scclResult_t scclTopoDumpXmlToFile(const char* xmlTopoFile, struct scclXml* xml) {
+    FILE* file = fopen(xmlTopoFile, "w");
+    if(file == NULL) {
+        WARN("Unable to open %s, not dumping topology.", xmlTopoFile);
+        return scclSuccess;
+    }
+    SCCLCHECK(xml::scclTopoDumpXmlRec(0, file, xml->nodes));
+    fclose(file);
+    return scclSuccess;
+}
+
+scclResult_t scclTopoFillGpu(struct scclXml* xml, const char* busId, struct scclXmlNode** gpuNode) {
+    struct scclXmlNode* node;
+    SCCLCHECK(xml::scclTopoGetPciNode(xml, busId, &node));
+    SCCLCHECK(xmlSetAttrIfUnset(node, "class", "0x03"));
+    SCCLCHECK(xml::scclTopoGetXmlFromSys(node, xml));
+    uint32_t devIndex;
+    static int rocmsmiInit = 0;
+    if(rocmsmiInit == 0) {
+        rocmsmiInit = (rocm_smi_init() != scclSuccess) ? 2 : 1;
+    }
+    if(rocmsmiInit == 1) {
+        if(rocm_smi_getDeviceIndexByPciBusId(busId, &devIndex) != scclSuccess)
+            devIndex = -1;
+    }
+    SCCLCHECK(xml::scclTopoGetXmlFromGpu(node, devIndex, xml, gpuNode));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoFillNet(struct scclXml* xml, const char* pciPath, const char* netName, struct scclXmlNode** netNode) {
+    SCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName));
+    if(*netNode != NULL)
+        return scclSuccess;
+
+    const char* pciSysPath = pciPath;
+    if(pciSysPath) {
+        char subSystem[PATH_MAX];
+        SCCLCHECK(xml::scclTopoGetSubsystem(pciSysPath, subSystem));
+        // This is not a PCI device (virtual, usb, ...).
+        if(strcmp(subSystem, "pci") != 0) {
+            INFO(SCCL_LOG_TOPO, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
+            pciSysPath = NULL;
+        }
+    }
+
+    struct scclXmlNode* parent = NULL;
+    if(pciSysPath) {
+        int offset;
+        for(offset = strlen(pciSysPath) - 1; pciSysPath[offset] != '/'; offset--)
+            ;
+        char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+        strcpy(busId, pciSysPath + offset + 1);
+        SCCLCHECK(xml::scclTopoGetPciNode(xml, busId, &parent));
+        SCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02"));
+        SCCLCHECK(xml::scclTopoGetXmlFromSys(parent, xml));
+    } else {
+        // Virtual NIC, no PCI device, attach to first CPU
+        SCCLCHECK(xmlFindTag(xml, "cpu", &parent));
+    }
+
+    struct scclXmlNode* nicNode = NULL;
+    SCCLCHECK(xmlGetSub(parent, "nic", &nicNode));
+    if(nicNode == NULL) {
+        SCCLCHECK(xmlAddNode(xml, parent, "nic", &nicNode));
+    }
+
+    // We know that this net does not exist yet (we searched for it at the
+    // beginning of this function), so we can add it.
+    SCCLCHECK(xmlAddNode(xml, nicNode, "net", netNode));
+    SCCLCHECK(xmlSetAttr(*netNode, "name", netName));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct scclXml* xml) {
+    FILE* file = fopen(xmlGraphFile, "r");
+    if(file == NULL) {
+        WARN("Could not open XML graph file %s : %s", xmlGraphFile, strerror(errno));
+        return scclSystemError;
+    }
+    struct xml::xmlHandler handlers[] = {{"graphs", xml::scclTopoXmlGraphLoadGraphs}};
+    xml->maxIndex                     = 0;
+    SCCLCHECK(xml::xmlLoadSub(file, xml, NULL, handlers, 1));
+    fclose(file);
+    return scclSuccess;
+}
+
+scclResult_t scclTopoTrimXml(struct scclXml* xml) {
+    SCCLCHECK(xml::scclTopoTrimXmlRec(xml->nodes));
+    return scclSuccess;
+}
+
+scclResult_t scclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
+    char filePath[PATH_MAX];
+    sprintf(filePath, "%s/%s", path, fileName);
+    int offset = 0;
+    FILE* file;
+    if((file = fopen(filePath, "r")) != NULL) {
+        while(feof(file) == 0 && ferror(file) == 0 && offset < MAX_STR_LEN) {
+            int len = fread(strValue + offset, 1, MAX_STR_LEN - offset, file);
+            offset += len;
+        }
+        fclose(file);
+    }
+    if(offset == 0) {
+        strValue[0] = '\0';
+        INFO(SCCL_LOG_TOPO, "Topology detection : could not read %s, ignoring", filePath);
+    } else {
+        strValue[offset - 1] = '\0';
+    }
+    return scclSuccess;
+}
+
+} // namespace topo
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/topo_bak/xml.h
+++ b/src/hardware/topo_bak/xml.h
+#ifndef XML_H_
+#define XML_H_
+
+#include <stdlib.h>
+
+#include "base.h"
+
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace topo {
+
+///////////////////////////////////////// 基础struct /////////////////////////////////////////
+
+// A few constraints to make the implementation easy
+#define MAX_STR_LEN 255
+#define MAX_ATTR_COUNT 16
+#define MAX_SUBS 32
+#define MAX_NODES 1024
+
+typedef enum node_type {
+    NODE_TYPE_NONE   = 0,
+    NODE_TYPE_OPEN   = 1,
+    NODE_TYPE_CLOSE  = 2,
+    NODE_TYPE_SINGLE = 3
+} node_type_t;
+
+// 定义一个结构体 scclXmlNode，用于表示XML节点
+struct scclXmlNode {
+    char name[MAX_STR_LEN + 1]; // 节点名称
+    struct {
+        char key[MAX_STR_LEN + 1];      // 属性键
+        char value[MAX_STR_LEN + 1];    // 属性值
+    } attrs[MAX_ATTR_COUNT + 1];        // 需要额外的一个来消耗额外参数
+    int nAttrs;                         // 属性数量
+    int type;                           // 节点类型
+    struct scclXmlNode* parent;         // 父节点指针
+    struct scclXmlNode* subs[MAX_SUBS]; // 子节点指针数组
+    int nSubs;                          // 子节点数量
+};
+
+// 定义了一个结构体 scclXml，用于表示XML文档的结构
+struct scclXml {
+    struct scclXmlNode nodes[MAX_NODES]; // 节点数组，每个节点代表XML中的一个元素
+    int maxIndex;                        // 当前XML结构中最大节点索引
+};
+
+struct kvDict {
+    const char* str;
+    int value;
+};
+
+typedef union {
+    hipDeviceArch_t arch;
+    int value;
+    static_assert(sizeof(hipDeviceArch_t) == sizeof(int), "value must be the same size of hipDeviceArch_t.");
+} scclHipDeviceArch_t;
+
+///////////////////////////////////////// File functions /////////////////////////////////////////
+#define SCCL_TOPO_XML_VERSION 2
+#define SCCL_GRAPH_XML_VERSION 1
+
+// 从文件中获取XML拓扑结构
+scclResult_t scclTopoGetXmlFromFile(const char* xmlTopoFile, struct scclXml* xml, int warn);
+// 将XML拓扑结构保存到文件中
+scclResult_t scclTopoDumpXmlToFile(const char* xmlTopoFile, struct scclXml* xml);
+
+// 从文件中获取XML图形结构
+scclResult_t scclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct scclXml* xml);
+
+/* 自动检测功能 */
+// 根据总线ID填充GPU信息到XML结构中
+scclResult_t scclTopoFillGpu(struct scclXml* xml, const char* busId, struct scclXmlNode** gpuNode);
+// 根据PCI路径和网络名称填充网络信息到XML结构中
+scclResult_t scclTopoFillNet(struct scclXml* xml, const char* pciPath, const char* netName, struct scclXmlNode** netNode);
+
+/* 移除不需要的部分 */
+// 修剪XML结构，移除不需要的部分
+scclResult_t scclTopoTrimXml(struct scclXml* xml);
+
+// 从系统路径中获取字符串值
+scclResult_t scclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue);
+
+/**************/
+/* XML Struct */
+/* Functions  */
+/**************/
+
+// 获取XML节点的属性索引
+scclResult_t xmlGetAttrIndex(struct scclXmlNode* node, const char* attrName, int* index);
+
+// 获取XML节点的属性值，返回为字符串
+scclResult_t xmlGetAttr(struct scclXmlNode* node, const char* attrName, const char** value);
+
+// 获取XML节点的属性值，返回为字符串（与xmlGetAttr类似）
+scclResult_t xmlGetAttrStr(struct scclXmlNode* node, const char* attrName, const char** value);
+
+// 获取XML节点的属性值，返回为整数
+scclResult_t xmlGetAttrInt(struct scclXmlNode* node, const char* attrName, int* value);
+
+// 获取XML节点的属性值，返回为整数，如果属性不存在则返回默认值
+scclResult_t xmlGetAttrIntDefault(struct scclXmlNode* node, const char* attrName, int* value, int defaultValue);
+
+// 初始化XML节点的整数属性
+scclResult_t xmlInitAttrInt(struct scclXmlNode* node, const char* attrName, const int value);
+
+// 初始化XML节点的无符号64位整数属性
+scclResult_t xmlInitAttrUint64(struct scclXmlNode* node, const char* attrName, const uint64_t value);
+
+// 获取XML节点的属性值，返回为浮点数
+scclResult_t xmlGetAttrFloat(struct scclXmlNode* node, const char* attrName, float* value);
+
+// 初始化XML节点的浮点数属性
+scclResult_t xmlInitAttrFloat(struct scclXmlNode* node, const char* attrName, const float value);
+
+// 在XML中查找指定标签名的节点
+scclResult_t xmlFindTag(struct scclXml* xml, const char* tagName, struct scclXmlNode** node);
+
+// 在XML中查找指定标签名和属性值的节点
+scclResult_t xmlFindTagKv(struct scclXml* xml, const char* tagName, struct scclXmlNode** node, const char* attrName, const char* attrValue);
+
+// 设置XML节点的属性值
+scclResult_t xmlSetAttr(struct scclXmlNode* node, const char* attrName, const char* value);
+
+// 如果属性未设置，则设置XML节点的属性值
+scclResult_t xmlSetAttrIfUnset(struct scclXmlNode* node, const char* attrName, const char* value);
+
+// 设置XML节点的属性值为整数
+scclResult_t xmlSetAttrInt(struct scclXmlNode* node, const char* attrName, const int value);
+
+// 设置XML节点的属性值为浮点数
+scclResult_t xmlSetAttrFloat(struct scclXmlNode* node, const char* attrName, const float value);
+
+// 移除XML节点的属性
+scclResult_t xmlUnsetAttr(struct scclXmlNode* node, const char* attrName);
+
+// 获取XML节点的子节点
+scclResult_t xmlGetSub(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub);
+
+// 获取XML节点的子节点，子节点需匹配指定属性值
+scclResult_t xmlGetSubKv(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub, const char* attrName, const char* attrValue);
+
+// 获取XML节点的子节点，子节点需匹配指定整数属性值
+scclResult_t xmlGetSubKvInt(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub, const char* attrName, const int attrValue);
+
+// 在XML中添加新节点
+scclResult_t xmlAddNode(struct scclXml* xml, struct scclXmlNode* parent, const char* subName, struct scclXmlNode** sub);
+
+// 从XML中移除节点
+scclResult_t xmlRemoveNode(struct scclXmlNode* node);
+
+// 字符串到整数的转换字典，最后一个元素的str应为NULL
+// 将字符串转换为整数
+scclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* dict);
+
+// 将整数转换为字符串
+scclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict);
+
+} // namespace topo
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
+
+#endif
--- a/src/hardware/topology/bootstrap/bootstrap.h
+++ b/src/hardware/topology/bootstrap/bootstrap.h
+
+
+#pragma once
+
+#include <string.h>
+#include "base.h"
+#include "archinfo.h"
+
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace bootstrap {
+
+///////////
+} // namespace bootstrap
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/topology/bootstrap/bootstrap_net.cpp
+++ b/src/hardware/topology/bootstrap/bootstrap_net.cpp
+#include <unistd.h>
+#include <sys/types.h>
+#include <string.h>
+#include "bootstrap_net.h"
+
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace bootstrap {
+
+namespace bootstrap_net {
+/* Init functions */
+static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1];
+static scclSocketAddress_t bootstrapNetIfAddr;
+static int bootstrapNetInitDone  = 0;
+pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
+
+/**
+ * @brief 初始化引导网络
+ *
+ * 该函数用于初始化SCCL的引导网络。它会检查环境变量"SCCL_COMM_ID"来获取远程地址，
+ * 如果没有设置则自动查找可用的网络接口。函数使用互斥锁确保线程安全。
+ *
+ * @return scclResult_t 返回操作结果：
+ *      - scclSuccess: 初始化成功
+ *      - scclInvalidArgument: 无效的SCCL_COMM_ID格式
+ *      - scclSystemError: 找不到匹配的网络接口
+ *      - scclInternalError: 找不到可用的网络接口
+ */
+scclResult_t bootstrapNetInit() {
+    if(bootstrapNetInitDone == 0) {
+        pthread_mutex_lock(&bootstrapNetLock);
+        if(bootstrapNetInitDone == 0) {
+            char* env = getenv("SCCL_COMM_ID");
+            if(env) {
+                scclSocketAddress_t remoteAddr;
+                if(net::host::scclSocketGetAddrFromString(&remoteAddr, env) != scclSuccess) {
+                    WARN("Invalid SCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
+                    return scclInvalidArgument;
+                }
+                if(net::host::scclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+                    WARN("NET/Socket : No usable listening interface found");
+                    return scclSystemError;
+                }
+            } else {
+                int nIfs = net::host::scclFindSocketInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
+                if(nIfs <= 0) {
+                    WARN("Bootstrap : no socket interface found");
+                    return scclInternalError;
+                }
+            }
+            char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2];
+            sprintf(line, " %s:", bootstrapNetIfName);
+            net::host::scclSocketToString(&bootstrapNetIfAddr, line + strlen(line));
+            INFO(SCCL_LOG_BOOTSTRAP, "Bootstrap : Using%s", line);
+            bootstrapNetInitDone = 1;
+
+            printf("line=%s\n", line);
+        }
+        pthread_mutex_unlock(&bootstrapNetLock);
+    }
+    return scclSuccess;
+}
+
+// Additional sync functions
+/**
+ * 通过网络发送数据
+ *
+ * @param sock 已连接的socket指针
+ * @param data 要发送的数据指针
+ * @param size 要发送的数据大小(字节)
+ * @return scclResult_t 返回操作结果(scclSuccess表示成功)
+ *
+ * @note 先发送数据大小(sizeof(int))，再发送实际数据
+ */
+scclResult_t bootstrapNetSend(scclSocket_t* sock, void* data, int size) {
+    SCCLCHECK(net::host::scclSocketSend(sock, &size, sizeof(int)));
+    SCCLCHECK(net::host::scclSocketSend(sock, data, size));
+    return scclSuccess;
+}
+
+/**
+ * 从socket接收数据
+ *
+ * @param sock 要接收数据的socket
+ * @param data 接收数据的缓冲区
+ * @param size 缓冲区大小
+ * @return scclResult_t 返回操作结果，成功返回scclSuccess，否则返回错误码
+ *
+ * @note 如果接收到的数据大小超过缓冲区大小，会截断数据并返回scclInternalError
+ */
+scclResult_t bootstrapNetRecv(scclSocket_t* sock, void* data, int size) {
+    int recvSize;
+    SCCLCHECK(net::host::scclSocketRecv(sock, &recvSize, sizeof(int)));
+    if(recvSize > size) {
+        WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
+        return scclInternalError;
+    }
+    SCCLCHECK(net::host::scclSocketRecv(sock, data, std::min(recvSize, size)));
+    return scclSuccess;
+}
+
+} // namespace bootstrap_net
+
+/**
+ * 将未预期的连接请求加入队列
+ *
+ * @param state 引导状态指针
+ * @param peer 对端节点ID
+ * @param tag 连接标签
+ * @param sock 套接字指针
+ * @return 成功返回scclSuccess
+ *
+ * @note 该函数用于处理未预期的连接请求，将其加入等待队列
+ */
+scclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, scclSocket_t* sock) {
+    // New unex
+    struct unexConn* unex;
+    SCCLCHECK(scclCalloc(&unex, 1));
+    unex->peer = peer;
+    unex->tag  = tag;
+    memcpy(&unex->sock, sock, sizeof(scclSocket_t));
+
+    // Enqueue
+    struct unexConn* list = state->unexpectedConnections;
+    if(list == NULL) {
+        state->unexpectedConnections = unex;
+        return scclSuccess;
+    }
+    while(list->next)
+        list = list->next;
+    list->next = unex;
+    return scclSuccess;
+}
+
+/**
+ * 从意外连接队列中查找并移除指定peer和tag的连接
+ *
+ * @param state 引导状态指针
+ * @param peer 目标peer ID
+ * @param tag 目标tag值
+ * @param sock 输出参数，用于存储找到的socket
+ * @param found 输出参数，指示是否找到匹配项
+ * @return 总是返回scclSuccess
+ *
+ * @note 该函数会遍历意外连接链表，查找匹配peer和tag的连接，
+ *       找到后将其从链表中移除并释放内存，通过sock参数返回socket信息
+ */
+scclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, scclSocket_t* sock, int* found) {
+    struct unexConn* elem = state->unexpectedConnections;
+    struct unexConn* prev = NULL;
+    *found                = 0;
+    while(elem) {
+        if(elem->peer == peer && elem->tag == tag) {
+            if(prev == NULL) {
+                state->unexpectedConnections = elem->next;
+            } else {
+                prev->next = elem->next;
+            }
+            memcpy(sock, &elem->sock, sizeof(scclSocket_t));
+            free(elem);
+            *found = 1;
+            return scclSuccess;
+        }
+        prev = elem;
+        elem = elem->next;
+    }
+    return scclSuccess;
+}
+
+/**
+ * 释放未预期的连接链表
+ *
+ * 遍历并释放bootstrapState中存储的所有未预期连接
+ *
+ * @param state 包含未预期连接链表的状态结构体指针
+ */
+static void unexpectedFree(struct bootstrapState* state) {
+    struct unexConn* elem = state->unexpectedConnections;
+    struct unexConn* prev = NULL;
+
+    while(elem) {
+        prev = elem;
+        elem = elem->next;
+        free(prev);
+    }
+    return;
+}
+
+/**
+ * 执行基于环的AllGather操作
+ *
+ * @param commState 通信状态指针
+ * @param allData 用于收集所有rank数据的缓冲区
+ * @param size 每个rank数据块的大小(字节)
+ * @return 成功返回scclSuccess，失败返回错误码
+ *
+ * @note 该函数实现了一个简单的基于环的AllGather算法：
+ *       1. 每个rank在步骤i从(rank-i-1)接收数据
+ *       2. 将前一步骤从(rank-i)接收的数据发送给右侧rank
+ *       3. 共进行nranks-1次步骤完成全收集
+ */
+scclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
+    struct bootstrapState* state = (struct bootstrapState*)commState;
+    char* data                   = (char*)allData;
+    int rank                     = state->rank;
+    int nranks                   = state->nranks;
+
+    INFO(SCCL_LOG_BOOTSTRAP, "rank %d nranks %d size %d", rank, nranks, size);
+
+    /* Simple ring based AllGather
+     * At each step i receive data from (rank-i-1) from left
+     * and send previous step's data from (rank-i) to right
+     */
+    for(int i = 0; i < nranks - 1; i++) {
+        size_t rslice = (rank - i - 1 + nranks) % nranks;
+        size_t sslice = (rank - i + nranks) % nranks;
+
+        // Send slice to the right
+        SCCLCHECK(bootstrap_net::bootstrapNetSend(&state->ringSendSocket, data + sslice * size, size));
+        // Recv slice from the left
+        SCCLCHECK(bootstrap_net::bootstrapNetRecv(&state->ringRecvSocket, data + rslice * size, size));
+    }
+
+    INFO(SCCL_LOG_BOOTSTRAP, "rank %d nranks %d size %d - DONE", rank, nranks, size);
+    return scclSuccess;
+}
+
+/**
+ * 通过socket向指定对等节点发送数据
+ *
+ * @param commState 通信状态指针
+ * @param peer 对等节点编号
+ * @param tag 消息标签
+ * @param data 要发送的数据指针
+ * @param size 数据大小(字节)
+ * @return scclResult_t 返回操作结果状态码(scclSuccess表示成功)
+ */
+scclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
+    scclResult_t ret             = scclSuccess;
+    struct bootstrapState* state = (struct bootstrapState*)commState;
+    scclSocket_t sock;
+
+    SCCLCHECKGOTO(net::host::scclSocketInit(&sock, state->peerCommAddresses + peer, state->magic, net::host::scclSocketTypeBootstrap), ret, fail);
+    SCCLCHECKGOTO(net::host::scclSocketConnect(&sock), ret, fail);
+    SCCLCHECKGOTO(bootstrap_net::bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
+    SCCLCHECKGOTO(bootstrap_net::bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
+    SCCLCHECKGOTO(bootstrap_net::bootstrapNetSend(&sock, data, size), ret, fail);
+
+exit:
+    SCCLCHECK(net::host::scclSocketClose(&sock));
+    return ret;
+fail:
+    goto exit;
+}
+
+/**
+ * @brief 从指定对等节点接收数据
+ *
+ * 该函数首先检查未预期的连接队列，若找到匹配的(peer, tag)则直接接收数据。
+ * 若未找到，则持续监听新连接，接收对等节点和标签信息进行匹配。
+ * 若匹配成功则接收数据，否则将连接信息存入未预期队列供后续使用。
+ *
+ * @param commState 通信状态指针
+ * @param peer 对等节点标识
+ * @param tag 消息标签
+ * @param data 接收数据缓冲区
+ * @param size 接收数据大小
+ * @return scclResult_t 返回操作结果(scclSuccess表示成功)
+ */
+scclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
+    scclResult_t ret             = scclSuccess;
+    struct bootstrapState* state = (struct bootstrapState*)commState;
+    scclSocket_t sock;
+    int newPeer, newTag;
+
+    // Search unexpected connections first
+    int found;
+    SCCLCHECK(unexpectedDequeue(state, peer, tag, &sock, &found));
+    if(found) {
+        SCCLCHECKGOTO(bootstrap_net::bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
+        goto exit;
+    }
+
+    // Then look for new connections
+    while(1) {
+        SCCLCHECKGOTO(net::host::scclSocketInit(&sock), ret, fail);
+        SCCLCHECKGOTO(net::host::scclSocketAccept(&sock, &state->listenSock), ret, fail);
+        SCCLCHECKGOTO(bootstrap_net::bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail);
+        SCCLCHECKGOTO(bootstrap_net::bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail);
+        if(newPeer == peer && newTag == tag) {
+            SCCLCHECKGOTO(bootstrap_net::bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
+            goto exit;
+        }
+        // Unexpected connection. Save for later.
+        SCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail);
+    }
+exit:
+    SCCLCHECK(net::host::scclSocketClose(&sock));
+    return ret;
+fail:
+    goto exit;
+}
+
+scclResult_t bootstrapInit() {}
+
+// /**
+//  * @brief 初始化bootstrap网络通信
+//  *
+//  * 该函数负责初始化bootstrap网络通信环境，包括：
+//  * 1. 创建监听socket供其他rank连接
+//  * 2. 与root节点交换连接信息
+//  * 3. 建立环形通信拓扑
+//  * 4. 收集所有peer的通信地址
+//  * 5. 创建并收集代理服务地址
+//  *
+//  * @param handle bootstrap句柄
+//  * @param comm bootstrap通信上下文
+//  * @return scclResult_t 返回操作结果，scclSuccess表示成功
+//  */
+// scclResult_t bootstrapInit(struct scclBootstrapHandle* handle, struct scclBootstrapComm* comm) {
+//     int rank   = comm->rank;   // 当前进程的排名
+//     int nranks = comm->nRanks; // 进程的总数
+
+//     struct bootstrapState* state;      // 引导状态结构体
+//     scclSocket_t* proxySocket;         // 代理套接字
+//     scclSocketAddress_t nextAddr;      // 下一个地址
+//     scclSocket_t sock, listenSockRoot; // 套接字和根监听套接字
+//     struct extInfo info = {0};         // 扩展信息结构体
+
+//     SCCLCHECK(scclCalloc(&state, 1));           // 分配引导状态结构体
+//     state->rank      = rank;                    // 设置当前进程的排名
+//     state->nranks    = nranks;                  // 设置进程的总数
+//     state->abortFlag = comm->abortFlag;         // 设置中止标志
+//     comm->bootstrap  = state;                   // 将引导状态结构体赋值给通信结构体
+//     comm->magic = state->magic = handle->magic; // 设置魔术值
+
+//     INFO(SCCL_LOG_BOOTSTRAP, "rank %d nranks %d", rank, nranks); // 打印日志信息
+
+//     info.rank   = rank;   // 设置扩展信息结构体中的排名
+//     info.nranks = nranks; // 设置扩展信息结构体中的进程总数
+//     // 创建套接字供其他进程联系
+//     SCCLCHECK(
+//         net::host::scclSocketInit(&state->listenSock, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag));
+//     SCCLCHECK(net::host::scclSocketListen(&state->listenSock));                          // 监听套接字
+//     SCCLCHECK(net::host::scclSocketGetAddr(&state->listenSock, &info.extAddressListen)); // 获取监听套接字地址
+
+//     // 创建套接字供根进程联系
+//     SCCLCHECK(net::host::scclSocketInit(&listenSockRoot, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeBootstrap,
+//     comm->abortFlag)); SCCLCHECK(net::host::scclSocketListen(&listenSockRoot));                              // 监听根进程套接字
+//     SCCLCHECK(net::host::scclSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); // 获取根进程监听套接字地址
+
+//     // // 分散连接时间以避免根进程过载
+//     // if(nranks > 128) {
+//     //     long msec = rank;
+//     //     struct timespec tv;
+//     //     tv.tv_sec  = msec / 1000;
+//     //     tv.tv_nsec = 1000000 * (msec % 1000);
+//     //     TRACE(SCCL_LOG_BOOTSTRAP, "rank %d delaying connection to root by %ld msec", rank, msec);
+//     //     (void)nanosleep(&tv, NULL);
+//     // }
+
+//     // 向根进程发送我的监听套接字信息
+//     SCCLCHECK(net::host::scclSocketInit(&sock, &handle->addr, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag));
+//     SCCLCHECK(net::host::scclSocketConnect(&sock));                         // 连接套接字
+//     SCCLCHECK(bootstrap_net::bootstrapNetSend(&sock, &info, sizeof(info))); // 发送扩展信息
+//     SCCLCHECK(net::host::scclSocketClose(&sock));                           // 关闭套接字
+
+//     // 从根进程获取我在引导环中的“下一个”进程的信息
+//     SCCLCHECK(net::host::scclSocketInit(&sock));                                               // 初始化套接字
+//     SCCLCHECK(net::host::scclSocketAccept(&sock, &listenSockRoot));                            // 接受根进程的连接
+//     SCCLCHECK(bootstrap_net::bootstrapNetRecv(&sock, &nextAddr, sizeof(scclSocketAddress_t))); // 接收下一个地址
+//     SCCLCHECK(net::host::scclSocketClose(&sock));                                              // 关闭套接字
+//     SCCLCHECK(net::host::scclSocketClose(&listenSockRoot));                                    // 关闭根监听套接字
+
+//     SCCLCHECK(net::host::scclSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag));
+//     SCCLCHECK(net::host::scclSocketConnect(&state->ringSendSocket)); // 连接环发送套接字
+//     // 接受引导环中前一个进程的连接请求
+//     SCCLCHECK(net::host::scclSocketInit(&state->ringRecvSocket));                       // 初始化环接收套接字
+//     SCCLCHECK(net::host::scclSocketAccept(&state->ringRecvSocket, &state->listenSock)); // 接受连接
+
+//     // 全部收集所有监听处理器
+//     SCCLCHECK(scclCalloc(&state->peerCommAddresses, nranks));                                     // 分配对等通信地址
+//     SCCLCHECK(net::host::scclSocketGetAddr(&state->listenSock, state->peerCommAddresses + rank)); // 获取监听套接字地址
+//     SCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(scclSocketAddress_t)));  // 全部收集地址
+
+//     // 创建服务代理
+//     SCCLCHECK(scclCalloc(&state->peerProxyAddresses, nranks)); // 分配对等代理地址
+
+//     // 代理通过消息中止；不要设置中止标志
+//     SCCLCHECK(scclCalloc(&proxySocket, 1)); // 分配代理套接字
+//     SCCLCHECK(net::host::scclSocketInit(proxySocket, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeProxy, comm->abortFlag));
+//     SCCLCHECK(net::host::scclSocketListen(proxySocket));                                          // 监听代理套接字
+//     SCCLCHECK(net::host::scclSocketGetAddr(proxySocket, state->peerProxyAddresses + rank));       // 获取代理套接字地址
+//     SCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(scclSocketAddress_t))); // 全部收集代理地址
+//     // SCCLCHECK(scclProxyInit(comm, proxySocket, state->peerProxyAddresses));
+
+//     INFO(SCCL_LOG_BOOTSTRAP, "rank %d nranks %d - DONE", rank, nranks); // 打印完成日志信息
+
+//     return scclSuccess; // 返回成功
+// }
+
+// /**
+//  * @brief 在bootstrap通信中创建新的子通信域
+//  *
+//  * 该函数用于将当前通信域按照指定颜色和键值拆分为子通信域，并建立相应的环状通信拓扑。
+//  *
+//  * @param handle bootstrap句柄
+//  * @param comm 新创建的子通信域
+//  * @param parent 父通信域
+//  * @param color 用于划分通信域的颜色值
+//  * @param key 用于确定新通信域中进程排名的键值
+//  * @param parentRanks 父通信域中的进程排名映射
+//  *
+//  * @return scclResult_t 返回操作结果，成功返回scclSuccess
+//  *
+//  * @note 函数会建立环状通信拓扑，包括：
+//  *       1. 初始化监听socket和环形接收socket
+//  *       2. 与前后节点交换地址信息
+//  *       3. 执行AllGather收集所有节点的通信地址
+//  *       4. 根据配置决定是否共享代理状态或创建新的代理服务
+//  */
+// scclResult_t
+// bootstrapSplit(struct scclBootstrapHandle* handle, struct scclBootstrapComm* comm, struct scclBootstrapComm* parent, int color, int key, int* parentRanks) {
+//     scclResult_t ret = scclSuccess;
+//     int rank         = comm->rank;
+//     int nranks       = comm->nRanks;
+//     int prev, next;
+//     scclSocketAddress_t listenAddr, tmpAddr;
+//     scclSocket_t* proxySocket;
+//     struct bootstrapState* state;
+
+//     // SCCLCHECKGOTO(scclCalloc(&state, 1), ret, fail);
+//     // state->rank      = rank;
+//     // state->nranks    = nranks;
+//     // state->abortFlag = comm->abortFlag;
+//     // comm->bootstrap  = state;
+//     // comm->magic = state->magic = handle->magic;
+
+//     // prev = parentRanks[(rank - 1 + nranks) % nranks];
+//     // next = parentRanks[(rank + 1) % nranks];
+
+//     // // Setup my sockets for the allgather ring and other p2p connections
+//     // SCCLCHECKGOTO(
+//     //     net::host::scclSocketInit(&state->listenSock, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeBootstrap,
+//     comm->abortFlag,
+//     //     0), ret, fail);
+//     // SCCLCHECKGOTO(net::host::scclSocketInit(&state->ringRecvSocket, NULL, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag, 0), ret,
+//     fail);
+
+//     // // Create socket for other ranks to contact me
+//     // SCCLCHECKGOTO(net::host::scclSocketListen(&state->listenSock), ret, fail);
+
+//     // // Get addr from next rank
+//     // SCCLCHECKGOTO(net::host::scclSocketGetAddr(&state->listenSock, &listenAddr), ret, fail);
+//     // SCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, -2, &listenAddr, sizeof(scclSocketAddress_t)), ret, fail);
+//     // SCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, -2, &tmpAddr, sizeof(scclSocketAddress_t)), ret, fail);
+
+//     // SCCLCHECKGOTO(net::host::scclSocketInit(&state->ringSendSocket, &tmpAddr, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag, 0), ret,
+//     // fail); SCCLCHECKGOTO(net::host::scclSocketConnect(&state->ringSendSocket), ret, fail);
+//     // // Accept the connect request from the previous rank in the AllGather ring
+//     // SCCLCHECKGOTO(net::host::scclSocketAccept(&state->ringRecvSocket, &state->listenSock), ret, fail);
+
+//     // // AllGather all listen handlers
+//     // SCCLCHECKGOTO(scclCalloc(&state->peerCommAddresses, nranks), ret, fail);
+//     // memcpy(state->peerCommAddresses + rank, &listenAddr, sizeof(scclSocketAddress_t));
+//     // SCCLCHECKGOTO(bootstrapAllGather(state, state->peerCommAddresses, sizeof(scclSocketAddress_t)), ret, fail);
+
+//     // if(parent->splitShare) {
+//     //     /* map local rank to top parent local rank. */
+//     //     for(int i = 0; i < nranks; ++i) {
+//     //         comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
+//     //     }
+//     //     comm->proxyState = parent->sharedRes->proxyState;
+//     //     scclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
+//     // } else {
+//     //     // Create the service proxy
+//     //     SCCLCHECKGOTO(scclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
+//     //     SCCLCHECKGOTO(scclCalloc(&proxySocket, 1), ret, fail);
+//     //     SCCLCHECKGOTO(
+//     //         net::host::scclSocketInit(proxySocket, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeProxy, comm->abortFlag, 0),
+//     //         ret,
+//     //         fail);
+//     //     SCCLCHECKGOTO(net::host::scclSocketListen(proxySocket), ret, fail);
+//     //     SCCLCHECKGOTO(net::host::scclSocketGetAddr(proxySocket, &tmpAddr), ret, fail);
+//     //     memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(scclSocketAddress_t));
+//     //     SCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(scclSocketAddress_t)), ret, fail);
+//     //     // SCCLCHECKGOTO(scclProxyInit(comm, proxySocket, state->peerProxyAddresses), ret, fail);
+//     // }
+
+//     // INFO(sccl_INIT, "bootstrapSplit: rank %d nranks %d color %d key %d prev %d next %d - DONE", rank, nranks, color, key, prev, next);
+
+// exit:
+//     return ret;
+// fail:
+//     goto exit;
+// }
+
+} // namespace bootstrap
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/topology/bootstrap/bootstrap_net.h
+++ b/src/hardware/topology/bootstrap/bootstrap_net.h
+#pragma once
+
+#include <string.h>
+#include "base.h"
+#include "socket.h"
+#include "bootstrap_utils.h"
+
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace bootstrap {
+
+template <typename Int>
+inline void scclAtomicRefCountIncrement(Int* refs) {
+    __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+namespace bootstrap_net {
+// 通过socket发送数据
+scclResult_t bootstrapNetSend(scclSocket_t* sock, void* data, int size);
+// 通过socket接收数据
+scclResult_t bootstrapNetRecv(scclSocket_t* sock, void* data, int size);
+// 初始化网络引导
+scclResult_t bootstrapNetInit();
+} // namespace bootstrap_net
+
+// 将消息加入到未预期消息队列中
+scclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, scclSocket_t* sock);
+// 从未预期消息队列中取出消息
+scclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, scclSocket_t* sock, int* found);
+// 释放未预期消息队列中的资源
+static void unexpectedFree(struct bootstrapState* state);
+
+// 执行全节点数据收集操作
+scclResult_t bootstrapAllGather(void* commState, void* allData, int size);
+// 向指定节点发送数据
+scclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
+// 从指定节点接收数据
+scclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
+
+////////////////////
+scclResult_t bootstrapInit(struct scclUniqueId* unique_id, struct scclBootstrapComm* comm);
+
+// scclResult_t bootstrapInit(struct scclBootstrapHandle* handle, struct scclBootstrapComm* comm);
+// scclResult_t
+// bootstrapSplit(struct scclBootstrapHandle* handle, struct scclBootstrapComm* comm, struct scclBootstrapComm* parent, int color, int key, int* parentRanks);
+
+} // namespace bootstrap
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/topology/bootstrap/bootstrap_utils.cpp
+++ b/src/hardware/topology/bootstrap/bootstrap_utils.cpp
+
+
+#pragma once
+
+#include <string.h>
+#include "base.h"
+
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace bootstrap {
+//
+} // namespace bootstrap
+} // namespace topology
+} // namespace hardware
+} // namespace sccl