#include #include #include #include #include #include #include "topo.h" #include "utils.h" #include "cpuset.h" #include "nvmlwrap.h" // #include "net.h" // #include "graph.h" // #include "comm.h" // #include "net.h" // #include "coll_net.h" // #include "cpuset.h" namespace sccl { namespace hardware { namespace topology { namespace topo { const char* topoNodeTypeStr[] = {"GPU", "PCI", "NVS", "CPU", "NIC", "NET"}; const char* topoLinkTypeStr[] = {"LOC", "XGMI", "", "PCI", "", "", "", "SYS", "NET"}; const char* topoPathTypeStr[] = {"LOC", "XGMI", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS"}; namespace topo_basic { struct kvDict kvDictPciClass[] = {{"0x060400", PCI}, {"0x068000", NVS}, {"0x068001", CPU}, {"0x03", GPU}, {"0x02", NIC}, {"0x120000", GPU}, {"0x0b4000", GPU}, {NULL, PCI /* Default fallback value */}}; struct kvDict kvDictPciGen[] = {{"2.5 GT/s", 15}, {"5 GT/s", 30}, {"8 GT/s", 60}, {"16 GT/s", 120}, {"32 GT/s", 240}, /* Kernel 5.6 and earlier */ {"2.5 GT/s PCIe", 15}, {"5.0 GT/s PCIe", 30}, {"8.0 GT/s PCIe", 60}, {"16.0 GT/s PCIe", 120}, {"32.0 GT/s PCIe", 240}, {"64.0 GT/s PCIe", 480}, {NULL, 60 /* Default fallback */}}; // x100 Mbps per lane // 定义一个参数 TopoDumpFileRank,用于指定拓扑结构转储文件的等级,默认值为0 SCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0); // 定义一个参数 IgnoreCpuAffinity,用于指定是否忽略CPU亲和性,默认值为0(不忽略) SCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); ////////////////////////////////////////////////////////////////////////////////////////////// scclResult_t scclTopoAddNet(struct scclXmlNode* xmlNet, struct scclTopoSystem* system, struct scclTopoNode* nic, int64_t busId) { int dev; SCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev)); struct scclTopoNode* net; SCCLCHECK(scclTopoCreateNode(system, &net, NET, dev)); const char* str; SCCLCHECK(xmlGetAttr(xmlNet, "guid", &str)); if(str) sscanf(str, "0x%lx", &net->net.asic); else net->net.asic = dev; int mbps; SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0)); if(mbps <= 0) mbps = 10000; // Some NICs define speed = -1 net->net.bw = mbps / 8000.0; if(xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != scclSuccess) net->net.latency = 0; SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0)); SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0)); // SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS)); SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0)); net->net.busId = busId; SCCLCHECK(scclTopoConnectNodes(nic, net, LINK_NET, net->net.bw)); SCCLCHECK(scclTopoConnectNodes(net, nic, LINK_NET, net->net.bw)); return scclSuccess; } scclResult_t scclTopoAddNic(struct scclXmlNode* xmlNic, struct scclTopoSystem* system, struct scclTopoNode* nic, int64_t busId) { for(int s = 0; s < xmlNic->nSubs; s++) { struct scclXmlNode* xmlNet = xmlNic->subs[s]; if(strcmp(xmlNet->name, "net") != 0) continue; int index; SCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index)); if(index == -1) continue; SCCLCHECK(scclTopoAddNet(xmlNet, system, nic, busId)); } return scclSuccess; } /** * @brief 添加GPU拓扑节点到系统 * * 从XML节点中解析GPU属性并填充到拓扑节点结构中,包括: * - CUDA计算能力(sm) * - GCN架构名称(gcn) * - HIP设备架构(arch) * - 设备排名(rank) * - 设备号(dev) * - GDR支持标志(gdr) * * @param xmlGpu 包含GPU配置的XML节点 * @param system 目标拓扑系统 * @param gpu 待填充的GPU拓扑节点 * @return scclResult_t 操作结果,成功返回scclSuccess * * @note 此函数仅处理GPU基础属性,NVLink连接将在后续处理 */ scclResult_t scclTopoAddGpu(struct scclXmlNode* xmlGpu, struct scclTopoSystem* system, struct scclTopoNode* gpu) { SCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap)); const char* gcnArch; const char* gcnArchName; SCCLCHECK(xmlGetAttr(xmlGpu, "gcn", &gcnArch)); convertGcnArchToGcnArchName(gcnArch, &gcnArchName); gpu->gpu.gcn = strdup(gcnArchName); scclHipDeviceArch_t arch; SCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value)); memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t)); SCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank)); SCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev)); SCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport)); // Do not go any further, nvlinks will be added in a second pass return scclSuccess; } /** * @brief 添加PCI设备到拓扑系统 * * 解析XML节点中的PCI设备信息,并根据设备类型(GPU/NIC/普通PCI)创建对应的拓扑节点。 * 对于GPU设备,会进一步解析rank信息;对于NIC设备,会合并多端口设备;对于普通PCI设备, * 会解析vendor/device等属性并递归处理子设备。 * * @param xmlPci 包含PCI设备信息的XML节点 * @param system 目标拓扑系统 * @param parent 父拓扑节点 * @return scclResult_t 操作结果,成功返回scclSuccess */ scclResult_t scclTopoAddPci(struct scclXmlNode* xmlPci, struct scclTopoSystem* system, struct scclTopoNode* parent) { const char* str; int type; SCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str)); SCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass)); int64_t busId; SCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str)); SCCLCHECK(busIdToInt64(str, &busId)); struct scclTopoNode* node = NULL; struct scclXmlNode* xmlGpu = NULL; SCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu)); if(xmlGpu != NULL) { type = GPU; int index; SCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index)); if(index == -1) return scclSuccess; SCCLCHECK(scclTopoCreateNode(system, &node, type, busId)); SCCLCHECK(scclTopoAddGpu(xmlGpu, system, node)); } struct scclXmlNode* xmlNic = NULL; SCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic)); if(xmlNic != NULL) { type = NIC; // Ignore sub device ID and merge multi-port NICs into one PCI device. busId &= 0xfffffffffffffff0; struct scclTopoNode* nicNode = NULL; SCCLCHECK(scclTopoGetNode(system, &nicNode, type, busId)); if(nicNode == NULL) { SCCLCHECK(scclTopoCreateNode(system, &nicNode, type, busId)); node = nicNode; // Connect it to parent later on } SCCLCHECK(scclTopoAddNic(xmlNic, system, nicNode, busId)); } else if(type == PCI) { SCCLCHECK(scclTopoCreateNode(system, &node, type, busId)); SCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str)); if(str) node->pci.device += strtol(str, NULL, 0) << 48; SCCLCHECK(xmlGetAttr(xmlPci, "device", &str)); if(str) node->pci.device += strtol(str, NULL, 0) << 32; SCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str)); if(str) node->pci.device += strtol(str, NULL, 0) << 16; SCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str)); if(str) node->pci.device += strtol(str, NULL, 0); for(int s = 0; s < xmlPci->nSubs; s++) { struct scclXmlNode* xmlSubPci = xmlPci->subs[s]; SCCLCHECK(scclTopoAddPci(xmlSubPci, system, node)); } } if(node) { int width, speed; SCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width)); SCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str)); // Manage cases where speed was not indicated in /sys if(width == 0) width = 16; SCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end) SCCLCHECK(scclTopoConnectNodes(node, parent, LINK_PCI, width * speed / 80.0)); SCCLCHECK(scclTopoConnectNodes(parent, node, LINK_PCI, width * speed / 80.0)); } return scclSuccess; } struct kvDict kvDictCpuArch[] = {{"x86_64", SCCL_TOPO_CPU_ARCH_X86}, {"arm64", SCCL_TOPO_CPU_ARCH_ARM}, {"ppc64", SCCL_TOPO_CPU_ARCH_POWER}, {NULL, 0}}; struct kvDict kvDictCpuVendor[] = {{"GenuineIntel", SCCL_TOPO_CPU_VENDOR_INTEL}, {"AuthenticAMD", SCCL_TOPO_CPU_VENDOR_AMD}, {"CentaurHauls", SCCL_TOPO_CPU_VENDOR_ZHAOXIN}, {" Shanghai ", SCCL_TOPO_CPU_VENDOR_ZHAOXIN}, {NULL, 0}}; /** * @brief 添加CPU拓扑信息到系统拓扑结构中 * * 从XML节点中解析CPU信息,包括NUMA ID、CPU架构、厂商、型号等, * 并创建对应的拓扑节点。同时处理CPU关联的PCI设备和NIC设备。 * * @param xmlCpu 包含CPU配置信息的XML节点 * @param system 目标拓扑系统 * @return scclResult_t 操作结果,成功返回scclSuccess */ scclResult_t scclTopoAddCpu(struct scclXmlNode* xmlCpu, struct scclTopoSystem* system) { int numaId; // 从XML节点获取NUMA ID SCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId)); struct scclTopoNode* cpu; // 创建一个新的CPU节点 SCCLCHECK(scclTopoCreateNode(system, &cpu, CPU, numaId)); const char* str; // 获取CPU的亲和性属性 SCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str)); if(str != NULL) { SCCLCHECK(scclStrToCpuset(str, &cpu->cpu.affinity)); } // 获取CPU架构信息 SCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str)); SCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch)); if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86) { // 获取CPU供应商信息 SCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str)); SCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor)); if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) { int familyId, modelId; // 获取Intel CPU的家族ID和型号ID SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); // 根据家族ID和型号ID确定CPU型号 cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? SCCL_TOPO_CPU_TYPE_SKL : SCCL_TOPO_CPU_INTEL_BDW; } else if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) { int familyId, modelId; // 获取兆芯CPU的家族ID和型号ID SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); if(familyId == 7 && modelId == 0x5B) cpu->cpu.model = SCCL_TOPO_CPU_TYPE_YONGFENG; } if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_AMD) { int familyId, modelId; // 获取AMD CPU的家族ID和型号ID SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); // 将“Milan”也视为“Rome” cpu->cpu.model = ((familyId == 143 && modelId >= 49) || familyId == 175) ? SCCL_TOPO_CPU_TYPE_ROME : SCCL_TOPO_CPU_TYPE_ZEN; } } // 遍历CPU节点的子节点 for(int s = 0; s < xmlCpu->nSubs; s++) { struct scclXmlNode* node = xmlCpu->subs[s]; // 如果子节点是PCI设备,添加PCI节点 if(strcmp(node->name, "pci") == 0) SCCLCHECK(scclTopoAddPci(node, system, cpu)); // 如果子节点是NIC设备,添加NIC节点 if(strcmp(node->name, "nic") == 0) { struct scclTopoNode* nic = NULL; SCCLCHECK(scclTopoGetNode(system, &nic, NIC, 0)); if(nic == NULL) { SCCLCHECK(scclTopoCreateNode(system, &nic, NIC, 0)); SCCLCHECK(scclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW)); SCCLCHECK(scclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW)); } SCCLCHECK(scclTopoAddNic(node, system, nic, 0)); } } return scclSuccess; } // scclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { // char* str = path + offset; // // Remove trailing "/" // if(*str == '/') // str--; // // Find next / // while(*str != '/') // str--; // str++; // int64_t numid; // SCCLCHECK(busIdToInt64(str, &numid)); // // Ignore subdevice because those should use the same PCI link so we want to merge nodes. // numid -= numid & 0xf; // *id = numid; // return scclSuccess; // } static scclResult_t findLocalCpu(struct scclTopoNode* node, struct scclTopoNode** cpu) { *cpu = NULL; if(node->type == CPU) { *cpu = node; return scclSuccess; } for(int l = 0; l < node->nlinks; l++) { if(node->links[l].type == LINK_PCI) SCCLCHECK(findLocalCpu(node->links[l].remNode, cpu)); if(*cpu != NULL) return scclSuccess; } return scclSuccess; } static scclResult_t scclTopoGetInterCpuBw(struct scclTopoNode* cpu, float* bw) { *bw = LOC_BW; if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_POWER) { *bw = P9_BW; return scclSuccess; } if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_ARM) { *bw = ARM_BW; return scclSuccess; } if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) { *bw = cpu->cpu.model == SCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW; } if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) { *bw = cpu->cpu.model == SCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW; } return scclSuccess; } // BCM Gen4 Switches present themselves as a two-level hierarchical switch // even though they're supposed to sustain full BW across all ports. // Flatten the switch as this extra level can break the search and make // SCCL take wrong topology decisions. scclResult_t scclTopoFlattenBcmSwitches(struct scclTopoSystem* system) { for(int s = 0; s < system->nodes[PCI].count; s++) { struct scclTopoNode* pciSwitch = system->nodes[PCI].nodes + s; uint64_t device = pciSwitch->pci.device; // Only flatten PEX Gen 4 switches in base mode if((device & 0xfffffffffffff000) == 0x1000c0101000a000) { // Find sub switches with the same device ID. int64_t* subSwIds; SCCLCHECK(scclCalloc(&subSwIds, pciSwitch->nlinks)); int subs = 0; for(int l = 0; l < pciSwitch->nlinks; l++) { struct scclTopoNode* sub = pciSwitch->links[l].remNode; // Only fuse sub switches with the same device ID. if(sub->type != PCI || sub->pci.device != device) continue; // Save sub switch for later subSwIds[subs++] = sub->id; // Remove link to that sub switch memmove(pciSwitch->links + l, pciSwitch->links + l + 1, (pciSwitch->nlinks - l - 1) * (sizeof(struct scclTopoLink))); pciSwitch->nlinks--; // Don't increase l for the next iteration as we just shifted all links by one. l--; } for(int s = 0; s < subs; s++) { // Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node) int index; SCCLCHECK(scclTopoIdToIndex(system, PCI, subSwIds[s], &index)); struct scclTopoNode* sub = system->nodes[PCI].nodes + index; // Connect all sub PCI devices to the parent switch for(int l = 0; l < sub->nlinks; l++) { struct scclTopoNode* remNode = sub->links[l].remNode; if(remNode == pciSwitch) continue; // Add link from parent PCI switch -> PCI device memcpy(pciSwitch->links + pciSwitch->nlinks, sub->links + l, sizeof(struct scclTopoLink)); pciSwitch->nlinks++; // Update link from PCI device -> parent PCI switch for(int rl = 0; rl < remNode->nlinks; rl++) { if(remNode->links[rl].remNode == sub) { remNode->links[rl].remNode = pciSwitch; break; } } } SCCLCHECK(scclTopoRemoveNode(system, PCI, index)); } // Set subdevice to 0x0000 to make sure we don't merge this switch again. pciSwitch->pci.device = 0x1000c01010000000; free(subSwIds); // Restart, as system->nodes[PCI].nodes has changed. s = 0; } } return scclSuccess; } scclResult_t scclTopoConnectCpus(struct scclTopoSystem* system) { // And connect all CPU nodes together for(int n = 0; n < system->nodes[CPU].count; n++) { for(int p = 0; p < system->nodes[CPU].count; p++) { if(n == p) continue; float bw; SCCLCHECK(scclTopoGetInterCpuBw(system->nodes[CPU].nodes + n, &bw)); SCCLCHECK(scclTopoConnectNodes(system->nodes[CPU].nodes + n, system->nodes[CPU].nodes + p, LINK_SYS, bw)); } } return scclSuccess; } static scclResult_t scclTopoSort(struct scclTopoNode* node, struct scclTopoNode* upNode) { // 如果存在上级节点,则调整当前节点的链接顺序,使上级节点的链接位于最后 if(upNode) { int l = 0; // 找到指向upNode的链接 while(node->links[l].remNode != upNode) l++; struct scclTopoLink upLink; // 复制找到的链接到upLink memcpy(&upLink, node->links + l, sizeof(struct scclTopoLink)); // 将所有链接左移,直到upLink被移动到链接列表的末尾 while(node->links[l + 1].remNode) { memcpy(node->links + l, node->links + l + 1, sizeof(struct scclTopoLink)); l++; } // 将upLink放到链接列表的末尾 memcpy(node->links + l, &upLink, sizeof(struct scclTopoLink)); } // 递归地对PCI树进行排序 for(int l = 0; l < node->nlinks; l++) { struct scclTopoLink* link = node->links + l; // 如果链接类型是PCI且远端节点不是上级节点,则递归排序 if(link->type == LINK_PCI && link->remNode != upNode) SCCLCHECK(scclTopoSort(link->remNode, node)); } return scclSuccess; } // We want the graph to be organized to ease/accelerate traversal : // 1. NVLinks (already the case) // 2. PCI down // 3. PCI up // 4. SYS (already the case) scclResult_t scclTopoSortSystem(struct scclTopoSystem* system) { for(int n = 0; n < system->nodes[CPU].count; n++) SCCLCHECK(scclTopoSort(system->nodes[CPU].nodes + n, NULL)); return scclSuccess; } float scclTopoXGMISpeed(const char* gcn) { if(IsArchMatch(gcn, "gfx90a")) return MI200_XGMI_WIDTH; else if(IsArchMatch(gcn, "gfx94")) return GFX94X_XGMI_WIDTH; else return VEGA_XGMI_WIDTH; } /** * @brief 添加XGMI拓扑连接 * * 处理XML节点中的XGMI连接信息,建立GPU与其他设备(GPU/CPU/NVS)之间的NVL连接。 * * @param node XML节点指针,包含XGMI连接配置信息 * @param system 拓扑系统指针,用于存储和管理拓扑节点 * @param parentBusId 父设备的PCIe总线ID字符串 * * @return scclResult_t 返回操作结果状态码: * - scclSuccess: 操作成功 * - scclInternalError: 找不到指定GPU设备时返回错误 * * @note 1. 支持GPU-GPU、GPU-CPU、GPU-NVS三种连接类型 * 2. 连接带宽由GPU的GCN架构和连接数量共同决定 * 3. 递归处理子节点时保持总线ID传递 */ scclResult_t scclTopoAddXGMI(struct scclXmlNode* node, struct scclTopoSystem* system, const char* parentBusId) { if(strcmp(node->name, "xgmi") == 0) { struct scclTopoNode* gpu = NULL; int64_t pBusId; SCCLCHECK(busIdToInt64(parentBusId, &pBusId)); SCCLCHECK(scclTopoGetNode(system, &gpu, GPU, pBusId)); if(gpu == NULL) { WARN("Add XGMI error : could not find GPU %lx\n", pBusId); return scclInternalError; } int count; SCCLCHECK(xmlGetAttrInt(node, "count", &count)); const char* targetClass; SCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass)); int targetType; SCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass)); struct scclTopoNode* remote = NULL; if(targetType == GPU) { // NVL P2P connection to another GPU const char* target; SCCLCHECK(xmlGetAttrStr(node, "target", &target)); int64_t busId; SCCLCHECK(busIdToInt64(target, &busId)); SCCLCHECK(scclTopoGetNode(system, &remote, GPU, busId)); } else if(targetType == CPU) { // NVL connection to the local CPU SCCLCHECK(findLocalCpu(gpu, &remote)); } else { if(system->nodes[NVS].count == 0) { SCCLCHECK(scclTopoCreateNode(system, &remote, NVS, 0)); } else { remote = system->nodes[NVS].nodes; } } if(remote) { float nvlSpeed = scclTopoXGMISpeed(gpu->gpu.gcn); SCCLCHECK(scclTopoConnectNodes(gpu, remote, LINK_NVL, count * nvlSpeed)); if(remote->type != GPU) { SCCLCHECK(scclTopoConnectNodes(remote, gpu, LINK_NVL, count * nvlSpeed)); } } } else { const char* busId; SCCLCHECK(xmlGetAttr(node, "busid", &busId)); for(int s = 0; s < node->nSubs; s++) { SCCLCHECK(scclTopoAddXGMI(node->subs[s], system, busId ? busId : parentBusId)); } } return scclSuccess; } /** * @brief 获取指定GPU组的本地网络掩码 * * 遍历系统中所有网络节点,找到与指定GPU组(g)连接带宽最大且路径类型最优的网络节点, * 将这些网络节点的ID转换为位掩码形式输出。 * * @param system 拓扑系统指针 * @param g GPU组索引 * @param localNetMask [out] 输出的本地网络掩码(64位无符号整数) * @param type [out] 可选参数,输出最优路径类型 * @return scclResult_t 成功返回scclSuccess,失败返回错误码 */ static scclResult_t getLocalNetMask(struct scclTopoSystem* system, int g, uint64_t* localNetMask, int* type) { int minType = PATH_DIS; float maxBw = 0; int count = 0; int* nets; SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count)); for(int n = 0; n < system->nodes[NET].count; n++) { struct scclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU] + g; if(path->bw > maxBw || (path->bw == maxBw && path->type < minType)) { maxBw = path->bw; minType = path->type; if(type) *type = minType; count = 0; } if(path->bw == maxBw && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id; } *localNetMask = 0ULL; for(int n = 0; n < count; n++) { if(nets[n] >= 64) return scclInternalError; *localNetMask |= 1ULL << nets[n]; } free(nets); return scclSuccess; } static scclResult_t scclTopoPrintRec(struct scclTopoNode* node, struct scclTopoNode* prevNode, char* line, int offset) { if(node->type == GPU) { sprintf(line + offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank); } else if(node->type == CPU) { sprintf(line + offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model); } else if(node->type == PCI) { sprintf(line + offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device); } else { sprintf(line + offset, "%s/%lX", topoNodeTypeStr[node->type], node->id); } INFO(SCCL_LOG_TOPO, "%s", line); for(int i = 0; i < offset; i++) line[i] = ' '; for(int l = 0; l < node->nlinks; l++) { struct scclTopoLink* link = node->links + l; if(link->type == LINK_LOC) continue; if(link->type != LINK_PCI || link->remNode != prevNode) { sprintf(line + offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw); int nextOffset = strlen(line); if(link->type == LINK_PCI) { SCCLCHECK(scclTopoPrintRec(link->remNode, node, line, nextOffset)); } else { if(link->remNode->type == NET) { sprintf(line + nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw); } else { sprintf(line + nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id); } INFO(SCCL_LOG_TOPO, "%s", line); } } } return scclSuccess; } } // namespace topo_basic //////////////////////////////////////////////////////////////////////////////////////////////// bool isHswDriverExist() { const ::std::string basePath = "/sys/bus/pci/drivers"; DIR* dir = opendir(basePath.c_str()); if(!dir) { return false; } struct dirent* entry; bool found = false; while((entry = readdir(dir)) != nullptr) { ::std::string name = entry->d_name; if(name != "." && name != ".." && name.compare(0, 3, "hsw") == 0) { found = true; break; } } closedir(dir); return found; } int getIBNum() { int count = 0; const ::std::string basePath = "/sys/class/infiniband"; DIR* dir = opendir(basePath.c_str()); if(!dir) { return count; } struct dirent* entry; while((entry = readdir(dir)) != nullptr) { if(strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) continue; if(strncmp(entry->d_name, "mlx5", 4) == 0) ++count; } closedir(dir); return count; } scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id) { uint64_t* localNetMasks; int ngpus = system->nodes[GPU].count; SCCLCHECK(scclCalloc(&localNetMasks, ngpus)); // Fill localNetMasks for all GPUs. for(int g = 0; g < ngpus; g++) { SCCLCHECK(topo_basic::getLocalNetMask(system, g, localNetMasks + g, NULL)); } // Find GPUs which have the same mask as rank, i.e. share the same local Nets. int gpu; SCCLCHECK(scclTopoRankToIndex(system, rank, &gpu)); int netLocalGpus = 0, netLocalGpu = 0; for(int g = 0; g < ngpus; g++) { if(localNetMasks[g] == localNetMasks[gpu]) { if(g == gpu) netLocalGpu = netLocalGpus; netLocalGpus++; } } uint64_t localNetMask = localNetMasks[gpu]; free(localNetMasks); if(localNetMask == 0) return scclInternalError; // Round robin on GPUs and channels int gIndex = 0, cId = 0, n = 0; while(1) { if(1ULL << n & localNetMask) { if(gIndex == netLocalGpu && cId == channelId) { *id = n; return scclSuccess; } gIndex++; if(gIndex == netLocalGpus) { gIndex = 0; cId++; } } n = (n + 1) % 64; } } scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex) { int ngpus = system->nodes[GPU].count; int* gpus; SCCLCHECK(scclCalloc(&gpus, ngpus)); // Find localNetMask which includes net with the most local GPUs. int netLocalGpus = 0, minType = PATH_DIS; uint64_t localNetMask = 0ULL; for(int g = 0; g < ngpus; g++) { int type = PATH_DIS; uint64_t mask; SCCLCHECK(topo_basic::getLocalNetMask(system, g, &mask, &type)); if((1ULL << net) & mask) { if(type < minType) { localNetMask = mask; netLocalGpus = 0; minType = type; } if(type == minType) { if(localNetMask && mask != localNetMask) { WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n", g, gpus[netLocalGpus - 1], minType, net, mask, localNetMask); free(gpus); return scclInternalError; } gpus[netLocalGpus] = g; netLocalGpus++; } } } if(localNetMask == 0ULL) { *gpuIndex = -1; free(gpus); return scclSuccess; } // Round robin on GPUs and channels int gIndex = 0, cId = 0, n = 0; while(1) { if(1ULL << n & localNetMask) { if(n == net) { *gpuIndex = gpus[gIndex]; free(gpus); return scclSuccess; } gIndex++; if(gIndex == netLocalGpus) { gIndex = 0; cId++; } } n = (n + 1) % 64; } } scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model) { *arch = system->nodes[CPU].nodes[0].cpu.arch; *vendor = system->nodes[CPU].nodes[0].cpu.vendor; *model = system->nodes[CPU].nodes[0].cpu.model; return scclSuccess; } scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity) { struct scclTopoNode *cpu = NULL, *gpu = NULL; for(int g = 0; g < system->nodes[GPU].count; g++) { if(system->nodes[GPU].nodes[g].gpu.rank == rank) { gpu = system->nodes[GPU].nodes + g; // Find closer CPU int cpuIndex = -1, minHops = 0; for(int c = 0; c < system->nodes[CPU].count; c++) { int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count; if(cpuIndex == -1 || nHops < minHops) { cpuIndex = c; minHops = nHops; } } cpu = system->nodes[CPU].nodes + cpuIndex; } } if(cpu == NULL) { WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank); return scclInternalError; } // Query the CPU affinity set we were provided cpu_set_t mask; SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); // Get the affinity of the CPU close to our GPU. cpu_set_t cpuMask = cpu->cpu.affinity; cpu_set_t finalMask; if(topo_basic::scclParamIgnoreCpuAffinity()) // Ignore the CPU affinity set and use the GPU one instead finalMask = cpuMask; else // Use a subset of the GPU affinity set CPU_AND(&finalMask, &mask, &cpuMask); memcpy(affinity, &finalMask, sizeof(cpu_set_t)); // If there is a non empty set, use it to set affinity if(CPU_COUNT(&finalMask)) { char affinityStr[sizeof(cpu_set_t) * 2]; SCCLCHECK(scclCpusetToStr(&finalMask, affinityStr)); INFO(SCCL_LOG_TOPO, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr); } return scclSuccess; } scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count) { *count = system->nodes[GPU].count; return scclSuccess; } scclResult_t scclTopoGetNetCount(struct scclTopoSystem* system, int* count) { *count = system->nodes[NET].count; return scclSuccess; } scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count) { *count = system->nodes[NVS].count; return scclSuccess; } scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank) { for(int g = 0; g < system->nodes[GPU].count; g++) { if(system->nodes[GPU].nodes[g].gpu.rank == rank) { *localRank = g; return scclSuccess; } } WARN("Could not find local GPU with rank %d", rank); return scclInternalError; } scclResult_t scclTopoPrint(struct scclTopoSystem* s) { INFO(SCCL_LOG_TOPO, "=== System : maxBw %2.1f totalBw %2.1f ===", s->maxBw, s->totalBw); char line[1024]; for(int n = 0; n < s->nodes[CPU].count; n++) SCCLCHECK(topo_basic::scclTopoPrintRec(s->nodes[CPU].nodes + n, NULL, line, 0)); INFO(SCCL_LOG_TOPO, "=========================================="); return scclSuccess; } scclResult_t scclTopoGetNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id) { for(int i = 0; i < system->nodes[type].count; i++) { if(system->nodes[type].nodes[i].id == id) { *node = system->nodes[type].nodes + i; return scclSuccess; } } return scclSuccess; } scclResult_t scclTopoCreateNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id) { if(system->nodes[type].count == SCCL_TOPO_MAX_NODES) { WARN("Error : tried to create too many nodes of type %d", type); return scclInternalError; } struct scclTopoNode* n = system->nodes[type].nodes + system->nodes[type].count; system->nodes[type].count++; n->type = type; n->id = id; if(type == GPU) { // Create link to itself (used in some corner cases) n->nlinks = 1; n->links[0].type = LINK_LOC; n->links[0].remNode = n; n->links[0].bw = LOC_BW; n->gpu.dev = SCCL_TOPO_UNDEF; n->gpu.rank = SCCL_TOPO_UNDEF; n->gpu.cudaCompCap = SCCL_TOPO_UNDEF; } else if(type == CPU) { n->cpu.arch = SCCL_TOPO_UNDEF; n->cpu.vendor = SCCL_TOPO_UNDEF; n->cpu.model = SCCL_TOPO_UNDEF; } else if(type == NET) { n->net.asic = 0ULL; n->net.port = SCCL_TOPO_UNDEF; n->net.bw = 0.0; n->net.latency = 0.0; } *node = n; return scclSuccess; } /** * 从拓扑系统中移除指定类型的节点 * * @param system 拓扑系统指针 * @param type 要移除的节点类型 * @param index 要移除的节点索引 * @return scclResult_t 返回操作结果(scclSuccess表示成功) * * 该函数会: * 1. 释放被移除节点的所有路径内存 * 2. 更新其他节点到被移除节点的链接关系 * 3. 调整节点数组中剩余节点的位置 * 4. 减少该类型节点的计数 */ scclResult_t scclTopoRemoveNode(struct scclTopoSystem* system, int type, int index) { struct scclTopoNode* delNode = system->nodes[type].nodes + index; for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) { free(delNode->paths[t]); for(int n = 0; n < system->nodes[t].count; n++) { struct scclTopoNode* node = system->nodes[t].nodes + n; if(node == delNode) continue; for(int l = 0; l < node->nlinks; l++) { while(l < node->nlinks && node->links[l].remNode == delNode) { memmove(node->links + l, node->links + l + 1, (node->nlinks - l - 1) * sizeof(struct scclTopoLink)); node->nlinks--; } if(l < node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) { node->links[l].remNode--; } } } } memmove(delNode, delNode + 1, (system->nodes[type].count - index - 1) * sizeof(struct scclTopoNode)); system->nodes[type].count--; return scclSuccess; } scclResult_t scclTopoConnectNodes(struct scclTopoNode* node, struct scclTopoNode* remNode, int type, float bw) { // Aggregate links into higher bw for NVLink struct scclTopoLink* link; for(link = node->links; link->remNode; link++) { if(link->remNode == remNode && link->type == type) break; } if(link->remNode == NULL) node->nlinks++; link->type = type; link->remNode = remNode; link->bw += bw; // Sort links in BW descending order struct scclTopoLink linkSave; memcpy(&linkSave, link, sizeof(struct scclTopoLink)); while(link != node->links) { if((link - 1)->bw >= linkSave.bw) break; memcpy(link, link - 1, sizeof(struct scclTopoLink)); link--; } memcpy(link, &linkSave, sizeof(struct scclTopoLink)); return scclSuccess; } scclResult_t scclTopoGetSystemFromXml(struct scclXml* xml, struct scclTopoSystem** topoSystem) { SCCLCHECK(scclCalloc(topoSystem, 1)); struct scclXmlNode* topNode; SCCLCHECK(xmlFindTag(xml, "system", &topNode)); printf("topNode->nSubs=%d\n", topNode->nSubs); for(int s = 0; s < topNode->nSubs; s++) { struct scclXmlNode* node = topNode->subs[s]; if(strcmp(node->name, "cpu") == 0) SCCLCHECK(topo_basic::scclTopoAddCpu(node, *topoSystem)); } SCCLCHECK(topo_basic::scclTopoAddXGMI(topNode, *topoSystem, NULL)); SCCLCHECK(topo_basic::scclTopoFlattenBcmSwitches(*topoSystem)); SCCLCHECK(topo_basic::scclTopoConnectCpus(*topoSystem)); SCCLCHECK(topo_basic::scclTopoSortSystem(*topoSystem)); return scclSuccess; } /** * 获取系统中所有GPU节点的计算能力范围 * * @param system 拓扑系统指针 * @param ccMin 输出参数,返回最小计算能力版本 * @param ccMax 输出参数,返回最大计算能力版本 * @return scclResult_t 成功返回scclSuccess,无GPU节点返回scclInternalError */ scclResult_t scclTopoGetCompCap(struct scclTopoSystem* system, int* ccMin, int* ccMax) { if(system->nodes[GPU].count == 0) return scclInternalError; int min, max; min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap; for(int g = 1; g < system->nodes[GPU].count; g++) { min = ::std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap); max = ::std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap); } if(ccMin) *ccMin = min; if(ccMax) *ccMax = max; return scclSuccess; } scclResult_t scclTopoIdToIndex(struct scclTopoSystem* system, int type, int64_t id, int* index) { *index = -1; for(int i = 0; i < system->nodes[type].count; i++) { if(system->nodes[type].nodes[i].id == id) { *index = i; return scclSuccess; } } return scclInternalError; } scclResult_t scclTopoRankToIndex(struct scclTopoSystem* system, int rank, int* index) { *index = -1; for(int i = 0; i < system->nodes[GPU].count; i++) { if(system->nodes[GPU].nodes[i].gpu.rank == rank) { *index = i; return scclSuccess; } } return scclInternalError; } scclResult_t scclTopoDevToRank(struct scclTopoSystem* system, int dev, int* rank) { *rank = -1; for(int i = 0; i < system->nodes[GPU].count; i++) { if(system->nodes[GPU].nodes[i].gpu.dev == dev) { *rank = system->nodes[GPU].nodes[i].gpu.rank; return scclSuccess; } } return scclInternalError; } /** * @brief 获取系统拓扑结构 * * 该函数用于获取系统的拓扑结构信息,包括GPU和NIC设备。 * 首先尝试从环境变量SCCL_TOPO_FILE指定的XML文件加载拓扑, * 若未指定则尝试加载默认拓扑文件(根据IB设备数量选择不同文件)。 * 自动检测本地GPU和NIC设备信息并填充到拓扑结构中。 * * @param comm 通信上下文指针 * @param system 输出参数,返回创建的拓扑系统指针 * @return scclResult_t 返回操作结果,scclSuccess表示成功 */ // scclResult_t scclTopoGetSystem(struct scclTopoComm* comm, struct scclTopoSystem** system) { // struct scclXml* xml; // SCCLCHECK(scclCalloc(&xml, 1)); // char* xmlTopoFile = getenv("SCCL_TOPO_FILE"); // if(xmlTopoFile) { // INFO(SCCL_LOG_TOPO, "SCCL_TOPO_FILE set by environment to %s", xmlTopoFile); // SCCLCHECK(scclTopoGetXmlFromFile(xmlTopoFile, xml, 1)); // } else { // bool useDefaultTopo = true; // bool HswExist = topo_basic::isHswDriverExist(); // if(HswExist == true) { // char* rocmPath = getenv("ROCM_PATH"); // if(rocmPath != NULL) { // ::std::string xmlPath; // int IBNum = topo_basic::getIBNum(); // if(IBNum == 8 || IBNum == 9 || IBNum == 10) { // xmlPath = ::std::string(rocmPath) + "/rccl/lib/built-in-BW-topo-input.xml"; // if(access(xmlPath.c_str(), F_OK) == 0) { // SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1)); // useDefaultTopo = false; // } // } else if(IBNum == 4 || IBNum == 5 || IBNum == 6) { // xmlPath = ::std::string(rocmPath) + "/rccl/lib/built-in-508-topo-input.xml"; // if(access(xmlPath.c_str(), F_OK) == 0) { // SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1)); // useDefaultTopo = false; // } // } // } // } // if(useDefaultTopo) { // INFO(SCCL_LOG_TOPO, "No default topo for now, please provide your own topo xml file"); // } // } // if(xml->maxIndex == 0) { // // Create top tag // struct scclXmlNode* top; // SCCLCHECK(xmlAddNode(xml, NULL, "system", &top)); // SCCLCHECK(xmlSetAttrInt(top, "version", SCCL_TOPO_XML_VERSION)); // } // // Auto-detect GPUs if needed // for(int r = 0; r < comm->nRanks; r++) { // if(comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) { // char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; // SCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId)); // struct scclXmlNode* node; // SCCLCHECK(scclTopoFillGpu(xml, busId, &node)); // if(node == NULL) // continue; // SCCLCHECK(xmlSetAttrInt(node, "keep", 1)); // SCCLCHECK(xmlSetAttrInt(node, "rank", r)); // SCCLCHECK(topo_basic::xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport)); // } // } // // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, // // so we start with collnet so that it has precedence. // int netDevCount = 0; // if(netDevCount == 0) { // SCCLCHECK(comm->scclNet->devices(&netDevCount)); // } // for(int n = 0; n < netDevCount; n++) { // sccl::hardware::net::scclNetProperties_t props; // SCCLCHECK(comm->scclNet->getProperties(n, &props)); // struct scclXmlNode* netNode; // SCCLCHECK(scclTopoFillNet(xml, props.pciPath, props.name, &netNode)); // SCCLCHECK(xmlSetAttrInt(netNode, "keep", 1)); // SCCLCHECK(xmlSetAttrInt(netNode, "dev", n)); // SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "speed", props.speed)); // SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "port", props.port)); // SCCLCHECK(topo_basic::xmlInitAttrFloat(netNode, "latency", props.latency)); // SCCLCHECK(topo_basic::xmlInitAttrUint64(netNode, "guid", props.guid)); // SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "maxconn", props.maxComms)); // bool gdrSupport = // (props.ptrSupport & sccl::hardware::net::SCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & sccl::hardware::net::SCCL_PTR_DMABUF)); // INFO(SCCL_LOG_TOPO, "NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->scclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); // SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "gdr", gdrSupport)); // } // // Remove XML branches which don't have a node with keep="1" (typically when importing a topology) // SCCLCHECK(scclTopoTrimXml(xml)); // xmlTopoFile = getenv("SCCL_TOPO_DUMP_FILE"); // if(xmlTopoFile && comm->rank == topo_basic::scclParamTopoDumpFileRank()) { // INFO(SCCL_LOG_TOPO, "SCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile); // SCCLCHECK(scclTopoDumpXmlToFile(xmlTopoFile, xml)); // } // SCCLCHECK(scclTopoGetSystemFromXml(xml, system)); // free(xml); // return scclSuccess; // } scclResult_t scclTopoGetSystem(struct scclTopoSystem** system) { using namespace sccl; struct scclXml* xml; SCCLCHECK(scclCalloc(&xml, 1)); bool HswExist = isHswDriverExist(); if(HswExist == true) { ::std::string xmlPath; int IBNum = getIBNum(); if(IBNum == 8 || IBNum == 9 || IBNum == 10) { xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml"; SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1)); } } if(xml->maxIndex == 0) { // Create top tag struct scclXmlNode* top; SCCLCHECK(xmlAddNode(xml, NULL, "system", &top)); SCCLCHECK(xmlSetAttrInt(top, "version", SCCL_TOPO_XML_VERSION)); } // Auto-detect GPUs if needed // for(int r = 0; r < comm->nRanks; r++) { // if(comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) { // char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; // SCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId)); // struct scclXmlNode* node; // SCCLCHECK(scclTopoFillGpu(xml, busId, &node)); // if(node == NULL) // continue; // SCCLCHECK(xmlSetAttrInt(node, "keep", 1)); // SCCLCHECK(xmlSetAttrInt(node, "rank", r)); // SCCLCHECK(topo_basic::xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport)); // } // } // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, // so we start with collnet so that it has precedence. int netDevCount = 0; auto scclNet = sccl::hardware::net::initNet(sccl::hardware::net::NET_IB); if(netDevCount == 0) { SCCLCHECK(scclNet->devices(&netDevCount)); } for(int n = 0; n < netDevCount; n++) { sccl::hardware::net::scclNetProperties_t props; SCCLCHECK(scclNet->getProperties(n, &props)); struct scclXmlNode* netNode; SCCLCHECK(scclTopoFillNet(xml, props.pciPath, props.name, &netNode)); SCCLCHECK(xmlSetAttrInt(netNode, "keep", 1)); SCCLCHECK(xmlSetAttrInt(netNode, "dev", n)); SCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed)); SCCLCHECK(xmlInitAttrInt(netNode, "port", props.port)); SCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency)); SCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid)); SCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms)); bool gdrSupport = (props.ptrSupport & sccl::hardware::net::SCCL_PTR_CUDA) && (props.ptrSupport & sccl::hardware::net::SCCL_PTR_DMABUF); INFO(SCCL_LOG_TOPO, "NET/%s : GPU Direct RDMA %s for HCA %d '%s'", scclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); SCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); } // Remove XML branches which don't have a node with keep="1" (typically when importing a topology) SCCLCHECK(scclTopoTrimXml(xml)); SCCLCHECK(scclTopoGetSystemFromXml(xml, system)); free(xml); return scclSuccess; } } // namespace topo } // namespace topology } // namespace hardware } // namespace sccl