#include #include "base.h" #include "graph.h" namespace sccl { namespace hardware { namespace topology { namespace graph { // 构造函数的实现 Graph::Graph(const Bootstrap* bootstrap) : sccl_bootstrap(bootstrap), rank(sccl_bootstrap->rank), nRanks(sccl_bootstrap->nRanks), localRank(sccl_bootstrap->localRank), nLocalRanks(sccl_bootstrap->nLocalRanks), interRank(sccl_bootstrap->interRank), nInterRanks(sccl_bootstrap->nInterRanks) { // 与scclNodeInfo_t中的定义一致,预留足够大小的node空间 this->node_info_total_bytes = sizeof(scclTopoNode_t) * topoNodeMaxLocalNodes / nLocalRanks; node_info_vec.reserve(nRanks * node_info_total_bytes); // 预留空间 node_info_vec.clear(); } // 析构函数的实现 Graph::~Graph() {} scclResult_t Graph::establishGraph(const BootstrapComm_t* bootstrap_comm) { //////// 初始化topo node //////// scclNodeInfo_t local_topo_nodes(nLocalRanks); // 使用ByteSpan替代std::vector,并指定容量为pNodes_len ByteSpanVector nodes_span((void*)local_topo_nodes.nodes, local_topo_nodes.totalByteSize); // 遍历所有的GPU的pciPath,添加topo node for(int r = localRank; r < bootstrap_comm->deviceCnt; r += nLocalRanks) { auto gpu_path = physical_links::getGpuPciPath(r); physical_links::generate_topo_nodes(gpu_path, this->interRank, r, nodes_span); delete(gpu_path); } // 遍历所有的NIC的pciPath,添加topo node bootstrap::scclRankInfo_t local_rank_info = bootstrap_comm->rank_phys_set->rank_info_vec[this->rank]; for(int r = localRank; r < local_rank_info.net.count; r += nLocalRanks) { auto net_path = physical_links::getNetPciPath(bootstrap_comm->scclNet, r); physical_links::generate_topo_nodes(net_path, this->interRank, r, nodes_span); delete(net_path); } #if 0 if(interRank == 0) { char line[30]; sprintf(line, "print rank=%d: ", rank); bootstrap::printRankInfo(std::string(line), &local_rank_info); } #endif #if 0 if(interRank == 0) { ByteSpanArray nodes_span_array(nodes_span.data(), local_topo_nodes.totalByteSize); printf("print rank=%d, nodes_span size=%zu\n", rank, nodes_span.size()); char line[30]; sprintf(line, "print rank=%d: ", rank); for(int i = 0; i < nodes_span.size(); i++) { printf("============================**============================\n"); physical_links::printTopoNode(nodes_span_array, i, line); printf("============================**============================\n"); } } #endif #if 0 // 尝试采用软件识别GPU之间互联 for(int i = 0; i < bootstrap_comm->deviceCnt; i++) { // if(i != bootstrap_comm->hipDev) { RSMI_IO_LINK_TYPE rsmi_type; int hops, count; if(rocm_smi_getLinkInfo(bootstrap_comm->hipDev, i, &rsmi_type, &hops, &count) == scclSuccess) { printf("rank=%d, i=%d, dev=%d, rsmi_type=%d, hops=%d, count=%d\n", rank, i, bootstrap_comm->hipDev, rsmi_type, hops, count); // if(rsmi_type == RSMI_IOLINK_TYPE_XGMI && hops <= 2) { // if(1) { // char busIdStr[] = "00000000:00:00.0"; // SCCLCHECK(rocm_smi_getDevicePciBusIdString(i, busIdStr, sizeof(busIdStr))); // char lowerId[16]; // for(int c = 0; c < 16; c++) { // lowerId[c] = tolower(busIdStr[c]); // if(busIdStr[c] == 0) // break; // } // } } else { printf("rsmi get type fail\n"); } // } } #endif // -------------------------- 4.Comm信息的allgather ----------------------------------- // SCCLCHECK(sccl_bootstrap->bootstrapAllGather(local_topo_nodes.nodes, this->node_info_vec.data(), this->node_info_total_bytes)); // TODO: 目前手动将节点内的GPU进行mesh连接,因为无法从/sys/device中获取NIC的拓扑信息,rsmi函数也无法获取NIC的拓扑信息。后续优化 SCCLCHECK(bootstrapNodesLink(this->node_info_vec.data(), this->node_info_total_bytes)); #if 0 if(rank == 1) { size_t dataLen = this->node_info_total_bytes; printf("nRanks * this->node_info_total_bytes=%zu, %lu\n", dataLen, nRanks * dataLen); auto node_info_data = reinterpret_cast(this->node_info_vec.data()); ByteSpanArray nodes_span_all(node_info_data, nRanks * dataLen); printf("print rank=%d, nodes_span_all size=%zu, scclTopoNode_t size=%zu\n", rank, nodes_span_all.size(), sizeof(scclTopoNode_t)); char line[30]; sprintf(line, "print rank=%d: ", rank); int node_cnt = 0; for(int i = 0; i < nodes_span_all.size(); i++) { if(nodes_span_all[i] && nodes_span_all[i]->type > 0) { if(i < 64) { printf("============================&&============================\n"); physical_links::printTopoNode(nodes_span_all, i, line); printf("============================&&============================\n"); } else if(i < 128) { printf("============================((============================\n"); physical_links::printTopoNode(nodes_span_all, i, line); printf("============================))============================\n"); } else { printf("============================@@============================\n"); physical_links::printTopoNode(nodes_span_all, i, line); printf("============================@@============================\n"); } node_cnt += 1; } } printf("print rank=%d, node_cnt=%d\n", rank, node_cnt); } #endif return scclSuccess; } scclResult_t Graph::calculateCommunicationPaths(const BootstrapComm_t* bootstrap_comm, scclTopoGraph_t* topo_graph) { // 通信路径计算的实现 std::cout << "Calculating communication paths..." << std::endl; // 调用pathFinder类,实现硬件路径搜索 auto path_finder = PathFinder(bootstrap_comm, this->node_info_vec, this->node_info_total_bytes); // 将搜索结果写入topo_graph中,并记录有效node SCCLCHECK(path_finder.computeTopoGpuP2pMap(topo_graph)); // topo_graph->printGPUPaths(); // 调用bootstrap类,将transport_map进行allgather统计 uint8_t* local_transport_map = topo_graph->getTransportMapRowStart(rank); SCCLCHECK(sccl_bootstrap->bootstrapAllGather(local_transport_map, topo_graph->transport_map.data(), nRanks * sizeof(uint8_t))); #if 1 // 打印transport_map if(bootstrap_comm->rank == 0) { SCCLCHECK(topo_graph->printTransportMap()); } #endif return scclSuccess; } scclResult_t Graph::searchLogicalTopology() { // 逻辑拓扑构建的实现 std::cout << "Building logical topology..." << std::endl; // 具体的实现细节 return scclSuccess; } scclResult_t Graph::calculateTopoChannels() { // 根据无向图计算topo路径的实现 std::cout << "Calculating topo paths based on undirected graph..." << std::endl; // 具体的实现细节 return scclSuccess; } ////////////////////////////////////////////////// private ////////////////////////////////////////////////// // TODO: 当前实现使用了较多的for循环,在节点数量较大时速度较慢,可以考虑采用cuda kernel /** * @brief 初始化并连接节点之间的链接 * * 该函数接收一个指向节点信息的字节数组的指针和节点信息的总字节数,用于初始化并连接节点之间的链接。 * 1.创建一个`ByteSpanArray`对象来管理节点信息的内存,然后根据节点的类型(GPU、PCI或NIC)将它们分类存储。 * 2.它使相同`interRank`下的GPU节点两两互联 * 3.遍历所有的`interRank`来合并具有相同`id`、`type`和`busIdStr`的PCI节点。 * 4.使CPU node即numa node的neighbors两两互联。 * 5.它使相同`deviceId`下的NIC节点两两互联。 * * @param node_info_vec 指向节点信息的字节数组的指针 * @param node_info_total_bytes 节点信息的总字节数 * @return scclResult_t 返回操作结果状态码: * - scclSuccess: 操作成功 * - scclError: 操作失败 */ scclResult_t Graph::bootstrapNodesLink(void* node_info_vec, int node_info_total_bytes) { // 创建一个ByteSpanArray对象,用于管理节点信息的内存 ByteSpanArray node_info_span(node_info_vec, nRanks * node_info_total_bytes); // 用于将nodes的deviceId对应的node std::unordered_map> nodes_map_by_deviceId; // 用于将interRank内nodes的deviceSig对应的NIC节点连接 std::unordered_map> nic_nodes_by_deviceId; // 用于识别并连接节点内的GPU node std::vector> gpu_nodes_by_interRank(nInterRanks); // -------------------------- 1.遍历所有的节点信息,记录node -------------------------- // for(size_t i = 0; i < node_info_span.size(); ++i) { scclTopoNode_t* node = node_info_span[i]; // 跳过空节点、跳过没有busId的节点(如空节点或CPU) if(node->type <= 0) { continue; } uint64_t id = node->id; int interRank; physical_links::getIdComponents(id, &interRank); uint64_t deviceSig = id & 0xFFFFFFFFFF; // 计算 interRank(24bit) + hipDev(8bit) + deviceId(16bit) + terminalType(8bit) + numaId(8bit) // 选择type为GPU的节点 if(node->type == GPU) { if(interRank >= gpu_nodes_by_interRank.size()) { gpu_nodes_by_interRank.resize(interRank + 1); } gpu_nodes_by_interRank[interRank].push_back(node); } else if(node->type == NIC) { nic_nodes_by_deviceId[deviceSig].push_back(node); } nodes_map_by_deviceId[id].push_back(node); } // 合并id相同和busId相同的node for(auto& pair : nodes_map_by_deviceId) { auto& nodes = pair.second; for(size_t i = 0; i < nodes.size(); ++i) { for(size_t j = i + 1; j < nodes.size(); ++j) { // if(nodes[i]->id == nodes[j]->id && nodes[i]->type == nodes[j]->type && nodes[i]->busId == nodes[j]->busId) { // SCCLCHECK(nodes[i]->combineNode(nodes[j])); // } if(nodes[i]->id == nodes[j]->id) { if(nodes[i]->type == nodes[j]->type && nodes[i]->busId == nodes[j]->busId) { SCCLCHECK(nodes[i]->combineNode(nodes[j])); } else { #if 0 int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId; physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId); int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId; physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId); char busIdStr_i[17], busIdStr_j[17]; int64ToBusId(nodes[i]->busId, busIdStr_i); int64ToBusId(nodes[j]->busId, busIdStr_j); printf("same Id but different type or busId: %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, " "H:%d, N:%d, busIdStr:%s)\n", nodes[i]->id, tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId, busIdStr_i, nodes[j]->id, tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId, busIdStr_j); #endif } } } } } // 遍历所有的节点信息,将CPU的所有neighbor node两两互联 for(size_t i = 0; i < node_info_span.size(); ++i) { scclTopoNode_t* node = node_info_span[i]; // 跳过空节点、跳过没有busId的节点(如空节点或CPU) if(node->type == CPU) { for(size_t i = 0; i < node->neighborCount; ++i) { for(size_t j = i + 1; j < node->neighborCount; ++j) { // 使用unordered_map来加速查找 auto it_i = nodes_map_by_deviceId.find(node->neighbors[i]); auto it_j = nodes_map_by_deviceId.find(node->neighbors[j]); if(it_i != nodes_map_by_deviceId.end() && it_j != nodes_map_by_deviceId.end()) { scclTopoNode_t* neighbor_i = nullptr; scclTopoNode_t* neighbor_j = nullptr; for(auto& n : it_i->second) { if(n->type > 0) { neighbor_i = n; break; } } for(auto& n : it_j->second) { if(n->type > 0) { neighbor_j = n; break; } } if(neighbor_i && neighbor_j) { neighbor_i->addNeighbor(neighbor_j->id); neighbor_j->addNeighbor(neighbor_i->id); #if 0 { int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId; physical_links::getIdComponents( neighbor_i->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId); int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId; physical_links::getIdComponents( neighbor_j->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId); char busIdStr_i[17], busIdStr_j[17]; int64ToBusId(neighbor_i->busId, busIdStr_i); int64ToBusId(neighbor_j->busId, busIdStr_j); printf("connect CPU neighbors %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, " "N:%d, busIdStr:%s)\n", neighbor_i->id, tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId, busIdStr_i, neighbor_j->id, tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId, busIdStr_j); } #endif } } } } } } // 使相同interRank下的GPU node两两互联 for(const auto& nodes : gpu_nodes_by_interRank) { for(size_t i = 0; i < nodes.size(); ++i) { for(size_t j = i + 1; j < nodes.size(); ++j) { nodes[i]->addNeighbor(nodes[j]->id); nodes[j]->addNeighbor(nodes[i]->id); #if 0 { int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId; physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId); int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId; physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId); char busIdStr_i[17], busIdStr_j[17]; int64ToBusId(nodes[i]->busId, busIdStr_i); int64ToBusId(nodes[j]->busId, busIdStr_j); printf("connect GPU %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s)\n", nodes[i]->id, tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId, busIdStr_i, nodes[j]->id, tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId, busIdStr_j); } #endif } } } // 使相同deviceId下的NIC节点两两互联 for(const auto& pair : nic_nodes_by_deviceId) { const auto& nodes = pair.second; for(size_t i = 0; i < nodes.size(); ++i) { for(size_t j = i + 1; j < nodes.size(); ++j) { // 在deviceId相同的情况下,比较busIdStr if(nodes[i]->busId == nodes[j]->busId) { nodes[i]->addNeighbor(nodes[j]->id); nodes[j]->addNeighbor(nodes[i]->id); #if 0 { int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId; physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId); int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId; physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId); char busIdStr_i[17], busIdStr_j[17]; int64ToBusId(nodes[i]->busId, busIdStr_i); int64ToBusId(nodes[j]->busId, busIdStr_j); printf("connect NIC interRank %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, " "busIdStr:%s)\n", nodes[i]->id, tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId, busIdStr_i, nodes[j]->id, tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId, busIdStr_j); } #endif } } } } return scclSuccess; } } // namespace graph } // namespace topology } // namespace hardware } // namespace sccl