Commit 58d57301 authored by lishen's avatar lishen
Browse files

将建图过程中原本在bootstrap中的一部分完全移动到graph中

parent 708aae12
......@@ -13,13 +13,14 @@ hipcc ./3_mpi_init_mpi_init_step2_graph.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_utils.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/rocm_smi_wrap.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/physical_links.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/hardware.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/thread_pool.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/physical_links.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/graph_utils.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/graph.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/paths.cpp \
-o 3_mpi_init_mpi_init_step2_graph \
......
......@@ -39,15 +39,19 @@ scclResult_t sccl_init(const scclUniqueId* unique_id, int rank, int nRanks) {
SCCLCHECK(sccl_bootstrap->init(bootstrap_comm));
printf("init pos 1\n");
// -------------------------- 3.MPI 建图 ----------------------------------- //
// -------------------------- 3.拓扑建图 ----------------------------------- //
topo_graph = new scclTopoGraph_t(nRanks);
auto sccl_graph = std::make_unique<topology::graph::Graph>(rank, nRanks);
auto sccl_graph = std::make_unique<topology::graph::Graph>(sccl_bootstrap.get());
printf("init pos 2\n");
// 计算通信路径
SCCLCHECK(sccl_graph->calculateCommunicationPaths(bootstrap_comm, topo_graph, sccl_bootstrap.get()));
// 计算拓扑图
SCCLCHECK(sccl_graph->establishGraph(bootstrap_comm));
printf("init pos 3\n");
// 计算通信路径
SCCLCHECK(sccl_graph->calculateCommunicationPaths(bootstrap_comm, topo_graph));
printf("init pos 4\n");
// -------------------------- 3.MPI allgather设置unique_id的整合 ----------------------------------- //
// -------------------------- 5.根据各个节点的基础信息计算topo结果 ----------------------------------- //
......
......@@ -278,17 +278,9 @@ scclResult_t bootstrapCreateRoot(BootstrapHandle_t* handle) {
////////////////////////////// 结构体定义 //////////////////////////////
// scclRankPhysSet构造函数定义
scclRankPhysSet::scclRankPhysSet(int nRanks, int nLocalRanks)
: nRanks(nRanks), nLocalRanks(nLocalRanks), node_info_total_bytes(sizeof(scclTopoNode_t) * topoNodeMaxLocalNodes / nLocalRanks) {
printf("scclRankPhysSet 构造函数\n");
scclRankPhysSet::scclRankPhysSet(int nRanks) {
rank_info_vec.reserve(nRanks); // 预留空间
rank_info_vec.clear();
// 与scclNodeInfo_t中的定义一致
node_info_vec.reserve(nRanks * node_info_total_bytes); // 预留空间
node_info_vec.clear();
printf("scclRankPhysSet 预留空间并初始化node_info_vec, nRanks * node_info_total_bytes=%zu\n", nRanks * node_info_total_bytes);
}
void BootstrapComm::init(int rank, int nRanks, int localRank, int nLocalRanks) {
......@@ -299,7 +291,7 @@ void BootstrapComm::init(int rank, int nRanks, int localRank, int nLocalRanks) {
this->nLocalRanks = nLocalRanks;
this->interRank = rank / nLocalRanks;
this->nInterRanks = nRanks / nLocalRanks;
rank_phys_set = new scclRankPhysSet(nRanks, nLocalRanks); // 假设需要动态分配
rank_phys_set = new scclRankPhysSet(nRanks); // 假设需要动态分配
};
void BootstrapComm::destroy() {
......@@ -347,7 +339,6 @@ scclResult_t Bootstrap::init(BootstrapComm_t* bootstrap_comm) {
// -------------------------- 3.设置本地localRank的BootstrapComm信息 ----------------------------------- //
// 初始化BootstrapComm类
bootstrap_comm->init(rank, nRanks, localRank, nLocalRanks);
if(CPU_COUNT(&bootstrap_comm->cpuAffinity)) {
sched_setaffinity(0, sizeof(cpu_set_t), &bootstrap_comm->cpuAffinity);
}
......@@ -379,105 +370,12 @@ scclResult_t Bootstrap::init(BootstrapComm_t* bootstrap_comm) {
local_rank_info.hostHash = node_basic.hostHash;
SCCLCHECK(bootstrapCommInitNodeInfo(bootstrap_comm->scclNet, &local_rank_info));
memcpy(&(local_rank_info.cpu.listen_sock), &(node_basic.sock), sizeof(scclSocket_t));
//////// 初始化topo node ////////
scclNodeInfo_t local_topo_nodes(nLocalRanks);
// 使用ByteSpan替代std::vector,并指定容量为pNodes_len
ByteSpanVector<scclTopoNode_t> nodes_span((void*)local_topo_nodes.nodes, local_topo_nodes.totalByteSize);
#if 1
printf("devices_num=%d, local_rank_info.net.count=%d\n", bootstrap_comm->deviceCnt, local_rank_info.net.count);
#endif
// 遍历所有的GPU的pciPath,添加topo node
for(int r = localRank; r < devices_num; r += nLocalRanks) {
auto gpu_path = physical_links::getGpuPciPath(r);
physical_links::generate_topo_nodes(gpu_path, interRank, r, nodes_span);
delete(gpu_path);
}
// 遍历所有的NIC的pciPath,添加topo node
for(int r = localRank; r < local_rank_info.net.count; r += nLocalRanks) {
auto net_path = physical_links::getNetPciPath(bootstrap_comm->scclNet, r);
physical_links::generate_topo_nodes(net_path, interRank, r, nodes_span);
delete(net_path);
}
#if 0
if(interRank == 0) {
ByteSpanArray<scclTopoNode_t> nodes_span_array(nodes_span.data(), local_topo_nodes.totalByteSize);
printf("print rank=%d, nodes_span size=%zu\n", rank, nodes_span.size());
char line[30];
sprintf(line, "print rank=%d: ", rank);
for(int i = 0; i < nodes_span.size(); i++) {
printf("============================**============================\n");
physical_links::printTopoNode(nodes_span_array, i, line);
printf("============================**============================\n");
}
}
#endif
#if 0
// 尝试采用软件识别GPU之间互联
for(int i = 0; i < bootstrap_comm->deviceCnt; i++) {
// if(i != bootstrap_comm->hipDev) {
RSMI_IO_LINK_TYPE rsmi_type;
int hops, count;
if(rocm_smi_getLinkInfo(bootstrap_comm->hipDev, i, &rsmi_type, &hops, &count) == scclSuccess) {
printf("rank=%d, i=%d, dev=%d, rsmi_type=%d, hops=%d, count=%d\n", rank, i, bootstrap_comm->hipDev, rsmi_type, hops, count);
// if(rsmi_type == RSMI_IOLINK_TYPE_XGMI && hops <= 2) {
// if(1) {
// char busIdStr[] = "00000000:00:00.0";
// SCCLCHECK(rocm_smi_getDevicePciBusIdString(i, busIdStr, sizeof(busIdStr)));
// char lowerId[16];
// for(int c = 0; c < 16; c++) {
// lowerId[c] = tolower(busIdStr[c]);
// if(busIdStr[c] == 0)
// break;
// }
// }
} else {
printf("rsmi get type fail\n");
}
// }
}
#endif
// -------------------------- 4.BootstrapComm信息的allgather ----------------------------------- //
bootstrapCommAllGather(&local_rank_info, &local_topo_nodes, bootstrap_comm->rank_phys_set);
// TODO: 目前手动将节点内的GPU进行mesh连接,因为无法从/sys/device中获取NIC的拓扑信息,rsmi函数也无法获取NIC的拓扑信息。后续优化
bootstrapNodesLink(bootstrap_comm->rank_phys_set->node_info_vec.data(), bootstrap_comm->rank_phys_set->node_info_total_bytes);
#if 0
if(rank == 1) {
size_t dataLen = bootstrap_comm->rank_phys_set->node_info_total_bytes;
printf("nRanks * bootstrap_comm->rank_phys_set->node_info_total_bytes=%zu, %lu\n", dataLen, nRanks * dataLen);
auto node_info_data = reinterpret_cast<char*>(bootstrap_comm->rank_phys_set->node_info_vec.data());
ByteSpanArray<scclTopoNode_t> nodes_span_all(node_info_data, nRanks * dataLen);
printf("print rank=%d, nodes_span_all size=%zu, scclTopoNode_t size=%zu\n", rank, nodes_span_all.size(), sizeof(scclTopoNode_t));
char line[30];
sprintf(line, "print rank=%d: ", rank);
int node_cnt = 0;
for(int i = 0; i < nodes_span_all.size(); i++) {
if(nodes_span_all[i] && nodes_span_all[i]->type > 0) {
if(i < 64) {
printf("============================&&============================\n");
physical_links::printTopoNode(nodes_span_all, i, line);
printf("============================&&============================\n");
} else if(i < 128) {
printf("============================((============================\n");
physical_links::printTopoNode(nodes_span_all, i, line);
printf("============================))============================\n");
} else {
printf("============================@@============================\n");
physical_links::printTopoNode(nodes_span_all, i, line);
printf("============================@@============================\n");
}
node_cnt += 1;
}
}
printf("print rank=%d, node_cnt=%d\n", rank, node_cnt);
}
#endif
// 将每个节点的`rank_info`信息收集到`rank_phys_set`中,以便后续使用
SCCLCHECK(bootstrapAllGather(&local_rank_info, bootstrap_comm->rank_phys_set->rank_info_vec.data(), sizeof(scclRankInfo_t)));
// 设置初始化标志
asm_ops::st_release_sys_global(&socketInitDone, true);
......@@ -545,6 +443,7 @@ scclResult_t Bootstrap::bootstrapRootGatherAndBroadcast(BootstrapNodeBasic_t* se
SCCLCHECK(bootstrapNet::bootstrapNetRecv(accept_manager.getSocket(), all_node_basic, all_node_basic_size));
}
printf("all_node_basic_size=%d\n", all_node_basic_size);
// ------------- 5.nLocalRanks==0时,将所有rank的ip数据广播给节点内其他rank ------------- //
ipcsocket = new scclIpcSocket_t(localRank, nLocalRanks, /*hash*/ root_handle->magic);
ipcsocket->scclIpcSocketBroadcast(all_node_basic, all_node_basic_size, /*localRank root*/ 0);
......@@ -618,28 +517,6 @@ scclResult_t Bootstrap::bootstrapCommInitNodeInfo(scclNet_t* scclNet, scclRankIn
return scclSuccess;
}
/**
* @brief 实现节点间通信的AllGather操作
*
* 该函数通过调用`bootstrapAllGather`函数,实现节点间通信的AllGather操作。
* 它将每个节点的`rank_info`信息和`node_info`信息收集到`rank_phys_set`中,以便后续使用。
*
* @param rank_info 指向当前节点的`rank_info`信息的指针
* @param node_info 指向当前节点的`node_info`信息的指针
* @param rank_phys_set 指向节点信息集合的指针,用于存储所有节点的`rank_info`和`node_info`信息
* @return scclResult_t 返回操作结果状态码:
*     - scclSuccess: 操作成功
*     - 其他错误码: 表示操作失败
*
* @note 该函数是一个简化的接口,用于调用`bootstrapAllGather`函数来实现节点间通信的AllGather操作。
*     在调用`bootstrapAllGather`函数之前,需要确保`rank_info`、`node_info`和`rank_phys_set`已经正确初始化。
*/
scclResult_t Bootstrap::bootstrapCommAllGather(scclRankInfo_t* rank_info, scclNodeInfo_t* node_info, scclRankPhysSet_t* rank_phys_set) {
SCCLCHECK(bootstrapAllGather(rank_info, rank_phys_set->rank_info_vec.data(), sizeof(scclRankInfo_t)));
SCCLCHECK(bootstrapAllGather(node_info->nodes, rank_phys_set->node_info_vec.data(), rank_phys_set->node_info_total_bytes));
return scclSuccess;
}
// TODO: 后续可以采用优化,先节点内allgather,再节点间的allgather,最后节点内的Broadcast。优化的算法并保证正确性
/**
* @brief 实现跨节点的AllGather通信操作
......@@ -659,7 +536,7 @@ scclResult_t Bootstrap::bootstrapCommAllGather(scclRankInfo_t* rank_info, scclNo
* 此外,该函数还假设所有节点的基本信息(如套接字地址)已经通过其他途径正确获取并存储在all_node_basic向量中。
* 在节点间通信中,使用了Ring AllGather算法,该算法在nRanks特别大的时候可能不是最优的选择,可以考虑进一步优化算法以减少通信次数。
*/
scclResult_t Bootstrap::bootstrapAllGather(const void* src_data, void* dst_data, int data_size) {
scclResult_t Bootstrap::bootstrapAllGather(const void* src_data, void* dst_data, int data_size) const {
// 数据准备
size_t inter_data_len = nLocalRanks * data_size; // 节点间传输时每个子块的大小
auto all_recv_data = reinterpret_cast<char*>(dst_data);
......@@ -703,249 +580,6 @@ scclResult_t Bootstrap::bootstrapAllGather(const void* src_data, void* dst_data,
return scclSuccess;
}
// TODO: 当前实现使用了较多的for循环,在节点数量较大时速度较慢,可以考虑采用cuda kernel
/**
* @brief 初始化并连接节点之间的链接
*
* 该函数接收一个指向节点信息的字节数组的指针和节点信息的总字节数,用于初始化并连接节点之间的链接。
* 1.创建一个`ByteSpanArray`对象来管理节点信息的内存,然后根据节点的类型(GPU、PCI或NIC)将它们分类存储。
* 2.它使相同`interRank`下的GPU节点两两互联
* 3.遍历所有的`interRank`来合并具有相同`id`、`type`和`busIdStr`的PCI节点。
* 4.使CPU node即numa node的neighbors两两互联。
* 5.它使相同`deviceId`下的NIC节点两两互联。
*
* @param node_info_vec 指向节点信息的字节数组的指针
* @param node_info_total_bytes 节点信息的总字节数
* @return scclResult_t 返回操作结果状态码:
* - scclSuccess: 操作成功
* - scclError: 操作失败
*/
scclResult_t Bootstrap::bootstrapNodesLink(void* node_info_vec, int node_info_total_bytes) {
// 创建一个ByteSpanArray对象,用于管理节点信息的内存
ByteSpanArray<scclTopoNode_t> node_info_span(node_info_vec, nRanks * node_info_total_bytes);
// 用于将nodes的deviceId对应的node
std::unordered_map<uint64_t, std::vector<scclTopoNode_t*>> nodes_map_by_deviceId;
// 用于将interRank内nodes的deviceSig对应的NIC节点连接
std::unordered_map<uint64_t, std::vector<scclTopoNode_t*>> nic_nodes_by_deviceId;
// 用于识别并连接节点内的GPU node
std::vector<std::vector<scclTopoNode_t*>> gpu_nodes_by_interRank(nInterRanks);
// -------------------------- 1.遍历所有的节点信息,记录node -------------------------- //
for(size_t i = 0; i < node_info_span.size(); ++i) {
scclTopoNode_t* node = node_info_span[i];
// 跳过空节点、跳过没有busId的节点(如空节点或CPU)
if(node->type <= 0) {
continue;
}
uint64_t id = node->id;
int interRank;
physical_links::getIdComponents(id, &interRank);
uint64_t deviceSig = id & 0xFFFFFFFFFF; // 计算 interRank(24bit) + hipDev(8bit) + deviceId(16bit) + terminalType(8bit) + numaId(8bit)
// 选择type为GPU的节点
if(node->type == GPU) {
if(interRank >= gpu_nodes_by_interRank.size()) {
gpu_nodes_by_interRank.resize(interRank + 1);
}
gpu_nodes_by_interRank[interRank].push_back(node);
} else if(node->type == NIC) {
nic_nodes_by_deviceId[deviceSig].push_back(node);
}
nodes_map_by_deviceId[id].push_back(node);
}
// 合并id相同和busId相同的node
for(auto& pair : nodes_map_by_deviceId) {
auto& nodes = pair.second;
for(size_t i = 0; i < nodes.size(); ++i) {
for(size_t j = i + 1; j < nodes.size(); ++j) {
// if(nodes[i]->id == nodes[j]->id && nodes[i]->type == nodes[j]->type && nodes[i]->busId == nodes[j]->busId) {
// SCCLCHECK(nodes[i]->combineNode(nodes[j]));
// }
if(nodes[i]->id == nodes[j]->id) {
if(nodes[i]->type == nodes[j]->type && nodes[i]->busId == nodes[j]->busId) {
SCCLCHECK(nodes[i]->combineNode(nodes[j]));
} else {
#if 0
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(nodes[i]->busId, busIdStr_i);
int64ToBusId(nodes[j]->busId, busIdStr_j);
printf("same Id but different type or busId: %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, "
"H:%d, N:%d, busIdStr:%s)\n",
nodes[i]->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
nodes[j]->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
#endif
}
}
}
}
}
// 遍历所有的节点信息,将CPU的所有neighbor node两两互联
for(size_t i = 0; i < node_info_span.size(); ++i) {
scclTopoNode_t* node = node_info_span[i];
// 跳过空节点、跳过没有busId的节点(如空节点或CPU)
if(node->type == CPU) {
for(size_t i = 0; i < node->neighborCount; ++i) {
for(size_t j = i + 1; j < node->neighborCount; ++j) {
// 使用unordered_map来加速查找
auto it_i = nodes_map_by_deviceId.find(node->neighbors[i]);
auto it_j = nodes_map_by_deviceId.find(node->neighbors[j]);
if(it_i != nodes_map_by_deviceId.end() && it_j != nodes_map_by_deviceId.end()) {
scclTopoNode_t* neighbor_i = nullptr;
scclTopoNode_t* neighbor_j = nullptr;
for(auto& n : it_i->second) {
if(n->type > 0) {
neighbor_i = n;
break;
}
}
for(auto& n : it_j->second) {
if(n->type > 0) {
neighbor_j = n;
break;
}
}
if(neighbor_i && neighbor_j) {
neighbor_i->addNeighbor(neighbor_j->id);
neighbor_j->addNeighbor(neighbor_i->id);
#if 0
{
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(
neighbor_i->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(
neighbor_j->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(neighbor_i->busId, busIdStr_i);
int64ToBusId(neighbor_j->busId, busIdStr_j);
printf("connect CPU neighbors %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, "
"N:%d, busIdStr:%s)\n",
neighbor_i->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
neighbor_j->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
}
#endif
}
}
}
}
}
}
// 使相同interRank下的GPU node两两互联
for(const auto& nodes : gpu_nodes_by_interRank) {
for(size_t i = 0; i < nodes.size(); ++i) {
for(size_t j = i + 1; j < nodes.size(); ++j) {
nodes[i]->addNeighbor(nodes[j]->id);
nodes[j]->addNeighbor(nodes[i]->id);
#if 0
{
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(nodes[i]->busId, busIdStr_i);
int64ToBusId(nodes[j]->busId, busIdStr_j);
printf("connect GPU %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s)\n",
nodes[i]->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
nodes[j]->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
}
#endif
}
}
}
// 使相同deviceId下的NIC节点两两互联
for(const auto& pair : nic_nodes_by_deviceId) {
const auto& nodes = pair.second;
for(size_t i = 0; i < nodes.size(); ++i) {
for(size_t j = i + 1; j < nodes.size(); ++j) {
// 在deviceId相同的情况下,比较busIdStr
if(nodes[i]->busId == nodes[j]->busId) {
nodes[i]->addNeighbor(nodes[j]->id);
nodes[j]->addNeighbor(nodes[i]->id);
#if 0
{
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(nodes[i]->busId, busIdStr_i);
int64ToBusId(nodes[j]->busId, busIdStr_j);
printf("connect NIC interRank %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, "
"busIdStr:%s)\n",
nodes[i]->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
nodes[j]->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
}
#endif
}
}
}
}
return scclSuccess;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////
// 函数:打印 scclRankInfo 结构体的信息
scclResult_t printRankInfo(const std::string& prefix, scclRankInfo_t* info) {
......
......@@ -7,7 +7,6 @@
#include "bootstrap_net.h"
#include "thread_pool.h"
#include "ipc_socket.h"
#include "physical_links.h"
namespace sccl {
namespace hardware {
......@@ -15,7 +14,6 @@ namespace topology {
namespace bootstrap {
typedef sccl::hardware::net::ipc_socket::scclIpcSocket_t scclIpcSocket_t;
typedef physical_links::scclTopoNode_t scclTopoNode_t;
///////////////////////////////////// 用于初始化时的功能函数 //////////////////////////////////////////
scclResult_t bootstrapGetUniqueId(BootstrapHandle_t* handle);
......@@ -56,41 +54,13 @@ typedef struct scclRankInfo {
uint64_t pidHash = 0; // 进程 ID 哈希值
} scclRankInfo_t;
// 定义结构体 scclNodeInfo,用于存储每个rank的图连接信息
// TODO: 目前每个rank需要的node_info大小为4k+,当卡数较大时占用内存较大,可以优化。或者不作为全局变量
typedef struct scclNodeInfo {
scclTopoNode_t* nodes; // 指向scclTopoNode_t对象数组的指针
int nLocalRanks;
int totalByteSize; // 表示占用的总字节数
// 带参数的构造函数,用于初始化nodes的大小
scclNodeInfo(int nLocalRanks) : nodes(nullptr), nLocalRanks(nLocalRanks), totalByteSize(sizeof(scclTopoNode_t) * topoNodeMaxLocalNodes / nLocalRanks) {
nodes = reinterpret_cast<scclTopoNode_t*>(malloc(totalByteSize));
if(nodes) {
memset(nodes, 0, totalByteSize);
}
}
// 析构函数,用于释放申请的数组空间
virtual ~scclNodeInfo() {
if(nodes) {
free(nodes);
}
}
} scclNodeInfo_t;
// 所有节点的信息
typedef struct scclRankPhysSet {
// 构造函数声明
scclRankPhysSet(int nRanks, int nLocalRanks);
std::vector<scclRankInfo_t> rank_info_vec;
std::vector<char> node_info_vec; // 实际为std::vector<scclNodeInfo_t>,vector不支持scclNodeInfo_t变长
scclRankPhysSet(int nRanks);
public:
int nRanks = 0; // 总的节点数量
int nLocalRanks = 0; // 本地计算节点中的节点总数
size_t node_info_total_bytes = 0; // 记录可变长度scclNodeInfo_t类型数据的实际大小
std::vector<scclRankInfo_t> rank_info_vec;
} scclRankPhysSet_t;
// BootstrapComm 结构体定义,用于存储引导通信信息
......@@ -126,7 +96,7 @@ public:
scclResult_t init(BootstrapComm_t* bootstrap_comm);
// 实现跨节点的AllGather通信操作
scclResult_t bootstrapAllGather(const void* src_data, void* dst_data, int data_size);
scclResult_t bootstrapAllGather(const void* src_data, void* dst_data, int data_size) const;
private:
// 执行根节点的聚集和广播操作
......@@ -135,17 +105,12 @@ private:
// 初始化节点通信信息
scclResult_t bootstrapCommInitNodeInfo(scclNet_t* scclNet, scclRankInfo_t* rank_info);
// 实现rank_info信息的节点间通信的AllGather操作
scclResult_t bootstrapCommAllGather(scclRankInfo_t* rank_info, scclNodeInfo_t* node_info, scclRankPhysSet_t* rank_phys_set);
// 额外处理nRanks个nodes的连接关系
scclResult_t bootstrapNodesLink(void* node_info_vec, int node_info_total_bytes);
private:
public:
int rank, nRanks; // 初始化阶段获取MPI的值
int localRank, nLocalRanks; // 通过bootstrapRootGatherAndBroadcast函数确定值
int interRank, nInterRanks; // 整个节点在全部节点中的位置
private:
// TODO: 用于控制套接字终端的变量,目前不知道在哪里使用
volatile uint32_t* abortFlag; // 中止标志,非阻塞套接字设置
......
#include <iostream>
#include "base.h"
#include "graph.h"
#include "paths.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace graph {
Graph::Graph(int rank, int nRanks) : rank(rank), nRanks(nRanks) {
// 构造函数的实现
// 构造函数的实现
Graph::Graph(const Bootstrap* bootstrap)
: sccl_bootstrap(bootstrap),
rank(sccl_bootstrap->rank),
nRanks(sccl_bootstrap->nRanks),
localRank(sccl_bootstrap->localRank),
nLocalRanks(sccl_bootstrap->nLocalRanks),
interRank(sccl_bootstrap->interRank),
nInterRanks(sccl_bootstrap->nInterRanks) {
// 与scclNodeInfo_t中的定义一致,预留足够大小的node空间
this->node_info_total_bytes = sizeof(scclTopoNode_t) * topoNodeMaxLocalNodes / nLocalRanks;
node_info_vec.reserve(nRanks * node_info_total_bytes); // 预留空间
node_info_vec.clear();
}
Graph::~Graph() {
// 析构函数的实现
// 析构函数的实现
Graph::~Graph() {}
scclResult_t Graph::establishGraph(const BootstrapComm_t* bootstrap_comm) {
//////// 初始化topo node ////////
scclNodeInfo_t local_topo_nodes(nLocalRanks);
// 使用ByteSpan替代std::vector,并指定容量为pNodes_len
ByteSpanVector<scclTopoNode_t> nodes_span((void*)local_topo_nodes.nodes, local_topo_nodes.totalByteSize);
// 遍历所有的GPU的pciPath,添加topo node
for(int r = localRank; r < bootstrap_comm->deviceCnt; r += nLocalRanks) {
auto gpu_path = physical_links::getGpuPciPath(r);
physical_links::generate_topo_nodes(gpu_path, this->interRank, r, nodes_span);
delete(gpu_path);
}
// 遍历所有的NIC的pciPath,添加topo node
bootstrap::scclRankInfo_t local_rank_info = bootstrap_comm->rank_phys_set->rank_info_vec[this->rank];
for(int r = localRank; r < local_rank_info.net.count; r += nLocalRanks) {
auto net_path = physical_links::getNetPciPath(bootstrap_comm->scclNet, r);
physical_links::generate_topo_nodes(net_path, this->interRank, r, nodes_span);
delete(net_path);
}
#if 0
if(interRank == 0) {
char line[30];
sprintf(line, "print rank=%d: ", rank);
bootstrap::printRankInfo(std::string(line), &local_rank_info);
}
#endif
#if 0
if(interRank == 0) {
ByteSpanArray<scclTopoNode_t> nodes_span_array(nodes_span.data(), local_topo_nodes.totalByteSize);
printf("print rank=%d, nodes_span size=%zu\n", rank, nodes_span.size());
char line[30];
sprintf(line, "print rank=%d: ", rank);
for(int i = 0; i < nodes_span.size(); i++) {
printf("============================**============================\n");
physical_links::printTopoNode(nodes_span_array, i, line);
printf("============================**============================\n");
}
}
#endif
#if 0
// 尝试采用软件识别GPU之间互联
for(int i = 0; i < bootstrap_comm->deviceCnt; i++) {
// if(i != bootstrap_comm->hipDev) {
RSMI_IO_LINK_TYPE rsmi_type;
int hops, count;
if(rocm_smi_getLinkInfo(bootstrap_comm->hipDev, i, &rsmi_type, &hops, &count) == scclSuccess) {
printf("rank=%d, i=%d, dev=%d, rsmi_type=%d, hops=%d, count=%d\n", rank, i, bootstrap_comm->hipDev, rsmi_type, hops, count);
// if(rsmi_type == RSMI_IOLINK_TYPE_XGMI && hops <= 2) {
// if(1) {
// char busIdStr[] = "00000000:00:00.0";
// SCCLCHECK(rocm_smi_getDevicePciBusIdString(i, busIdStr, sizeof(busIdStr)));
// char lowerId[16];
// for(int c = 0; c < 16; c++) {
// lowerId[c] = tolower(busIdStr[c]);
// if(busIdStr[c] == 0)
// break;
// }
// }
} else {
printf("rsmi get type fail\n");
}
// }
}
#endif
// -------------------------- 4.Comm信息的allgather ----------------------------------- //
SCCLCHECK(sccl_bootstrap->bootstrapAllGather(local_topo_nodes.nodes, this->node_info_vec.data(), this->node_info_total_bytes));
// TODO: 目前手动将节点内的GPU进行mesh连接,因为无法从/sys/device中获取NIC的拓扑信息,rsmi函数也无法获取NIC的拓扑信息。后续优化
SCCLCHECK(bootstrapNodesLink(this->node_info_vec.data(), this->node_info_total_bytes));
#if 0
if(rank == 1) {
size_t dataLen = this->node_info_total_bytes;
printf("nRanks * this->node_info_total_bytes=%zu, %lu\n", dataLen, nRanks * dataLen);
auto node_info_data = reinterpret_cast<char*>(this->node_info_vec.data());
ByteSpanArray<scclTopoNode_t> nodes_span_all(node_info_data, nRanks * dataLen);
printf("print rank=%d, nodes_span_all size=%zu, scclTopoNode_t size=%zu\n", rank, nodes_span_all.size(), sizeof(scclTopoNode_t));
char line[30];
sprintf(line, "print rank=%d: ", rank);
int node_cnt = 0;
for(int i = 0; i < nodes_span_all.size(); i++) {
if(nodes_span_all[i] && nodes_span_all[i]->type > 0) {
if(i < 64) {
printf("============================&&============================\n");
physical_links::printTopoNode(nodes_span_all, i, line);
printf("============================&&============================\n");
} else if(i < 128) {
printf("============================((============================\n");
physical_links::printTopoNode(nodes_span_all, i, line);
printf("============================))============================\n");
} else {
printf("============================@@============================\n");
physical_links::printTopoNode(nodes_span_all, i, line);
printf("============================@@============================\n");
}
node_cnt += 1;
}
}
printf("print rank=%d, node_cnt=%d\n", rank, node_cnt);
}
#endif
return scclSuccess;
}
scclResult_t Graph::calculateCommunicationPaths(const BootstrapComm_t* bootstrap_comm, scclTopoGraph_t* topo_graph, Bootstrap* sccl_bootstrap) {
scclResult_t Graph::calculateCommunicationPaths(const BootstrapComm_t* bootstrap_comm, scclTopoGraph_t* topo_graph) {
// 通信路径计算的实现
std::cout << "Calculating communication paths..." << std::endl;
// 调用pathFinder类,实现硬件路径搜索
auto path_finder = PathFinder(bootstrap_comm);
printf("calculateCommunicationPaths pos 1\n");
auto path_finder = PathFinder(bootstrap_comm, this->node_info_vec, this->node_info_total_bytes);
// 将搜索结果写入topo_graph中,并记录有效node
SCCLCHECK(path_finder.computeTopoGpuP2pMap(topo_graph));
printf("calculateCommunicationPaths pos 2\n");
// topo_graph->printGPUPaths();
// 调用bootstrap类,将transport_map进行allgather统计
uint8_t* local_transport_map = topo_graph->getTransportMapRowStart(rank);
SCCLCHECK(sccl_bootstrap->bootstrapAllGather(local_transport_map, topo_graph->transport_map.data(), nRanks * sizeof(uint8_t)));
printf("calculateCommunicationPaths pos 3\n");
#if 1
// 打印transport_map
if(bootstrap_comm->rank == 0) {
SCCLCHECK(topo_graph->printTransportMap());
}
#endif
return scclSuccess;
}
scclResult_t Graph::buildLogicalTopology() {
scclResult_t Graph::searchLogicalTopology() {
// 逻辑拓扑构建的实现
std::cout << "Building logical topology..." << std::endl;
// 具体的实现细节
......@@ -54,6 +172,251 @@ scclResult_t Graph::calculateTopoChannels() {
return scclSuccess;
}
////////////////////////////////////////////////// private //////////////////////////////////////////////////
// TODO: 当前实现使用了较多的for循环,在节点数量较大时速度较慢,可以考虑采用cuda kernel
/**
* @brief 初始化并连接节点之间的链接
*
* 该函数接收一个指向节点信息的字节数组的指针和节点信息的总字节数,用于初始化并连接节点之间的链接。
* 1.创建一个`ByteSpanArray`对象来管理节点信息的内存,然后根据节点的类型(GPU、PCI或NIC)将它们分类存储。
* 2.它使相同`interRank`下的GPU节点两两互联
* 3.遍历所有的`interRank`来合并具有相同`id`、`type`和`busIdStr`的PCI节点。
* 4.使CPU node即numa node的neighbors两两互联。
* 5.它使相同`deviceId`下的NIC节点两两互联。
*
* @param node_info_vec 指向节点信息的字节数组的指针
* @param node_info_total_bytes 节点信息的总字节数
* @return scclResult_t 返回操作结果状态码:
* - scclSuccess: 操作成功
* - scclError: 操作失败
*/
scclResult_t Graph::bootstrapNodesLink(void* node_info_vec, int node_info_total_bytes) {
// 创建一个ByteSpanArray对象,用于管理节点信息的内存
ByteSpanArray<scclTopoNode_t> node_info_span(node_info_vec, nRanks * node_info_total_bytes);
// 用于将nodes的deviceId对应的node
std::unordered_map<uint64_t, std::vector<scclTopoNode_t*>> nodes_map_by_deviceId;
// 用于将interRank内nodes的deviceSig对应的NIC节点连接
std::unordered_map<uint64_t, std::vector<scclTopoNode_t*>> nic_nodes_by_deviceId;
// 用于识别并连接节点内的GPU node
std::vector<std::vector<scclTopoNode_t*>> gpu_nodes_by_interRank(nInterRanks);
// -------------------------- 1.遍历所有的节点信息,记录node -------------------------- //
for(size_t i = 0; i < node_info_span.size(); ++i) {
scclTopoNode_t* node = node_info_span[i];
// 跳过空节点、跳过没有busId的节点(如空节点或CPU)
if(node->type <= 0) {
continue;
}
uint64_t id = node->id;
int interRank;
physical_links::getIdComponents(id, &interRank);
uint64_t deviceSig = id & 0xFFFFFFFFFF; // 计算 interRank(24bit) + hipDev(8bit) + deviceId(16bit) + terminalType(8bit) + numaId(8bit)
// 选择type为GPU的节点
if(node->type == GPU) {
if(interRank >= gpu_nodes_by_interRank.size()) {
gpu_nodes_by_interRank.resize(interRank + 1);
}
gpu_nodes_by_interRank[interRank].push_back(node);
} else if(node->type == NIC) {
nic_nodes_by_deviceId[deviceSig].push_back(node);
}
nodes_map_by_deviceId[id].push_back(node);
}
// 合并id相同和busId相同的node
for(auto& pair : nodes_map_by_deviceId) {
auto& nodes = pair.second;
for(size_t i = 0; i < nodes.size(); ++i) {
for(size_t j = i + 1; j < nodes.size(); ++j) {
// if(nodes[i]->id == nodes[j]->id && nodes[i]->type == nodes[j]->type && nodes[i]->busId == nodes[j]->busId) {
// SCCLCHECK(nodes[i]->combineNode(nodes[j]));
// }
if(nodes[i]->id == nodes[j]->id) {
if(nodes[i]->type == nodes[j]->type && nodes[i]->busId == nodes[j]->busId) {
SCCLCHECK(nodes[i]->combineNode(nodes[j]));
} else {
#if 0
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(nodes[i]->busId, busIdStr_i);
int64ToBusId(nodes[j]->busId, busIdStr_j);
printf("same Id but different type or busId: %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, "
"H:%d, N:%d, busIdStr:%s)\n",
nodes[i]->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
nodes[j]->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
#endif
}
}
}
}
}
// 遍历所有的节点信息,将CPU的所有neighbor node两两互联
for(size_t i = 0; i < node_info_span.size(); ++i) {
scclTopoNode_t* node = node_info_span[i];
// 跳过空节点、跳过没有busId的节点(如空节点或CPU)
if(node->type == CPU) {
for(size_t i = 0; i < node->neighborCount; ++i) {
for(size_t j = i + 1; j < node->neighborCount; ++j) {
// 使用unordered_map来加速查找
auto it_i = nodes_map_by_deviceId.find(node->neighbors[i]);
auto it_j = nodes_map_by_deviceId.find(node->neighbors[j]);
if(it_i != nodes_map_by_deviceId.end() && it_j != nodes_map_by_deviceId.end()) {
scclTopoNode_t* neighbor_i = nullptr;
scclTopoNode_t* neighbor_j = nullptr;
for(auto& n : it_i->second) {
if(n->type > 0) {
neighbor_i = n;
break;
}
}
for(auto& n : it_j->second) {
if(n->type > 0) {
neighbor_j = n;
break;
}
}
if(neighbor_i && neighbor_j) {
neighbor_i->addNeighbor(neighbor_j->id);
neighbor_j->addNeighbor(neighbor_i->id);
#if 0
{
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(
neighbor_i->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(
neighbor_j->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(neighbor_i->busId, busIdStr_i);
int64ToBusId(neighbor_j->busId, busIdStr_j);
printf("connect CPU neighbors %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, "
"N:%d, busIdStr:%s)\n",
neighbor_i->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
neighbor_j->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
}
#endif
}
}
}
}
}
}
// 使相同interRank下的GPU node两两互联
for(const auto& nodes : gpu_nodes_by_interRank) {
for(size_t i = 0; i < nodes.size(); ++i) {
for(size_t j = i + 1; j < nodes.size(); ++j) {
nodes[i]->addNeighbor(nodes[j]->id);
nodes[j]->addNeighbor(nodes[i]->id);
#if 0
{
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(nodes[i]->busId, busIdStr_i);
int64ToBusId(nodes[j]->busId, busIdStr_j);
printf("connect GPU %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s)\n",
nodes[i]->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
nodes[j]->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
}
#endif
}
}
}
// 使相同deviceId下的NIC节点两两互联
for(const auto& pair : nic_nodes_by_deviceId) {
const auto& nodes = pair.second;
for(size_t i = 0; i < nodes.size(); ++i) {
for(size_t j = i + 1; j < nodes.size(); ++j) {
// 在deviceId相同的情况下,比较busIdStr
if(nodes[i]->busId == nodes[j]->busId) {
nodes[i]->addNeighbor(nodes[j]->id);
nodes[j]->addNeighbor(nodes[i]->id);
#if 0
{
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(nodes[i]->busId, busIdStr_i);
int64ToBusId(nodes[j]->busId, busIdStr_j);
printf("connect NIC interRank %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, "
"busIdStr:%s)\n",
nodes[i]->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
nodes[j]->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
}
#endif
}
}
}
}
return scclSuccess;
}
} // namespace graph
} // namespace topology
} // namespace hardware
......
......@@ -3,6 +3,7 @@
#include <vector>
#include "base.h"
#include "graph_utils.h"
#include "paths.h"
namespace sccl {
namespace hardware {
......@@ -11,23 +12,37 @@ namespace graph {
class Graph {
public:
Graph(int rank, int nRanks);
Graph(const Bootstrap* bootstrap);
virtual ~Graph();
scclResult_t establishGraph(const BootstrapComm_t* bootstrap_comm);
// 通信路径计算
scclResult_t calculateCommunicationPaths(const BootstrapComm_t* bootstrap_comm, scclTopoGraph_t* topo_graph, Bootstrap* sccl_bootstrap);
scclResult_t calculateCommunicationPaths(const BootstrapComm_t* bootstrap_comm, scclTopoGraph_t* topo_graph);
// 逻辑拓扑构建
scclResult_t buildLogicalTopology();
scclResult_t searchLogicalTopology();
// 根据无向图计算topo路径
scclResult_t calculateTopoChannels();
private:
// 额外处理nRanks个nodes的连接关系
scclResult_t bootstrapNodesLink(void* node_info_vec, int node_info_total_bytes);
private:
const Bootstrap* sccl_bootstrap; // 为了调用class Bootstrap中的函数
// 记录所有rank中node信息
std::vector<char> node_info_vec; // 实际为std::vector<scclNodeInfo_t>,vector不支持scclNodeInfo_t变长
size_t node_info_total_bytes = 0; // 记录可变长度scclNodeInfo_t类型数据的实际大小
std::vector<std::vector<int>> adjacencyMatrix; // 使用邻接矩阵表示图
// 你可以根据需要添加更多的私有成员变量和函数
// rank信息
int rank, nRanks;
int localRank, nLocalRanks;
int interRank, nInterRanks; // 整个节点在全部节点中的位置
};
} // namespace graph
......
#include <string.h>
#include "graph_utils.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace graph {
scclTopoGraph::scclTopoGraph(int nRanks) : nRanks(nRanks), transport_map(nullptr, 0) {
// 分配transport_map的内存
uint8_t* raw_transport_map = static_cast<uint8_t*>(calloc(nRanks * nRanks, sizeof(uint8_t)));
if(raw_transport_map == nullptr) {
// 处理内存分配失败的情况
throw std::bad_alloc();
}
// 使用ByteSpanArray初始化transport_map
transport_map = ByteSpanArray<uint8_t>(raw_transport_map, nRanks * nRanks);
}
scclTopoGraph::~scclTopoGraph() {
// 释放transport_map的内存
free(transport_map.data());
}
// 打印transport_map
scclResult_t scclTopoGraph::printTransportMap() {
for(int i = 0; i < this->nRanks; ++i) {
for(int j = 0; j < this->nRanks; ++j) {
uint8_t* value = this->getTransportMapData(i, j);
if(value != nullptr) {
printf("%d ", *value);
} else {
printf("nullptr ");
}
}
printf("\n");
}
return scclSuccess;
}
// 打印gpu_paths信息的函数
scclResult_t scclTopoGraph::printGPUPaths() {
for(const auto& start_pair : gpu_paths) {
uint64_t start_node_id = start_pair.first;
auto start_node_it = graph_nodes.find(start_node_id);
if(start_node_it != graph_nodes.end()) {
std::cout << "Paths starting from node: ";
start_node_it->second.printNodeInfo("Start Node");
} else {
std::cout << "Start node ID " << start_node_id << " not found in graph nodes." << std::endl;
continue;
}
for(const auto& end_pair : start_pair.second) {
uint64_t end_node_id = end_pair.first;
auto end_node_it = graph_nodes.find(end_node_id);
if(end_node_it != graph_nodes.end()) {
std::cout << " to node: ";
end_node_it->second.printNodeInfo("End Node");
} else {
std::cout << " End node ID " << end_node_id << " not found in graph nodes." << std::endl;
continue;
}
std::cout << " Paths:" << std::endl;
for(const auto& path : end_pair.second) {
std::cout << " Path: ";
for(const auto& node_id : path) {
auto node_it = graph_nodes.find(node_id);
if(node_it != graph_nodes.end()) {
node_it->second.printNodeInfo(" ");
} else {
std::cout << " Node ID " << node_id << " not found in graph nodes." << std::endl;
}
}
std::cout << std::endl;
}
}
}
return scclSuccess;
}
} // namespace graph
} // namespace topology
} // namespace hardware
} // namespace sccl
......@@ -3,16 +3,41 @@
#include <string.h>
#include "base.h"
#include "bootstrap.h"
#include "physical_links.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace graph {
typedef bootstrap::physical_links::scclTopoNode_t scclTopoNode_t;
typedef bootstrap::scclNodeInfo_t scclNodeInfo_t;
typedef physical_links::scclTopoNode_t scclTopoNode_t;
typedef bootstrap::BootstrapComm_t BootstrapComm_t;
typedef topology::bootstrap::Bootstrap Bootstrap;
// 定义结构体 scclNodeInfo,用于存储每个rank的图连接信息
// TODO: 目前每个rank需要的node_info大小为4k+,当卡数较大时占用内存较大,可以优化。或者不作为全局变量
typedef struct scclNodeInfo {
scclTopoNode_t* nodes; // 指向scclTopoNode_t对象数组的指针
int nLocalRanks;
int totalByteSize; // 表示占用的总字节数
// 带参数的构造函数,用于初始化nodes的大小
scclNodeInfo(int nLocalRanks) : nodes(nullptr), nLocalRanks(nLocalRanks), totalByteSize(sizeof(scclTopoNode_t) * topoNodeMaxLocalNodes / nLocalRanks) {
nodes = reinterpret_cast<scclTopoNode_t*>(malloc(totalByteSize));
if(nodes) {
memset(nodes, 0, totalByteSize);
}
}
// 析构函数,用于释放申请的数组空间
virtual ~scclNodeInfo() {
if(nodes) {
free(nodes);
}
}
} scclNodeInfo_t;
//////////////////////////////////////////////////////////////////////////////////////////////////
// 定义 topoPathType_t 枚举类型,用于表示不同的路径类型。
typedef enum topoPathType {
PATH_LOC = 0, // 本地路径
......@@ -20,7 +45,7 @@ typedef enum topoPathType {
PATH_NVB = 2, // 通过中间 GPU 使用 NVLink 连接
PATH_PIX = 3, // 通过最多一个 PCIe 桥连接
PATH_PXB = 4, // 通过多个 PCIe 桥连接(不经过 PCIe 主桥)
PATH_PXN = 5, // GPU 和 NIC 之间通过中间 GPU 连接
PATH_PXN = 5, // GPU 和 NIC 之间通过中间 GPU 连接, PXN = PCI + NVLink
PATH_PHB = 6, // 通过 PCIe 以及 PCIe 主桥连接
PATH_SYS = 7, // 通过 PCIe 以及 NUMA 节点之间的 SMP 互连连接
PATH_NET = 8, // 通过网络连接
......@@ -39,44 +64,22 @@ typedef enum LinkType : uint8_t {
typedef struct scclTopoGraph {
scclTopoGraph() = delete; // 删除默认构造函数
scclTopoGraph(int nRanks) : nRanks(nRanks), transport_map(nullptr, 0) {
// 分配transport_map的内存
uint8_t* raw_transport_map = static_cast<uint8_t*>(calloc(nRanks * nRanks, sizeof(uint8_t)));
if(raw_transport_map == nullptr) {
// 处理内存分配失败的情况
throw std::bad_alloc();
}
// 使用ByteSpanArray初始化transport_map
transport_map = ByteSpanArray<uint8_t>(raw_transport_map, nRanks * nRanks);
}
virtual ~scclTopoGraph() {
// 释放transport_map的内存
free(transport_map.data());
}
scclTopoGraph(int nRanks);
virtual ~scclTopoGraph();
uint8_t* getTransportMapRowStart(int row) { return transport_map[row * nRanks]; }
uint8_t* getTransportMapData(int row, int col) { return transport_map[row * nRanks + col]; }
// 打印transport_map
scclResult_t printTransportMap() {
for(int i = 0; i < this->nRanks; ++i) {
for(int j = 0; j < this->nRanks; ++j) {
uint8_t* value = this->getTransportMapData(i, j);
if(value != nullptr) {
printf("%d ", *value);
} else {
printf("nullptr ");
}
}
printf("\n");
}
return scclSuccess;
}
scclResult_t printTransportMap();
// 打印gpu_paths信息的函数
scclResult_t printGPUPaths();
public:
// 使用无序映射存储图的有效节点
std::unordered_map<uint64_t, scclTopoNode_t> graph_nodes;
// 使用无序映射存储从每个GPU节点到其他GPU节点的所有路径,[start_node_id][end_node_id] = {path1, path2}
// 使用无序映射存储从每个GPU节点到其他GPU节点的所有路径,[start_node_id][end_node_id] = {path1, path2, ...}
std::unordered_map<uint64_t, std::unordered_map<uint64_t, std::vector<std::vector<uint64_t>>>> gpu_paths;
// 传输位图
......
......@@ -6,15 +6,15 @@ namespace hardware {
namespace topology {
namespace graph {
PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm)
PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm, std::vector<char>& node_info_vec, size_t node_info_total_bytes)
: rank(bootstrap_comm->rank),
nRanks(bootstrap_comm->nRanks),
localRank(bootstrap_comm->localRank),
nLocalRanks(bootstrap_comm->nLocalRanks),
interRank(bootstrap_comm->interRank),
nInterRanks(bootstrap_comm->nInterRanks),
node_container_(bootstrap_comm->rank_phys_set->node_info_vec.data(),
bootstrap_comm->nRanks * bootstrap_comm->rank_phys_set->node_info_total_bytes) { // 初始化NodeContainer对象
node_container_(node_info_vec.data(), bootstrap_comm->nRanks * node_info_total_bytes) { // 初始化NodeContainer对象
printf("get PathFinder, node_container_=%zu\n", node_container_.size());
for(size_t i = 0; i < node_container_.size(); ++i) {
scclTopoNode_t* node = node_container_[i];
......@@ -36,7 +36,7 @@ PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm)
const scclTopoNode_t* node = node_container_[index];
int interRank, deviceValue, terminalType, hipDev, numaId;
bootstrap::physical_links::getIdComponents(node_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
physical_links::getIdComponents(node_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
char busIdStr[17];
int64ToBusId(node->busId, busIdStr);
printf("rank=%d, node=(InterRank:%d, V:%d, T:%d, H:%d, N:%d, type:%d, busIdStr:%s), neighbor_count=%zu",
......@@ -54,7 +54,7 @@ PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm)
uint64_t neighbor_id = node->neighbors[n];
const scclTopoNode_t* neighbor_node = findNodeById(neighbor_id);
if(neighbor_node) {
bootstrap::physical_links::getIdComponents(neighbor_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
physical_links::getIdComponents(neighbor_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
int64ToBusId(neighbor_node->busId, busIdStr);
printf(", neighbor[%d]=(InterRank:%d, V:%d, T:%d, H:%d, N:%d, type:%d, busIdStr:%s)",
......@@ -75,10 +75,36 @@ PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm)
}
#endif
// 查找当前rank对应的其他GPU节点的所有路径
printf("PathFinder pos 1\n");
findGpuPaths();
printf("PathFinder pos 2\n");
// 查找当前rank对应的GPU的node,并执行BFS搜索,查找到其他所有GPU node的路径
for(const auto& pair : id_to_index_) {
uint64_t id = pair.first;
size_t index = pair.second;
// 定位到node
scclTopoNode_t* node = node_container_[index];
int nodeInterRank, nodeHipDev;
physical_links::getIdComponents(node->id, &nodeInterRank, nullptr, nullptr, &nodeHipDev, nullptr);
if(node->type == GPU && nodeInterRank == this->interRank && nodeHipDev == this->localRank) {
// printf("bfsFindGpuPaths start_node_id=%lu, running\n", node->id);
bfsFindGpuPaths(node->id);
}
}
#if 1
if(rank == 1) {
printGpuPaths();
}
#endif
}
int getGpuRankFromNodeId(uint64_t node_id, int nLocalRanks) {
int interRank, hipDev;
// 调用 getIdComponents 函数获取 interRank 和 hipDev
physical_links::getIdComponents(node_id, &interRank, nullptr, nullptr, &hipDev, nullptr);
// 计算并返回 gpu_rank
int gpu_rank = interRank * nLocalRanks + hipDev;
printf("node_id=%lu, interRank=%d, hipDev=%d, gpu_rank=%d\n", node_id, interRank, hipDev, gpu_rank);
return gpu_rank;
}
/**
......@@ -124,48 +150,61 @@ scclResult_t PathFinder::computeTopoGpuP2pMap(scclTopoGraph_t* topo_graph) {
// 记录bitmap
LinkType_t link_type;
int start_gpu_rank, end_gpu_rank;
{
// 根据路径中途径的节点点确定连接方式的类型
SCCLCHECK(determineLinkType(path, &link_type));
int start_interRank, start_hipDev;
int end_interRank, end_hipDev;
bootstrap::physical_links::getIdComponents(start_node_id, &start_interRank, nullptr, nullptr, &start_hipDev, nullptr);
bootstrap::physical_links::getIdComponents(end_node_id, &end_interRank, nullptr, nullptr, &end_hipDev, nullptr);
// 根据路径中途径的节点点确定连接方式的类型
SCCLCHECK(determineLinkType(path, &link_type));
// 获取gpu的rank
int start_gpu_rank = getGpuRankFromNodeId(start_node_id, nLocalRanks);
int end_gpu_rank = getGpuRankFromNodeId(end_node_id, nLocalRanks);
start_gpu_rank = start_interRank * nLocalRanks + start_hipDev;
end_gpu_rank = end_interRank * nLocalRanks + end_hipDev;
#if 0
printf("rank=%d, interRank=%d, localRank=%d: start_interRank=%d, start_hipDev=%d, end_interRank=%d, end_hipDev=%d, link_type=%d\n",
rank,
interRank,
localRank,
start_interRank,
start_hipDev,
end_interRank,
end_hipDev,
static_cast<int>(link_type));
#endif
}
// 查找transport_map中的起始和结束节点
uint8_t* transport_map_pt = topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank);
// 将连接方式的类型存储在transport_map中
if(*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)) > 0 && link_type > 0) {
if(link_type < static_cast<LinkType_t>(*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)))) {
*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)) = link_type;
if(*transport_map_pt > 0 && link_type > 0) {
if(link_type < static_cast<LinkType_t>(*transport_map_pt)) {
*transport_map_pt = link_type;
// 清空之前的路径
topo_graph->gpu_paths[start_node_id][end_node_id].clear();
// 添加新的路径
topo_graph->gpu_paths[start_node_id][end_node_id].push_back(path);
} else if(link_type == static_cast<LinkType_t>(*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)))) {
} else if(link_type == static_cast<LinkType_t>(*transport_map_pt)) {
// 添加新的路径
topo_graph->gpu_paths[start_node_id][end_node_id].push_back(path);
}
} else {
*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)) = static_cast<uint8_t>(link_type);
*transport_map_pt = static_cast<uint8_t>(link_type);
// 添加新的路径
topo_graph->gpu_paths[start_node_id][end_node_id].push_back(path);
}
#if 0
{
char start_busIdStr[17] = ""; // 用于存储总线ID字符串
// 根据起始节点的ID查找对应的节点对象
const scclTopoNode_t* start_node = findNodeById(start_node_id);
// 如果找到了对应的节点对象,则将其总线ID转换为字符串
if(start_node) {
int64ToBusId(start_node->busId, start_busIdStr);
}
char end_busIdStr[17] = ""; // 用于存储总线ID字符串
// 根据起始节点的ID查找对应的节点对象
const scclTopoNode_t* end_node = findNodeById(end_node_id);
// 如果找到了对应的节点对象,则将其总线ID转换为字符串
if(end_node) {
int64ToBusId(end_node->busId, end_busIdStr);
}
printf("nLocalRanks=%d, start_node_id=%lu, busIdStr=%s, end_node_id=%lu, busIdStr=%s\n"
"start_gpu_rank: %d, end_gpu_rank: %d, link_type: %d, paths count: %zu\n",
nLocalRanks,
start_node_id,
start_busIdStr,
end_node_id,
end_busIdStr,
start_gpu_rank,
end_gpu_rank,
*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)),
topo_graph->gpu_paths[start_node_id][end_node_id].size());
}
#endif
}
}
......@@ -173,35 +212,6 @@ scclResult_t PathFinder::computeTopoGpuP2pMap(scclTopoGraph_t* topo_graph) {
}
/////////////////////////////////////////////////////////////////////////////////////////////
/**
* @brief 查找当前rank对应的其他GPU节点的所有路径
*
* 该函数用于查找当前rank对应的GPU节点的所有路径。它遍历`id_to_index_`中的所有节点ID和索引对,
* 对于每一个节点,如果该节点是GPU类型,并且属于当前rank的进程,则调用`bfsFindGpuPaths`函数执行广度优先搜索(BFS),
* 查找到其他所有GPU节点的路径。最后,如果当前rank为1,则调用`printGpuPaths`函数打印所有GPU路径。
*/
void PathFinder::findGpuPaths() {
// 查找当前rank对应的GPU的node,并执行BFS搜索,查找到其他所有GPU node的路径
for(const auto& pair : id_to_index_) {
uint64_t id = pair.first;
size_t index = pair.second;
// 定位到node
scclTopoNode_t* node = node_container_[index];
int nodeInterRank, nodeHipDev;
bootstrap::physical_links::getIdComponents(node->id, &nodeInterRank, nullptr, nullptr, &nodeHipDev, nullptr);
if(node->type == GPU && nodeInterRank == this->interRank && nodeHipDev == this->localRank) {
// printf("bfsFindGpuPaths start_node_id=%lu, running\n", node->id);
bfsFindGpuPaths(node->id);
}
}
#if 1
if(rank == 1) {
printGpuPaths();
}
#endif
}
/**
* @brief 根据节点ID查找节点
*
......@@ -231,7 +241,6 @@ const scclTopoNode_t* PathFinder::findNodeById(uint64_t id) const {
*
* @param start_node_id 起始GPU节点的ID
*/
#if 1
void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
// 使用一个队列来存储当前路径
std::queue<std::vector<uint64_t>> queue;
......@@ -259,14 +268,14 @@ void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
// 如果当前节点是GPU节点且不是起始节点,则将当前路径加入结果
if(current_node->type == GPU && nodeId != start_node_id) {
int hipDev;
bootstrap::physical_links::getIdComponents(current_node->id, nullptr, nullptr, nullptr, &hipDev, nullptr);
physical_links::getIdComponents(current_node->id, nullptr, nullptr, nullptr, &hipDev, nullptr);
// 仅当节点内的device id小于等于nLocalRanks时,才是有效GPU,才将路径加入结果
if(hipDev < nLocalRanks) {
gpu_paths_[start_node_id].push_back(path);
}
} else {
int nodeInterRank;
bootstrap::physical_links::getIdComponents(nodeId, &nodeInterRank);
physical_links::getIdComponents(nodeId, &nodeInterRank);
// 遍历当前节点的所有邻居节点
for(uint64_t neighbor_id : graph_node_neighbors_.at(nodeId)) {
if(findNodeById(neighbor_id) == nullptr) {
......@@ -274,7 +283,7 @@ void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
}
// 获取邻居节点的interRank
int neighbor_inter_rank;
bootstrap::physical_links::getIdComponents(neighbor_id, &neighbor_inter_rank);
physical_links::getIdComponents(neighbor_id, &neighbor_inter_rank);
// 检查邻居节点是否已在当前路径中访问过
bool visited = std::find(path.begin(), path.end(), neighbor_id) != path.end();
......@@ -302,141 +311,6 @@ void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
}
}
#else
void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
// 使用一个队列来存储当前路径
std::queue<std::vector<uint64_t>> queue;
// 使用一个unordered_map来存储每个node的最短路径
std::unordered_map<uint64_t, std::vector<uint64_t>> shortest_paths;
// 将起始节点加入队列
queue.push({start_node_id});
shortest_paths[start_node_id] = {start_node_id};
// 当队列不为空时,继续搜索
while(!queue.empty()) {
// 从队列中取出一个路径
auto path = queue.front();
queue.pop();
// 获取当前路径的最后一个节点的ID
uint64_t nodeId = path.back();
// 根据节点ID查找对应的节点
const scclTopoNode_t* current_node = findNodeById(nodeId);
if(current_node == nullptr) {
continue;
}
// 如果当前节点是GPU节点且不是起始节点,则将当前路径加入结果
if(current_node->type == GPU && nodeId != start_node_id) {
int hipDev;
bootstrap::physical_links::getIdComponents(current_node->id, nullptr, nullptr, nullptr, &hipDev, nullptr);
if(hipDev < nLocalRanks) {
gpu_paths_[start_node_id].push_back(path);
}
} else {
int nodeInterRank;
bootstrap::physical_links::getIdComponents(nodeId, &nodeInterRank);
// 遍历当前节点的所有邻居节点
for(uint64_t neighbor_id : graph_node_neighbors_.at(nodeId)) {
if(findNodeById(neighbor_id) == nullptr) {
continue;
}
// 获取邻居节点的interRank
int neighbor_inter_rank;
bootstrap::physical_links::getIdComponents(neighbor_id, &neighbor_inter_rank);
// 检查邻居节点是否已在当前路径中访问过
bool visited = std::find(path.begin(), path.end(), neighbor_id) != path.end();
// 检查interRank是否已经存在(仅当interRank改变时)
bool inter_rank_exists = false;
if(neighbor_inter_rank != nodeInterRank) {
for(uint64_t node_id : path) {
if(node_id == neighbor_id) {
inter_rank_exists = true;
break;
}
}
}
// 如果邻居节点未访问过且interRank未存在,则扩展路径
if(!visited && !inter_rank_exists) {
std::vector<uint64_t> new_path = path;
new_path.push_back(neighbor_id);
// 如果新路径比已有的最短路径更短,则更新最短路径
if(shortest_paths.find(neighbor_id) == shortest_paths.end() || shortest_paths[neighbor_id].size() > new_path.size()) {
shortest_paths[neighbor_id] = new_path;
queue.push(new_path);
}
}
}
}
}
}
void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
// 使用一个队列来存储当前路径
std::queue<std::vector<uint64_t>> queue;
// 将起始节点加入队列
queue.push({start_node_id});
// 当队列不为空时,继续搜索
while(!queue.empty()) {
// 从队列中取出一个路径
auto path = queue.front();
queue.pop();
// 获取当前路径的最后一个节点的ID
uint64_t nodeId = path.back();
// 根据节点ID查找对应的节点
const scclTopoNode_t* current_node = findNodeById(nodeId);
if(current_node == nullptr) {
continue;
}
// 如果当前节点是GPU节点且不是起始节点,则将当前路径加入结果
if(current_node->type == GPU && nodeId != start_node_id) {
int hipDev;
bootstrap::physical_links::getIdComponents(current_node->id, nullptr, nullptr, nullptr, &hipDev, nullptr);
if(hipDev < nLocalRanks) {
gpu_paths_[start_node_id].push_back(path);
}
} else {
int nodeInterRank;
bootstrap::physical_links::getIdComponents(nodeId, &nodeInterRank);
// 遍历当前节点的所有邻居节点
for(uint64_t neighbor_id : graph_node_neighbors_.at(nodeId)) {
if(findNodeById(nodeId) == nullptr) {
continue;
}
// 获取邻居节点的interRank
int neighbor_inter_rank;
bootstrap::physical_links::getIdComponents(neighbor_id, &neighbor_inter_rank);
// 检查邻居节点是否已在当前路径中访问过
bool visited = std::find(path.begin(), path.end(), neighbor_id) != path.end();
// 检查interRank是否已经存在(仅当interRank改变时)
bool inter_rank_exists = false;
if(neighbor_inter_rank != (nodeInterRank)) {
for(uint64_t node_id : path) {
if((nodeInterRank) == neighbor_inter_rank) {
inter_rank_exists = true;
break;
}
}
}
// 如果邻居节点未访问过且interRank未存在,则扩展路径
if(!visited && !inter_rank_exists) {
std::vector<uint64_t> new_path = path;
new_path.push_back(neighbor_id);
queue.push(new_path);
}
}
}
}
}
#endif
/**
* @brief 打印GPU路径信息
*
......@@ -463,7 +337,7 @@ void PathFinder::printGpuPaths() {
int interRank, deviceValue, terminalType, hipDev, numaId;
// 根据起始节点的ID获取其interRank、deviceValue、terminalType和numaId
bootstrap::physical_links::getIdComponents(start_node_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
physical_links::getIdComponents(start_node_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
printf("GPU node ID:%lu (InterRank:%d, V:%d, T:%d, H:%d, N:%d) (Path count: %zu)\n",
start_node_id,
interRank,
......@@ -486,7 +360,7 @@ void PathFinder::printGpuPaths() {
const scclTopoNode_t* node = findNodeById(node_id);
if(node) {
// 根据节点的ID获取其interRank、deviceValue、terminalType和numaId
bootstrap::physical_links::getIdComponents(node->id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
physical_links::getIdComponents(node->id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
// 将节点的总线ID转换为字符串
int64ToBusId(node->busId, busIdStr);
// 打印节点的信息,包括其interRank、deviceValue、terminalType、numaId、类型和总线ID字符串
......
......@@ -13,21 +13,21 @@ namespace hardware {
namespace topology {
namespace graph {
// 设置Path路径直接link的 bandwidth 和 speed
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class PathFinder {
public:
// 构造函数
PathFinder(const BootstrapComm_t* bootstrap_comm);
PathFinder(const BootstrapComm_t* bootstrap_comm, std::vector<char>& node_info_vec, size_t node_info_total_bytes);
// 计算拓扑图中GPU节点之间的点对点映射
scclResult_t computeTopoGpuP2pMap(scclTopoGraph_t* graph);
// 计算拓扑图中GPU节点之间的点对点映射,结果保存在graph中
scclResult_t computeTopoGpuP2pMap(scclTopoGraph_t* topo_graph);
// 打印函数
void printGpuPaths();
private:
// 获取所有GPU到GPU的路径函数
void findGpuPaths();
// 使用广度优先搜索(BFS)查找从起始GPU节点到其他GPU节点的最短路径
void bfsFindGpuPaths(uint64_t start_node_id);
......@@ -53,6 +53,9 @@ private:
int nInterRanks = 0; // 全局拥有节点的个数
};
// 根据 node_id 获取 gpu_rank
int getGpuRankFromNodeId(uint64_t node_id, int nLocalRanks);
} // namespace graph
} // namespace topology
} // namespace hardware
......
......@@ -4,7 +4,7 @@
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
namespace graph {
namespace physical_links {
constexpr int numaIdStrLen = 10;
......@@ -726,7 +726,7 @@ void printTopoNode(ByteSpanArray<scclTopoNode_t>& nodes, int nodeIndex, const ch
}
} // namespace physical_links
} // namespace bootstrap
} // namespace graph
} // namespace topology
} // namespace hardware
} // namespace sccl
\ No newline at end of file
......@@ -13,12 +13,14 @@
#include <filesystem> // 需要C++17支持
#include "container.h"
#include "bootstrap_utils.h"
#include "bootstrap.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
namespace graph {
typedef sccl::hardware::net::scclNet_t scclNet_t;
constexpr size_t topoNodeMaxLocalNodes = 128; // 每个节点最多的node数量
constexpr size_t topoNodeMaxNeighbors = 16; // 每个node最多neighbor数量
......@@ -70,7 +72,7 @@ scclResult_t generate_topo_nodes(const char* pciPath, int interRank, int hipDev,
// 根据numaId获取pci路径
std::string generate_topo_node_numa_info(int numaId);
// 输出id分解后的所有数据
// 输出node id分解后的所有数据
void getIdComponents(
uint64_t idToDecompose, int* interRank = nullptr, int* deviceValue = nullptr, int* terminalType = nullptr, int* hipDev = nullptr, int* numaId = nullptr);
......@@ -82,7 +84,7 @@ char* getNetPciPath(scclNet_t* scclNet, int hipDev);
void printTopoNode(ByteSpanArray<scclTopoNode_t>& nodes, int nodeIndex, const char* prefix);
} // namespace physical_links
} // namespace bootstrap
} // namespace graph
} // namespace topology
} // namespace hardware
} // namespace sccl
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment