Commit 58d57301 authored by lishen's avatar lishen
Browse files

将建图过程中原本在bootstrap中的一部分完全移动到graph中

parent 708aae12
......@@ -13,13 +13,14 @@ hipcc ./3_mpi_init_mpi_init_step2_graph.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_utils.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/rocm_smi_wrap.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/physical_links.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/hardware.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/thread_pool.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/physical_links.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/graph_utils.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/graph.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/paths.cpp \
-o 3_mpi_init_mpi_init_step2_graph \
......
......@@ -39,15 +39,19 @@ scclResult_t sccl_init(const scclUniqueId* unique_id, int rank, int nRanks) {
SCCLCHECK(sccl_bootstrap->init(bootstrap_comm));
printf("init pos 1\n");
// -------------------------- 3.MPI 建图 ----------------------------------- //
// -------------------------- 3.拓扑建图 ----------------------------------- //
topo_graph = new scclTopoGraph_t(nRanks);
auto sccl_graph = std::make_unique<topology::graph::Graph>(rank, nRanks);
auto sccl_graph = std::make_unique<topology::graph::Graph>(sccl_bootstrap.get());
printf("init pos 2\n");
// 计算通信路径
SCCLCHECK(sccl_graph->calculateCommunicationPaths(bootstrap_comm, topo_graph, sccl_bootstrap.get()));
// 计算拓扑图
SCCLCHECK(sccl_graph->establishGraph(bootstrap_comm));
printf("init pos 3\n");
// 计算通信路径
SCCLCHECK(sccl_graph->calculateCommunicationPaths(bootstrap_comm, topo_graph));
printf("init pos 4\n");
// -------------------------- 3.MPI allgather设置unique_id的整合 ----------------------------------- //
// -------------------------- 5.根据各个节点的基础信息计算topo结果 ----------------------------------- //
......
......@@ -7,7 +7,6 @@
#include "bootstrap_net.h"
#include "thread_pool.h"
#include "ipc_socket.h"
#include "physical_links.h"
namespace sccl {
namespace hardware {
......@@ -15,7 +14,6 @@ namespace topology {
namespace bootstrap {
typedef sccl::hardware::net::ipc_socket::scclIpcSocket_t scclIpcSocket_t;
typedef physical_links::scclTopoNode_t scclTopoNode_t;
///////////////////////////////////// 用于初始化时的功能函数 //////////////////////////////////////////
scclResult_t bootstrapGetUniqueId(BootstrapHandle_t* handle);
......@@ -56,41 +54,13 @@ typedef struct scclRankInfo {
uint64_t pidHash = 0; // 进程 ID 哈希值
} scclRankInfo_t;
// 定义结构体 scclNodeInfo,用于存储每个rank的图连接信息
// TODO: 目前每个rank需要的node_info大小为4k+,当卡数较大时占用内存较大,可以优化。或者不作为全局变量
typedef struct scclNodeInfo {
scclTopoNode_t* nodes; // 指向scclTopoNode_t对象数组的指针
int nLocalRanks;
int totalByteSize; // 表示占用的总字节数
// 带参数的构造函数,用于初始化nodes的大小
scclNodeInfo(int nLocalRanks) : nodes(nullptr), nLocalRanks(nLocalRanks), totalByteSize(sizeof(scclTopoNode_t) * topoNodeMaxLocalNodes / nLocalRanks) {
nodes = reinterpret_cast<scclTopoNode_t*>(malloc(totalByteSize));
if(nodes) {
memset(nodes, 0, totalByteSize);
}
}
// 析构函数,用于释放申请的数组空间
virtual ~scclNodeInfo() {
if(nodes) {
free(nodes);
}
}
} scclNodeInfo_t;
// 所有节点的信息
typedef struct scclRankPhysSet {
// 构造函数声明
scclRankPhysSet(int nRanks, int nLocalRanks);
std::vector<scclRankInfo_t> rank_info_vec;
std::vector<char> node_info_vec; // 实际为std::vector<scclNodeInfo_t>,vector不支持scclNodeInfo_t变长
scclRankPhysSet(int nRanks);
public:
int nRanks = 0; // 总的节点数量
int nLocalRanks = 0; // 本地计算节点中的节点总数
size_t node_info_total_bytes = 0; // 记录可变长度scclNodeInfo_t类型数据的实际大小
std::vector<scclRankInfo_t> rank_info_vec;
} scclRankPhysSet_t;
// BootstrapComm 结构体定义,用于存储引导通信信息
......@@ -126,7 +96,7 @@ public:
scclResult_t init(BootstrapComm_t* bootstrap_comm);
// 实现跨节点的AllGather通信操作
scclResult_t bootstrapAllGather(const void* src_data, void* dst_data, int data_size);
scclResult_t bootstrapAllGather(const void* src_data, void* dst_data, int data_size) const;
private:
// 执行根节点的聚集和广播操作
......@@ -135,17 +105,12 @@ private:
// 初始化节点通信信息
scclResult_t bootstrapCommInitNodeInfo(scclNet_t* scclNet, scclRankInfo_t* rank_info);
// 实现rank_info信息的节点间通信的AllGather操作
scclResult_t bootstrapCommAllGather(scclRankInfo_t* rank_info, scclNodeInfo_t* node_info, scclRankPhysSet_t* rank_phys_set);
// 额外处理nRanks个nodes的连接关系
scclResult_t bootstrapNodesLink(void* node_info_vec, int node_info_total_bytes);
private:
public:
int rank, nRanks; // 初始化阶段获取MPI的值
int localRank, nLocalRanks; // 通过bootstrapRootGatherAndBroadcast函数确定值
int interRank, nInterRanks; // 整个节点在全部节点中的位置
private:
// TODO: 用于控制套接字终端的变量,目前不知道在哪里使用
volatile uint32_t* abortFlag; // 中止标志,非阻塞套接字设置
......
This diff is collapsed.
......@@ -3,6 +3,7 @@
#include <vector>
#include "base.h"
#include "graph_utils.h"
#include "paths.h"
namespace sccl {
namespace hardware {
......@@ -11,23 +12,37 @@ namespace graph {
class Graph {
public:
Graph(int rank, int nRanks);
Graph(const Bootstrap* bootstrap);
virtual ~Graph();
scclResult_t establishGraph(const BootstrapComm_t* bootstrap_comm);
// 通信路径计算
scclResult_t calculateCommunicationPaths(const BootstrapComm_t* bootstrap_comm, scclTopoGraph_t* topo_graph, Bootstrap* sccl_bootstrap);
scclResult_t calculateCommunicationPaths(const BootstrapComm_t* bootstrap_comm, scclTopoGraph_t* topo_graph);
// 逻辑拓扑构建
scclResult_t buildLogicalTopology();
scclResult_t searchLogicalTopology();
// 根据无向图计算topo路径
scclResult_t calculateTopoChannels();
private:
// 额外处理nRanks个nodes的连接关系
scclResult_t bootstrapNodesLink(void* node_info_vec, int node_info_total_bytes);
private:
const Bootstrap* sccl_bootstrap; // 为了调用class Bootstrap中的函数
// 记录所有rank中node信息
std::vector<char> node_info_vec; // 实际为std::vector<scclNodeInfo_t>,vector不支持scclNodeInfo_t变长
size_t node_info_total_bytes = 0; // 记录可变长度scclNodeInfo_t类型数据的实际大小
std::vector<std::vector<int>> adjacencyMatrix; // 使用邻接矩阵表示图
// 你可以根据需要添加更多的私有成员变量和函数
// rank信息
int rank, nRanks;
int localRank, nLocalRanks;
int interRank, nInterRanks; // 整个节点在全部节点中的位置
};
} // namespace graph
......
#include <string.h>
#include "graph_utils.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace graph {
scclTopoGraph::scclTopoGraph(int nRanks) : nRanks(nRanks), transport_map(nullptr, 0) {
// 分配transport_map的内存
uint8_t* raw_transport_map = static_cast<uint8_t*>(calloc(nRanks * nRanks, sizeof(uint8_t)));
if(raw_transport_map == nullptr) {
// 处理内存分配失败的情况
throw std::bad_alloc();
}
// 使用ByteSpanArray初始化transport_map
transport_map = ByteSpanArray<uint8_t>(raw_transport_map, nRanks * nRanks);
}
scclTopoGraph::~scclTopoGraph() {
// 释放transport_map的内存
free(transport_map.data());
}
// 打印transport_map
scclResult_t scclTopoGraph::printTransportMap() {
for(int i = 0; i < this->nRanks; ++i) {
for(int j = 0; j < this->nRanks; ++j) {
uint8_t* value = this->getTransportMapData(i, j);
if(value != nullptr) {
printf("%d ", *value);
} else {
printf("nullptr ");
}
}
printf("\n");
}
return scclSuccess;
}
// 打印gpu_paths信息的函数
scclResult_t scclTopoGraph::printGPUPaths() {
for(const auto& start_pair : gpu_paths) {
uint64_t start_node_id = start_pair.first;
auto start_node_it = graph_nodes.find(start_node_id);
if(start_node_it != graph_nodes.end()) {
std::cout << "Paths starting from node: ";
start_node_it->second.printNodeInfo("Start Node");
} else {
std::cout << "Start node ID " << start_node_id << " not found in graph nodes." << std::endl;
continue;
}
for(const auto& end_pair : start_pair.second) {
uint64_t end_node_id = end_pair.first;
auto end_node_it = graph_nodes.find(end_node_id);
if(end_node_it != graph_nodes.end()) {
std::cout << " to node: ";
end_node_it->second.printNodeInfo("End Node");
} else {
std::cout << " End node ID " << end_node_id << " not found in graph nodes." << std::endl;
continue;
}
std::cout << " Paths:" << std::endl;
for(const auto& path : end_pair.second) {
std::cout << " Path: ";
for(const auto& node_id : path) {
auto node_it = graph_nodes.find(node_id);
if(node_it != graph_nodes.end()) {
node_it->second.printNodeInfo(" ");
} else {
std::cout << " Node ID " << node_id << " not found in graph nodes." << std::endl;
}
}
std::cout << std::endl;
}
}
}
return scclSuccess;
}
} // namespace graph
} // namespace topology
} // namespace hardware
} // namespace sccl
......@@ -3,16 +3,41 @@
#include <string.h>
#include "base.h"
#include "bootstrap.h"
#include "physical_links.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace graph {
typedef bootstrap::physical_links::scclTopoNode_t scclTopoNode_t;
typedef bootstrap::scclNodeInfo_t scclNodeInfo_t;
typedef physical_links::scclTopoNode_t scclTopoNode_t;
typedef bootstrap::BootstrapComm_t BootstrapComm_t;
typedef topology::bootstrap::Bootstrap Bootstrap;
// 定义结构体 scclNodeInfo,用于存储每个rank的图连接信息
// TODO: 目前每个rank需要的node_info大小为4k+,当卡数较大时占用内存较大,可以优化。或者不作为全局变量
typedef struct scclNodeInfo {
scclTopoNode_t* nodes; // 指向scclTopoNode_t对象数组的指针
int nLocalRanks;
int totalByteSize; // 表示占用的总字节数
// 带参数的构造函数,用于初始化nodes的大小
scclNodeInfo(int nLocalRanks) : nodes(nullptr), nLocalRanks(nLocalRanks), totalByteSize(sizeof(scclTopoNode_t) * topoNodeMaxLocalNodes / nLocalRanks) {
nodes = reinterpret_cast<scclTopoNode_t*>(malloc(totalByteSize));
if(nodes) {
memset(nodes, 0, totalByteSize);
}
}
// 析构函数,用于释放申请的数组空间
virtual ~scclNodeInfo() {
if(nodes) {
free(nodes);
}
}
} scclNodeInfo_t;
//////////////////////////////////////////////////////////////////////////////////////////////////
// 定义 topoPathType_t 枚举类型,用于表示不同的路径类型。
typedef enum topoPathType {
PATH_LOC = 0, // 本地路径
......@@ -20,7 +45,7 @@ typedef enum topoPathType {
PATH_NVB = 2, // 通过中间 GPU 使用 NVLink 连接
PATH_PIX = 3, // 通过最多一个 PCIe 桥连接
PATH_PXB = 4, // 通过多个 PCIe 桥连接(不经过 PCIe 主桥)
PATH_PXN = 5, // GPU 和 NIC 之间通过中间 GPU 连接
PATH_PXN = 5, // GPU 和 NIC 之间通过中间 GPU 连接, PXN = PCI + NVLink
PATH_PHB = 6, // 通过 PCIe 以及 PCIe 主桥连接
PATH_SYS = 7, // 通过 PCIe 以及 NUMA 节点之间的 SMP 互连连接
PATH_NET = 8, // 通过网络连接
......@@ -39,44 +64,22 @@ typedef enum LinkType : uint8_t {
typedef struct scclTopoGraph {
scclTopoGraph() = delete; // 删除默认构造函数
scclTopoGraph(int nRanks) : nRanks(nRanks), transport_map(nullptr, 0) {
// 分配transport_map的内存
uint8_t* raw_transport_map = static_cast<uint8_t*>(calloc(nRanks * nRanks, sizeof(uint8_t)));
if(raw_transport_map == nullptr) {
// 处理内存分配失败的情况
throw std::bad_alloc();
}
// 使用ByteSpanArray初始化transport_map
transport_map = ByteSpanArray<uint8_t>(raw_transport_map, nRanks * nRanks);
}
virtual ~scclTopoGraph() {
// 释放transport_map的内存
free(transport_map.data());
}
scclTopoGraph(int nRanks);
virtual ~scclTopoGraph();
uint8_t* getTransportMapRowStart(int row) { return transport_map[row * nRanks]; }
uint8_t* getTransportMapData(int row, int col) { return transport_map[row * nRanks + col]; }
// 打印transport_map
scclResult_t printTransportMap() {
for(int i = 0; i < this->nRanks; ++i) {
for(int j = 0; j < this->nRanks; ++j) {
uint8_t* value = this->getTransportMapData(i, j);
if(value != nullptr) {
printf("%d ", *value);
} else {
printf("nullptr ");
}
}
printf("\n");
}
return scclSuccess;
}
scclResult_t printTransportMap();
// 打印gpu_paths信息的函数
scclResult_t printGPUPaths();
public:
// 使用无序映射存储图的有效节点
std::unordered_map<uint64_t, scclTopoNode_t> graph_nodes;
// 使用无序映射存储从每个GPU节点到其他GPU节点的所有路径,[start_node_id][end_node_id] = {path1, path2}
// 使用无序映射存储从每个GPU节点到其他GPU节点的所有路径,[start_node_id][end_node_id] = {path1, path2, ...}
std::unordered_map<uint64_t, std::unordered_map<uint64_t, std::vector<std::vector<uint64_t>>>> gpu_paths;
// 传输位图
......
......@@ -6,15 +6,15 @@ namespace hardware {
namespace topology {
namespace graph {
PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm)
PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm, std::vector<char>& node_info_vec, size_t node_info_total_bytes)
: rank(bootstrap_comm->rank),
nRanks(bootstrap_comm->nRanks),
localRank(bootstrap_comm->localRank),
nLocalRanks(bootstrap_comm->nLocalRanks),
interRank(bootstrap_comm->interRank),
nInterRanks(bootstrap_comm->nInterRanks),
node_container_(bootstrap_comm->rank_phys_set->node_info_vec.data(),
bootstrap_comm->nRanks * bootstrap_comm->rank_phys_set->node_info_total_bytes) { // 初始化NodeContainer对象
node_container_(node_info_vec.data(), bootstrap_comm->nRanks * node_info_total_bytes) { // 初始化NodeContainer对象
printf("get PathFinder, node_container_=%zu\n", node_container_.size());
for(size_t i = 0; i < node_container_.size(); ++i) {
scclTopoNode_t* node = node_container_[i];
......@@ -36,7 +36,7 @@ PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm)
const scclTopoNode_t* node = node_container_[index];
int interRank, deviceValue, terminalType, hipDev, numaId;
bootstrap::physical_links::getIdComponents(node_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
physical_links::getIdComponents(node_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
char busIdStr[17];
int64ToBusId(node->busId, busIdStr);
printf("rank=%d, node=(InterRank:%d, V:%d, T:%d, H:%d, N:%d, type:%d, busIdStr:%s), neighbor_count=%zu",
......@@ -54,7 +54,7 @@ PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm)
uint64_t neighbor_id = node->neighbors[n];
const scclTopoNode_t* neighbor_node = findNodeById(neighbor_id);
if(neighbor_node) {
bootstrap::physical_links::getIdComponents(neighbor_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
physical_links::getIdComponents(neighbor_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
int64ToBusId(neighbor_node->busId, busIdStr);
printf(", neighbor[%d]=(InterRank:%d, V:%d, T:%d, H:%d, N:%d, type:%d, busIdStr:%s)",
......@@ -75,10 +75,36 @@ PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm)
}
#endif
// 查找当前rank对应的其他GPU节点的所有路径
printf("PathFinder pos 1\n");
findGpuPaths();
printf("PathFinder pos 2\n");
// 查找当前rank对应的GPU的node,并执行BFS搜索,查找到其他所有GPU node的路径
for(const auto& pair : id_to_index_) {
uint64_t id = pair.first;
size_t index = pair.second;
// 定位到node
scclTopoNode_t* node = node_container_[index];
int nodeInterRank, nodeHipDev;
physical_links::getIdComponents(node->id, &nodeInterRank, nullptr, nullptr, &nodeHipDev, nullptr);
if(node->type == GPU && nodeInterRank == this->interRank && nodeHipDev == this->localRank) {
// printf("bfsFindGpuPaths start_node_id=%lu, running\n", node->id);
bfsFindGpuPaths(node->id);
}
}
#if 1
if(rank == 1) {
printGpuPaths();
}
#endif
}
int getGpuRankFromNodeId(uint64_t node_id, int nLocalRanks) {
int interRank, hipDev;
// 调用 getIdComponents 函数获取 interRank 和 hipDev
physical_links::getIdComponents(node_id, &interRank, nullptr, nullptr, &hipDev, nullptr);
// 计算并返回 gpu_rank
int gpu_rank = interRank * nLocalRanks + hipDev;
printf("node_id=%lu, interRank=%d, hipDev=%d, gpu_rank=%d\n", node_id, interRank, hipDev, gpu_rank);
return gpu_rank;
}
/**
......@@ -124,48 +150,61 @@ scclResult_t PathFinder::computeTopoGpuP2pMap(scclTopoGraph_t* topo_graph) {
// 记录bitmap
LinkType_t link_type;
int start_gpu_rank, end_gpu_rank;
{
// 根据路径中途径的节点点确定连接方式的类型
SCCLCHECK(determineLinkType(path, &link_type));
int start_interRank, start_hipDev;
int end_interRank, end_hipDev;
bootstrap::physical_links::getIdComponents(start_node_id, &start_interRank, nullptr, nullptr, &start_hipDev, nullptr);
bootstrap::physical_links::getIdComponents(end_node_id, &end_interRank, nullptr, nullptr, &end_hipDev, nullptr);
// 根据路径中途径的节点点确定连接方式的类型
SCCLCHECK(determineLinkType(path, &link_type));
// 获取gpu的rank
int start_gpu_rank = getGpuRankFromNodeId(start_node_id, nLocalRanks);
int end_gpu_rank = getGpuRankFromNodeId(end_node_id, nLocalRanks);
start_gpu_rank = start_interRank * nLocalRanks + start_hipDev;
end_gpu_rank = end_interRank * nLocalRanks + end_hipDev;
#if 0
printf("rank=%d, interRank=%d, localRank=%d: start_interRank=%d, start_hipDev=%d, end_interRank=%d, end_hipDev=%d, link_type=%d\n",
rank,
interRank,
localRank,
start_interRank,
start_hipDev,
end_interRank,
end_hipDev,
static_cast<int>(link_type));
#endif
}
// 查找transport_map中的起始和结束节点
uint8_t* transport_map_pt = topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank);
// 将连接方式的类型存储在transport_map中
if(*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)) > 0 && link_type > 0) {
if(link_type < static_cast<LinkType_t>(*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)))) {
*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)) = link_type;
if(*transport_map_pt > 0 && link_type > 0) {
if(link_type < static_cast<LinkType_t>(*transport_map_pt)) {
*transport_map_pt = link_type;
// 清空之前的路径
topo_graph->gpu_paths[start_node_id][end_node_id].clear();
// 添加新的路径
topo_graph->gpu_paths[start_node_id][end_node_id].push_back(path);
} else if(link_type == static_cast<LinkType_t>(*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)))) {
} else if(link_type == static_cast<LinkType_t>(*transport_map_pt)) {
// 添加新的路径
topo_graph->gpu_paths[start_node_id][end_node_id].push_back(path);
}
} else {
*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)) = static_cast<uint8_t>(link_type);
*transport_map_pt = static_cast<uint8_t>(link_type);
// 添加新的路径
topo_graph->gpu_paths[start_node_id][end_node_id].push_back(path);
}
#if 0
{
char start_busIdStr[17] = ""; // 用于存储总线ID字符串
// 根据起始节点的ID查找对应的节点对象
const scclTopoNode_t* start_node = findNodeById(start_node_id);
// 如果找到了对应的节点对象,则将其总线ID转换为字符串
if(start_node) {
int64ToBusId(start_node->busId, start_busIdStr);
}
char end_busIdStr[17] = ""; // 用于存储总线ID字符串
// 根据起始节点的ID查找对应的节点对象
const scclTopoNode_t* end_node = findNodeById(end_node_id);
// 如果找到了对应的节点对象,则将其总线ID转换为字符串
if(end_node) {
int64ToBusId(end_node->busId, end_busIdStr);
}
printf("nLocalRanks=%d, start_node_id=%lu, busIdStr=%s, end_node_id=%lu, busIdStr=%s\n"
"start_gpu_rank: %d, end_gpu_rank: %d, link_type: %d, paths count: %zu\n",
nLocalRanks,
start_node_id,
start_busIdStr,
end_node_id,
end_busIdStr,
start_gpu_rank,
end_gpu_rank,
*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)),
topo_graph->gpu_paths[start_node_id][end_node_id].size());
}
#endif
}
}
......@@ -173,35 +212,6 @@ scclResult_t PathFinder::computeTopoGpuP2pMap(scclTopoGraph_t* topo_graph) {
}
/////////////////////////////////////////////////////////////////////////////////////////////
/**
* @brief 查找当前rank对应的其他GPU节点的所有路径
*
* 该函数用于查找当前rank对应的GPU节点的所有路径。它遍历`id_to_index_`中的所有节点ID和索引对,
* 对于每一个节点,如果该节点是GPU类型,并且属于当前rank的进程,则调用`bfsFindGpuPaths`函数执行广度优先搜索(BFS),
* 查找到其他所有GPU节点的路径。最后,如果当前rank为1,则调用`printGpuPaths`函数打印所有GPU路径。
*/
void PathFinder::findGpuPaths() {
// 查找当前rank对应的GPU的node,并执行BFS搜索,查找到其他所有GPU node的路径
for(const auto& pair : id_to_index_) {
uint64_t id = pair.first;
size_t index = pair.second;
// 定位到node
scclTopoNode_t* node = node_container_[index];
int nodeInterRank, nodeHipDev;
bootstrap::physical_links::getIdComponents(node->id, &nodeInterRank, nullptr, nullptr, &nodeHipDev, nullptr);
if(node->type == GPU && nodeInterRank == this->interRank && nodeHipDev == this->localRank) {
// printf("bfsFindGpuPaths start_node_id=%lu, running\n", node->id);
bfsFindGpuPaths(node->id);
}
}
#if 1
if(rank == 1) {
printGpuPaths();
}
#endif
}
/**
* @brief 根据节点ID查找节点
*
......@@ -231,7 +241,6 @@ const scclTopoNode_t* PathFinder::findNodeById(uint64_t id) const {
*
* @param start_node_id 起始GPU节点的ID
*/
#if 1
void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
// 使用一个队列来存储当前路径
std::queue<std::vector<uint64_t>> queue;
......@@ -259,14 +268,14 @@ void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
// 如果当前节点是GPU节点且不是起始节点,则将当前路径加入结果
if(current_node->type == GPU && nodeId != start_node_id) {
int hipDev;
bootstrap::physical_links::getIdComponents(current_node->id, nullptr, nullptr, nullptr, &hipDev, nullptr);
physical_links::getIdComponents(current_node->id, nullptr, nullptr, nullptr, &hipDev, nullptr);
// 仅当节点内的device id小于等于nLocalRanks时,才是有效GPU,才将路径加入结果
if(hipDev < nLocalRanks) {
gpu_paths_[start_node_id].push_back(path);
}
} else {
int nodeInterRank;
bootstrap::physical_links::getIdComponents(nodeId, &nodeInterRank);
physical_links::getIdComponents(nodeId, &nodeInterRank);
// 遍历当前节点的所有邻居节点
for(uint64_t neighbor_id : graph_node_neighbors_.at(nodeId)) {
if(findNodeById(neighbor_id) == nullptr) {
......@@ -274,7 +283,7 @@ void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
}
// 获取邻居节点的interRank
int neighbor_inter_rank;
bootstrap::physical_links::getIdComponents(neighbor_id, &neighbor_inter_rank);
physical_links::getIdComponents(neighbor_id, &neighbor_inter_rank);
// 检查邻居节点是否已在当前路径中访问过
bool visited = std::find(path.begin(), path.end(), neighbor_id) != path.end();
......@@ -302,141 +311,6 @@ void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
}
}
#else
void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
// 使用一个队列来存储当前路径
std::queue<std::vector<uint64_t>> queue;
// 使用一个unordered_map来存储每个node的最短路径
std::unordered_map<uint64_t, std::vector<uint64_t>> shortest_paths;
// 将起始节点加入队列
queue.push({start_node_id});
shortest_paths[start_node_id] = {start_node_id};
// 当队列不为空时,继续搜索
while(!queue.empty()) {
// 从队列中取出一个路径
auto path = queue.front();
queue.pop();
// 获取当前路径的最后一个节点的ID
uint64_t nodeId = path.back();
// 根据节点ID查找对应的节点
const scclTopoNode_t* current_node = findNodeById(nodeId);
if(current_node == nullptr) {
continue;
}
// 如果当前节点是GPU节点且不是起始节点,则将当前路径加入结果
if(current_node->type == GPU && nodeId != start_node_id) {
int hipDev;
bootstrap::physical_links::getIdComponents(current_node->id, nullptr, nullptr, nullptr, &hipDev, nullptr);
if(hipDev < nLocalRanks) {
gpu_paths_[start_node_id].push_back(path);
}
} else {
int nodeInterRank;
bootstrap::physical_links::getIdComponents(nodeId, &nodeInterRank);
// 遍历当前节点的所有邻居节点
for(uint64_t neighbor_id : graph_node_neighbors_.at(nodeId)) {
if(findNodeById(neighbor_id) == nullptr) {
continue;
}
// 获取邻居节点的interRank
int neighbor_inter_rank;
bootstrap::physical_links::getIdComponents(neighbor_id, &neighbor_inter_rank);
// 检查邻居节点是否已在当前路径中访问过
bool visited = std::find(path.begin(), path.end(), neighbor_id) != path.end();
// 检查interRank是否已经存在(仅当interRank改变时)
bool inter_rank_exists = false;
if(neighbor_inter_rank != nodeInterRank) {
for(uint64_t node_id : path) {
if(node_id == neighbor_id) {
inter_rank_exists = true;
break;
}
}
}
// 如果邻居节点未访问过且interRank未存在,则扩展路径
if(!visited && !inter_rank_exists) {
std::vector<uint64_t> new_path = path;
new_path.push_back(neighbor_id);
// 如果新路径比已有的最短路径更短,则更新最短路径
if(shortest_paths.find(neighbor_id) == shortest_paths.end() || shortest_paths[neighbor_id].size() > new_path.size()) {
shortest_paths[neighbor_id] = new_path;
queue.push(new_path);
}
}
}
}
}
}
void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
// 使用一个队列来存储当前路径
std::queue<std::vector<uint64_t>> queue;
// 将起始节点加入队列
queue.push({start_node_id});
// 当队列不为空时,继续搜索
while(!queue.empty()) {
// 从队列中取出一个路径
auto path = queue.front();
queue.pop();
// 获取当前路径的最后一个节点的ID
uint64_t nodeId = path.back();
// 根据节点ID查找对应的节点
const scclTopoNode_t* current_node = findNodeById(nodeId);
if(current_node == nullptr) {
continue;
}
// 如果当前节点是GPU节点且不是起始节点,则将当前路径加入结果
if(current_node->type == GPU && nodeId != start_node_id) {
int hipDev;
bootstrap::physical_links::getIdComponents(current_node->id, nullptr, nullptr, nullptr, &hipDev, nullptr);
if(hipDev < nLocalRanks) {
gpu_paths_[start_node_id].push_back(path);
}
} else {
int nodeInterRank;
bootstrap::physical_links::getIdComponents(nodeId, &nodeInterRank);
// 遍历当前节点的所有邻居节点
for(uint64_t neighbor_id : graph_node_neighbors_.at(nodeId)) {
if(findNodeById(nodeId) == nullptr) {
continue;
}
// 获取邻居节点的interRank
int neighbor_inter_rank;
bootstrap::physical_links::getIdComponents(neighbor_id, &neighbor_inter_rank);
// 检查邻居节点是否已在当前路径中访问过
bool visited = std::find(path.begin(), path.end(), neighbor_id) != path.end();
// 检查interRank是否已经存在(仅当interRank改变时)
bool inter_rank_exists = false;
if(neighbor_inter_rank != (nodeInterRank)) {
for(uint64_t node_id : path) {
if((nodeInterRank) == neighbor_inter_rank) {
inter_rank_exists = true;
break;
}
}
}
// 如果邻居节点未访问过且interRank未存在,则扩展路径
if(!visited && !inter_rank_exists) {
std::vector<uint64_t> new_path = path;
new_path.push_back(neighbor_id);
queue.push(new_path);
}
}
}
}
}
#endif
/**
* @brief 打印GPU路径信息
*
......@@ -463,7 +337,7 @@ void PathFinder::printGpuPaths() {
int interRank, deviceValue, terminalType, hipDev, numaId;
// 根据起始节点的ID获取其interRank、deviceValue、terminalType和numaId
bootstrap::physical_links::getIdComponents(start_node_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
physical_links::getIdComponents(start_node_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
printf("GPU node ID:%lu (InterRank:%d, V:%d, T:%d, H:%d, N:%d) (Path count: %zu)\n",
start_node_id,
interRank,
......@@ -486,7 +360,7 @@ void PathFinder::printGpuPaths() {
const scclTopoNode_t* node = findNodeById(node_id);
if(node) {
// 根据节点的ID获取其interRank、deviceValue、terminalType和numaId
bootstrap::physical_links::getIdComponents(node->id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
physical_links::getIdComponents(node->id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
// 将节点的总线ID转换为字符串
int64ToBusId(node->busId, busIdStr);
// 打印节点的信息,包括其interRank、deviceValue、terminalType、numaId、类型和总线ID字符串
......
......@@ -13,21 +13,21 @@ namespace hardware {
namespace topology {
namespace graph {
// 设置Path路径直接link的 bandwidth 和 speed
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class PathFinder {
public:
// 构造函数
PathFinder(const BootstrapComm_t* bootstrap_comm);
PathFinder(const BootstrapComm_t* bootstrap_comm, std::vector<char>& node_info_vec, size_t node_info_total_bytes);
// 计算拓扑图中GPU节点之间的点对点映射
scclResult_t computeTopoGpuP2pMap(scclTopoGraph_t* graph);
// 计算拓扑图中GPU节点之间的点对点映射,结果保存在graph中
scclResult_t computeTopoGpuP2pMap(scclTopoGraph_t* topo_graph);
// 打印函数
void printGpuPaths();
private:
// 获取所有GPU到GPU的路径函数
void findGpuPaths();
// 使用广度优先搜索(BFS)查找从起始GPU节点到其他GPU节点的最短路径
void bfsFindGpuPaths(uint64_t start_node_id);
......@@ -53,6 +53,9 @@ private:
int nInterRanks = 0; // 全局拥有节点的个数
};
// 根据 node_id 获取 gpu_rank
int getGpuRankFromNodeId(uint64_t node_id, int nLocalRanks);
} // namespace graph
} // namespace topology
} // namespace hardware
......
......@@ -4,7 +4,7 @@
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
namespace graph {
namespace physical_links {
constexpr int numaIdStrLen = 10;
......@@ -726,7 +726,7 @@ void printTopoNode(ByteSpanArray<scclTopoNode_t>& nodes, int nodeIndex, const ch
}
} // namespace physical_links
} // namespace bootstrap
} // namespace graph
} // namespace topology
} // namespace hardware
} // namespace sccl
\ No newline at end of file
......@@ -13,12 +13,14 @@
#include <filesystem> // 需要C++17支持
#include "container.h"
#include "bootstrap_utils.h"
#include "bootstrap.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
namespace graph {
typedef sccl::hardware::net::scclNet_t scclNet_t;
constexpr size_t topoNodeMaxLocalNodes = 128; // 每个节点最多的node数量
constexpr size_t topoNodeMaxNeighbors = 16; // 每个node最多neighbor数量
......@@ -70,7 +72,7 @@ scclResult_t generate_topo_nodes(const char* pciPath, int interRank, int hipDev,
// 根据numaId获取pci路径
std::string generate_topo_node_numa_info(int numaId);
// 输出id分解后的所有数据
// 输出node id分解后的所有数据
void getIdComponents(
uint64_t idToDecompose, int* interRank = nullptr, int* deviceValue = nullptr, int* terminalType = nullptr, int* hipDev = nullptr, int* numaId = nullptr);
......@@ -82,7 +84,7 @@ char* getNetPciPath(scclNet_t* scclNet, int hipDev);
void printTopoNode(ByteSpanArray<scclTopoNode_t>& nodes, int nodeIndex, const char* prefix);
} // namespace physical_links
} // namespace bootstrap
} // namespace graph
} // namespace topology
} // namespace hardware
} // namespace sccl
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment