Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
lishen01
Sccl
Commits
58d57301
Commit
58d57301
authored
Aug 13, 2025
by
lishen
Browse files
将建图过程中原本在bootstrap中的一部分完全移动到graph中
parent
708aae12
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
634 additions
and
684 deletions
+634
-684
examples/2_topo/2_bootstrap/compile_mpi3_init_step2.sh
examples/2_topo/2_bootstrap/compile_mpi3_init_step2.sh
+2
-1
src/hardware/hardware.cpp
src/hardware/hardware.cpp
+8
-4
src/hardware/topology/bootstrap/bootstrap.cpp
src/hardware/topology/bootstrap/bootstrap.cpp
+6
-372
src/hardware/topology/bootstrap/bootstrap.h
src/hardware/topology/bootstrap/bootstrap.h
+5
-40
src/hardware/topology/graph/graph.cpp
src/hardware/topology/graph/graph.cpp
+375
-12
src/hardware/topology/graph/graph.h
src/hardware/topology/graph/graph.h
+19
-4
src/hardware/topology/graph/graph_utils.cpp
src/hardware/topology/graph/graph_utils.cpp
+86
-0
src/hardware/topology/graph/graph_utils.h
src/hardware/topology/graph/graph_utils.h
+35
-32
src/hardware/topology/graph/paths.cpp
src/hardware/topology/graph/paths.cpp
+81
-207
src/hardware/topology/graph/paths.h
src/hardware/topology/graph/paths.h
+9
-6
src/hardware/topology/graph/physical_links.cpp
src/hardware/topology/graph/physical_links.cpp
+2
-2
src/hardware/topology/graph/physical_links.h
src/hardware/topology/graph/physical_links.h
+6
-4
No files found.
examples/2_topo/2_bootstrap/compile_mpi3_init_step2.sh
View file @
58d57301
...
...
@@ -13,13 +13,14 @@ hipcc ./3_mpi_init_mpi_init_step2_graph.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/rocm_smi_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/physical_links.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/hardware.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/thread_pool.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/physical_links.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/graph_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/graph.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/paths.cpp
\
-o
3_mpi_init_mpi_init_step2_graph
\
...
...
src/hardware/hardware.cpp
View file @
58d57301
...
...
@@ -39,15 +39,19 @@ scclResult_t sccl_init(const scclUniqueId* unique_id, int rank, int nRanks) {
SCCLCHECK
(
sccl_bootstrap
->
init
(
bootstrap_comm
));
printf
(
"init pos 1
\n
"
);
// -------------------------- 3.
MPI
建图 ----------------------------------- //
// -------------------------- 3.
拓扑
建图 ----------------------------------- //
topo_graph
=
new
scclTopoGraph_t
(
nRanks
);
auto
sccl_graph
=
std
::
make_unique
<
topology
::
graph
::
Graph
>
(
rank
,
nRanks
);
auto
sccl_graph
=
std
::
make_unique
<
topology
::
graph
::
Graph
>
(
sccl_bootstrap
.
get
()
);
printf
(
"init pos 2
\n
"
);
// 计算
通信路径
SCCLCHECK
(
sccl_graph
->
calculateCommunicationPaths
(
bootstrap_comm
,
topo_graph
,
sccl_bootstrap
.
get
()
));
// 计算
拓扑图
SCCLCHECK
(
sccl_graph
->
establishGraph
(
bootstrap_comm
));
printf
(
"init pos 3
\n
"
);
// 计算通信路径
SCCLCHECK
(
sccl_graph
->
calculateCommunicationPaths
(
bootstrap_comm
,
topo_graph
));
printf
(
"init pos 4
\n
"
);
// -------------------------- 3.MPI allgather设置unique_id的整合 ----------------------------------- //
// -------------------------- 5.根据各个节点的基础信息计算topo结果 ----------------------------------- //
...
...
src/hardware/topology/bootstrap/bootstrap.cpp
View file @
58d57301
...
...
@@ -278,17 +278,9 @@ scclResult_t bootstrapCreateRoot(BootstrapHandle_t* handle) {
////////////////////////////// 结构体定义 //////////////////////////////
// scclRankPhysSet构造函数定义
scclRankPhysSet
::
scclRankPhysSet
(
int
nRanks
,
int
nLocalRanks
)
:
nRanks
(
nRanks
),
nLocalRanks
(
nLocalRanks
),
node_info_total_bytes
(
sizeof
(
scclTopoNode_t
)
*
topoNodeMaxLocalNodes
/
nLocalRanks
)
{
printf
(
"scclRankPhysSet 构造函数
\n
"
);
scclRankPhysSet
::
scclRankPhysSet
(
int
nRanks
)
{
rank_info_vec
.
reserve
(
nRanks
);
// 预留空间
rank_info_vec
.
clear
();
// 与scclNodeInfo_t中的定义一致
node_info_vec
.
reserve
(
nRanks
*
node_info_total_bytes
);
// 预留空间
node_info_vec
.
clear
();
printf
(
"scclRankPhysSet 预留空间并初始化node_info_vec, nRanks * node_info_total_bytes=%zu
\n
"
,
nRanks
*
node_info_total_bytes
);
}
void
BootstrapComm
::
init
(
int
rank
,
int
nRanks
,
int
localRank
,
int
nLocalRanks
)
{
...
...
@@ -299,7 +291,7 @@ void BootstrapComm::init(int rank, int nRanks, int localRank, int nLocalRanks) {
this
->
nLocalRanks
=
nLocalRanks
;
this
->
interRank
=
rank
/
nLocalRanks
;
this
->
nInterRanks
=
nRanks
/
nLocalRanks
;
rank_phys_set
=
new
scclRankPhysSet
(
nRanks
,
nLocalRanks
);
// 假设需要动态分配
rank_phys_set
=
new
scclRankPhysSet
(
nRanks
);
// 假设需要动态分配
};
void
BootstrapComm
::
destroy
()
{
...
...
@@ -347,7 +339,6 @@ scclResult_t Bootstrap::init(BootstrapComm_t* bootstrap_comm) {
// -------------------------- 3.设置本地localRank的BootstrapComm信息 ----------------------------------- //
// 初始化BootstrapComm类
bootstrap_comm
->
init
(
rank
,
nRanks
,
localRank
,
nLocalRanks
);
if
(
CPU_COUNT
(
&
bootstrap_comm
->
cpuAffinity
))
{
sched_setaffinity
(
0
,
sizeof
(
cpu_set_t
),
&
bootstrap_comm
->
cpuAffinity
);
}
...
...
@@ -379,105 +370,12 @@ scclResult_t Bootstrap::init(BootstrapComm_t* bootstrap_comm) {
local_rank_info
.
hostHash
=
node_basic
.
hostHash
;
SCCLCHECK
(
bootstrapCommInitNodeInfo
(
bootstrap_comm
->
scclNet
,
&
local_rank_info
));
memcpy
(
&
(
local_rank_info
.
cpu
.
listen_sock
),
&
(
node_basic
.
sock
),
sizeof
(
scclSocket_t
));
//////// 初始化topo node ////////
scclNodeInfo_t
local_topo_nodes
(
nLocalRanks
);
// 使用ByteSpan替代std::vector,并指定容量为pNodes_len
ByteSpanVector
<
scclTopoNode_t
>
nodes_span
((
void
*
)
local_topo_nodes
.
nodes
,
local_topo_nodes
.
totalByteSize
);
#if 1
printf
(
"devices_num=%d, local_rank_info.net.count=%d
\n
"
,
bootstrap_comm
->
deviceCnt
,
local_rank_info
.
net
.
count
);
#endif
// 遍历所有的GPU的pciPath,添加topo node
for
(
int
r
=
localRank
;
r
<
devices_num
;
r
+=
nLocalRanks
)
{
auto
gpu_path
=
physical_links
::
getGpuPciPath
(
r
);
physical_links
::
generate_topo_nodes
(
gpu_path
,
interRank
,
r
,
nodes_span
);
delete
(
gpu_path
);
}
// 遍历所有的NIC的pciPath,添加topo node
for
(
int
r
=
localRank
;
r
<
local_rank_info
.
net
.
count
;
r
+=
nLocalRanks
)
{
auto
net_path
=
physical_links
::
getNetPciPath
(
bootstrap_comm
->
scclNet
,
r
);
physical_links
::
generate_topo_nodes
(
net_path
,
interRank
,
r
,
nodes_span
);
delete
(
net_path
);
}
#if 0
if(interRank == 0) {
ByteSpanArray<scclTopoNode_t> nodes_span_array(nodes_span.data(), local_topo_nodes.totalByteSize);
printf("print rank=%d, nodes_span size=%zu\n", rank, nodes_span.size());
char line[30];
sprintf(line, "print rank=%d: ", rank);
for(int i = 0; i < nodes_span.size(); i++) {
printf("============================**============================\n");
physical_links::printTopoNode(nodes_span_array, i, line);
printf("============================**============================\n");
}
}
#endif
#if 0
// 尝试采用软件识别GPU之间互联
for(int i = 0; i < bootstrap_comm->deviceCnt; i++) {
// if(i != bootstrap_comm->hipDev) {
RSMI_IO_LINK_TYPE rsmi_type;
int hops, count;
if(rocm_smi_getLinkInfo(bootstrap_comm->hipDev, i, &rsmi_type, &hops, &count) == scclSuccess) {
printf("rank=%d, i=%d, dev=%d, rsmi_type=%d, hops=%d, count=%d\n", rank, i, bootstrap_comm->hipDev, rsmi_type, hops, count);
// if(rsmi_type == RSMI_IOLINK_TYPE_XGMI && hops <= 2) {
// if(1) {
// char busIdStr[] = "00000000:00:00.0";
// SCCLCHECK(rocm_smi_getDevicePciBusIdString(i, busIdStr, sizeof(busIdStr)));
// char lowerId[16];
// for(int c = 0; c < 16; c++) {
// lowerId[c] = tolower(busIdStr[c]);
// if(busIdStr[c] == 0)
// break;
// }
// }
} else {
printf("rsmi get type fail\n");
}
// }
}
#endif
// -------------------------- 4.BootstrapComm信息的allgather ----------------------------------- //
bootstrapCommAllGather
(
&
local_rank_info
,
&
local_topo_nodes
,
bootstrap_comm
->
rank_phys_set
);
// TODO: 目前手动将节点内的GPU进行mesh连接,因为无法从/sys/device中获取NIC的拓扑信息,rsmi函数也无法获取NIC的拓扑信息。后续优化
bootstrapNodesLink
(
bootstrap_comm
->
rank_phys_set
->
node_info_vec
.
data
(),
bootstrap_comm
->
rank_phys_set
->
node_info_total_bytes
);
#if 0
if(rank == 1) {
size_t dataLen = bootstrap_comm->rank_phys_set->node_info_total_bytes;
printf("nRanks * bootstrap_comm->rank_phys_set->node_info_total_bytes=%zu, %lu\n", dataLen, nRanks * dataLen);
auto node_info_data = reinterpret_cast<char*>(bootstrap_comm->rank_phys_set->node_info_vec.data());
ByteSpanArray<scclTopoNode_t> nodes_span_all(node_info_data, nRanks * dataLen);
printf("print rank=%d, nodes_span_all size=%zu, scclTopoNode_t size=%zu\n", rank, nodes_span_all.size(), sizeof(scclTopoNode_t));
char line[30];
sprintf(line, "print rank=%d: ", rank);
int node_cnt = 0;
for(int i = 0; i < nodes_span_all.size(); i++) {
if(nodes_span_all[i] && nodes_span_all[i]->type > 0) {
if(i < 64) {
printf("============================&&============================\n");
physical_links::printTopoNode(nodes_span_all, i, line);
printf("============================&&============================\n");
} else if(i < 128) {
printf("============================((============================\n");
physical_links::printTopoNode(nodes_span_all, i, line);
printf("============================))============================\n");
} else {
printf("============================@@============================\n");
physical_links::printTopoNode(nodes_span_all, i, line);
printf("============================@@============================\n");
}
node_cnt += 1;
}
}
printf("print rank=%d, node_cnt=%d\n", rank, node_cnt);
}
#endif
// 将每个节点的`rank_info`信息收集到`rank_phys_set`中,以便后续使用
SCCLCHECK
(
bootstrapAllGather
(
&
local_rank_info
,
bootstrap_comm
->
rank_phys_set
->
rank_info_vec
.
data
(),
sizeof
(
scclRankInfo_t
)));
// 设置初始化标志
asm_ops
::
st_release_sys_global
(
&
socketInitDone
,
true
);
...
...
@@ -545,6 +443,7 @@ scclResult_t Bootstrap::bootstrapRootGatherAndBroadcast(BootstrapNodeBasic_t* se
SCCLCHECK
(
bootstrapNet
::
bootstrapNetRecv
(
accept_manager
.
getSocket
(),
all_node_basic
,
all_node_basic_size
));
}
printf
(
"all_node_basic_size=%d
\n
"
,
all_node_basic_size
);
// ------------- 5.nLocalRanks==0时,将所有rank的ip数据广播给节点内其他rank ------------- //
ipcsocket
=
new
scclIpcSocket_t
(
localRank
,
nLocalRanks
,
/*hash*/
root_handle
->
magic
);
ipcsocket
->
scclIpcSocketBroadcast
(
all_node_basic
,
all_node_basic_size
,
/*localRank root*/
0
);
...
...
@@ -618,28 +517,6 @@ scclResult_t Bootstrap::bootstrapCommInitNodeInfo(scclNet_t* scclNet, scclRankIn
return
scclSuccess
;
}
/**
* @brief 实现节点间通信的AllGather操作
*
* 该函数通过调用`bootstrapAllGather`函数,实现节点间通信的AllGather操作。
* 它将每个节点的`rank_info`信息和`node_info`信息收集到`rank_phys_set`中,以便后续使用。
*
* @param rank_info 指向当前节点的`rank_info`信息的指针
* @param node_info 指向当前节点的`node_info`信息的指针
* @param rank_phys_set 指向节点信息集合的指针,用于存储所有节点的`rank_info`和`node_info`信息
* @return scclResult_t 返回操作结果状态码:
* - scclSuccess: 操作成功
* - 其他错误码: 表示操作失败
*
* @note 该函数是一个简化的接口,用于调用`bootstrapAllGather`函数来实现节点间通信的AllGather操作。
* 在调用`bootstrapAllGather`函数之前,需要确保`rank_info`、`node_info`和`rank_phys_set`已经正确初始化。
*/
scclResult_t
Bootstrap
::
bootstrapCommAllGather
(
scclRankInfo_t
*
rank_info
,
scclNodeInfo_t
*
node_info
,
scclRankPhysSet_t
*
rank_phys_set
)
{
SCCLCHECK
(
bootstrapAllGather
(
rank_info
,
rank_phys_set
->
rank_info_vec
.
data
(),
sizeof
(
scclRankInfo_t
)));
SCCLCHECK
(
bootstrapAllGather
(
node_info
->
nodes
,
rank_phys_set
->
node_info_vec
.
data
(),
rank_phys_set
->
node_info_total_bytes
));
return
scclSuccess
;
}
// TODO: 后续可以采用优化,先节点内allgather,再节点间的allgather,最后节点内的Broadcast。优化的算法并保证正确性
/**
* @brief 实现跨节点的AllGather通信操作
...
...
@@ -659,7 +536,7 @@ scclResult_t Bootstrap::bootstrapCommAllGather(scclRankInfo_t* rank_info, scclNo
* 此外,该函数还假设所有节点的基本信息(如套接字地址)已经通过其他途径正确获取并存储在all_node_basic向量中。
* 在节点间通信中,使用了Ring AllGather算法,该算法在nRanks特别大的时候可能不是最优的选择,可以考虑进一步优化算法以减少通信次数。
*/
scclResult_t
Bootstrap
::
bootstrapAllGather
(
const
void
*
src_data
,
void
*
dst_data
,
int
data_size
)
{
scclResult_t
Bootstrap
::
bootstrapAllGather
(
const
void
*
src_data
,
void
*
dst_data
,
int
data_size
)
const
{
// 数据准备
size_t
inter_data_len
=
nLocalRanks
*
data_size
;
// 节点间传输时每个子块的大小
auto
all_recv_data
=
reinterpret_cast
<
char
*>
(
dst_data
);
...
...
@@ -703,249 +580,6 @@ scclResult_t Bootstrap::bootstrapAllGather(const void* src_data, void* dst_data,
return
scclSuccess
;
}
// TODO: 当前实现使用了较多的for循环,在节点数量较大时速度较慢,可以考虑采用cuda kernel
/**
* @brief 初始化并连接节点之间的链接
*
* 该函数接收一个指向节点信息的字节数组的指针和节点信息的总字节数,用于初始化并连接节点之间的链接。
* 1.创建一个`ByteSpanArray`对象来管理节点信息的内存,然后根据节点的类型(GPU、PCI或NIC)将它们分类存储。
* 2.它使相同`interRank`下的GPU节点两两互联
* 3.遍历所有的`interRank`来合并具有相同`id`、`type`和`busIdStr`的PCI节点。
* 4.使CPU node即numa node的neighbors两两互联。
* 5.它使相同`deviceId`下的NIC节点两两互联。
*
* @param node_info_vec 指向节点信息的字节数组的指针
* @param node_info_total_bytes 节点信息的总字节数
* @return scclResult_t 返回操作结果状态码:
* - scclSuccess: 操作成功
* - scclError: 操作失败
*/
scclResult_t
Bootstrap
::
bootstrapNodesLink
(
void
*
node_info_vec
,
int
node_info_total_bytes
)
{
// 创建一个ByteSpanArray对象,用于管理节点信息的内存
ByteSpanArray
<
scclTopoNode_t
>
node_info_span
(
node_info_vec
,
nRanks
*
node_info_total_bytes
);
// 用于将nodes的deviceId对应的node
std
::
unordered_map
<
uint64_t
,
std
::
vector
<
scclTopoNode_t
*>>
nodes_map_by_deviceId
;
// 用于将interRank内nodes的deviceSig对应的NIC节点连接
std
::
unordered_map
<
uint64_t
,
std
::
vector
<
scclTopoNode_t
*>>
nic_nodes_by_deviceId
;
// 用于识别并连接节点内的GPU node
std
::
vector
<
std
::
vector
<
scclTopoNode_t
*>>
gpu_nodes_by_interRank
(
nInterRanks
);
// -------------------------- 1.遍历所有的节点信息,记录node -------------------------- //
for
(
size_t
i
=
0
;
i
<
node_info_span
.
size
();
++
i
)
{
scclTopoNode_t
*
node
=
node_info_span
[
i
];
// 跳过空节点、跳过没有busId的节点(如空节点或CPU)
if
(
node
->
type
<=
0
)
{
continue
;
}
uint64_t
id
=
node
->
id
;
int
interRank
;
physical_links
::
getIdComponents
(
id
,
&
interRank
);
uint64_t
deviceSig
=
id
&
0xFFFFFFFFFF
;
// 计算 interRank(24bit) + hipDev(8bit) + deviceId(16bit) + terminalType(8bit) + numaId(8bit)
// 选择type为GPU的节点
if
(
node
->
type
==
GPU
)
{
if
(
interRank
>=
gpu_nodes_by_interRank
.
size
())
{
gpu_nodes_by_interRank
.
resize
(
interRank
+
1
);
}
gpu_nodes_by_interRank
[
interRank
].
push_back
(
node
);
}
else
if
(
node
->
type
==
NIC
)
{
nic_nodes_by_deviceId
[
deviceSig
].
push_back
(
node
);
}
nodes_map_by_deviceId
[
id
].
push_back
(
node
);
}
// 合并id相同和busId相同的node
for
(
auto
&
pair
:
nodes_map_by_deviceId
)
{
auto
&
nodes
=
pair
.
second
;
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
for
(
size_t
j
=
i
+
1
;
j
<
nodes
.
size
();
++
j
)
{
// if(nodes[i]->id == nodes[j]->id && nodes[i]->type == nodes[j]->type && nodes[i]->busId == nodes[j]->busId) {
// SCCLCHECK(nodes[i]->combineNode(nodes[j]));
// }
if
(
nodes
[
i
]
->
id
==
nodes
[
j
]
->
id
)
{
if
(
nodes
[
i
]
->
type
==
nodes
[
j
]
->
type
&&
nodes
[
i
]
->
busId
==
nodes
[
j
]
->
busId
)
{
SCCLCHECK
(
nodes
[
i
]
->
combineNode
(
nodes
[
j
]));
}
else
{
#if 0
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(nodes[i]->busId, busIdStr_i);
int64ToBusId(nodes[j]->busId, busIdStr_j);
printf("same Id but different type or busId: %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, "
"H:%d, N:%d, busIdStr:%s)\n",
nodes[i]->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
nodes[j]->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
#endif
}
}
}
}
}
// 遍历所有的节点信息,将CPU的所有neighbor node两两互联
for
(
size_t
i
=
0
;
i
<
node_info_span
.
size
();
++
i
)
{
scclTopoNode_t
*
node
=
node_info_span
[
i
];
// 跳过空节点、跳过没有busId的节点(如空节点或CPU)
if
(
node
->
type
==
CPU
)
{
for
(
size_t
i
=
0
;
i
<
node
->
neighborCount
;
++
i
)
{
for
(
size_t
j
=
i
+
1
;
j
<
node
->
neighborCount
;
++
j
)
{
// 使用unordered_map来加速查找
auto
it_i
=
nodes_map_by_deviceId
.
find
(
node
->
neighbors
[
i
]);
auto
it_j
=
nodes_map_by_deviceId
.
find
(
node
->
neighbors
[
j
]);
if
(
it_i
!=
nodes_map_by_deviceId
.
end
()
&&
it_j
!=
nodes_map_by_deviceId
.
end
())
{
scclTopoNode_t
*
neighbor_i
=
nullptr
;
scclTopoNode_t
*
neighbor_j
=
nullptr
;
for
(
auto
&
n
:
it_i
->
second
)
{
if
(
n
->
type
>
0
)
{
neighbor_i
=
n
;
break
;
}
}
for
(
auto
&
n
:
it_j
->
second
)
{
if
(
n
->
type
>
0
)
{
neighbor_j
=
n
;
break
;
}
}
if
(
neighbor_i
&&
neighbor_j
)
{
neighbor_i
->
addNeighbor
(
neighbor_j
->
id
);
neighbor_j
->
addNeighbor
(
neighbor_i
->
id
);
#if 0
{
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(
neighbor_i->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(
neighbor_j->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(neighbor_i->busId, busIdStr_i);
int64ToBusId(neighbor_j->busId, busIdStr_j);
printf("connect CPU neighbors %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, "
"N:%d, busIdStr:%s)\n",
neighbor_i->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
neighbor_j->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
}
#endif
}
}
}
}
}
}
// 使相同interRank下的GPU node两两互联
for
(
const
auto
&
nodes
:
gpu_nodes_by_interRank
)
{
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
for
(
size_t
j
=
i
+
1
;
j
<
nodes
.
size
();
++
j
)
{
nodes
[
i
]
->
addNeighbor
(
nodes
[
j
]
->
id
);
nodes
[
j
]
->
addNeighbor
(
nodes
[
i
]
->
id
);
#if 0
{
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(nodes[i]->busId, busIdStr_i);
int64ToBusId(nodes[j]->busId, busIdStr_j);
printf("connect GPU %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s)\n",
nodes[i]->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
nodes[j]->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
}
#endif
}
}
}
// 使相同deviceId下的NIC节点两两互联
for
(
const
auto
&
pair
:
nic_nodes_by_deviceId
)
{
const
auto
&
nodes
=
pair
.
second
;
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
for
(
size_t
j
=
i
+
1
;
j
<
nodes
.
size
();
++
j
)
{
// 在deviceId相同的情况下,比较busIdStr
if
(
nodes
[
i
]
->
busId
==
nodes
[
j
]
->
busId
)
{
nodes
[
i
]
->
addNeighbor
(
nodes
[
j
]
->
id
);
nodes
[
j
]
->
addNeighbor
(
nodes
[
i
]
->
id
);
#if 0
{
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(nodes[i]->busId, busIdStr_i);
int64ToBusId(nodes[j]->busId, busIdStr_j);
printf("connect NIC interRank %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, "
"busIdStr:%s)\n",
nodes[i]->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
nodes[j]->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
}
#endif
}
}
}
}
return
scclSuccess
;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////
// 函数:打印 scclRankInfo 结构体的信息
scclResult_t
printRankInfo
(
const
std
::
string
&
prefix
,
scclRankInfo_t
*
info
)
{
...
...
src/hardware/topology/bootstrap/bootstrap.h
View file @
58d57301
...
...
@@ -7,7 +7,6 @@
#include "bootstrap_net.h"
#include "thread_pool.h"
#include "ipc_socket.h"
#include "physical_links.h"
namespace
sccl
{
namespace
hardware
{
...
...
@@ -15,7 +14,6 @@ namespace topology {
namespace
bootstrap
{
typedef
sccl
::
hardware
::
net
::
ipc_socket
::
scclIpcSocket_t
scclIpcSocket_t
;
typedef
physical_links
::
scclTopoNode_t
scclTopoNode_t
;
///////////////////////////////////// 用于初始化时的功能函数 //////////////////////////////////////////
scclResult_t
bootstrapGetUniqueId
(
BootstrapHandle_t
*
handle
);
...
...
@@ -56,41 +54,13 @@ typedef struct scclRankInfo {
uint64_t
pidHash
=
0
;
// 进程 ID 哈希值
}
scclRankInfo_t
;
// 定义结构体 scclNodeInfo,用于存储每个rank的图连接信息
// TODO: 目前每个rank需要的node_info大小为4k+,当卡数较大时占用内存较大,可以优化。或者不作为全局变量
typedef
struct
scclNodeInfo
{
scclTopoNode_t
*
nodes
;
// 指向scclTopoNode_t对象数组的指针
int
nLocalRanks
;
int
totalByteSize
;
// 表示占用的总字节数
// 带参数的构造函数,用于初始化nodes的大小
scclNodeInfo
(
int
nLocalRanks
)
:
nodes
(
nullptr
),
nLocalRanks
(
nLocalRanks
),
totalByteSize
(
sizeof
(
scclTopoNode_t
)
*
topoNodeMaxLocalNodes
/
nLocalRanks
)
{
nodes
=
reinterpret_cast
<
scclTopoNode_t
*>
(
malloc
(
totalByteSize
));
if
(
nodes
)
{
memset
(
nodes
,
0
,
totalByteSize
);
}
}
// 析构函数,用于释放申请的数组空间
virtual
~
scclNodeInfo
()
{
if
(
nodes
)
{
free
(
nodes
);
}
}
}
scclNodeInfo_t
;
// 所有节点的信息
typedef
struct
scclRankPhysSet
{
// 构造函数声明
scclRankPhysSet
(
int
nRanks
,
int
nLocalRanks
);
std
::
vector
<
scclRankInfo_t
>
rank_info_vec
;
std
::
vector
<
char
>
node_info_vec
;
// 实际为std::vector<scclNodeInfo_t>,vector不支持scclNodeInfo_t变长
scclRankPhysSet
(
int
nRanks
);
public:
int
nRanks
=
0
;
// 总的节点数量
int
nLocalRanks
=
0
;
// 本地计算节点中的节点总数
size_t
node_info_total_bytes
=
0
;
// 记录可变长度scclNodeInfo_t类型数据的实际大小
std
::
vector
<
scclRankInfo_t
>
rank_info_vec
;
}
scclRankPhysSet_t
;
// BootstrapComm 结构体定义,用于存储引导通信信息
...
...
@@ -126,7 +96,7 @@ public:
scclResult_t
init
(
BootstrapComm_t
*
bootstrap_comm
);
// 实现跨节点的AllGather通信操作
scclResult_t
bootstrapAllGather
(
const
void
*
src_data
,
void
*
dst_data
,
int
data_size
);
scclResult_t
bootstrapAllGather
(
const
void
*
src_data
,
void
*
dst_data
,
int
data_size
)
const
;
private:
// 执行根节点的聚集和广播操作
...
...
@@ -135,17 +105,12 @@ private:
// 初始化节点通信信息
scclResult_t
bootstrapCommInitNodeInfo
(
scclNet_t
*
scclNet
,
scclRankInfo_t
*
rank_info
);
// 实现rank_info信息的节点间通信的AllGather操作
scclResult_t
bootstrapCommAllGather
(
scclRankInfo_t
*
rank_info
,
scclNodeInfo_t
*
node_info
,
scclRankPhysSet_t
*
rank_phys_set
);
// 额外处理nRanks个nodes的连接关系
scclResult_t
bootstrapNodesLink
(
void
*
node_info_vec
,
int
node_info_total_bytes
);
private:
public:
int
rank
,
nRanks
;
// 初始化阶段获取MPI的值
int
localRank
,
nLocalRanks
;
// 通过bootstrapRootGatherAndBroadcast函数确定值
int
interRank
,
nInterRanks
;
// 整个节点在全部节点中的位置
private:
// TODO: 用于控制套接字终端的变量,目前不知道在哪里使用
volatile
uint32_t
*
abortFlag
;
// 中止标志,非阻塞套接字设置
...
...
src/hardware/topology/graph/graph.cpp
View file @
58d57301
#include <iostream>
#include "base.h"
#include "graph.h"
#include "paths.h"
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
graph
{
Graph
::
Graph
(
int
rank
,
int
nRanks
)
:
rank
(
rank
),
nRanks
(
nRanks
)
{
// 构造函数的实现
// 构造函数的实现
Graph
::
Graph
(
const
Bootstrap
*
bootstrap
)
:
sccl_bootstrap
(
bootstrap
),
rank
(
sccl_bootstrap
->
rank
),
nRanks
(
sccl_bootstrap
->
nRanks
),
localRank
(
sccl_bootstrap
->
localRank
),
nLocalRanks
(
sccl_bootstrap
->
nLocalRanks
),
interRank
(
sccl_bootstrap
->
interRank
),
nInterRanks
(
sccl_bootstrap
->
nInterRanks
)
{
// 与scclNodeInfo_t中的定义一致,预留足够大小的node空间
this
->
node_info_total_bytes
=
sizeof
(
scclTopoNode_t
)
*
topoNodeMaxLocalNodes
/
nLocalRanks
;
node_info_vec
.
reserve
(
nRanks
*
node_info_total_bytes
);
// 预留空间
node_info_vec
.
clear
();
}
Graph
::~
Graph
()
{
// 析构函数的实现
// 析构函数的实现
Graph
::~
Graph
()
{}
scclResult_t
Graph
::
establishGraph
(
const
BootstrapComm_t
*
bootstrap_comm
)
{
//////// 初始化topo node ////////
scclNodeInfo_t
local_topo_nodes
(
nLocalRanks
);
// 使用ByteSpan替代std::vector,并指定容量为pNodes_len
ByteSpanVector
<
scclTopoNode_t
>
nodes_span
((
void
*
)
local_topo_nodes
.
nodes
,
local_topo_nodes
.
totalByteSize
);
// 遍历所有的GPU的pciPath,添加topo node
for
(
int
r
=
localRank
;
r
<
bootstrap_comm
->
deviceCnt
;
r
+=
nLocalRanks
)
{
auto
gpu_path
=
physical_links
::
getGpuPciPath
(
r
);
physical_links
::
generate_topo_nodes
(
gpu_path
,
this
->
interRank
,
r
,
nodes_span
);
delete
(
gpu_path
);
}
// 遍历所有的NIC的pciPath,添加topo node
bootstrap
::
scclRankInfo_t
local_rank_info
=
bootstrap_comm
->
rank_phys_set
->
rank_info_vec
[
this
->
rank
];
for
(
int
r
=
localRank
;
r
<
local_rank_info
.
net
.
count
;
r
+=
nLocalRanks
)
{
auto
net_path
=
physical_links
::
getNetPciPath
(
bootstrap_comm
->
scclNet
,
r
);
physical_links
::
generate_topo_nodes
(
net_path
,
this
->
interRank
,
r
,
nodes_span
);
delete
(
net_path
);
}
#if 0
if(interRank == 0) {
char line[30];
sprintf(line, "print rank=%d: ", rank);
bootstrap::printRankInfo(std::string(line), &local_rank_info);
}
#endif
#if 0
if(interRank == 0) {
ByteSpanArray<scclTopoNode_t> nodes_span_array(nodes_span.data(), local_topo_nodes.totalByteSize);
printf("print rank=%d, nodes_span size=%zu\n", rank, nodes_span.size());
char line[30];
sprintf(line, "print rank=%d: ", rank);
for(int i = 0; i < nodes_span.size(); i++) {
printf("============================**============================\n");
physical_links::printTopoNode(nodes_span_array, i, line);
printf("============================**============================\n");
}
}
#endif
#if 0
// 尝试采用软件识别GPU之间互联
for(int i = 0; i < bootstrap_comm->deviceCnt; i++) {
// if(i != bootstrap_comm->hipDev) {
RSMI_IO_LINK_TYPE rsmi_type;
int hops, count;
if(rocm_smi_getLinkInfo(bootstrap_comm->hipDev, i, &rsmi_type, &hops, &count) == scclSuccess) {
printf("rank=%d, i=%d, dev=%d, rsmi_type=%d, hops=%d, count=%d\n", rank, i, bootstrap_comm->hipDev, rsmi_type, hops, count);
// if(rsmi_type == RSMI_IOLINK_TYPE_XGMI && hops <= 2) {
// if(1) {
// char busIdStr[] = "00000000:00:00.0";
// SCCLCHECK(rocm_smi_getDevicePciBusIdString(i, busIdStr, sizeof(busIdStr)));
// char lowerId[16];
// for(int c = 0; c < 16; c++) {
// lowerId[c] = tolower(busIdStr[c]);
// if(busIdStr[c] == 0)
// break;
// }
// }
} else {
printf("rsmi get type fail\n");
}
// }
}
#endif
// -------------------------- 4.Comm信息的allgather ----------------------------------- //
SCCLCHECK
(
sccl_bootstrap
->
bootstrapAllGather
(
local_topo_nodes
.
nodes
,
this
->
node_info_vec
.
data
(),
this
->
node_info_total_bytes
));
// TODO: 目前手动将节点内的GPU进行mesh连接,因为无法从/sys/device中获取NIC的拓扑信息,rsmi函数也无法获取NIC的拓扑信息。后续优化
SCCLCHECK
(
bootstrapNodesLink
(
this
->
node_info_vec
.
data
(),
this
->
node_info_total_bytes
));
#if 0
if(rank == 1) {
size_t dataLen = this->node_info_total_bytes;
printf("nRanks * this->node_info_total_bytes=%zu, %lu\n", dataLen, nRanks * dataLen);
auto node_info_data = reinterpret_cast<char*>(this->node_info_vec.data());
ByteSpanArray<scclTopoNode_t> nodes_span_all(node_info_data, nRanks * dataLen);
printf("print rank=%d, nodes_span_all size=%zu, scclTopoNode_t size=%zu\n", rank, nodes_span_all.size(), sizeof(scclTopoNode_t));
char line[30];
sprintf(line, "print rank=%d: ", rank);
int node_cnt = 0;
for(int i = 0; i < nodes_span_all.size(); i++) {
if(nodes_span_all[i] && nodes_span_all[i]->type > 0) {
if(i < 64) {
printf("============================&&============================\n");
physical_links::printTopoNode(nodes_span_all, i, line);
printf("============================&&============================\n");
} else if(i < 128) {
printf("============================((============================\n");
physical_links::printTopoNode(nodes_span_all, i, line);
printf("============================))============================\n");
} else {
printf("============================@@============================\n");
physical_links::printTopoNode(nodes_span_all, i, line);
printf("============================@@============================\n");
}
node_cnt += 1;
}
}
printf("print rank=%d, node_cnt=%d\n", rank, node_cnt);
}
#endif
return
scclSuccess
;
}
scclResult_t
Graph
::
calculateCommunicationPaths
(
const
BootstrapComm_t
*
bootstrap_comm
,
scclTopoGraph_t
*
topo_graph
,
Bootstrap
*
sccl_bootstrap
)
{
scclResult_t
Graph
::
calculateCommunicationPaths
(
const
BootstrapComm_t
*
bootstrap_comm
,
scclTopoGraph_t
*
topo_graph
)
{
// 通信路径计算的实现
std
::
cout
<<
"Calculating communication paths..."
<<
std
::
endl
;
// 调用pathFinder类,实现硬件路径搜索
auto
path_finder
=
PathFinder
(
bootstrap_comm
);
printf
(
"calculateCommunicationPaths pos 1
\n
"
);
auto
path_finder
=
PathFinder
(
bootstrap_comm
,
this
->
node_info_vec
,
this
->
node_info_total_bytes
);
// 将搜索结果写入topo_graph中,并记录有效node
SCCLCHECK
(
path_finder
.
computeTopoGpuP2pMap
(
topo_graph
));
printf
(
"calculateCommunicationPaths pos 2
\n
"
);
// topo_graph->printGPUPaths(
);
// 调用bootstrap类,将transport_map进行allgather统计
uint8_t
*
local_transport_map
=
topo_graph
->
getTransportMapRowStart
(
rank
);
SCCLCHECK
(
sccl_bootstrap
->
bootstrapAllGather
(
local_transport_map
,
topo_graph
->
transport_map
.
data
(),
nRanks
*
sizeof
(
uint8_t
)));
printf
(
"calculateCommunicationPaths pos 3
\n
"
);
#if 1
// 打印transport_map
if
(
bootstrap_comm
->
rank
==
0
)
{
SCCLCHECK
(
topo_graph
->
printTransportMap
());
}
#endif
return
scclSuccess
;
}
scclResult_t
Graph
::
build
LogicalTopology
()
{
scclResult_t
Graph
::
search
LogicalTopology
()
{
// 逻辑拓扑构建的实现
std
::
cout
<<
"Building logical topology..."
<<
std
::
endl
;
// 具体的实现细节
...
...
@@ -54,6 +172,251 @@ scclResult_t Graph::calculateTopoChannels() {
return
scclSuccess
;
}
////////////////////////////////////////////////// private //////////////////////////////////////////////////
// TODO: 当前实现使用了较多的for循环,在节点数量较大时速度较慢,可以考虑采用cuda kernel
/**
* @brief 初始化并连接节点之间的链接
*
* 该函数接收一个指向节点信息的字节数组的指针和节点信息的总字节数,用于初始化并连接节点之间的链接。
* 1.创建一个`ByteSpanArray`对象来管理节点信息的内存,然后根据节点的类型(GPU、PCI或NIC)将它们分类存储。
* 2.它使相同`interRank`下的GPU节点两两互联
* 3.遍历所有的`interRank`来合并具有相同`id`、`type`和`busIdStr`的PCI节点。
* 4.使CPU node即numa node的neighbors两两互联。
* 5.它使相同`deviceId`下的NIC节点两两互联。
*
* @param node_info_vec 指向节点信息的字节数组的指针
* @param node_info_total_bytes 节点信息的总字节数
* @return scclResult_t 返回操作结果状态码:
* - scclSuccess: 操作成功
* - scclError: 操作失败
*/
scclResult_t
Graph
::
bootstrapNodesLink
(
void
*
node_info_vec
,
int
node_info_total_bytes
)
{
// 创建一个ByteSpanArray对象,用于管理节点信息的内存
ByteSpanArray
<
scclTopoNode_t
>
node_info_span
(
node_info_vec
,
nRanks
*
node_info_total_bytes
);
// 用于将nodes的deviceId对应的node
std
::
unordered_map
<
uint64_t
,
std
::
vector
<
scclTopoNode_t
*>>
nodes_map_by_deviceId
;
// 用于将interRank内nodes的deviceSig对应的NIC节点连接
std
::
unordered_map
<
uint64_t
,
std
::
vector
<
scclTopoNode_t
*>>
nic_nodes_by_deviceId
;
// 用于识别并连接节点内的GPU node
std
::
vector
<
std
::
vector
<
scclTopoNode_t
*>>
gpu_nodes_by_interRank
(
nInterRanks
);
// -------------------------- 1.遍历所有的节点信息,记录node -------------------------- //
for
(
size_t
i
=
0
;
i
<
node_info_span
.
size
();
++
i
)
{
scclTopoNode_t
*
node
=
node_info_span
[
i
];
// 跳过空节点、跳过没有busId的节点(如空节点或CPU)
if
(
node
->
type
<=
0
)
{
continue
;
}
uint64_t
id
=
node
->
id
;
int
interRank
;
physical_links
::
getIdComponents
(
id
,
&
interRank
);
uint64_t
deviceSig
=
id
&
0xFFFFFFFFFF
;
// 计算 interRank(24bit) + hipDev(8bit) + deviceId(16bit) + terminalType(8bit) + numaId(8bit)
// 选择type为GPU的节点
if
(
node
->
type
==
GPU
)
{
if
(
interRank
>=
gpu_nodes_by_interRank
.
size
())
{
gpu_nodes_by_interRank
.
resize
(
interRank
+
1
);
}
gpu_nodes_by_interRank
[
interRank
].
push_back
(
node
);
}
else
if
(
node
->
type
==
NIC
)
{
nic_nodes_by_deviceId
[
deviceSig
].
push_back
(
node
);
}
nodes_map_by_deviceId
[
id
].
push_back
(
node
);
}
// 合并id相同和busId相同的node
for
(
auto
&
pair
:
nodes_map_by_deviceId
)
{
auto
&
nodes
=
pair
.
second
;
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
for
(
size_t
j
=
i
+
1
;
j
<
nodes
.
size
();
++
j
)
{
// if(nodes[i]->id == nodes[j]->id && nodes[i]->type == nodes[j]->type && nodes[i]->busId == nodes[j]->busId) {
// SCCLCHECK(nodes[i]->combineNode(nodes[j]));
// }
if
(
nodes
[
i
]
->
id
==
nodes
[
j
]
->
id
)
{
if
(
nodes
[
i
]
->
type
==
nodes
[
j
]
->
type
&&
nodes
[
i
]
->
busId
==
nodes
[
j
]
->
busId
)
{
SCCLCHECK
(
nodes
[
i
]
->
combineNode
(
nodes
[
j
]));
}
else
{
#if 0
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(nodes[i]->busId, busIdStr_i);
int64ToBusId(nodes[j]->busId, busIdStr_j);
printf("same Id but different type or busId: %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, "
"H:%d, N:%d, busIdStr:%s)\n",
nodes[i]->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
nodes[j]->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
#endif
}
}
}
}
}
// 遍历所有的节点信息,将CPU的所有neighbor node两两互联
for
(
size_t
i
=
0
;
i
<
node_info_span
.
size
();
++
i
)
{
scclTopoNode_t
*
node
=
node_info_span
[
i
];
// 跳过空节点、跳过没有busId的节点(如空节点或CPU)
if
(
node
->
type
==
CPU
)
{
for
(
size_t
i
=
0
;
i
<
node
->
neighborCount
;
++
i
)
{
for
(
size_t
j
=
i
+
1
;
j
<
node
->
neighborCount
;
++
j
)
{
// 使用unordered_map来加速查找
auto
it_i
=
nodes_map_by_deviceId
.
find
(
node
->
neighbors
[
i
]);
auto
it_j
=
nodes_map_by_deviceId
.
find
(
node
->
neighbors
[
j
]);
if
(
it_i
!=
nodes_map_by_deviceId
.
end
()
&&
it_j
!=
nodes_map_by_deviceId
.
end
())
{
scclTopoNode_t
*
neighbor_i
=
nullptr
;
scclTopoNode_t
*
neighbor_j
=
nullptr
;
for
(
auto
&
n
:
it_i
->
second
)
{
if
(
n
->
type
>
0
)
{
neighbor_i
=
n
;
break
;
}
}
for
(
auto
&
n
:
it_j
->
second
)
{
if
(
n
->
type
>
0
)
{
neighbor_j
=
n
;
break
;
}
}
if
(
neighbor_i
&&
neighbor_j
)
{
neighbor_i
->
addNeighbor
(
neighbor_j
->
id
);
neighbor_j
->
addNeighbor
(
neighbor_i
->
id
);
#if 0
{
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(
neighbor_i->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(
neighbor_j->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(neighbor_i->busId, busIdStr_i);
int64ToBusId(neighbor_j->busId, busIdStr_j);
printf("connect CPU neighbors %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, "
"N:%d, busIdStr:%s)\n",
neighbor_i->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
neighbor_j->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
}
#endif
}
}
}
}
}
}
// 使相同interRank下的GPU node两两互联
for
(
const
auto
&
nodes
:
gpu_nodes_by_interRank
)
{
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
for
(
size_t
j
=
i
+
1
;
j
<
nodes
.
size
();
++
j
)
{
nodes
[
i
]
->
addNeighbor
(
nodes
[
j
]
->
id
);
nodes
[
j
]
->
addNeighbor
(
nodes
[
i
]
->
id
);
#if 0
{
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(nodes[i]->busId, busIdStr_i);
int64ToBusId(nodes[j]->busId, busIdStr_j);
printf("connect GPU %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s)\n",
nodes[i]->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
nodes[j]->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
}
#endif
}
}
}
// 使相同deviceId下的NIC节点两两互联
for
(
const
auto
&
pair
:
nic_nodes_by_deviceId
)
{
const
auto
&
nodes
=
pair
.
second
;
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
for
(
size_t
j
=
i
+
1
;
j
<
nodes
.
size
();
++
j
)
{
// 在deviceId相同的情况下,比较busIdStr
if
(
nodes
[
i
]
->
busId
==
nodes
[
j
]
->
busId
)
{
nodes
[
i
]
->
addNeighbor
(
nodes
[
j
]
->
id
);
nodes
[
j
]
->
addNeighbor
(
nodes
[
i
]
->
id
);
#if 0
{
int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
char busIdStr_i[17], busIdStr_j[17];
int64ToBusId(nodes[i]->busId, busIdStr_i);
int64ToBusId(nodes[j]->busId, busIdStr_j);
printf("connect NIC interRank %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, "
"busIdStr:%s)\n",
nodes[i]->id,
tmpi_interRank,
tmpi_deviceValue,
tmpi_terminalType,
tmpi_hipDev,
tmpi_numaId,
busIdStr_i,
nodes[j]->id,
tmpj_interRank,
tmpj_deviceValue,
tmpj_terminalType,
tmpj_hipDev,
tmpj_numaId,
busIdStr_j);
}
#endif
}
}
}
}
return
scclSuccess
;
}
}
// namespace graph
}
// namespace topology
}
// namespace hardware
...
...
src/hardware/topology/graph/graph.h
View file @
58d57301
...
...
@@ -3,6 +3,7 @@
#include <vector>
#include "base.h"
#include "graph_utils.h"
#include "paths.h"
namespace
sccl
{
namespace
hardware
{
...
...
@@ -11,23 +12,37 @@ namespace graph {
class
Graph
{
public:
Graph
(
int
rank
,
int
nRanks
);
Graph
(
const
Bootstrap
*
bootstrap
);
virtual
~
Graph
();
scclResult_t
establishGraph
(
const
BootstrapComm_t
*
bootstrap_comm
);
// 通信路径计算
scclResult_t
calculateCommunicationPaths
(
const
BootstrapComm_t
*
bootstrap_comm
,
scclTopoGraph_t
*
topo_graph
,
Bootstrap
*
sccl_bootstrap
);
scclResult_t
calculateCommunicationPaths
(
const
BootstrapComm_t
*
bootstrap_comm
,
scclTopoGraph_t
*
topo_graph
);
// 逻辑拓扑构建
scclResult_t
build
LogicalTopology
();
scclResult_t
search
LogicalTopology
();
// 根据无向图计算topo路径
scclResult_t
calculateTopoChannels
();
private:
// 额外处理nRanks个nodes的连接关系
scclResult_t
bootstrapNodesLink
(
void
*
node_info_vec
,
int
node_info_total_bytes
);
private:
const
Bootstrap
*
sccl_bootstrap
;
// 为了调用class Bootstrap中的函数
// 记录所有rank中node信息
std
::
vector
<
char
>
node_info_vec
;
// 实际为std::vector<scclNodeInfo_t>,vector不支持scclNodeInfo_t变长
size_t
node_info_total_bytes
=
0
;
// 记录可变长度scclNodeInfo_t类型数据的实际大小
std
::
vector
<
std
::
vector
<
int
>>
adjacencyMatrix
;
// 使用邻接矩阵表示图
// 你可以根据需要添加更多的私有成员变量和函数
// rank信息
int
rank
,
nRanks
;
int
localRank
,
nLocalRanks
;
int
interRank
,
nInterRanks
;
// 整个节点在全部节点中的位置
};
}
// namespace graph
...
...
src/hardware/topology/graph/graph_utils.cpp
0 → 100644
View file @
58d57301
#include <string.h>
#include "graph_utils.h"
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
graph
{
scclTopoGraph
::
scclTopoGraph
(
int
nRanks
)
:
nRanks
(
nRanks
),
transport_map
(
nullptr
,
0
)
{
// 分配transport_map的内存
uint8_t
*
raw_transport_map
=
static_cast
<
uint8_t
*>
(
calloc
(
nRanks
*
nRanks
,
sizeof
(
uint8_t
)));
if
(
raw_transport_map
==
nullptr
)
{
// 处理内存分配失败的情况
throw
std
::
bad_alloc
();
}
// 使用ByteSpanArray初始化transport_map
transport_map
=
ByteSpanArray
<
uint8_t
>
(
raw_transport_map
,
nRanks
*
nRanks
);
}
scclTopoGraph
::~
scclTopoGraph
()
{
// 释放transport_map的内存
free
(
transport_map
.
data
());
}
// 打印transport_map
scclResult_t
scclTopoGraph
::
printTransportMap
()
{
for
(
int
i
=
0
;
i
<
this
->
nRanks
;
++
i
)
{
for
(
int
j
=
0
;
j
<
this
->
nRanks
;
++
j
)
{
uint8_t
*
value
=
this
->
getTransportMapData
(
i
,
j
);
if
(
value
!=
nullptr
)
{
printf
(
"%d "
,
*
value
);
}
else
{
printf
(
"nullptr "
);
}
}
printf
(
"
\n
"
);
}
return
scclSuccess
;
}
// 打印gpu_paths信息的函数
scclResult_t
scclTopoGraph
::
printGPUPaths
()
{
for
(
const
auto
&
start_pair
:
gpu_paths
)
{
uint64_t
start_node_id
=
start_pair
.
first
;
auto
start_node_it
=
graph_nodes
.
find
(
start_node_id
);
if
(
start_node_it
!=
graph_nodes
.
end
())
{
std
::
cout
<<
"Paths starting from node: "
;
start_node_it
->
second
.
printNodeInfo
(
"Start Node"
);
}
else
{
std
::
cout
<<
"Start node ID "
<<
start_node_id
<<
" not found in graph nodes."
<<
std
::
endl
;
continue
;
}
for
(
const
auto
&
end_pair
:
start_pair
.
second
)
{
uint64_t
end_node_id
=
end_pair
.
first
;
auto
end_node_it
=
graph_nodes
.
find
(
end_node_id
);
if
(
end_node_it
!=
graph_nodes
.
end
())
{
std
::
cout
<<
" to node: "
;
end_node_it
->
second
.
printNodeInfo
(
"End Node"
);
}
else
{
std
::
cout
<<
" End node ID "
<<
end_node_id
<<
" not found in graph nodes."
<<
std
::
endl
;
continue
;
}
std
::
cout
<<
" Paths:"
<<
std
::
endl
;
for
(
const
auto
&
path
:
end_pair
.
second
)
{
std
::
cout
<<
" Path: "
;
for
(
const
auto
&
node_id
:
path
)
{
auto
node_it
=
graph_nodes
.
find
(
node_id
);
if
(
node_it
!=
graph_nodes
.
end
())
{
node_it
->
second
.
printNodeInfo
(
" "
);
}
else
{
std
::
cout
<<
" Node ID "
<<
node_id
<<
" not found in graph nodes."
<<
std
::
endl
;
}
}
std
::
cout
<<
std
::
endl
;
}
}
}
return
scclSuccess
;
}
}
// namespace graph
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
src/hardware/topology/graph/graph_utils.h
View file @
58d57301
...
...
@@ -3,16 +3,41 @@
#include <string.h>
#include "base.h"
#include "bootstrap.h"
#include "physical_links.h"
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
graph
{
typedef
bootstrap
::
physical_links
::
scclTopoNode_t
scclTopoNode_t
;
typedef
bootstrap
::
scclNodeInfo_t
scclNodeInfo
_t
;
typedef
physical_links
::
scclTopoNode_t
scclTopoNode
_t
;
typedef
bootstrap
::
BootstrapComm_t
BootstrapComm_t
;
typedef
topology
::
bootstrap
::
Bootstrap
Bootstrap
;
// 定义结构体 scclNodeInfo,用于存储每个rank的图连接信息
// TODO: 目前每个rank需要的node_info大小为4k+,当卡数较大时占用内存较大,可以优化。或者不作为全局变量
typedef
struct
scclNodeInfo
{
scclTopoNode_t
*
nodes
;
// 指向scclTopoNode_t对象数组的指针
int
nLocalRanks
;
int
totalByteSize
;
// 表示占用的总字节数
// 带参数的构造函数,用于初始化nodes的大小
scclNodeInfo
(
int
nLocalRanks
)
:
nodes
(
nullptr
),
nLocalRanks
(
nLocalRanks
),
totalByteSize
(
sizeof
(
scclTopoNode_t
)
*
topoNodeMaxLocalNodes
/
nLocalRanks
)
{
nodes
=
reinterpret_cast
<
scclTopoNode_t
*>
(
malloc
(
totalByteSize
));
if
(
nodes
)
{
memset
(
nodes
,
0
,
totalByteSize
);
}
}
// 析构函数,用于释放申请的数组空间
virtual
~
scclNodeInfo
()
{
if
(
nodes
)
{
free
(
nodes
);
}
}
}
scclNodeInfo_t
;
//////////////////////////////////////////////////////////////////////////////////////////////////
// 定义 topoPathType_t 枚举类型,用于表示不同的路径类型。
typedef
enum
topoPathType
{
PATH_LOC
=
0
,
// 本地路径
...
...
@@ -20,7 +45,7 @@ typedef enum topoPathType {
PATH_NVB
=
2
,
// 通过中间 GPU 使用 NVLink 连接
PATH_PIX
=
3
,
// 通过最多一个 PCIe 桥连接
PATH_PXB
=
4
,
// 通过多个 PCIe 桥连接(不经过 PCIe 主桥)
PATH_PXN
=
5
,
// GPU 和 NIC 之间通过中间 GPU 连接
PATH_PXN
=
5
,
// GPU 和 NIC 之间通过中间 GPU 连接
, PXN = PCI + NVLink
PATH_PHB
=
6
,
// 通过 PCIe 以及 PCIe 主桥连接
PATH_SYS
=
7
,
// 通过 PCIe 以及 NUMA 节点之间的 SMP 互连连接
PATH_NET
=
8
,
// 通过网络连接
...
...
@@ -39,44 +64,22 @@ typedef enum LinkType : uint8_t {
typedef
struct
scclTopoGraph
{
scclTopoGraph
()
=
delete
;
// 删除默认构造函数
scclTopoGraph
(
int
nRanks
)
:
nRanks
(
nRanks
),
transport_map
(
nullptr
,
0
)
{
// 分配transport_map的内存
uint8_t
*
raw_transport_map
=
static_cast
<
uint8_t
*>
(
calloc
(
nRanks
*
nRanks
,
sizeof
(
uint8_t
)));
if
(
raw_transport_map
==
nullptr
)
{
// 处理内存分配失败的情况
throw
std
::
bad_alloc
();
}
// 使用ByteSpanArray初始化transport_map
transport_map
=
ByteSpanArray
<
uint8_t
>
(
raw_transport_map
,
nRanks
*
nRanks
);
}
virtual
~
scclTopoGraph
()
{
// 释放transport_map的内存
free
(
transport_map
.
data
());
}
scclTopoGraph
(
int
nRanks
);
virtual
~
scclTopoGraph
();
uint8_t
*
getTransportMapRowStart
(
int
row
)
{
return
transport_map
[
row
*
nRanks
];
}
uint8_t
*
getTransportMapData
(
int
row
,
int
col
)
{
return
transport_map
[
row
*
nRanks
+
col
];
}
// 打印transport_map
scclResult_t
printTransportMap
()
{
for
(
int
i
=
0
;
i
<
this
->
nRanks
;
++
i
)
{
for
(
int
j
=
0
;
j
<
this
->
nRanks
;
++
j
)
{
uint8_t
*
value
=
this
->
getTransportMapData
(
i
,
j
);
if
(
value
!=
nullptr
)
{
printf
(
"%d "
,
*
value
);
}
else
{
printf
(
"nullptr "
);
}
}
printf
(
"
\n
"
);
}
return
scclSuccess
;
}
scclResult_t
printTransportMap
();
// 打印gpu_paths信息的函数
scclResult_t
printGPUPaths
();
public:
// 使用无序映射存储图的有效节点
std
::
unordered_map
<
uint64_t
,
scclTopoNode_t
>
graph_nodes
;
// 使用无序映射存储从每个GPU节点到其他GPU节点的所有路径,[start_node_id][end_node_id] = {path1, path2}
// 使用无序映射存储从每个GPU节点到其他GPU节点的所有路径,[start_node_id][end_node_id] = {path1, path2
, ...
}
std
::
unordered_map
<
uint64_t
,
std
::
unordered_map
<
uint64_t
,
std
::
vector
<
std
::
vector
<
uint64_t
>>>>
gpu_paths
;
// 传输位图
...
...
src/hardware/topology/graph/paths.cpp
View file @
58d57301
...
...
@@ -6,15 +6,15 @@ namespace hardware {
namespace
topology
{
namespace
graph
{
PathFinder
::
PathFinder
(
const
BootstrapComm_t
*
bootstrap_comm
)
PathFinder
::
PathFinder
(
const
BootstrapComm_t
*
bootstrap_comm
,
std
::
vector
<
char
>&
node_info_vec
,
size_t
node_info_total_bytes
)
:
rank
(
bootstrap_comm
->
rank
),
nRanks
(
bootstrap_comm
->
nRanks
),
localRank
(
bootstrap_comm
->
localRank
),
nLocalRanks
(
bootstrap_comm
->
nLocalRanks
),
interRank
(
bootstrap_comm
->
interRank
),
nInterRanks
(
bootstrap_comm
->
nInterRanks
),
node_container_
(
bootstrap_comm
->
r
ank
_phys_set
->
node_info_vec
.
data
(),
bootstrap_comm
->
nRanks
*
bootstrap_comm
->
rank_phys_set
->
node_info_total_bytes
)
{
// 初始化NodeContainer对象
node_container_
(
node_info_vec
.
data
(),
bootstrap_comm
->
nR
ank
s
*
node_info_total_bytes
)
{
// 初始化NodeContainer对象
printf
(
"get PathFinder, node_container_=%zu
\n
"
,
node_container_
.
size
());
for
(
size_t
i
=
0
;
i
<
node_container_
.
size
();
++
i
)
{
scclTopoNode_t
*
node
=
node_container_
[
i
];
...
...
@@ -36,7 +36,7 @@ PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm)
const scclTopoNode_t* node = node_container_[index];
int interRank, deviceValue, terminalType, hipDev, numaId;
bootstrap::
physical_links::getIdComponents(node_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
physical_links::getIdComponents(node_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
char busIdStr[17];
int64ToBusId(node->busId, busIdStr);
printf("rank=%d, node=(InterRank:%d, V:%d, T:%d, H:%d, N:%d, type:%d, busIdStr:%s), neighbor_count=%zu",
...
...
@@ -54,7 +54,7 @@ PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm)
uint64_t neighbor_id = node->neighbors[n];
const scclTopoNode_t* neighbor_node = findNodeById(neighbor_id);
if(neighbor_node) {
bootstrap::
physical_links::getIdComponents(neighbor_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
physical_links::getIdComponents(neighbor_id, &interRank, &deviceValue, &terminalType, &hipDev, &numaId);
int64ToBusId(neighbor_node->busId, busIdStr);
printf(", neighbor[%d]=(InterRank:%d, V:%d, T:%d, H:%d, N:%d, type:%d, busIdStr:%s)",
...
...
@@ -75,10 +75,36 @@ PathFinder::PathFinder(const BootstrapComm_t* bootstrap_comm)
}
#endif
// 查找当前rank对应的其他GPU节点的所有路径
printf
(
"PathFinder pos 1
\n
"
);
findGpuPaths
();
printf
(
"PathFinder pos 2
\n
"
);
// 查找当前rank对应的GPU的node,并执行BFS搜索,查找到其他所有GPU node的路径
for
(
const
auto
&
pair
:
id_to_index_
)
{
uint64_t
id
=
pair
.
first
;
size_t
index
=
pair
.
second
;
// 定位到node
scclTopoNode_t
*
node
=
node_container_
[
index
];
int
nodeInterRank
,
nodeHipDev
;
physical_links
::
getIdComponents
(
node
->
id
,
&
nodeInterRank
,
nullptr
,
nullptr
,
&
nodeHipDev
,
nullptr
);
if
(
node
->
type
==
GPU
&&
nodeInterRank
==
this
->
interRank
&&
nodeHipDev
==
this
->
localRank
)
{
// printf("bfsFindGpuPaths start_node_id=%lu, running\n", node->id);
bfsFindGpuPaths
(
node
->
id
);
}
}
#if 1
if
(
rank
==
1
)
{
printGpuPaths
();
}
#endif
}
int
getGpuRankFromNodeId
(
uint64_t
node_id
,
int
nLocalRanks
)
{
int
interRank
,
hipDev
;
// 调用 getIdComponents 函数获取 interRank 和 hipDev
physical_links
::
getIdComponents
(
node_id
,
&
interRank
,
nullptr
,
nullptr
,
&
hipDev
,
nullptr
);
// 计算并返回 gpu_rank
int
gpu_rank
=
interRank
*
nLocalRanks
+
hipDev
;
printf
(
"node_id=%lu, interRank=%d, hipDev=%d, gpu_rank=%d
\n
"
,
node_id
,
interRank
,
hipDev
,
gpu_rank
);
return
gpu_rank
;
}
/**
...
...
@@ -124,84 +150,68 @@ scclResult_t PathFinder::computeTopoGpuP2pMap(scclTopoGraph_t* topo_graph) {
// 记录bitmap
LinkType_t
link_type
;
int
start_gpu_rank
,
end_gpu_rank
;
{
// 根据路径中途径的节点点确定连接方式的类型
SCCLCHECK
(
determineLinkType
(
path
,
&
link_type
));
// 获取gpu的rank
int
start_gpu_rank
=
getGpuRankFromNodeId
(
start_node_id
,
nLocalRanks
);
int
end_gpu_rank
=
getGpuRankFromNodeId
(
end_node_id
,
nLocalRanks
);
int
start_interRank
,
start_hipDev
;
int
end_interRank
,
end_hipDev
;
bootstrap
::
physical_links
::
getIdComponents
(
start_node_id
,
&
start_interRank
,
nullptr
,
nullptr
,
&
start_hipDev
,
nullptr
);
bootstrap
::
physical_links
::
getIdComponents
(
end_node_id
,
&
end_interRank
,
nullptr
,
nullptr
,
&
end_hipDev
,
nullptr
);
start_gpu_rank
=
start_interRank
*
nLocalRanks
+
start_hipDev
;
end_gpu_rank
=
end_interRank
*
nLocalRanks
+
end_hipDev
;
#if 0
printf("rank=%d, interRank=%d, localRank=%d: start_interRank=%d, start_hipDev=%d, end_interRank=%d, end_hipDev=%d, link_type=%d\n",
rank,
interRank,
localRank,
start_interRank,
start_hipDev,
end_interRank,
end_hipDev,
static_cast<int>(link_type));
#endif
}
// 查找transport_map中的起始和结束节点
uint8_t
*
transport_map_pt
=
topo_graph
->
getTransportMapData
(
start_gpu_rank
,
end_gpu_rank
);
// 将连接方式的类型存储在transport_map中
if
(
*
(
topo_graph
->
getTransportMapData
(
start_gpu_rank
,
end_gpu_rank
))
>
0
&&
link_type
>
0
)
{
if
(
link_type
<
static_cast
<
LinkType_t
>
(
*
(
topo_graph
->
getTransportMapData
(
start_gpu_rank
,
end_gpu_rank
))
))
{
*
(
topo_graph
->
getTransportMapData
(
start_gpu_rank
,
end_gpu_rank
))
=
link_type
;
if
(
*
transport_map_pt
>
0
&&
link_type
>
0
)
{
if
(
link_type
<
static_cast
<
LinkType_t
>
(
*
transport_map_pt
))
{
*
transport_map_pt
=
link_type
;
// 清空之前的路径
topo_graph
->
gpu_paths
[
start_node_id
][
end_node_id
].
clear
();
// 添加新的路径
topo_graph
->
gpu_paths
[
start_node_id
][
end_node_id
].
push_back
(
path
);
}
else
if
(
link_type
==
static_cast
<
LinkType_t
>
(
*
(
topo_graph
->
getTransportMapData
(
start_gpu_rank
,
end_gpu_rank
))
))
{
}
else
if
(
link_type
==
static_cast
<
LinkType_t
>
(
*
transport_map_pt
))
{
// 添加新的路径
topo_graph
->
gpu_paths
[
start_node_id
][
end_node_id
].
push_back
(
path
);
}
}
else
{
*
(
topo_graph
->
getTransportMapData
(
start_gpu_rank
,
end_gpu_rank
))
=
static_cast
<
uint8_t
>
(
link_type
);
*
transport_map_pt
=
static_cast
<
uint8_t
>
(
link_type
);
// 添加新的路径
topo_graph
->
gpu_paths
[
start_node_id
][
end_node_id
].
push_back
(
path
);
}
#if 0
{
char start_busIdStr[17] = ""; // 用于存储总线ID字符串
// 根据起始节点的ID查找对应的节点对象
const scclTopoNode_t* start_node = findNodeById(start_node_id);
// 如果找到了对应的节点对象,则将其总线ID转换为字符串
if(start_node) {
int64ToBusId(start_node->busId, start_busIdStr);
}
char end_busIdStr[17] = ""; // 用于存储总线ID字符串
// 根据起始节点的ID查找对应的节点对象
const scclTopoNode_t* end_node = findNodeById(end_node_id);
// 如果找到了对应的节点对象,则将其总线ID转换为字符串
if(end_node) {
int64ToBusId(end_node->busId, end_busIdStr);
}
return
scclSuccess
;
}
/////////////////////////////////////////////////////////////////////////////////////////////
/**
* @brief 查找当前rank对应的其他GPU节点的所有路径
*
* 该函数用于查找当前rank对应的GPU节点的所有路径。它遍历`id_to_index_`中的所有节点ID和索引对,
* 对于每一个节点,如果该节点是GPU类型,并且属于当前rank的进程,则调用`bfsFindGpuPaths`函数执行广度优先搜索(BFS),
* 查找到其他所有GPU节点的路径。最后,如果当前rank为1,则调用`printGpuPaths`函数打印所有GPU路径。
*/
void
PathFinder
::
findGpuPaths
()
{
// 查找当前rank对应的GPU的node,并执行BFS搜索,查找到其他所有GPU node的路径
for
(
const
auto
&
pair
:
id_to_index_
)
{
uint64_t
id
=
pair
.
first
;
size_t
index
=
pair
.
second
;
// 定位到node
scclTopoNode_t
*
node
=
node_container_
[
index
];
int
nodeInterRank
,
nodeHipDev
;
bootstrap
::
physical_links
::
getIdComponents
(
node
->
id
,
&
nodeInterRank
,
nullptr
,
nullptr
,
&
nodeHipDev
,
nullptr
);
if
(
node
->
type
==
GPU
&&
nodeInterRank
==
this
->
interRank
&&
nodeHipDev
==
this
->
localRank
)
{
// printf("bfsFindGpuPaths start_node_id=%lu, running\n", node->id);
bfsFindGpuPaths
(
node
->
id
);
printf("nLocalRanks=%d, start_node_id=%lu, busIdStr=%s, end_node_id=%lu, busIdStr=%s\n"
"start_gpu_rank: %d, end_gpu_rank: %d, link_type: %d, paths count: %zu\n",
nLocalRanks,
start_node_id,
start_busIdStr,
end_node_id,
end_busIdStr,
start_gpu_rank,
end_gpu_rank,
*(topo_graph->getTransportMapData(start_gpu_rank, end_gpu_rank)),
topo_graph->gpu_paths[start_node_id][end_node_id].size());
}
#endif
}
#if 1
if
(
rank
==
1
)
{
printGpuPaths
();
}
#endif
return
scclSuccess
;
}
/////////////////////////////////////////////////////////////////////////////////////////////
/**
* @brief 根据节点ID查找节点
*
...
...
@@ -231,7 +241,6 @@ const scclTopoNode_t* PathFinder::findNodeById(uint64_t id) const {
*
* @param start_node_id 起始GPU节点的ID
*/
#if 1
void
PathFinder
::
bfsFindGpuPaths
(
uint64_t
start_node_id
)
{
// 使用一个队列来存储当前路径
std
::
queue
<
std
::
vector
<
uint64_t
>>
queue
;
...
...
@@ -259,14 +268,14 @@ void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
// 如果当前节点是GPU节点且不是起始节点,则将当前路径加入结果
if
(
current_node
->
type
==
GPU
&&
nodeId
!=
start_node_id
)
{
int
hipDev
;
bootstrap
::
physical_links
::
getIdComponents
(
current_node
->
id
,
nullptr
,
nullptr
,
nullptr
,
&
hipDev
,
nullptr
);
physical_links
::
getIdComponents
(
current_node
->
id
,
nullptr
,
nullptr
,
nullptr
,
&
hipDev
,
nullptr
);
// 仅当节点内的device id小于等于nLocalRanks时,才是有效GPU,才将路径加入结果
if
(
hipDev
<
nLocalRanks
)
{
gpu_paths_
[
start_node_id
].
push_back
(
path
);
}
}
else
{
int
nodeInterRank
;
bootstrap
::
physical_links
::
getIdComponents
(
nodeId
,
&
nodeInterRank
);
physical_links
::
getIdComponents
(
nodeId
,
&
nodeInterRank
);
// 遍历当前节点的所有邻居节点
for
(
uint64_t
neighbor_id
:
graph_node_neighbors_
.
at
(
nodeId
))
{
if
(
findNodeById
(
neighbor_id
)
==
nullptr
)
{
...
...
@@ -274,7 +283,7 @@ void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
}
// 获取邻居节点的interRank
int
neighbor_inter_rank
;
bootstrap
::
physical_links
::
getIdComponents
(
neighbor_id
,
&
neighbor_inter_rank
);
physical_links
::
getIdComponents
(
neighbor_id
,
&
neighbor_inter_rank
);
// 检查邻居节点是否已在当前路径中访问过
bool
visited
=
std
::
find
(
path
.
begin
(),
path
.
end
(),
neighbor_id
)
!=
path
.
end
();
...
...
@@ -302,141 +311,6 @@ void PathFinder::bfsFindGpuPaths(uint64_t start_node_id) {
}
}
#else
void
PathFinder
::
bfsFindGpuPaths
(
uint64_t
start_node_id
)
{
// 使用一个队列来存储当前路径
std
::
queue
<
std
::
vector
<
uint64_t
>>
queue
;
// 使用一个unordered_map来存储每个node的最短路径
std
::
unordered_map
<
uint64_t
,
std
::
vector
<
uint64_t
>>
shortest_paths
;
// 将起始节点加入队列
queue
.
push
({
start_node_id
});
shortest_paths
[
start_node_id
]
=
{
start_node_id
};
// 当队列不为空时,继续搜索
while
(
!
queue
.
empty
())
{
// 从队列中取出一个路径
auto
path
=
queue
.
front
();
queue
.
pop
();
// 获取当前路径的最后一个节点的ID
uint64_t
nodeId
=
path
.
back
();
// 根据节点ID查找对应的节点
const
scclTopoNode_t
*
current_node
=
findNodeById
(
nodeId
);
if
(
current_node
==
nullptr
)
{
continue
;
}
// 如果当前节点是GPU节点且不是起始节点,则将当前路径加入结果
if
(
current_node
->
type
==
GPU
&&
nodeId
!=
start_node_id
)
{
int
hipDev
;
bootstrap
::
physical_links
::
getIdComponents
(
current_node
->
id
,
nullptr
,
nullptr
,
nullptr
,
&
hipDev
,
nullptr
);
if
(
hipDev
<
nLocalRanks
)
{
gpu_paths_
[
start_node_id
].
push_back
(
path
);
}
}
else
{
int
nodeInterRank
;
bootstrap
::
physical_links
::
getIdComponents
(
nodeId
,
&
nodeInterRank
);
// 遍历当前节点的所有邻居节点
for
(
uint64_t
neighbor_id
:
graph_node_neighbors_
.
at
(
nodeId
))
{
if
(
findNodeById
(
neighbor_id
)
==
nullptr
)
{
continue
;
}
// 获取邻居节点的interRank
int
neighbor_inter_rank
;
bootstrap
::
physical_links
::
getIdComponents
(
neighbor_id
,
&
neighbor_inter_rank
);
// 检查邻居节点是否已在当前路径中访问过
bool
visited
=
std
::
find
(
path
.
begin
(),
path
.
end
(),
neighbor_id
)
!=
path
.
end
();
// 检查interRank是否已经存在(仅当interRank改变时)
bool
inter_rank_exists
=
false
;
if
(
neighbor_inter_rank
!=
nodeInterRank
)
{
for
(
uint64_t
node_id
:
path
)
{
if
(
node_id
==
neighbor_id
)
{
inter_rank_exists
=
true
;
break
;
}
}
}
// 如果邻居节点未访问过且interRank未存在,则扩展路径
if
(
!
visited
&&
!
inter_rank_exists
)
{
std
::
vector
<
uint64_t
>
new_path
=
path
;
new_path
.
push_back
(
neighbor_id
);
// 如果新路径比已有的最短路径更短,则更新最短路径
if
(
shortest_paths
.
find
(
neighbor_id
)
==
shortest_paths
.
end
()
||
shortest_paths
[
neighbor_id
].
size
()
>
new_path
.
size
())
{
shortest_paths
[
neighbor_id
]
=
new_path
;
queue
.
push
(
new_path
);
}
}
}
}
}
}
void
PathFinder
::
bfsFindGpuPaths
(
uint64_t
start_node_id
)
{
// 使用一个队列来存储当前路径
std
::
queue
<
std
::
vector
<
uint64_t
>>
queue
;
// 将起始节点加入队列
queue
.
push
({
start_node_id
});
// 当队列不为空时,继续搜索
while
(
!
queue
.
empty
())
{
// 从队列中取出一个路径
auto
path
=
queue
.
front
();
queue
.
pop
();
// 获取当前路径的最后一个节点的ID
uint64_t
nodeId
=
path
.
back
();
// 根据节点ID查找对应的节点
const
scclTopoNode_t
*
current_node
=
findNodeById
(
nodeId
);
if
(
current_node
==
nullptr
)
{
continue
;
}
// 如果当前节点是GPU节点且不是起始节点,则将当前路径加入结果
if
(
current_node
->
type
==
GPU
&&
nodeId
!=
start_node_id
)
{
int
hipDev
;
bootstrap
::
physical_links
::
getIdComponents
(
current_node
->
id
,
nullptr
,
nullptr
,
nullptr
,
&
hipDev
,
nullptr
);
if
(
hipDev
<
nLocalRanks
)
{
gpu_paths_
[
start_node_id
].
push_back
(
path
);
}
}
else
{
int
nodeInterRank
;
bootstrap
::
physical_links
::
getIdComponents
(
nodeId
,
&
nodeInterRank
);
// 遍历当前节点的所有邻居节点
for
(
uint64_t
neighbor_id
:
graph_node_neighbors_
.
at
(
nodeId
))
{
if
(
findNodeById
(
nodeId
)
==
nullptr
)
{
continue
;
}
// 获取邻居节点的interRank
int
neighbor_inter_rank
;
bootstrap
::
physical_links
::
getIdComponents
(
neighbor_id
,
&
neighbor_inter_rank
);
// 检查邻居节点是否已在当前路径中访问过
bool
visited
=
std
::
find
(
path
.
begin
(),
path
.
end
(),
neighbor_id
)
!=
path
.
end
();
// 检查interRank是否已经存在(仅当interRank改变时)
bool
inter_rank_exists
=
false
;
if
(
neighbor_inter_rank
!=
(
nodeInterRank
))
{
for
(
uint64_t
node_id
:
path
)
{
if
((
nodeInterRank
)
==
neighbor_inter_rank
)
{
inter_rank_exists
=
true
;
break
;
}
}
}
// 如果邻居节点未访问过且interRank未存在,则扩展路径
if
(
!
visited
&&
!
inter_rank_exists
)
{
std
::
vector
<
uint64_t
>
new_path
=
path
;
new_path
.
push_back
(
neighbor_id
);
queue
.
push
(
new_path
);
}
}
}
}
}
#endif
/**
* @brief 打印GPU路径信息
*
...
...
@@ -463,7 +337,7 @@ void PathFinder::printGpuPaths() {
int
interRank
,
deviceValue
,
terminalType
,
hipDev
,
numaId
;
// 根据起始节点的ID获取其interRank、deviceValue、terminalType和numaId
bootstrap
::
physical_links
::
getIdComponents
(
start_node_id
,
&
interRank
,
&
deviceValue
,
&
terminalType
,
&
hipDev
,
&
numaId
);
physical_links
::
getIdComponents
(
start_node_id
,
&
interRank
,
&
deviceValue
,
&
terminalType
,
&
hipDev
,
&
numaId
);
printf
(
"GPU node ID:%lu (InterRank:%d, V:%d, T:%d, H:%d, N:%d) (Path count: %zu)
\n
"
,
start_node_id
,
interRank
,
...
...
@@ -486,7 +360,7 @@ void PathFinder::printGpuPaths() {
const
scclTopoNode_t
*
node
=
findNodeById
(
node_id
);
if
(
node
)
{
// 根据节点的ID获取其interRank、deviceValue、terminalType和numaId
bootstrap
::
physical_links
::
getIdComponents
(
node
->
id
,
&
interRank
,
&
deviceValue
,
&
terminalType
,
&
hipDev
,
&
numaId
);
physical_links
::
getIdComponents
(
node
->
id
,
&
interRank
,
&
deviceValue
,
&
terminalType
,
&
hipDev
,
&
numaId
);
// 将节点的总线ID转换为字符串
int64ToBusId
(
node
->
busId
,
busIdStr
);
// 打印节点的信息,包括其interRank、deviceValue、terminalType、numaId、类型和总线ID字符串
...
...
src/hardware/topology/graph/paths.h
View file @
58d57301
...
...
@@ -13,21 +13,21 @@ namespace hardware {
namespace
topology
{
namespace
graph
{
// 设置Path路径直接link的 bandwidth 和 speed
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class
PathFinder
{
public:
// 构造函数
PathFinder
(
const
BootstrapComm_t
*
bootstrap_comm
);
PathFinder
(
const
BootstrapComm_t
*
bootstrap_comm
,
std
::
vector
<
char
>&
node_info_vec
,
size_t
node_info_total_bytes
);
// 计算拓扑图中GPU节点之间的点对点映射
scclResult_t
computeTopoGpuP2pMap
(
scclTopoGraph_t
*
graph
);
// 计算拓扑图中GPU节点之间的点对点映射
,结果保存在graph中
scclResult_t
computeTopoGpuP2pMap
(
scclTopoGraph_t
*
topo_
graph
);
// 打印函数
void
printGpuPaths
();
private:
// 获取所有GPU到GPU的路径函数
void
findGpuPaths
();
// 使用广度优先搜索(BFS)查找从起始GPU节点到其他GPU节点的最短路径
void
bfsFindGpuPaths
(
uint64_t
start_node_id
);
...
...
@@ -53,6 +53,9 @@ private:
int
nInterRanks
=
0
;
// 全局拥有节点的个数
};
// 根据 node_id 获取 gpu_rank
int
getGpuRankFromNodeId
(
uint64_t
node_id
,
int
nLocalRanks
);
}
// namespace graph
}
// namespace topology
}
// namespace hardware
...
...
src/hardware/topology/
bootst
rap/physical_links.cpp
→
src/hardware/topology/
g
rap
h
/physical_links.cpp
View file @
58d57301
...
...
@@ -4,7 +4,7 @@
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
bootst
rap
{
namespace
g
rap
h
{
namespace
physical_links
{
constexpr
int
numaIdStrLen
=
10
;
...
...
@@ -726,7 +726,7 @@ void printTopoNode(ByteSpanArray<scclTopoNode_t>& nodes, int nodeIndex, const ch
}
}
// namespace physical_links
}
// namespace
bootst
rap
}
// namespace
g
rap
h
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
\ No newline at end of file
src/hardware/topology/
bootst
rap/physical_links.h
→
src/hardware/topology/
g
rap
h
/physical_links.h
View file @
58d57301
...
...
@@ -13,12 +13,14 @@
#include <filesystem> // 需要C++17支持
#include "container.h"
#include "bootstrap
_utils
.h"
#include "bootstrap.h"
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
bootstrap
{
namespace
graph
{
typedef
sccl
::
hardware
::
net
::
scclNet_t
scclNet_t
;
constexpr
size_t
topoNodeMaxLocalNodes
=
128
;
// 每个节点最多的node数量
constexpr
size_t
topoNodeMaxNeighbors
=
16
;
// 每个node最多neighbor数量
...
...
@@ -70,7 +72,7 @@ scclResult_t generate_topo_nodes(const char* pciPath, int interRank, int hipDev,
// 根据numaId获取pci路径
std
::
string
generate_topo_node_numa_info
(
int
numaId
);
// 输出id分解后的所有数据
// 输出
node
id分解后的所有数据
void
getIdComponents
(
uint64_t
idToDecompose
,
int
*
interRank
=
nullptr
,
int
*
deviceValue
=
nullptr
,
int
*
terminalType
=
nullptr
,
int
*
hipDev
=
nullptr
,
int
*
numaId
=
nullptr
);
...
...
@@ -82,7 +84,7 @@ char* getNetPciPath(scclNet_t* scclNet, int hipDev);
void
printTopoNode
(
ByteSpanArray
<
scclTopoNode_t
>&
nodes
,
int
nodeIndex
,
const
char
*
prefix
);
}
// namespace physical_links
}
// namespace
bootst
rap
}
// namespace
g
rap
h
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment