Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
lishen01
Sccl
Commits
571a75b5
Commit
571a75b5
authored
Aug 09, 2025
by
lishen
Browse files
完成全部网络的node建立,以及GPU到GPU的path物理路径搜索
parent
379c4128
Changes
44
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
965 additions
and
573 deletions
+965
-573
examples/1_connection/3_sccl_ipc_socket/1_socket_mpi_fd.cpp
examples/1_connection/3_sccl_ipc_socket/1_socket_mpi_fd.cpp
+25
-40
examples/1_connection/3_sccl_ipc_socket/3_socket_mpi_data.cpp
...ples/1_connection/3_sccl_ipc_socket/3_socket_mpi_data.cpp
+83
-82
examples/1_connection/3_sccl_ipc_socket/4_socket_mpi_data_sccl.cpp
...1_connection/3_sccl_ipc_socket/4_socket_mpi_data_sccl.cpp
+0
-76
examples/2_topo/1_demo_rocm/1_test_rocm_smi.cpp
examples/2_topo/1_demo_rocm/1_test_rocm_smi.cpp
+0
-0
examples/2_topo/1_demo_rocm/2_test_pci_info.cpp
examples/2_topo/1_demo_rocm/2_test_pci_info.cpp
+15
-0
examples/2_topo/1_demo_rocm/compile1.sh
examples/2_topo/1_demo_rocm/compile1.sh
+2
-2
examples/2_topo/1_demo_rocm/compile2.sh
examples/2_topo/1_demo_rocm/compile2.sh
+15
-0
examples/2_topo/2_bootstrap/1_mpi_init.cpp
examples/2_topo/2_bootstrap/1_mpi_init.cpp
+16
-19
examples/2_topo/2_bootstrap/2_mpi_init_mpi_init_step1_bootstrap.cpp
..._topo/2_bootstrap/2_mpi_init_mpi_init_step1_bootstrap.cpp
+92
-0
examples/2_topo/2_bootstrap/3_mpi_init_mpi_init_step2_graph.cpp
...es/2_topo/2_bootstrap/3_mpi_init_mpi_init_step2_graph.cpp
+60
-0
examples/2_topo/2_bootstrap/compile_mpi1.sh
examples/2_topo/2_bootstrap/compile_mpi1.sh
+4
-0
examples/2_topo/2_bootstrap/compile_mpi2_init_step1.sh
examples/2_topo/2_bootstrap/compile_mpi2_init_step1.sh
+59
-0
examples/2_topo/2_bootstrap/compile_mpi3_init_step2.sh
examples/2_topo/2_bootstrap/compile_mpi3_init_step2.sh
+61
-0
examples/2_topo/2_bootstrap/hostfile2
examples/2_topo/2_bootstrap/hostfile2
+1
-1
src/hardware/hardware.cpp
src/hardware/hardware.cpp
+17
-53
src/hardware/hardware.h
src/hardware/hardware.h
+3
-4
src/hardware/hardware_utils.h
src/hardware/hardware_utils.h
+2
-3
src/hardware/net/ipc_socket/ipc_socket.cpp
src/hardware/net/ipc_socket/ipc_socket.cpp
+493
-282
src/hardware/net/ipc_socket/ipc_socket.h
src/hardware/net/ipc_socket/ipc_socket.h
+13
-9
src/hardware/net/net_ib/net_ib.cpp
src/hardware/net/net_ib/net_ib.cpp
+4
-2
No files found.
examples/1_connection/3_sccl_ipc_socket/1_socket_mpi_fd.cpp
View file @
571a75b5
...
@@ -27,40 +27,6 @@ using namespace sccl;
...
@@ -27,40 +27,6 @@ using namespace sccl;
typedef
class
sccl
::
hardware
::
net
::
ipc_socket
::
scclIpcSocket
scclIpcSocket_t
;
typedef
class
sccl
::
hardware
::
net
::
ipc_socket
::
scclIpcSocket
scclIpcSocket_t
;
int
ipcSendRecvFd_nrank2
(
int
argc
,
char
*
argv
[])
{
MPI_Init
(
&
argc
,
&
argv
);
int
rank
;
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
int
dst_hash
=
12345
;
scclIpcSocket_t
ipcsocket
(
rank
,
dst_hash
);
if
(
rank
==
0
)
{
// 进程 0: 打开文件并发送文件描述符
int
fd
=
open
(
"testfile.txt"
,
O_RDONLY
);
if
(
fd
<
0
)
{
perror
(
"Failed to open file"
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
}
ipcsocket
.
scclIpcSocketSendFd
(
fd
,
1
,
12345
);
// 假设 dst_hash 为 12345
close
(
fd
);
}
else
if
(
rank
==
1
)
{
// 进程 1: 接收文件描述符并读取文件内容
int
fd
;
ipcsocket
.
scclIpcSocketRecvFd
(
&
fd
);
char
buffer
[
256
];
ssize_t
n
=
read
(
fd
,
buffer
,
sizeof
(
buffer
)
-
1
);
if
(
n
>
0
)
{
buffer
[
n
]
=
'\0'
;
printf
(
"Process %d received: %s
\n
"
,
rank
,
buffer
);
}
close
(
fd
);
}
MPI_Finalize
();
return
0
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
int
main
(
int
argc
,
char
*
argv
[])
{
MPI_Init
(
&
argc
,
&
argv
);
MPI_Init
(
&
argc
,
&
argv
);
int
rank
,
size
;
int
rank
,
size
;
...
@@ -68,39 +34,58 @@ int main(int argc, char* argv[]) {
...
@@ -68,39 +34,58 @@ int main(int argc, char* argv[]) {
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
size
);
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
size
);
int
dst_hash
=
12345
;
int
dst_hash
=
12345
;
scclIpcSocket_t
ipcsocket
(
rank
,
dst_hash
);
scclIpcSocket_t
ipcsocket
(
rank
,
size
,
dst_hash
);
int
fd
;
if
(
rank
==
0
)
{
if
(
rank
==
0
)
{
// 进程 0: 打开文件并发送文件描述符给所有其他进程
// 进程 0: 打开文件并发送文件描述符给所有其他进程
int
fd
=
open
(
"testfile.txt"
,
O_RDONLY
);
fd
=
open
(
"testfile.txt"
,
O_RDONLY
);
if
(
fd
<
0
)
{
if
(
fd
<
0
)
{
perror
(
"Failed to open file"
);
perror
(
"Failed to open file"
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
}
}
for
(
int
i
=
1
;
i
<
size
;
++
i
)
{
for
(
int
i
=
1
;
i
<
size
;
++
i
)
{
if
(
ipcsocket
.
scclIpcSocketSendFd
(
fd
,
i
,
dst_hash
)
!=
scclSuccess
)
{
if
(
ipcsocket
.
scclIpcSocketSendFd
(
fd
,
i
)
!=
scclSuccess
)
{
perror
(
"Failed to send file descriptor"
);
perror
(
"Failed to send file descriptor"
);
close
(
fd
);
close
(
fd
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
}
}
lseek
(
fd
,
0
,
SEEK_SET
);
}
}
close
(
fd
);
}
else
{
}
else
{
// 其他进程: 接收文件描述符并读取文件内容
// 其他进程: 接收文件描述符并读取文件内容
int
fd
;
if
(
ipcsocket
.
scclIpcSocketRecvFd
(
&
fd
)
<
0
)
{
if
(
ipcsocket
.
scclIpcSocketRecvFd
(
&
fd
)
<
0
)
{
perror
(
"Failed to receive file descriptor"
);
perror
(
"Failed to receive file descriptor"
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
}
}
// lseek(fd, 0, SEEK_SET); // 重置文件偏移量到文件开头
printf
(
"11 rank %d received fd %d
\n
"
,
rank
,
fd
);
char
buffer
[
256
];
char
buffer
[
256
];
struct
pollfd
pfd
;
pfd
.
fd
=
fd
;
pfd
.
events
=
POLLIN
;
int
pollResult
=
poll
(
&
pfd
,
1
,
-
1
);
// 无限等待
printf
(
"pollResult=%d, rank=%d
\n
"
,
pollResult
,
rank
);
ssize_t
n
=
read
(
fd
,
buffer
,
sizeof
(
buffer
)
-
1
);
ssize_t
n
=
read
(
fd
,
buffer
,
sizeof
(
buffer
)
-
1
);
if
(
n
>
0
)
{
if
(
n
>
0
)
{
buffer
[
n
]
=
'\0'
;
buffer
[
n
]
=
'\0'
;
printf
(
"Process %d received: %s
\n
"
,
rank
,
buffer
);
printf
(
"Process %d received: %s
\n
"
,
rank
,
buffer
);
lseek
(
fd
,
0
,
SEEK_SET
);
// 重置文件偏移量到文件开头
}
}
close
(
fd
);
printf
(
"n=%zd, rank=%d
\n
"
,
n
,
rank
);
/////////////////////
// 注意,fd会有抢占,同一时间只能有一个进程读取
/////////////////////
}
}
// if(fd >= 0) {
// close(fd);
// }
MPI_Finalize
();
MPI_Finalize
();
return
0
;
return
0
;
}
}
...
...
examples/1_connection/3_sccl_ipc_socket/3_socket_mpi_data.cpp
View file @
571a75b5
...
@@ -13,63 +13,35 @@ using namespace sccl;
...
@@ -13,63 +13,35 @@ using namespace sccl;
typedef
class
sccl
::
hardware
::
net
::
ipc_socket
::
scclIpcSocket
scclIpcSocket_t
;
typedef
class
sccl
::
hardware
::
net
::
ipc_socket
::
scclIpcSocket
scclIpcSocket_t
;
template
<
typename
T
>
template
<
typename
T
>
void
send_data
(
T
*
ipcsocket
,
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
)
{
int
test_allgather
(
T
*
ipcsocket
,
int
rank
,
int
size
,
int
dataLen
=
64
*
1024
,
int
num_iterations
=
1
)
{
if
(
ipcsocket
->
scclIpcSocketSendData
(
data
,
dataLen
,
dst_rank
)
!=
scclSuccess
)
{
std
::
vector
<
char
>
sendData
(
dataLen
);
perror
(
"Failed to send data"
);
std
::
vector
<
char
>
recvData
(
size
*
dataLen
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
}
}
template
<
typename
T
>
void
recv_data
(
T
*
ipcsocket
,
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
)
{
if
(
ipcsocket
->
scclIpcSocketRecvData
(
buffer
,
bufferLen
,
receivedLen
)
!=
scclSuccess
)
{
perror
(
"Failed to receive data"
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
}
}
template
<
typename
T
>
int
test_allgather_ver1
(
T
*
ipcsocket
,
int
rank
,
int
size
)
{
int
sendDataLen
=
256
;
std
::
vector
<
char
>
sendData
(
sendDataLen
);
std
::
vector
<
char
>
recvData
(
size
*
sendDataLen
);
size_t
receivedLen
;
// 填充发送数据
// 填充发送数据
snprintf
(
sendData
.
data
(),
sendData
.
size
(),
"Data from process %d"
,
rank
);
snprintf
(
sendData
.
data
(),
sendData
.
size
(),
"Data from process %d"
,
rank
);
printf
(
"test_allgather dataLen=%d, sendData.size()=%zu
\n
"
,
dataLen
,
sendData
.
size
());
auto
pthpool
=
ThreadPool
(
size
*
2
);
std
::
vector
<
double
>
elapsed_times
;
// 用于存储每次执行的耗时
// 发送数据给所有其他进程
// 开始计时
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
if
(
i
!=
rank
)
{
auto
task_send
=
std
::
bind
(
send_data
<
scclIpcSocket_t
>
,
ipcsocket
,
sendData
.
data
(),
sendData
.
size
(),
i
);
pthpool
.
enqueue
(
task_send
);
auto
task_recv
=
std
::
bind
(
recv_data
<
scclIpcSocket_t
>
,
ipcsocket
,
recvData
.
data
()
+
i
*
sendDataLen
,
sendDataLen
,
&
receivedLen
);
// 调用 Allgather 函数
pthpool
.
enqueue
(
task_recv
);
for
(
int
i
=
0
;
i
<
num_iterations
;
++
i
)
{
}
SCCLCHECK
(
ipcsocket
->
scclIpcSocketAllgather
(
sendData
.
data
(),
recvData
.
data
(),
dataLen
));
}
}
printf
(
"sendData.size()=%d, receivedLen=%d
\n
"
,
sendDataLen
,
int
(
receivedLen
));
// 结束计时
auto
end
=
std
::
chrono
::
high_resolution_clock
::
now
();
// 打印接收到的数据
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
printf
(
"Process %d received from process %d: %s
\n
"
,
rank
,
i
,
recvData
.
data
()
+
i
*
256
);
}
return
0
;
// 所有进程在此处等待,直到所有进程都到达这一点
}
MPI_Barrier
(
MPI_COMM_WORLD
);
template
<
typename
T
>
// 计算并存储每个进程的计时结果
int
test_allgather_ver2
(
T
*
ipcsocket
,
int
rank
,
int
size
)
{
std
::
chrono
::
duration
<
double
>
elapsed
=
end
-
start
;
int
sendDataLen
=
256
;
std
::
vector
<
char
>
sendData
(
sendDataLen
);
std
::
vector
<
char
>
recvData
(
size
*
sendDataLen
);
// 填充发送数据
auto
average_time
=
elapsed
.
count
()
*
1e6
/
num_iterations
;
// 转换为微秒
snprintf
(
sendData
.
data
(),
sendData
.
size
(),
"Data from process %d"
,
rank
);
printf
(
"rank %d: Average time for Allgather over %d iterations: %f us.
\n
"
,
rank
,
num_iterations
,
average_time
);
SCCLCHECK
(
ipcsocket
->
scclIpcSocketAllgatherSync
(
sendData
.
data
(),
recvData
.
data
(),
sendData
.
size
(),
/*wait*/
true
));
// 打印接收到的数据
// 打印接收到的数据
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
...
@@ -80,67 +52,96 @@ int test_allgather_ver2(T* ipcsocket, int rank, int size) {
...
@@ -80,67 +52,96 @@ int test_allgather_ver2(T* ipcsocket, int rank, int size) {
}
}
template
<
typename
T
>
template
<
typename
T
>
int
test_allgather_ver3
(
T
*
ipcsocket
,
int
rank
,
int
size
)
{
int
test_broadcast
(
T
*
ipcsocket
,
int
rank
,
int
size
,
int
dataLen
=
64
*
1024
,
int
num_iterations
=
1
)
{
int
sendDataLen
=
256
;
std
::
vector
<
char
>
data
(
dataLen
);
std
::
vector
<
char
>
sendData
(
sendDataLen
);
int
root
=
0
;
// 假设 rank 0 是根进程
std
::
vector
<
char
>
recvData
(
size
*
sendDataLen
);
if
(
rank
==
root
)
{
// 仅根进程填充发送数据
// 填充发送数据
snprintf
(
data
.
data
(),
data
.
size
(),
"Data from root process %d"
,
rank
);
snprintf
(
sendData
.
data
(),
sendData
.
size
(),
"Data from process %d"
,
rank
);
SCCLCHECK
(
ipcsocket
->
scclIpcSocketAllgather
(
sendData
.
data
(),
recvData
.
data
(),
sendData
.
size
()));
// 打印接收到的数据
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
printf
(
"rank %d received from process %d: %s
\n
"
,
rank
,
i
,
recvData
.
data
()
+
i
*
sendData
.
size
());
}
}
printf
(
"rank=%d, data.size()=%zu
\n
"
,
rank
,
data
.
size
());
return
0
;
std
::
vector
<
double
>
elapsed_times
;
// 用于存储每次执行的耗时
}
template
<
typename
T
>
// 开始计时
int
test_broadcast_ver1
(
T
*
ipcsocket
,
int
rank
,
int
size
)
{
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
int
sendDataLen
=
256
;
std
::
vector
<
char
>
sendData
(
sendDataLen
);
std
::
vector
<
char
>
recvData
(
sendDataLen
);
int
root
=
0
;
// 假设 rank 0 是根进程
if
(
rank
==
root
)
{
for
(
int
i
=
0
;
i
<
num_iterations
;
++
i
)
{
// 仅根进程填充发送数据
SCCLCHECK
(
ipcsocket
->
scclIpcSocketBroadcast
(
data
.
data
(),
data
.
size
(),
root
));
snprintf
(
sendData
.
data
(),
sendData
.
size
(),
"Data from root process %d"
,
rank
);
}
}
SCCLCHECK
(
ipcsocket
->
scclIpcSocketBroadcast
(
sendData
.
data
(),
recvData
.
data
(),
sendData
.
size
(),
root
,
/*wait*/
true
));
// 结束计时
auto
end
=
std
::
chrono
::
high_resolution_clock
::
now
();
// 打印接收到的数据
// 所有进程在此处等待,直到所有进程都到达这一点
printf
(
"rank %d received: %s
\n
"
,
rank
,
recvData
.
data
());
MPI_Barrier
(
MPI_COMM_WORLD
);
// 计算并存储每个进程的计时结果
std
::
chrono
::
duration
<
double
>
elapsed
=
end
-
start
;
auto
average_time
=
elapsed
.
count
()
*
1e6
/
num_iterations
;
// 转换为微秒
printf
(
"rank %d: data=%s, Average time for scclIpcSocketBroadcast over %d iterations: %f us.
\n
"
,
rank
,
(
char
*
)(
data
.
data
()),
num_iterations
,
average_time
);
return
0
;
return
0
;
}
}
int
main
(
int
argc
,
char
*
argv
[])
{
int
main
(
int
argc
,
char
*
argv
[])
{
MPI_Init
(
&
argc
,
&
argv
);
MPI_Init
(
&
argc
,
&
argv
);
int
rank
,
size
;
int
rank
,
size
;
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
size
);
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
size
);
int
dst_hash
=
12345
;
int
dst_hash
=
654321
;
scclIpcSocket_t
*
ipcsocket
=
new
scclIpcSocket_t
(
rank
,
size
,
dst_hash
);
scclIpcSocket_t
*
ipcsocket
=
new
scclIpcSocket_t
(
rank
,
size
,
dst_hash
);
// test_allgather_ver1(ipcsocket, rank, size);
// 默认参数
// test_allgather_ver2(ipcsocket, rank, size);
std
::
string
test_type
=
"allgather"
;
// test_allgather_ver3(ipcsocket, rank, size);
int
dataLen
=
64
*
1024
;
test_broadcast_ver1
(
ipcsocket
,
rank
,
size
);
int
num_iterations
=
1
;
// 解析命令行参数
for
(
int
i
=
1
;
i
<
argc
;
++
i
)
{
std
::
istringstream
iss
(
argv
[
i
]);
std
::
string
arg
;
iss
>>
arg
;
if
(
arg
==
"--test-type"
)
{
if
(
++
i
<
argc
)
{
test_type
=
argv
[
i
];
}
}
else
if
(
arg
==
"--data-len"
)
{
if
(
++
i
<
argc
)
{
iss
.
clear
();
iss
.
str
(
argv
[
i
]);
iss
>>
dataLen
;
}
}
else
if
(
arg
==
"--num-iterations"
)
{
if
(
++
i
<
argc
)
{
iss
.
clear
();
iss
.
str
(
argv
[
i
]);
iss
>>
num_iterations
;
}
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
seconds
(
10
));
if
(
test_type
==
"allgather"
)
{
// while(!ipcsocket->getPthreadPool()->allTasksCompleted()) {}
test_allgather
(
ipcsocket
,
rank
,
size
,
dataLen
,
num_iterations
);
// printf("delete ipcsocket... rank=%d\n", rank);
}
else
if
(
test_type
==
"broadcast"
)
{
test_broadcast
(
ipcsocket
,
rank
,
size
,
dataLen
,
num_iterations
);
}
else
{
if
(
rank
==
0
)
{
std
::
cerr
<<
"Unknown test type: "
<<
test_type
<<
std
::
endl
;
}
}
delete
(
ipcsocket
)
;
delete
ipcsocket
;
MPI_Finalize
();
MPI_Finalize
();
return
0
;
return
0
;
}
}
/*
/*
单机执行
单机执行
SCCL_DEBUG_LEVEL=ABORT SCCL_DEBUG_SUBSYS=BOOTSTRAP mpirun --allow-run-as-root -np 8 3_socket_mpi_data
SCCL_DEBUG_LEVEL=WARN SCCL_DEBUG_SUBSYS=GRAPH mpirun --allow-run-as-root -np 8 3_socket_mpi_data
SCCL_DEBUG_LEVEL=WARN SCCL_DEBUG_SUBSYS=GRAPH mpirun --allow-run-as-root -np 4 3_socket_mpi_data
*/
*/
examples/1_connection/3_sccl_ipc_socket/4_socket_mpi_data_sccl.cpp
deleted
100644 → 0
View file @
379c4128
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <vector> // 引入vector库
#include <thread> // 为了使用 std::this_thread::sleep_for
#include "mpi.h"
#include "net.h"
#include "ipc_socket.h"
#include "thread_pool.h"
using
namespace
sccl
;
typedef
class
sccl
::
hardware
::
net
::
ipc_socket
::
scclIpcSocket
scclIpcSocket_t
;
template
<
typename
T
>
void
send_data
(
T
*
ipcsocket
,
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
,
uint64_t
dst_hash
)
{
if
(
ipcsocket
->
scclIpcSocketSendData
(
data
,
dataLen
,
dst_rank
,
dst_hash
)
!=
scclSuccess
)
{
perror
(
"Failed to send data"
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
}
}
template
<
typename
T
>
void
recv_data
(
T
*
ipcsocket
,
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
)
{
if
(
ipcsocket
->
scclIpcSocketRecvData
(
buffer
,
bufferLen
,
receivedLen
)
!=
scclSuccess
)
{
perror
(
"Failed to receive data"
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
}
}
int
main
(
int
argc
,
char
*
argv
[])
{
MPI_Init
(
&
argc
,
&
argv
);
int
rank
,
size
;
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
size
);
int
dst_hash
=
12345
;
scclIpcSocket_t
ipcsocket
(
rank
,
dst_hash
);
int
sendDataLen
=
256
;
std
::
vector
<
char
>
sendData
(
sendDataLen
);
std
::
vector
<
char
>
recvData
(
size
*
sendDataLen
);
size_t
receivedLen
;
// 填充发送数据
snprintf
(
sendData
.
data
(),
sendData
.
size
(),
"Data from process %d"
,
rank
);
auto
pthpool
=
ThreadPool
(
size
*
2
);
// 发送数据给所有其他进程
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
if
(
i
!=
rank
)
{
auto
task_send
=
std
::
bind
(
send_data
<
scclIpcSocket_t
>
,
&
ipcsocket
,
sendData
.
data
(),
sendData
.
size
(),
i
,
dst_hash
);
pthpool
.
enqueue
(
task_send
);
auto
task_recv
=
std
::
bind
(
recv_data
<
scclIpcSocket_t
>
,
&
ipcsocket
,
recvData
.
data
()
+
i
*
sendDataLen
,
sendDataLen
,
&
receivedLen
);
pthpool
.
enqueue
(
task_recv
);
}
}
printf
(
"sendData.size()=%d, receivedLen=%d
\n
"
,
sendDataLen
,
int
(
receivedLen
));
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
seconds
(
2
));
// 打印接收到的数据
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
printf
(
"Process %d received from process %d: %s
\n
"
,
rank
,
i
,
recvData
.
data
()
+
i
*
256
);
}
MPI_Finalize
();
return
0
;
}
/*
单机执行
SCCL_DEBUG_LEVEL=ABORT SCCL_DEBUG_SUBSYS=BOOTSTRAP mpirun --allow-run-as-root -np 8 3_socket_mpi_data
*/
examples/2_topo/1_demo_rocm/test_rocm_smi.cpp
→
examples/2_topo/1_demo_rocm/
1_
test_rocm_smi.cpp
View file @
571a75b5
File moved
examples/2_topo/1_demo_rocm/2_test_pci_info.cpp
0 → 100644
View file @
571a75b5
#include <sys/sysinfo.h>
#include <iostream>
int
main
()
{
struct
sysinfo
info
;
if
(
sysinfo
(
&
info
)
==
0
)
{
std
::
cout
<<
"Uptime: "
<<
info
.
uptime
<<
std
::
endl
;
std
::
cout
<<
"Total RAM: "
<<
info
.
totalram
<<
std
::
endl
;
std
::
cout
<<
"Free RAM: "
<<
info
.
freeram
<<
std
::
endl
;
// 输出更多信息...
}
else
{
std
::
cerr
<<
"Failed to get system information."
<<
std
::
endl
;
}
return
0
;
}
examples/2_topo/1_demo_rocm/compile
_rocm_smi
.sh
→
examples/2_topo/1_demo_rocm/compile
1
.sh
View file @
571a75b5
hipcc
/public/home/lishen/Code/rocSHMEM/SCCL_v1/examples/2_topo/1_demo_rocm/
test_rocm_smi.cpp
\
hipcc
1_
test_rocm_smi.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/rocm_smi_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/rocm_smi_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp
\
-o
test_
topo
\
-o
1_
test_
rocm_smi
\
-std
=
c++17
-g
-O3
-fopenmp
-D__HIP_PLATFORM_HCC__
\
-std
=
c++17
-g
-O3
-fopenmp
-D__HIP_PLATFORM_HCC__
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include
\
...
...
examples/2_topo/1_demo_rocm/compile2.sh
0 → 100644
View file @
571a75b5
hipcc 2_test_pci_info.cpp
\
-o
2_test_pci_info
\
-std
=
c++17
-g
-O3
-fopenmp
-D__HIP_PLATFORM_HCC__
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/
\
-L
/usr/lib/x86_64-linux-gnu
\
-L
/usr/lib/
\
-lamdhip64
-lrocm_smi64
\ No newline at end of file
examples/2_topo/2_bootstrap/1_mpi_init.cpp
View file @
571a75b5
...
@@ -11,9 +11,6 @@ using namespace sccl;
...
@@ -11,9 +11,6 @@ using namespace sccl;
int
main
(
int
argc
,
char
*
argv
[])
{
int
main
(
int
argc
,
char
*
argv
[])
{
int
rank
,
nranks
;
int
rank
,
nranks
;
int
tag1
,
src
,
dst
,
cnt
;
MPI_Status
status
;
MPI_Init
(
&
argc
,
&
argv
);
MPI_Init
(
&
argc
,
&
argv
);
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
nranks
);
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
nranks
);
...
@@ -21,28 +18,28 @@ int main(int argc, char* argv[]) {
...
@@ -21,28 +18,28 @@ int main(int argc, char* argv[]) {
printf
(
"rank=%d, nranks=%d
\n
"
,
rank
,
nranks
);
printf
(
"rank=%d, nranks=%d
\n
"
,
rank
,
nranks
);
// ----------------------------------------------------------------------- //
//
//
----------------------------------------------------------------------- //
INFO
(
SCCL_LOG_TOPO
,
"Bootstrap ...
\n
"
);
//
INFO(SCCL_LOG_TOPO, "Bootstrap ...\n");
struct
scclRankInfo
*
rank_info
;
//
scclRankInfo
_t
* rank_info;
struct
sccl
::
hardware
::
topology
::
bootstrap
::
sccl
BootstrapComm
*
comm
;
//
struct sccl::hardware::topology::bootstrap::BootstrapComm* comm;
SCCLCHECK
(
scclCalloc
(
&
rank_info
,
1
));
//
SCCLCHECK(scclCalloc(&rank_info, 1));
SCCLCHECK
(
scclCalloc
(
&
comm
,
1
));
//
SCCLCHECK(scclCalloc(&comm, 1));
rank_info
->
rank
=
rank
;
//
rank_info->rank = rank;
rank_info
->
nRanks
=
nranks
;
//
rank_info->nRanks = nranks;
rank_info
->
localRanks
=
2
;
//
rank_info->localRanks = 2;
rank_info
->
hipDev
=
rank
%
rank_info
->
localRanks
;
//
rank_info->hipDev = rank % rank_info->localRanks;
auto
sccl_bootstrap
=
new
sccl
::
hardware
::
topology
::
bootstrap
::
sccl
Bootstrap
(
rank_info
,
comm
);
//
auto sccl_bootstrap = new sccl::hardware::topology::bootstrap::Bootstrap(rank_info, comm);
SCCLCHECK
(
sccl_bootstrap
->
bootstrapInitCheck
());
//
SCCLCHECK(sccl_bootstrap->bootstrapInitCheck());
sccl
::
hardware
::
topology
::
bootstrap
::
printUniqueInfo
(
comm
->
unique_info
);
//
sccl::hardware::topology::bootstrap::printUniqueInfo(comm->unique_info);
int
cuda_id
;
//
int cuda_id;
HIPCHECK
(
hipGetDevice
(
&
cuda_id
));
//
HIPCHECK(hipGetDevice(&cuda_id));
printf
(
"rank=%d, cuda_id=%d
\n
"
,
rank
,
cuda_id
);
//
printf("rank=%d, cuda_id=%d\n", rank, cuda_id);
MPI_Finalize
();
MPI_Finalize
();
}
}
...
...
examples/2_topo/2_bootstrap/2_mpi_init_mpi_init_step1_bootstrap.cpp
0 → 100644
View file @
571a75b5
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <mpi.h>
#include "bootstrap.h"
#include "hardware.h"
using
namespace
sccl
;
typedef
sccl
::
hardware
::
topology
::
bootstrap
::
scclUniqueId
scclUniqueId
;
typedef
sccl
::
hardware
::
topology
::
bootstrap
::
BootstrapHandle_t
BootstrapHandle_t
;
typedef
sccl
::
hardware
::
topology
::
bootstrap
::
Bootstrap
Bootstrap
;
// 全局变量
struct
sccl
::
hardware
::
topology
::
bootstrap
::
BootstrapComm
bootstrap_comm
;
scclResult_t
sccl_init_step1
(
const
scclUniqueId
*
unique_id
,
int
rank
,
int
nRanks
)
{
// -------------------------- 1.获取0号rank的地址信息 ----------------------------------- //
auto
root_handle
=
reinterpret_cast
<
const
BootstrapHandle_t
*>
(
unique_id
);
EQCHECK
(
root_handle
->
magic
,
0
);
// 检查handle是否已经更新
// -------------------------- 2.初始化获取所有节点的node信息 ----------------------------------- //
auto
sccl_bootstrap
=
std
::
make_unique
<
Bootstrap
>
(
root_handle
,
rank
,
nRanks
);
SCCLCHECK
(
sccl_bootstrap
->
init
(
&
bootstrap_comm
));
return
scclSuccess
;
}
constexpr
int
topoNodeMaxNeighbors
=
16
;
typedef
struct
topoNode
{
uint64_t
id
;
// 图点id标志
int
type
;
// 图点类型
int
numaId
;
// 节点id
char
busIdStr
[
17
]
=
""
;
// 总线ID字符串 "00000000:00:00.0"
int
speed
;
// 速度
int
width
;
// 带宽
char
cpuAffinity
[
36
]
=
""
;
// cpu的affinity
std
::
array
<
uint64_t
,
topoNodeMaxNeighbors
>
neighbors
;
// 邻居图点
size_t
neighborCount
;
// 邻居图点的数量
}
topoNode_t
;
int
main
(
int
argc
,
char
*
argv
[])
{
// -------------------------- 1.启动MPI ----------------------------------- //
MPI_Init
(
&
argc
,
&
argv
);
int
rank
,
nRanks
;
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
nRanks
);
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
printf
(
"rank=%d, nRanks=%d
\n
"
,
rank
,
nRanks
);
int
nLocalRanks
=
2
;
BootstrapHandle_t
uqid
;
printf
(
"uqid size=%lu
\n
"
,
sizeof
(
uqid
));
sccl
::
hardware
::
topology
::
bootstrap
::
scclRankInfo_t
rankinfo
;
sccl
::
hardware
::
topology
::
bootstrap
::
scclNodeInfo_t
nodeinfo
(
nLocalRanks
);
topoNode_t
topo_node
;
printf
(
"rankinfo size=%lu
\n
"
,
sizeof
(
rankinfo
));
printf
(
"rankinfo cpu size=%lu
\n
"
,
sizeof
(
rankinfo
.
cpu
));
printf
(
"rankinfo gpu size=%lu
\n
"
,
sizeof
(
rankinfo
.
gpu
));
printf
(
"rankinfo net size=%lu
\n
"
,
sizeof
(
rankinfo
.
net
));
printf
(
"nodeinfo size=%lu, stu size=%d
\n
"
,
sizeof
(
nodeinfo
),
nodeinfo
.
size
);
printf
(
"topo_node size=%lu
\n
"
,
sizeof
(
topo_node
));
// -------------------------- 2.获取节点unique_id,主要是socket地址 ----------------------------------- //
scclUniqueId
unique_id
;
if
(
rank
==
0
)
{
SCCLCHECK
(
sccl
::
hardware
::
scclGetUniqueId
(
&
unique_id
));
}
MPI_Bcast
(
&
unique_id
,
sizeof
(
scclUniqueId
),
MPI_BYTE
,
0
,
MPI_COMM_WORLD
);
// -------------------------- 3.基于unique_id的整合结果初始化 ----------------------------------- //
sccl_init_step1
(
&
unique_id
,
rank
,
nRanks
);
int
cuda_id
;
HIPCHECK
(
hipGetDevice
(
&
cuda_id
));
printf
(
"rank=%d, cuda_id=%d
\n
"
,
rank
,
cuda_id
);
MPI_Barrier
(
MPI_COMM_WORLD
);
SCCLCHECK
(
sccl
::
hardware
::
sccl_finalize
());
MPI_Finalize
();
}
/*
单机执行
SCCL_DEBUG_LEVEL=ABORT mpirun --allow-run-as-root -np 4 2_mpi_init_mpi_init_step1_bootstrap
SCCL_DEBUG_LEVEL=INFO SCCL_DEBUG_SUBSYS=ALL mpirun --allow-run-as-root -np 2 2_mpi_init_mpi_init_step1_bootstrap
跨机执行
SCCL_DEBUG_LEVEL=WARN SCCL_DEBUG_SUBSYS=BOOTSTRAP mpirun --allow-run-as-root --hostfile hostfile2 -np 4 ./2_mpi_init_mpi_init_step1_bootstrap
SCCL_DEBUG_LEVEL=WARN SCCL_DEBUG_SUBSYS=BOOTSTRAP mpirun --allow-run-as-root --hostfile hostfile -np 16 ./2_mpi_init_mpi_init_step1_bootstrap
*/
examples/2_topo/2_bootstrap/3_mpi_init_mpi_init_step2_graph.cpp
0 → 100644
View file @
571a75b5
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <mpi.h>
#include "bootstrap.h"
#include "hardware.h"
using
namespace
sccl
;
int
main
(
int
argc
,
char
*
argv
[])
{
// -------------------------- 1.启动MPI ----------------------------------- //
MPI_Init
(
&
argc
,
&
argv
);
int
rank
,
nRanks
;
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
nRanks
);
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
printf
(
"rank=%d, nRanks=%d
\n
"
,
rank
,
nRanks
);
sccl
::
hardware
::
topology
::
bootstrap
::
BootstrapHandle_t
uqid
;
sccl
::
hardware
::
topology
::
bootstrap
::
scclRankInfo_t
rankinfo
;
sccl
::
hardware
::
topology
::
bootstrap
::
scclNodeInfo_t
nodeinfo
(
/*nLocalRanks*/
2
);
printf
(
"rankinfo size=%lu
\n
"
,
sizeof
(
rankinfo
));
printf
(
"rankinfo cpu size=%lu
\n
"
,
sizeof
(
rankinfo
.
cpu
));
printf
(
"rankinfo gpu size=%lu
\n
"
,
sizeof
(
rankinfo
.
gpu
));
printf
(
"rankinfo net size=%lu
\n
"
,
sizeof
(
rankinfo
.
net
));
printf
(
"nodeinfo size=%lu, stu size=%d
\n
"
,
sizeof
(
nodeinfo
),
nodeinfo
.
totalByteSize
);
// topoNode_t topo_node;
// printf("topo_node size=%lu\n", sizeof(topo_node));
// -------------------------- 2.获取节点unique_id,主要是socket地址 ----------------------------------- //
typedef
sccl
::
hardware
::
topology
::
bootstrap
::
scclUniqueId
scclUniqueId
;
scclUniqueId
unique_id
;
if
(
rank
==
0
)
{
SCCLCHECK
(
sccl
::
hardware
::
scclGetUniqueId
(
&
unique_id
));
}
MPI_Bcast
(
&
unique_id
,
sizeof
(
scclUniqueId
),
MPI_BYTE
,
0
,
MPI_COMM_WORLD
);
// -------------------------- 3.基于unique_id的整合结果初始化 ----------------------------------- //
SCCLCHECK
(
sccl
::
hardware
::
sccl_init
(
&
unique_id
,
rank
,
nRanks
));
// int cuda_id;
// HIPCHECK(hipGetDevice(&cuda_id));
// printf("rank=%d, cuda_id=%d\n", rank, cuda_id);
// MPI_Barrier(MPI_COMM_WORLD);
SCCLCHECK
(
sccl
::
hardware
::
sccl_finalize
());
MPI_Finalize
();
}
/*
单机执行
SCCL_DEBUG_LEVEL=WARN mpirun --allow-run-as-root -np 4 3_mpi_init_mpi_init_step2_graph
SCCL_DEBUG_LEVEL=WARN SCCL_DEBUG_SUBSYS=ALL mpirun --allow-run-as-root -np 2 3_mpi_init_mpi_init_step2_graph
跨机执行
SCCL_DEBUG_LEVEL=WARN SCCL_DEBUG_SUBSYS=GRAPH mpirun --allow-run-as-root --hostfile hostfile2 -np 2 ./3_mpi_init_mpi_init_step2_graph
SCCL_DEBUG_LEVEL=WARN SCCL_DEBUG_SUBSYS=GRAPH mpirun --allow-run-as-root --hostfile hostfile2 -np 4 ./3_mpi_init_mpi_init_step2_graph
SCCL_DEBUG_LEVEL=WARN SCCL_DEBUG_SUBSYS=BOOTSTRAP mpirun --allow-run-as-root --hostfile hostfile -np 16 ./3_mpi_init_mpi_init_step2_graph
*/
examples/2_topo/2_bootstrap/compile_mpi.sh
→
examples/2_topo/2_bootstrap/compile_mpi
1
.sh
View file @
571a75b5
...
@@ -10,6 +10,8 @@ hipcc ./1_mpi_init.cpp \
...
@@ -10,6 +10,8 @@ hipcc ./1_mpi_init.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/mpi/mpiwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/mpi/mpisymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/rocm_smi_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/rocm_smi_wrap.cpp
\
...
@@ -31,6 +33,8 @@ hipcc ./1_mpi_init.cpp \
...
@@ -31,6 +33,8 @@ hipcc ./1_mpi_init.cpp \
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/mpi
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/mpi
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
...
...
examples/2_topo/2_bootstrap/compile_mpi2_init_step1.sh
0 → 100644
View file @
571a75b5
hipcc ./2_mpi_init_mpi_init_step1_bootstrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/hardware_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/ibvsymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/ibvwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/net_ib.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/net_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/ipc_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/rocm_smi_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/physical_links.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/hardware.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/thread_pool.cpp
\
-o
2_mpi_init_mpi_init_step1_bootstrap
\
-std
=
c++17
-g
-O3
-fopenmp
-DROC_SHMEM
-D__HIP_PLATFORM_HCC__
-Wno-return-type
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/opt/dtk/lib
-lamdhip64
-lrocm-core
-lrocm_smi64
-pthread
\
-L
/usr/lib/x86_64-linux-gnu
-libverbs
-lrdmacm
\
-L
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib
-lmpi
# # \
# # -L /public/home/lishen/Code/rocSHMEM/3rd_party/install/ucx/lib -lucs -lucp -luct -lucm
# # export HSA_FORCE_FINE_GRAIN_PCIE="1"
# # export iommu=pt
# hipcc ./2_mpi_init_mpi_init_step1_bootstrap.cpp \
# -o 2_mpi_init_mpi_init_step1_bootstrap \
# -std=c++17 -g -O3 -fopenmp -DROC_SHMEM -D__HIP_PLATFORM_HCC__ -Wno-return-type \
# -I ./ -I /usr/include -I /opt/dtk/include \
# -I /public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/ \
# -L /usr/lib/x86_64-linux-gnu -libverbs -lrdmacm \
# -L /public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib -lmpi \
# -L /opt/dtk/lib -lamdhip64 -lrocm-core -lrocm_smi64 -pthread
examples/2_topo/2_bootstrap/compile_mpi3_init_step2.sh
0 → 100644
View file @
571a75b5
hipcc ./3_mpi_init_mpi_init_step2_graph.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/hardware_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/ibvsymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/ibvwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/net_ib.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/net_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/ipc_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/rocm_smi_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/physical_links.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/hardware.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/thread_pool.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/graph.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/paths.cpp
\
-o
3_mpi_init_mpi_init_step2_graph
\
-std
=
c++17
-g
-O3
-fopenmp
-DROC_SHMEM
-D__HIP_PLATFORM_HCC__
-Wno-return-type
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/graph/
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/opt/dtk/lib
-lamdhip64
-lrocm-core
-lrocm_smi64
-pthread
\
-L
/usr/lib/x86_64-linux-gnu
-libverbs
-lrdmacm
\
-L
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib
-lmpi
# # \
# # -L /public/home/lishen/Code/rocSHMEM/3rd_party/install/ucx/lib -lucs -lucp -luct -lucm
# # export HSA_FORCE_FINE_GRAIN_PCIE="1"
# # export iommu=pt
# hipcc ./3_mpi_init_mpi_init_step2_graph.cpp \
# -o 3_mpi_init_mpi_init_step2_graph \
# -std=c++17 -g -O3 -fopenmp -DROC_SHMEM -D__HIP_PLATFORM_HCC__ -Wno-return-type \
# -I ./ -I /usr/include -I /opt/dtk/include \
# -I /public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/ \
# -L /usr/lib/x86_64-linux-gnu -libverbs -lrdmacm \
# -L /public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib -lmpi \
# -L /opt/dtk/lib -lamdhip64 -lrocm-core -lrocm_smi64 -pthread
examples/2_topo/2_bootstrap/hostfile2
View file @
571a75b5
node037 slots=2
node037 slots=2
node038 slots=2
node038 slots=2
\ No newline at end of file
src/hardware/hardware.cpp
View file @
571a75b5
...
@@ -5,75 +5,41 @@
...
@@ -5,75 +5,41 @@
#include "base.h"
#include "base.h"
#include "hardware_utils.h"
#include "hardware_utils.h"
#include "bootstrap.h"
#include "bootstrap.h"
#include "graph.h"
#include "hardware.h"
namespace
sccl
{
namespace
sccl
{
namespace
hardware
{
namespace
hardware
{
namespace
topology
{
namespace
bootstrap
{
// 全局变量,全部节点的信息
// 全局变量,全部节点的信息
s
truct
BootstrapComm
bootstrap_comm
;
s
ccl
::
hardware
::
topology
::
bootstrap
::
BootstrapComm
_t
bootstrap_comm
;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
scclResult_t
scclGetUniqueId
(
scclUniqueId
*
unique_id
)
{
scclResult_t
scclGetUniqueId
(
scclUniqueId
*
unique_id
)
{
auto
handle
=
reinterpret_cast
<
struct
BootstrapHandle
*>
(
unique_id
);
auto
handle
=
reinterpret_cast
<
BootstrapHandle
_t
*>
(
unique_id
);
NEQCHECK
(
sizeof
(
struct
BootstrapHandle
),
SCCL_UNIQUE_ID_BYTES
);
NEQCHECK
(
sizeof
(
BootstrapHandle
_t
),
SCCL_UNIQUE_ID_BYTES
);
SCCLCHECK
(
bootstrapGetUniqueId
(
handle
));
SCCLCHECK
(
topology
::
bootstrap
::
bootstrapGetUniqueId
(
handle
));
return
scclSuccess
;
return
scclSuccess
;
}
}
scclResult_t
sccl_init
(
const
scclUniqueId
*
unique_id
,
int
rank
,
int
nRanks
)
{
scclResult_t
sccl_init
(
const
scclUniqueId
*
unique_id
,
int
rank
,
int
nRanks
)
{
// -------------------------- 1.获取0号rank的地址信息 ----------------------------------- //
// -------------------------- 1.获取0号rank的地址信息 ----------------------------------- //
auto
root_handle
=
reinterpret_cast
<
const
struct
BootstrapHandle
*>
(
unique_id
);
auto
root_handle
=
reinterpret_cast
<
const
BootstrapHandle
_t
*>
(
unique_id
);
EQCHECK
(
root_handle
->
magic
,
0
);
// 检查handle是否已经更新
EQCHECK
(
root_handle
->
magic
,
0
);
// 检查handle是否已经更新
// -------------------------- 2.初始化获取所有节点的node信息 ----------------------------------- //
// -------------------------- 2.初始化获取所有节点的node信息 ----------------------------------- //
auto
sccl_bootstrap
=
std
::
make_unique
<
Bootstrap
>
(
root_handle
,
rank
,
nRanks
);
auto
sccl_bootstrap
=
std
::
make_unique
<
topology
::
bootstrap
::
Bootstrap
>
(
root_handle
,
rank
,
nRanks
);
SCCLCHECK
(
sccl_bootstrap
->
init
(
&
bootstrap_comm
));
SCCLCHECK
(
sccl_bootstrap
->
init
(
&
bootstrap_comm
));
// // -------------------------- 3.MPI allgather设置unique_id的整合 ----------------------------------- //
// -------------------------- 3.MPI 建图 ----------------------------------- //
auto
sccl_graph
=
std
::
make_unique
<
topology
::
graph
::
Graph
>
(
rank
,
nRanks
);
// auto unique_ids_chr = reinterpret_cast<const char*>(unique_ids);
printf
(
"init pos 2
\n
"
);
// 计算通信路径
sccl_graph
->
calculateCommunicationPaths
(
&
bootstrap_comm
);
printf
(
"init pos 3
\n
"
);
// // -------------------------- 3.MPI allgather设置unique_id的整合 ----------------------------------- //
// // -------------------------- 3.MPI allgather设置unique_id的整合 ----------------------------------- //
// std::vector<scclUniqueId> unique_id_vec(nRanks);
// MPI_Allgather(&unique_id, sizeof(scclUniqueId), MPI_BYTE, &unique_id_vec[0], sizeof(scclUniqueId), MPI_BYTE, MPI_COMM_WORLD);
// for(int i = 0; i < nRanks; ++i) {
// auto root_handle = reinterpret_cast<const struct BootstrapHandle*>(unique_ids_chr + i * sizeof(struct BootstrapHandle));
// printf("rank=%d, i=%d, unique_ids hosthash=%lu\n", root_handle->rank, i, root_handle->hostHash);
// }
// ByteSpan<struct BootstrapHandle> unique_ids_span(unique_ids_chr, nRanks * sizeof(struct BootstrapHandle));
// // -------------------------- 2.设置基础信息 ----------------------------------- //
// INFO(SCCL_LOG_TOPO, "Bootstrap ...\n");
// struct scclRankInfo rank_info;
// rank_info.rank = rank;
// rank_info.nRanks = nRanks;
// // 在每个进程中设置 root_handle 的值
// root_handle.rank = rank_info->rank;
// root_handle.hostHash = getHostHash();
// scclSocketAddress_t localSocketAddr = sccl_bootstrap->getLocalSocketAddr();
// memcpy(&root_handle.addr, &localSocketAddr, sizeof(scclSocketAddress_t));
// #if 1
// char line[100];
// sprintf(line, "pos 55: rank=%d", rank);
// SCCLCHECK(hardware::net::printSocketAddr(&root_handle.addr, line));
// printf("root_handle.hostHash rank=%d, hash=%lu\n", rank, root_handle.hostHash);
// #endif
// // -------------------------- 3.收集所有进程的 root_handle 信息 ----------------------------------- //
// std::vector<char> recvBuffer(nRanks * sendBuffer.size());
// SCCLCHECK(mpi::wrap_mpi_allgather(sendBuffer.data(), sendBuffer.size(), MPI_BYTE, recvBuffer.data(), sendBuffer.size(), MPI_BYTE, MPI_COMM_WORLD));
// -------------------------- 4.设置各个节点的基础信息 ----------------------------------- //
// SCCLCHECK(sccl_bootstrap->bootstrapInit(rank_info, recvBuffer.data()));
// -------------------------- 5.根据各个节点的基础信息计算topo结果 ----------------------------------- //
// -------------------------- 5.根据各个节点的基础信息计算topo结果 ----------------------------------- //
...
@@ -84,14 +50,12 @@ scclResult_t sccl_finalize() {
...
@@ -84,14 +50,12 @@ scclResult_t sccl_finalize() {
// 设置一些全局变量的重置和销毁
// 设置一些全局变量的重置和销毁
// 设置socket等硬件监听的关闭
// 设置socket等硬件监听的关闭
// void BootstrapComm::destroy() {
// void BootstrapComm::destroy() {
if
(
bootstrap_comm
.
nRanks
>
0
)
{
//
if(bootstrap_comm.nRanks > 0) {
bootstrap_comm
.
destroy
();
//
bootstrap_comm.destroy();
}
//
}
return
scclSuccess
;
return
scclSuccess
;
}
}
}
// namespace bootstrap
}
// namespace topology
}
// namespace hardware
}
// namespace hardware
}
// namespace sccl
}
// namespace sccl
src/hardware/hardware.h
View file @
571a75b5
...
@@ -6,15 +6,14 @@
...
@@ -6,15 +6,14 @@
namespace
sccl
{
namespace
sccl
{
namespace
hardware
{
namespace
hardware
{
namespace
topology
{
namespace
bootstrap
{
typedef
topology
::
bootstrap
::
scclUniqueId
scclUniqueId
;
typedef
topology
::
bootstrap
::
BootstrapHandle_t
BootstrapHandle_t
;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
scclResult_t
scclGetUniqueId
(
scclUniqueId
*
unique_id
);
scclResult_t
scclGetUniqueId
(
scclUniqueId
*
unique_id
);
scclResult_t
sccl_init
(
const
scclUniqueId
*
unique_id
,
int
rank
,
int
nRanks
);
scclResult_t
sccl_init
(
const
scclUniqueId
*
unique_id
,
int
rank
,
int
nRanks
);
scclResult_t
sccl_finalize
();
scclResult_t
sccl_finalize
();
}
// namespace bootstrap
}
// namespace topology
}
// namespace hardware
}
// namespace hardware
}
// namespace sccl
}
// namespace sccl
src/hardware/hardware_utils.h
View file @
571a75b5
...
@@ -6,9 +6,8 @@
...
@@ -6,9 +6,8 @@
namespace
sccl
{
namespace
sccl
{
namespace
hardware
{
namespace
hardware
{
namespace
ops
{
////
}
// namespace ops
// 实现类似于std::span的功能,将字节数组转换为类型数组
}
// namespace hardware
}
// namespace hardware
}
// namespace sccl
}
// namespace sccl
src/hardware/net/ipc_socket/ipc_socket.cpp
View file @
571a75b5
This diff is collapsed.
Click to expand it.
src/hardware/net/ipc_socket/ipc_socket.h
View file @
571a75b5
...
@@ -37,7 +37,7 @@ struct DataPackage {
...
@@ -37,7 +37,7 @@ struct DataPackage {
};
};
//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
class
scclIpcSocket
{
typedef
class
scclIpcSocket
{
public:
public:
// 构造函数和析构函数
// 构造函数和析构函数
scclIpcSocket
(
int
localRank
,
int
nlocalRanks
,
uint64_t
hash
,
volatile
uint32_t
*
abortFlag
=
nullptr
);
scclIpcSocket
(
int
localRank
,
int
nlocalRanks
,
uint64_t
hash
,
volatile
uint32_t
*
abortFlag
=
nullptr
);
...
@@ -62,19 +62,16 @@ public:
...
@@ -62,19 +62,16 @@ public:
// 通过Unix域套接字发送/接收数据到指定目标
// 通过Unix域套接字发送/接收数据到指定目标
scclResult_t
scclIpcSocketSendData
(
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
);
scclResult_t
scclIpcSocketSendData
(
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
);
scclResult_t
scclIpcSocketRecvData
(
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
);
scclResult_t
scclIpcSocketRecvData
(
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
,
int
*
src_rank
);
// 通过Unix域套接字发送/接收数据到指定目标,
并发送ack确保发送成功
// 通过Unix域套接字发送/接收数据到指定目标,
有ACK信息
scclResult_t
scclIpcSocketSendDataWithAck
(
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
);
scclResult_t
scclIpcSocketSendDataWithAck
(
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
);
scclResult_t
scclIpcSocketRecvData
AndSend
Ack
(
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
,
int
src_rank
);
scclResult_t
scclIpcSocketRecvData
With
Ack
(
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
,
int
*
src_rank
);
//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
// local rank内的allgather操作
。
保证接收顺序
// local rank内的allgather操作
,
保证接收顺序
scclResult_t
scclIpcSocketAllgather
(
const
void
*
sendData
,
void
*
recvData
,
size_t
dataLen
);
scclResult_t
scclIpcSocketAllgather
(
const
void
*
sendData
,
void
*
recvData
,
size_t
dataLen
);
// local rank内的allgather操作。为了性能,不保证接收顺序,所以发送的信息中需要添加进程ID
scclResult_t
scclIpcSocketAllgatherSync
(
const
void
*
sendData
,
void
*
recvData
,
size_t
dataLen
);
// local rank内的broadcast操作
// local rank内的broadcast操作
scclResult_t
scclIpcSocketBroadcast
(
void
*
data
,
size_t
dataLen
,
int
root
);
scclResult_t
scclIpcSocketBroadcast
(
void
*
data
,
size_t
dataLen
,
int
root
);
...
@@ -82,6 +79,12 @@ private:
...
@@ -82,6 +79,12 @@ private:
// 初始化IPC套接字
// 初始化IPC套接字
scclResult_t
scclIpcSocketInit
(
volatile
uint32_t
*
abortFlag
);
scclResult_t
scclIpcSocketInit
(
volatile
uint32_t
*
abortFlag
);
scclResult_t
getScclIpcSocknameStr
(
int
rank
,
uint64_t
hash
,
char
*
out_str
,
int
*
out_len
);
scclResult_t
getScclIpcSocknameStr
(
int
rank
,
uint64_t
hash
,
char
*
out_str
,
int
*
out_len
);
// 通过Unix域套接字发送/接收数据到指定目标,不加锁执行
scclResult_t
scclIpcSocketSendDataBasic
(
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
);
scclResult_t
scclIpcSocketRecvDataBasic
(
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
);
// 通过Unix域套接字发送/接收数据到指定目标,不加锁执行
scclResult_t
scclIpcSocketSendDataAndRank
(
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
);
scclResult_t
scclIpcSocketRecvDataAndRank
(
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
,
int
*
src_rank
);
private:
private:
// 定义并初始化一个 scclIpcSocket 结构体,用于处理 IPC 套接字连接
// 定义并初始化一个 scclIpcSocket 结构体,用于处理 IPC 套接字连接
...
@@ -100,6 +103,7 @@ private:
...
@@ -100,6 +103,7 @@ private:
// 线程池指针
// 线程池指针
ThreadPool
*
pthread_pool
=
nullptr
;
ThreadPool
*
pthread_pool
=
nullptr
;
// 设置超时时间为无限长
// 设置超时时间为无限长
int
timeoutMs
=
-
1
;
int
timeoutMs
=
-
1
;
...
@@ -107,7 +111,7 @@ private:
...
@@ -107,7 +111,7 @@ private:
static
constexpr
int
ACK_SIZE
=
8
;
static
constexpr
int
ACK_SIZE
=
8
;
// 假设 CHUNK_SIZE 是一个合适的块大小,例如 64KB
// 假设 CHUNK_SIZE 是一个合适的块大小,例如 64KB
static
constexpr
size_t
CHUNK_SIZE
=
64
*
1024
;
static
constexpr
size_t
CHUNK_SIZE
=
64
*
1024
;
};
}
scclIpcSocket_t
;
}
// namespace ipc_socket
}
// namespace ipc_socket
}
// namespace net
}
// namespace net
...
...
src/hardware/net/net_ib/net_ib.cpp
View file @
571a75b5
...
@@ -1154,8 +1154,10 @@ scclResult_t scclNetIb::getProperties(int dev, scclNetProperties_t* props) {
...
@@ -1154,8 +1154,10 @@ scclResult_t scclNetIb::getProperties(int dev, scclNetProperties_t* props) {
if
(
scclIbGdrSupport
(
dev
)
==
scclSuccess
)
{
if
(
scclIbGdrSupport
(
dev
)
==
scclSuccess
)
{
props
->
ptrSupport
|=
SCCL_PTR_CUDA
;
// GDR support via nv_peermem
props
->
ptrSupport
|=
SCCL_PTR_CUDA
;
// GDR support via nv_peermem
}
}
if
(
scclIbDmaBufSupport
(
dev
)
==
scclSuccess
)
{
if
(
getDmaBufEnable
()
!=
0
)
{
props
->
ptrSupport
|=
SCCL_PTR_DMABUF
;
// GDR support via DMA-BUF
if
(
scclIbDmaBufSupport
(
dev
)
==
scclSuccess
)
{
props
->
ptrSupport
|=
SCCL_PTR_DMABUF
;
// GDR support via DMA-BUF
}
}
}
props
->
speed
=
scclIbDevs
[
dev
].
speed
;
props
->
speed
=
scclIbDevs
[
dev
].
speed
;
props
->
latency
=
0
;
// Not set
props
->
latency
=
0
;
// Not set
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment