Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
lishen01
Sccl
Commits
a4ac3320
Commit
a4ac3320
authored
Jul 07, 2025
by
lishen
Browse files
通过线程池实现ipcsocket,满足节点内通信
parent
d9d23f34
Changes
132
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
271 additions
and
1645 deletions
+271
-1645
examples/1_connection/3_sccl_ipc_socket/4_socket_mpi_data_sccl.cpp
...1_connection/3_sccl_ipc_socket/4_socket_mpi_data_sccl.cpp
+76
-0
examples/1_connection/3_sccl_ipc_socket/compile1.sh
examples/1_connection/3_sccl_ipc_socket/compile1.sh
+35
-0
examples/1_connection/3_sccl_ipc_socket/compile2.sh
examples/1_connection/3_sccl_ipc_socket/compile2.sh
+35
-0
examples/1_connection/3_sccl_ipc_socket/compile3.sh
examples/1_connection/3_sccl_ipc_socket/compile3.sh
+35
-0
examples/1_connection/3_sccl_ipc_socket/testfile.txt
examples/1_connection/3_sccl_ipc_socket/testfile.txt
+2
-0
examples/1_connection/3_socket_comm/client.cpp
examples/1_connection/3_socket_comm/client.cpp
+0
-54
examples/1_connection/3_socket_comm/get_ip.cpp
examples/1_connection/3_socket_comm/get_ip.cpp
+0
-44
examples/1_connection/3_socket_comm/server.cpp
examples/1_connection/3_socket_comm/server.cpp
+0
-81
examples/1_connection/3_socket_comm/socket.cpp
examples/1_connection/3_socket_comm/socket.cpp
+0
-905
examples/1_connection/3_socket_comm/socket.h
examples/1_connection/3_socket_comm/socket.h
+0
-235
examples/1_connection/3_socket_comm/test_socket_itf.cpp
examples/1_connection/3_socket_comm/test_socket_itf.cpp
+0
-281
examples/2_topo/0_demo_topo/compile_topo.sh
examples/2_topo/0_demo_topo/compile_topo.sh
+7
-7
examples/2_topo/1_demo_rocm/compile_rocm_smi.sh
examples/2_topo/1_demo_rocm/compile_rocm_smi.sh
+2
-1
examples/2_topo/1_demo_rocm/test_rocm_smi.cpp
examples/2_topo/1_demo_rocm/test_rocm_smi.cpp
+4
-4
examples/2_topo/2_bootstrap/1_mpi_init.cpp
examples/2_topo/2_bootstrap/1_mpi_init.cpp
+24
-5
examples/2_topo/2_bootstrap/compile_mpi.sh
examples/2_topo/2_bootstrap/compile_mpi.sh
+27
-12
examples/2_topo/2_bootstrap/hostfile
examples/2_topo/2_bootstrap/hostfile
+2
-0
examples/2_topo/2_bootstrap/hostfile2
examples/2_topo/2_bootstrap/hostfile2
+2
-0
examples/get_cpp_files.py
examples/get_cpp_files.py
+18
-0
src/hardware/comm.h
src/hardware/comm.h
+2
-16
No files found.
examples/1_connection/3_sccl_ipc_socket/4_socket_mpi_data_sccl.cpp
0 → 100644
View file @
a4ac3320
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <vector> // 引入vector库
#include <thread> // 为了使用 std::this_thread::sleep_for
#include "mpi.h"
#include "net.h"
#include "ipc_socket.h"
#include "thread_pool.h"
using
namespace
sccl
;
typedef
class
sccl
::
hardware
::
net
::
ipc_socket
::
scclIpcSocket
scclIpcSocket_t
;
template
<
typename
T
>
void
send_data
(
T
*
ipcsocket
,
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
,
uint64_t
dst_hash
)
{
if
(
ipcsocket
->
scclIpcSocketSendData
(
data
,
dataLen
,
dst_rank
,
dst_hash
)
!=
scclSuccess
)
{
perror
(
"Failed to send data"
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
}
}
template
<
typename
T
>
void
recv_data
(
T
*
ipcsocket
,
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
)
{
if
(
ipcsocket
->
scclIpcSocketRecvData
(
buffer
,
bufferLen
,
receivedLen
)
!=
scclSuccess
)
{
perror
(
"Failed to receive data"
);
MPI_Abort
(
MPI_COMM_WORLD
,
1
);
}
}
int
main
(
int
argc
,
char
*
argv
[])
{
MPI_Init
(
&
argc
,
&
argv
);
int
rank
,
size
;
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
size
);
int
dst_hash
=
12345
;
scclIpcSocket_t
ipcsocket
(
rank
,
dst_hash
);
int
sendDataLen
=
256
;
std
::
vector
<
char
>
sendData
(
sendDataLen
);
std
::
vector
<
char
>
recvData
(
size
*
sendDataLen
);
size_t
receivedLen
;
// 填充发送数据
snprintf
(
sendData
.
data
(),
sendData
.
size
(),
"Data from process %d"
,
rank
);
auto
pthpool
=
ThreadPool
(
size
*
2
);
// 发送数据给所有其他进程
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
if
(
i
!=
rank
)
{
auto
task_send
=
std
::
bind
(
send_data
<
scclIpcSocket_t
>
,
&
ipcsocket
,
sendData
.
data
(),
sendData
.
size
(),
i
,
dst_hash
);
pthpool
.
enqueue
(
task_send
);
auto
task_recv
=
std
::
bind
(
recv_data
<
scclIpcSocket_t
>
,
&
ipcsocket
,
recvData
.
data
()
+
i
*
sendDataLen
,
sendDataLen
,
&
receivedLen
);
pthpool
.
enqueue
(
task_recv
);
}
}
printf
(
"sendData.size()=%d, receivedLen=%d
\n
"
,
sendDataLen
,
int
(
receivedLen
));
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
seconds
(
2
));
// 打印接收到的数据
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
printf
(
"Process %d received from process %d: %s
\n
"
,
rank
,
i
,
recvData
.
data
()
+
i
*
256
);
}
MPI_Finalize
();
return
0
;
}
/*
单机执行
SCCL_DEBUG_LEVEL=ABORT SCCL_DEBUG_SUBSYS=BOOTSTRAP mpirun --allow-run-as-root -np 8 3_socket_mpi_data
*/
examples/1_connection/3_sccl_ipc_socket/compile1.sh
0 → 100644
View file @
a4ac3320
hipcc ./1_socket_mpi_fd.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/hardware_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/ibvsymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/ibvwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/net_ib.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/net_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/ipc_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/thread_pool.cpp
\
-o
1_socket_mpi_fd
\
-std
=
c++17
-g
-O3
-fopenmp
-DROC_SHMEM
-D__HIP_PLATFORM_HCC__
-Wno-return-type
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/usr/lib/x86_64-linux-gnu
-libverbs
-lrdmacm
\
-L
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib
-lmpi
\
-L
/opt/dtk/lib
-lamdhip64
-lrocm-core
-lrocm_smi64
-pthread
examples/1_connection/3_sccl_ipc_socket/compile2.sh
0 → 100644
View file @
a4ac3320
hipcc ./2_socket_mpi_fd_pthpool.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/hardware_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/ibvsymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/ibvwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/net_ib.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/net_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/ipc_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/thread_pool.cpp
\
-o
2_socket_mpi_fd_pthpool
\
-std
=
c++17
-g
-O3
-fopenmp
-DROC_SHMEM
-D__HIP_PLATFORM_HCC__
-Wno-return-type
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/usr/lib/x86_64-linux-gnu
-libverbs
-lrdmacm
\
-L
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib
-lmpi
\
-L
/opt/dtk/lib
-lamdhip64
-lrocm-core
-lrocm_smi64
-pthread
examples/1_connection/3_sccl_ipc_socket/compile3.sh
0 → 100644
View file @
a4ac3320
hipcc ./3_socket_mpi_data.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/hardware_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/ibvsymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/ibvwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/net_ib.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/net_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/ipc_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/thread_pool.cpp
\
-o
3_socket_mpi_data
\
-std
=
c++17
-g
-O3
-fopenmp
-DROC_SHMEM
-D__HIP_PLATFORM_HCC__
-Wno-return-type
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/usr/lib/x86_64-linux-gnu
-libverbs
-lrdmacm
\
-L
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib
-lmpi
\
-L
/opt/dtk/lib
-lamdhip64
-lrocm-core
-lrocm_smi64
-pthread
examples/1_connection/3_sccl_ipc_socket/testfile.txt
0 → 100644
View file @
a4ac3320
hello, ipc socket!
examples/1_connection/3_socket_comm/client.cpp
deleted
100644 → 0
View file @
d9d23f34
#include <iostream>
#include <string>
#include <cstring>
#include <unistd.h>
#include <arpa/inet.h>
void
start_client
(
const
std
::
string
&
server_ip
,
int
server_port
)
{
int
sock
=
0
;
struct
sockaddr_in
serv_addr
;
char
buffer
[
1024
]
=
{
0
};
std
::
string
message
=
"你好,服务器!"
;
// 创建 socket 文件描述符
if
((
sock
=
socket
(
AF_INET
,
SOCK_STREAM
,
0
))
<
0
)
{
std
::
cerr
<<
"Socket creation error"
<<
std
::
endl
;
exit
(
EXIT_FAILURE
);
}
serv_addr
.
sin_family
=
AF_INET
;
serv_addr
.
sin_port
=
htons
(
server_port
);
// 转换 IPv4 和 IPv6 地址
if
(
inet_pton
(
AF_INET
,
server_ip
.
c_str
(),
&
serv_addr
.
sin_addr
)
<=
0
)
{
std
::
cerr
<<
"Invalid address/ Address not supported"
<<
std
::
endl
;
close
(
sock
);
exit
(
EXIT_FAILURE
);
}
// 连接到服务器
if
(
connect
(
sock
,
(
struct
sockaddr
*
)
&
serv_addr
,
sizeof
(
serv_addr
))
<
0
)
{
std
::
cerr
<<
"Connection Failed"
<<
std
::
endl
;
close
(
sock
);
exit
(
EXIT_FAILURE
);
}
// 发送数据
send
(
sock
,
message
.
c_str
(),
message
.
length
(),
0
);
std
::
cout
<<
"消息已发送"
<<
std
::
endl
;
// 接收响应
int
valread
=
read
(
sock
,
buffer
,
1024
);
std
::
cout
<<
"收到的响应: "
<<
buffer
<<
std
::
endl
;
// 关闭连接
close
(
sock
);
}
int
main
()
{
std
::
string
server_ip
=
"10.16.1.37"
;
int
server_port
=
6842
;
start_client
(
server_ip
,
server_port
);
return
0
;
}
\ No newline at end of file
examples/1_connection/3_socket_comm/get_ip.cpp
deleted
100644 → 0
View file @
d9d23f34
#include <iostream>
#include <ifaddrs.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <stdlib.h>
#include <netdb.h>
#include <unistd.h>
#include <ifaddrs.h>
#include <net/if.h>
#include <vector>
#include <utility>
#include <unordered_set>
#include <unistd.h>
#include <sys/syscall.h>
#define NI_MAXHOST 1025
void
get_ip_addresses
()
{
struct
ifaddrs
*
ifaddr
,
*
ifa
;
char
host
[
NI_MAXHOST
];
if
(
getifaddrs
(
&
ifaddr
)
==
-
1
)
{
perror
(
"getifaddrs"
);
exit
(
EXIT_FAILURE
);
}
for
(
ifa
=
ifaddr
;
ifa
!=
NULL
;
ifa
=
ifa
->
ifa_next
)
{
if
(
ifa
->
ifa_addr
==
NULL
)
continue
;
if
(
ifa
->
ifa_addr
->
sa_family
==
AF_INET
)
{
// 检查是否为 IPv4 地址
(
void
)
getnameinfo
(
ifa
->
ifa_addr
,
sizeof
(
struct
sockaddr_in
),
host
,
NI_MAXHOST
,
NULL
,
0
,
NI_NUMERICHOST
);
std
::
cout
<<
"Interface: "
<<
ifa
->
ifa_name
<<
" Address: "
<<
host
<<
std
::
endl
;
}
}
freeifaddrs
(
ifaddr
);
}
int
main
()
{
get_ip_addresses
();
return
0
;
}
\ No newline at end of file
examples/1_connection/3_socket_comm/server.cpp
deleted
100644 → 0
View file @
d9d23f34
#include <iostream>
#include <string>
#include <cstring>
#include <unistd.h>
#include <arpa/inet.h>
#include <ifaddrs.h>
#include <net/if.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
void
start_server
()
{
int
server_fd
,
new_socket
;
struct
sockaddr_in
address
;
int
addrlen
=
sizeof
(
address
);
char
buffer
[
1024
]
=
{
0
};
std
::
string
message
=
"消息已收到"
;
// 创建 socket 文件描述符
if
((
server_fd
=
socket
(
AF_INET
,
SOCK_STREAM
,
0
))
==
0
)
{
perror
(
"socket failed"
);
exit
(
EXIT_FAILURE
);
}
// 绑定地址和端口
address
.
sin_family
=
AF_INET
;
address
.
sin_addr
.
s_addr
=
INADDR_ANY
;
// 自动获取所有 IP 地址
address
.
sin_port
=
htons
(
6842
);
if
(
bind
(
server_fd
,
(
struct
sockaddr
*
)
&
address
,
sizeof
(
address
))
<
0
)
{
perror
(
"bind failed"
);
close
(
server_fd
);
exit
(
EXIT_FAILURE
);
}
// 获取绑定的端口号
socklen_t
len
=
sizeof
(
address
);
if
(
getsockname
(
server_fd
,
(
struct
sockaddr
*
)
&
address
,
&
len
)
==
-
1
)
{
perror
(
"getsockname failed"
);
close
(
server_fd
);
exit
(
EXIT_FAILURE
);
}
int
port
=
ntohs
(
address
.
sin_port
);
std
::
cout
<<
"服务器已启动,端口: "
<<
port
<<
std
::
endl
;
// 监听连接
if
(
listen
(
server_fd
,
3
)
<
0
)
{
perror
(
"listen"
);
close
(
server_fd
);
exit
(
EXIT_FAILURE
);
}
std
::
cout
<<
"等待连接..."
<<
std
::
endl
;
// 接受客户端连接
if
((
new_socket
=
accept
(
server_fd
,
(
struct
sockaddr
*
)
&
address
,
(
socklen_t
*
)
&
addrlen
))
<
0
)
{
perror
(
"accept"
);
close
(
server_fd
);
exit
(
EXIT_FAILURE
);
}
while
(
true
)
{
// 接收数据
int
valread
=
read
(
new_socket
,
buffer
,
1024
);
if
(
valread
==
0
)
{
break
;
}
std
::
cout
<<
"收到的消息: "
<<
buffer
<<
std
::
endl
;
send
(
new_socket
,
message
.
c_str
(),
message
.
length
(),
0
);
memset
(
buffer
,
0
,
sizeof
(
buffer
));
}
// 关闭连接
close
(
new_socket
);
close
(
server_fd
);
}
int
main
()
{
start_server
();
return
0
;
}
\ No newline at end of file
examples/1_connection/3_socket_comm/socket.cpp
deleted
100644 → 0
View file @
d9d23f34
This diff is collapsed.
Click to expand it.
examples/1_connection/3_socket_comm/socket.h
deleted
100644 → 0
View file @
d9d23f34
#pragma once
#include "debug.h"
#include "check.h"
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/tcp.h>
#include <netdb.h>
#include <fcntl.h>
#include <poll.h>
using
namespace
sccl
;
struct
netIf
{
char
prefix
[
64
];
int
port
;
};
static
thread_local
int
scclDebugNoWarn
=
0
;
#define SYSCHECK(call, name) \
do { \
int retval; \
SYSCHECKVAL(call, name, retval); \
} while(false)
#define SYSCHECKVAL(call, name, retval) \
do { \
SYSCHECKSYNC(call, name, retval); \
if(retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
return scclSystemError; \
} \
} while(false)
#define SYSCHECKSYNC(call, name, retval) \
do { \
retval = call; \
if(retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
INFO(SCCL_LOG_CODEALL, "Call to " name " returned %s, retrying", strerror(errno)); \
} else { \
break; \
} \
} while(true)
#define EQCHECK(statement, value) \
do { \
if((statement) == value) { \
/* Print the back trace*/
\
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, scclSystemError, strerror(errno)); \
return scclSystemError; \
} \
} while(0);
#define NEQCHECKGOTO(statement, value, RES, label) \
do { \
if((statement) != value) { \
/* Print the back trace*/
\
RES = scclSystemError; \
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while(0);
#define SYSCHECKGOTO(statement, RES, label) \
do { \
if((statement) == -1) { \
/* Print the back trace*/
\
RES = scclSystemError; \
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while(0);
#define SCCLCHECKGOTO(call, RES, label) \
do { \
RES = call; \
if(RES != scclSuccess && RES != scclInProgress) { \
/* Print the back trace*/
\
if(scclDebugNoWarn == 0) \
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d", __FILE__, __LINE__, RES); \
goto label; \
} \
INFO(SCCL_LOG_CODEALL, "check pass %s:%d -> %d", __FILE__, __LINE__, RES); \
} while(0);
#define EQCHECKGOTO(statement, value, RES, label) \
do { \
if((statement) == value) { \
/* Print the back trace*/
\
RES = scclSystemError; \
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while(0);
static
int
parseStringList
(
const
char
*
string
,
struct
netIf
*
ifList
,
int
maxList
)
{
if
(
!
string
)
return
0
;
const
char
*
ptr
=
string
;
int
ifNum
=
0
;
int
ifC
=
0
;
char
c
;
do
{
c
=
*
ptr
;
if
(
c
==
':'
)
{
if
(
ifC
>
0
)
{
ifList
[
ifNum
].
prefix
[
ifC
]
=
'\0'
;
ifList
[
ifNum
].
port
=
atoi
(
ptr
+
1
);
ifNum
++
;
ifC
=
0
;
}
while
(
c
!=
','
&&
c
!=
'\0'
)
c
=
*
(
++
ptr
);
}
else
if
(
c
==
','
||
c
==
'\0'
)
{
if
(
ifC
>
0
)
{
ifList
[
ifNum
].
prefix
[
ifC
]
=
'\0'
;
ifList
[
ifNum
].
port
=
-
1
;
ifNum
++
;
ifC
=
0
;
}
}
else
{
ifList
[
ifNum
].
prefix
[
ifC
]
=
c
;
ifC
++
;
}
ptr
++
;
}
while
(
ifNum
<
maxList
&&
c
);
return
ifNum
;
}
static
bool
matchIf
(
const
char
*
string
,
const
char
*
ref
,
bool
matchExact
)
{
// Make sure to include '\0' in the exact case
int
matchLen
=
matchExact
?
strlen
(
string
)
+
1
:
strlen
(
ref
);
return
strncmp
(
string
,
ref
,
matchLen
)
==
0
;
}
static
bool
matchPort
(
const
int
port1
,
const
int
port2
)
{
if
(
port1
==
-
1
)
return
true
;
if
(
port2
==
-
1
)
return
true
;
if
(
port1
==
port2
)
return
true
;
return
false
;
}
static
bool
matchIfList
(
const
char
*
string
,
int
port
,
struct
netIf
*
ifList
,
int
listSize
,
bool
matchExact
)
{
// Make an exception for the case where no user list is defined
if
(
listSize
==
0
)
return
true
;
for
(
int
i
=
0
;
i
<
listSize
;
i
++
)
{
if
(
matchIf
(
string
,
ifList
[
i
].
prefix
,
matchExact
)
&&
matchPort
(
port
,
ifList
[
i
].
port
))
{
return
true
;
}
}
return
false
;
}
#define MAX_IFS 16
#define MAX_IF_NAME_SIZE 16
#define SLEEP_INT 1000 // connection retry sleep interval in usec
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
#define SOCKET_NAME_MAXLEN (NI_MAXHOST + NI_MAXSERV)
#define SCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
union
scclSocketAddress
{
struct
sockaddr
sa
;
struct
sockaddr_in
sin
;
struct
sockaddr_in6
sin6
;
};
enum
scclSocketState
{
scclSocketStateNone
=
0
,
scclSocketStateInitialized
=
1
,
scclSocketStateAccepting
=
2
,
scclSocketStateAccepted
=
3
,
scclSocketStateConnecting
=
4
,
scclSocketStateConnectPolling
=
5
,
scclSocketStateConnected
=
6
,
scclSocketStateReady
=
7
,
scclSocketStateClosed
=
8
,
scclSocketStateError
=
9
,
scclSocketStateNum
=
10
};
enum
scclSocketType
{
scclSocketTypeUnknown
=
0
,
scclSocketTypeBootstrap
=
1
,
scclSocketTypeProxy
=
2
,
scclSocketTypeNetSocket
=
3
,
scclSocketTypeNetIb
=
4
};
struct
scclSocket
{
int
fd
;
int
acceptFd
;
int
timedOutRetries
;
int
refusedRetries
;
union
scclSocketAddress
addr
;
volatile
uint32_t
*
abortFlag
;
int
asyncFlag
;
enum
scclSocketState
state
;
int
salen
;
uint64_t
magic
;
enum
scclSocketType
type
;
};
const
char
*
scclSocketToString
(
union
scclSocketAddress
*
addr
,
char
*
buf
,
const
int
numericHostForm
=
1
);
scclResult_t
scclSocketGetAddrFromString
(
union
scclSocketAddress
*
ua
,
const
char
*
ip_port_pair
);
int
scclFindInterfaceMatchSubnet
(
char
*
ifNames
,
union
scclSocketAddress
*
localAddrs
,
union
scclSocketAddress
*
remoteAddr
,
int
ifNameMaxSize
,
int
maxIfs
);
int
scclFindInterfaces
(
char
*
ifNames
,
union
scclSocketAddress
*
ifAddrs
,
int
ifNameMaxSize
,
int
maxIfs
);
// Initialize a socket
scclResult_t
scclSocketInit
(
struct
scclSocket
*
sock
,
union
scclSocketAddress
*
addr
=
NULL
,
uint64_t
magic
=
SCCL_SOCKET_MAGIC
,
enum
scclSocketType
type
=
scclSocketTypeUnknown
,
volatile
uint32_t
*
abortFlag
=
NULL
,
int
asyncFlag
=
0
);
// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
scclResult_t
scclSocketListen
(
struct
scclSocket
*
sock
);
scclResult_t
scclSocketGetAddr
(
struct
scclSocket
*
sock
,
union
scclSocketAddress
*
addr
);
// Connect to sock->addr. sock->fd is set after a successful call.
scclResult_t
scclSocketConnect
(
struct
scclSocket
*
sock
,
int
portReuse
=
0
);
// Return socket connection state.
scclResult_t
scclSocketReady
(
struct
scclSocket
*
sock
,
int
*
running
);
// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
scclResult_t
scclSocketAccept
(
struct
scclSocket
*
sock
,
struct
scclSocket
*
ulistenSock
);
scclResult_t
scclSocketGetFd
(
struct
scclSocket
*
sock
,
int
*
fd
);
scclResult_t
scclSocketSetFd
(
int
fd
,
struct
scclSocket
*
sock
);
#define SCCL_SOCKET_SEND 0
#define SCCL_SOCKET_RECV 1
scclResult_t
scclSocketProgress
(
int
op
,
struct
scclSocket
*
sock
,
void
*
ptr
,
int
size
,
int
*
offset
);
scclResult_t
scclSocketWait
(
int
op
,
struct
scclSocket
*
sock
,
void
*
ptr
,
int
size
,
int
*
offset
);
scclResult_t
scclSocketSend
(
struct
scclSocket
*
sock
,
void
*
ptr
,
int
size
);
scclResult_t
scclSocketRecv
(
struct
scclSocket
*
sock
,
void
*
ptr
,
int
size
);
scclResult_t
scclSocketTryRecv
(
struct
scclSocket
*
sock
,
void
*
ptr
,
int
size
,
int
*
closed
,
bool
blocking
);
scclResult_t
scclSocketClose
(
struct
scclSocket
*
sock
);
examples/1_connection/3_socket_comm/test_socket_itf.cpp
deleted
100644 → 0
View file @
d9d23f34
#include "socket.h"
#include "debug.h"
#include "check.h"
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/tcp.h>
#include <netdb.h>
#include <fcntl.h>
#include <poll.h>
using
namespace
sccl
;
#define MAX_REQUESTS 8
#define MAX_THREADS 16
#define MAX_SOCKETS 64
struct
scclNetSocketTask
{
int
op
;
void
*
data
;
int
size
;
struct
scclSocket
*
sock
;
int
offset
;
int
used
;
scclResult_t
result
;
};
struct
scclNetSocketTaskQueue
{
int
next
;
int
len
;
struct
scclNetSocketTask
*
tasks
;
};
struct
scclNetSocketRequest
{
int
op
;
void
*
data
;
int
size
;
struct
scclSocket
*
ctrlSock
;
int
offset
;
int
used
;
struct
scclNetSocketComm
*
comm
;
struct
scclNetSocketTask
*
tasks
[
MAX_SOCKETS
];
int
nSubs
;
};
struct
scclNetSocketThreadResources
{
struct
scclNetSocketTaskQueue
threadTaskQueue
;
int
stop
;
struct
scclNetSocketComm
*
comm
;
pthread_mutex_t
threadLock
;
pthread_cond_t
threadCond
;
};
struct
scclNetSocketComm
{
struct
scclSocket
ctrlSock
;
struct
scclSocket
socks
[
MAX_SOCKETS
];
int
dev
;
int
hipDev
;
int
nSocks
;
int
nThreads
;
int
nextSock
;
struct
scclNetSocketRequest
requests
[
MAX_REQUESTS
];
pthread_t
helperThread
[
MAX_THREADS
];
struct
scclNetSocketThreadResources
threadResources
[
MAX_THREADS
];
};
#define DIVUP(x, y) (((x) + (y) - 1) / (y))
#define MIN_CHUNKSIZE (64 * 1024)
template
<
typename
T
>
scclResult_t
scclCallocDebug
(
T
**
ptr
,
size_t
nelem
,
const
char
*
filefunc
,
int
line
)
{
void
*
p
=
malloc
(
nelem
*
sizeof
(
T
));
if
(
p
==
NULL
)
{
WARN
(
"Failed to malloc %ld bytes"
,
nelem
*
sizeof
(
T
));
return
scclSystemError
;
}
memset
(
p
,
0
,
nelem
*
sizeof
(
T
));
*
ptr
=
(
T
*
)
p
;
return
scclSuccess
;
}
#define scclCalloc(...) scclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
void
scclSetThreadName
(
pthread_t
thread
,
const
char
*
fmt
,
...)
{
#ifdef _GNU_SOURCE
char
threadName
[
16
];
va_list
vargs
;
va_start
(
vargs
,
fmt
);
vsnprintf
(
threadName
,
16
,
fmt
,
vargs
);
va_end
(
vargs
);
pthread_setname_np
(
thread
,
threadName
);
#endif
}
void
*
persistentSocketThread
(
void
*
args_
)
{
struct
scclNetSocketThreadResources
*
resource
=
(
struct
scclNetSocketThreadResources
*
)
args_
;
struct
scclNetSocketComm
*
comm
=
resource
->
comm
;
struct
scclNetSocketTaskQueue
*
myQueue
=
&
resource
->
threadTaskQueue
;
int
nSocksPerThread
=
comm
->
nSocks
/
comm
->
nThreads
;
while
(
1
)
{
int
idle
=
1
;
int
mark
=
myQueue
->
next
;
// mark newest task seen
for
(
int
i
=
0
;
i
<
myQueue
->
len
;
i
+=
nSocksPerThread
)
{
int
repeat
;
do
{
repeat
=
0
;
for
(
int
j
=
0
;
j
<
nSocksPerThread
;
j
++
)
{
struct
scclNetSocketTask
*
r
=
myQueue
->
tasks
+
i
+
j
;
if
(
r
!=
NULL
&&
r
->
used
==
1
&&
r
->
offset
<
r
->
size
)
{
r
->
result
=
scclSocketProgress
(
r
->
op
,
r
->
sock
,
r
->
data
,
r
->
size
,
&
r
->
offset
);
if
(
r
->
result
!=
scclSuccess
)
{
WARN
(
"NET/Socket : socket progress error"
);
return
NULL
;
}
idle
=
0
;
if
(
r
->
offset
<
r
->
size
)
repeat
=
1
;
}
}
}
while
(
repeat
);
}
if
(
idle
)
{
pthread_mutex_lock
(
&
resource
->
threadLock
);
while
(
mark
==
myQueue
->
next
&&
resource
->
stop
==
0
)
{
// no new tasks, wait
pthread_cond_wait
(
&
resource
->
threadCond
,
&
resource
->
threadLock
);
}
pthread_mutex_unlock
(
&
resource
->
threadLock
);
}
if
(
resource
->
stop
)
return
NULL
;
}
}
scclResult_t
scclNetSocketGetTask
(
struct
scclNetSocketComm
*
comm
,
int
op
,
void
*
data
,
int
size
,
struct
scclNetSocketTask
**
req
)
{
int
tid
=
comm
->
nextSock
%
comm
->
nThreads
;
struct
scclNetSocketThreadResources
*
res
=
comm
->
threadResources
+
tid
;
struct
scclNetSocketTaskQueue
*
queue
=
&
res
->
threadTaskQueue
;
// create helper threads and prepare per-thread task queue
if
(
queue
->
tasks
==
NULL
)
{
// each request can be divided up to nSocks tasks, and
// these tasks are distributed to nThreads threads,
// we need to make sure each thread queue has enough slots for MAX_REQUESTS
queue
->
len
=
MAX_REQUESTS
*
DIVUP
(
comm
->
nSocks
,
comm
->
nThreads
);
SCCLCHECK
(
scclCalloc
(
&
queue
->
tasks
,
queue
->
len
));
queue
->
next
=
0
;
res
->
comm
=
comm
;
pthread_mutex_init
(
&
res
->
threadLock
,
NULL
);
pthread_cond_init
(
&
res
->
threadCond
,
NULL
);
pthread_create
(
comm
->
helperThread
+
tid
,
NULL
,
persistentSocketThread
,
res
);
scclSetThreadName
(
comm
->
helperThread
[
tid
],
"NCCL Sock%c%1u%2u%2u"
,
op
==
SCCL_SOCKET_SEND
?
'S'
:
'R'
,
comm
->
dev
,
tid
,
comm
->
hipDev
);
}
struct
scclNetSocketTask
*
r
=
queue
->
tasks
+
queue
->
next
;
if
(
r
->
used
==
0
)
{
r
->
op
=
op
;
r
->
data
=
data
;
r
->
size
=
size
;
r
->
sock
=
comm
->
socks
+
comm
->
nextSock
;
r
->
offset
=
0
;
r
->
result
=
scclSuccess
;
comm
->
nextSock
=
(
comm
->
nextSock
+
1
)
%
comm
->
nSocks
;
r
->
used
=
1
;
*
req
=
r
;
pthread_mutex_lock
(
&
res
->
threadLock
);
queue
->
next
=
(
queue
->
next
+
1
)
%
queue
->
len
;
pthread_cond_signal
(
&
res
->
threadCond
);
pthread_mutex_unlock
(
&
res
->
threadLock
);
return
scclSuccess
;
}
WARN
(
"NET/Socket : unable to allocate subtasks"
);
return
scclInternalError
;
}
/**
* @brief 测试socket通信请求状态
*
* 该函数用于测试socket通信请求的完成状态,并处理数据传输过程。它会根据请求的不同状态(未开始、正在交换数据大小、已完成交换)执行相应的操作:
* - 如果请求未开始(used=0),则初始化状态
* - 如果正在交换数据大小(used=1),则处理数据大小交换逻辑
* - 如果已完成数据大小交换(used=2),则处理实际数据传输
*
* @param request 指向socket请求的指针
* @param done 输出参数,指示请求是否完成(1=完成,0=未完成)
* @param size 输出参数,返回传输的数据大小
* @return scclResult_t 返回操作结果状态码
*/
scclResult_t
scclNetSocketTest
(
void
*
request
,
int
*
done
,
int
*
size
)
{
*
done
=
0
;
struct
scclNetSocketRequest
*
r
=
(
struct
scclNetSocketRequest
*
)
request
;
if
(
r
==
NULL
)
{
INFO
(
SCCL_LOG_CODEALL
,
"NET/Socket : test called with NULL request"
);
return
scclInternalError
;
}
INFO
(
SCCL_LOG_CODEALL
,
"NET/Socket : test called request used:%d
\n
"
,
r
->
used
);
if
(
r
->
used
==
1
)
{
/* try to send/recv size */
int
data
=
r
->
size
;
int
offset
=
0
;
SCCLCHECK
(
scclSocketProgress
(
r
->
op
,
r
->
ctrlSock
,
&
data
,
sizeof
(
int
),
&
offset
));
if
(
offset
==
0
)
return
scclSuccess
;
/* Not ready -- retry later */
// Not sure we could ever receive less than 4 bytes, but just in case ...
if
(
offset
<
sizeof
(
int
))
SCCLCHECK
(
scclSocketWait
(
r
->
op
,
r
->
ctrlSock
,
&
data
,
sizeof
(
int
),
&
offset
));
// Check size is less or equal to the size provided by the user
if
(
r
->
op
==
SCCL_SOCKET_RECV
&&
data
>
r
->
size
)
{
char
line
[
SOCKET_NAME_MAXLEN
+
1
];
union
scclSocketAddress
addr
;
scclSocketGetAddr
(
r
->
ctrlSock
,
&
addr
);
WARN
(
"NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
there may be a mismatch in collective sizes or environment settings (e.g. SCCL_PROTO, SCCL_ALGO) between ranks"
,
scclSocketToString
(
&
addr
,
line
),
data
,
r
->
size
);
return
scclInvalidUsage
;
}
r
->
size
=
data
;
r
->
offset
=
0
;
r
->
used
=
2
;
// done exchanging size
// divide into subtasks
int
chunkOffset
=
0
,
i
=
0
;
if
(
r
->
comm
->
nSocks
>
0
)
{
// each request can be divided up to nSocks tasks
int
taskSize
=
std
::
max
(
MIN_CHUNKSIZE
,
DIVUP
(
r
->
size
,
r
->
comm
->
nSocks
));
while
(
chunkOffset
<
r
->
size
)
{
int
chunkSize
=
std
::
min
(
taskSize
,
r
->
size
-
chunkOffset
);
SCCLCHECK
(
scclNetSocketGetTask
(
r
->
comm
,
r
->
op
,
(
char
*
)(
r
->
data
)
+
chunkOffset
,
chunkSize
,
r
->
tasks
+
i
++
));
chunkOffset
+=
chunkSize
;
}
}
r
->
nSubs
=
i
;
}
if
(
r
->
used
==
2
)
{
// already exchanged size
if
(
r
->
nSubs
>
0
)
{
int
nCompleted
=
0
;
for
(
int
i
=
0
;
i
<
r
->
nSubs
;
i
++
)
{
struct
scclNetSocketTask
*
sub
=
r
->
tasks
[
i
];
if
(
sub
->
result
!=
scclSuccess
)
return
sub
->
result
;
if
(
sub
->
offset
==
sub
->
size
)
nCompleted
++
;
}
if
(
nCompleted
==
r
->
nSubs
)
{
if
(
size
)
*
size
=
r
->
size
;
*
done
=
1
;
r
->
used
=
0
;
for
(
int
i
=
0
;
i
<
r
->
nSubs
;
i
++
)
{
struct
scclNetSocketTask
*
sub
=
r
->
tasks
[
i
];
sub
->
used
=
0
;
}
}
}
else
{
// progress request using main thread
if
(
r
->
offset
<
r
->
size
)
{
SCCLCHECK
(
scclSocketProgress
(
r
->
op
,
r
->
ctrlSock
,
r
->
data
,
r
->
size
,
&
r
->
offset
));
}
if
(
r
->
offset
==
r
->
size
)
{
if
(
size
)
*
size
=
r
->
size
;
*
done
=
1
;
r
->
used
=
0
;
}
}
}
return
scclSuccess
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
struct
scclNetSocketRequest
*
request
=
(
struct
scclNetSocketRequest
*
)
malloc
(
sizeof
(
struct
scclNetSocketRequest
));
request
->
op
=
SCCL_SOCKET_SEND
;
request
->
used
=
1
;
request
->
size
=
1024
;
request
->
data
=
(
char
*
)
malloc
(
request
->
size
);
request
->
ctrlSock
=
NULL
;
request
->
comm
=
NULL
;
request
->
nSubs
=
0
;
int
done
;
int
sizes
[
32
];
printf
(
"test
\n
"
);
INFO
(
SCCL_LOG_CODEALL
,
"test INFO"
);
SCCLCHECK
(
scclSocketInit
(
request
));
SCCLCHECK
(
scclNetSocketTest
(
request
,
&
done
,
sizes
));
if
(
done
)
{
printf
(
"done
\n
"
);
}
}
\ No newline at end of file
examples/2_topo/0_demo_topo/compile_topo.sh
View file @
a4ac3320
...
@@ -6,11 +6,11 @@ hipcc ./test_topo.cpp \
...
@@ -6,11 +6,11 @@ hipcc ./test_topo.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/nvmlwrap.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/nvmlwrap.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
device
/ibvsymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
net_ib
/ibvsymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
device
/ibvwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
net_ib
/ibvwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
device
/net_ib.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
net_ib
/net_ib.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
hos
t/socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
net_socke
t/socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
hos
t/net_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
net_socke
t/net_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
...
@@ -25,8 +25,8 @@ hipcc ./test_topo.cpp \
...
@@ -25,8 +25,8 @@ hipcc ./test_topo.cpp \
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
device
/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
net_ib
/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
hos
t/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
net_socke
t/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/usr/lib/x86_64-linux-gnu
-L
/usr/lib/
\
-L
/usr/lib/x86_64-linux-gnu
-L
/usr/lib/
\
...
...
examples/2_topo/1_demo_rocm/compile_rocm_smi.sh
View file @
a4ac3320
hipcc /public/home/lishen/Code/rocSHMEM/SCCL_v1/examples/2_topo/1_demo_rocm/test_rocm_smi.cpp
\
hipcc /public/home/lishen/Code/rocSHMEM/SCCL_v1/examples/2_topo/1_demo_rocm/test_rocm_smi.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/rocm_smi_wrap.c
c
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
bootstrap/
rocm_smi_wrap.c
pp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp
\
-o
test_topo
\
-o
test_topo
\
-std
=
c++17
-g
-O3
-fopenmp
-D__HIP_PLATFORM_HCC__
\
-std
=
c++17
-g
-O3
-fopenmp
-D__HIP_PLATFORM_HCC__
\
...
@@ -11,6 +11,7 @@ hipcc /public/home/lishen/Code/rocSHMEM/SCCL_v1/examples/2_topo/1_demo_rocm/test
...
@@ -11,6 +11,7 @@ hipcc /public/home/lishen/Code/rocSHMEM/SCCL_v1/examples/2_topo/1_demo_rocm/test
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/
\
-L
/usr/lib/x86_64-linux-gnu
\
-L
/usr/lib/x86_64-linux-gnu
\
-L
/usr/lib/
\
-L
/usr/lib/
\
-lamdhip64
-lrocm_smi64
-lamdhip64
-lrocm_smi64
\ No newline at end of file
examples/2_topo/1_demo_rocm/test_rocm_smi.cpp
View file @
a4ac3320
...
@@ -11,21 +11,21 @@ using namespace sccl;
...
@@ -11,21 +11,21 @@ using namespace sccl;
int
main
(
int
argc
,
char
**
argv
)
{
int
main
(
int
argc
,
char
**
argv
)
{
printf
(
"hello world
\n
"
);
printf
(
"hello world
\n
"
);
(
void
)
rocm_smi_init
();
(
void
)
sccl
::
hardware
::
topology
::
bootstrap
::
rocm_smi_init
();
uint32_t
num_devs
;
uint32_t
num_devs
;
(
void
)
rocm_smi_getNumDevice
(
&
num_devs
);
(
void
)
sccl
::
hardware
::
topology
::
bootstrap
::
rocm_smi_getNumDevice
(
&
num_devs
);
printf
(
"num_devs=%d
\n
"
,
num_devs
);
printf
(
"num_devs=%d
\n
"
,
num_devs
);
uint32_t
deviceIndex
=
0
;
uint32_t
deviceIndex
=
0
;
char
bus0
[
100
]
=
"bus0"
;
char
bus0
[
100
]
=
"bus0"
;
(
void
)
rocm_smi_getDevicePciBusIdString
(
deviceIndex
,
bus0
,
100
);
(
void
)
sccl
::
hardware
::
topology
::
bootstrap
::
rocm_smi_getDevicePciBusIdString
(
deviceIndex
,
bus0
,
100
);
printf
(
"bus0=%s
\n
"
,
bus0
);
printf
(
"bus0=%s
\n
"
,
bus0
);
RSMI_IO_LINK_TYPE
rsmi_type
;
RSMI_IO_LINK_TYPE
rsmi_type
;
int
hops
,
count
;
int
hops
,
count
;
(
void
)
rocm_smi_getLinkInfo
(
0
,
8
,
&
rsmi_type
,
&
hops
,
&
count
);
(
void
)
sccl
::
hardware
::
topology
::
bootstrap
::
rocm_smi_getLinkInfo
(
0
,
8
,
&
rsmi_type
,
&
hops
,
&
count
);
printf
(
"rsmi_type=%d, hops=%d, count=%d
\n
"
,
rsmi_type
,
hops
,
count
);
printf
(
"rsmi_type=%d, hops=%d, count=%d
\n
"
,
rsmi_type
,
hops
,
count
);
// struct sccl::hardware::topology::topo::scclXml* xml;
// struct sccl::hardware::topology::topo::scclXml* xml;
...
...
examples/2_topo/2_bootstrap/1_mpi_init.cpp
View file @
a4ac3320
...
@@ -4,7 +4,8 @@
...
@@ -4,7 +4,8 @@
#include <stdint.h>
#include <stdint.h>
#include "mpi.h"
#include "mpi.h"
#include "net.h"
#include "net.h"
#include "bootstrap_net.h"
#include "bootstrap.h"
#include "hardware_utils.h"
using
namespace
sccl
;
using
namespace
sccl
;
...
@@ -23,17 +24,35 @@ int main(int argc, char* argv[]) {
...
@@ -23,17 +24,35 @@ int main(int argc, char* argv[]) {
// ----------------------------------------------------------------------- //
// ----------------------------------------------------------------------- //
INFO
(
SCCL_LOG_TOPO
,
"Bootstrap ...
\n
"
);
INFO
(
SCCL_LOG_TOPO
,
"Bootstrap ...
\n
"
);
struct
scclRankInfo
*
rank_info
;
struct
sccl
::
hardware
::
topology
::
bootstrap
::
scclBootstrapComm
*
comm
;
(
void
)
sccl
::
hardware
::
topology
::
bootstrap
::
bootstrap_net
::
bootstrapNetInit
();
SCCLCHECK
(
scclCalloc
(
&
rank_info
,
1
));
SCCLCHECK
(
scclCalloc
(
&
comm
,
1
));
rank_info
->
rank
=
rank
;
rank_info
->
nRanks
=
nranks
;
rank_info
->
localRanks
=
2
;
rank_info
->
hipDev
=
rank
%
rank_info
->
localRanks
;
auto
sccl_bootstrap
=
new
sccl
::
hardware
::
topology
::
bootstrap
::
scclBootstrap
(
rank_info
,
comm
);
SCCLCHECK
(
sccl_bootstrap
->
bootstrapInitCheck
());
sccl
::
hardware
::
topology
::
bootstrap
::
printUniqueInfo
(
comm
->
unique_info
);
int
cuda_id
;
HIPCHECK
(
hipGetDevice
(
&
cuda_id
));
printf
(
"rank=%d, cuda_id=%d
\n
"
,
rank
,
cuda_id
);
MPI_Finalize
();
MPI_Finalize
();
}
}
/*
/*
单机执行
单机执行
SCCL_DEBUG_LEVEL=
SCCL_LOG_
ABORT mpirun --allow-run-as-root -np
2
1_mpi_init
SCCL_DEBUG_LEVEL=ABORT mpirun --allow-run-as-root -np
4
1_mpi_init
SCCL_DEBUG_LEVEL=
SCCL_LOG_
INFO SCCL_DEBUG_
POS=SCCL_LOG_CODE
ALL mpirun --allow-run-as-root -np 2 1_mpi_init
SCCL_DEBUG_LEVEL=INFO SCCL_DEBUG_
SUBSYS=
ALL mpirun --allow-run-as-root -np 2 1_mpi_init
跨机执行
跨机执行
SCCL_DEBUG_LEVEL=SCCL_LOG_ABORT mpirun --allow-run-as-root --hostfile hostfile -np 16 ./1_mpi_init
SCCL_DEBUG_LEVEL=ABORT mpirun --allow-run-as-root --hostfile hostfile -np 16 ./1_mpi_init
SCCL_DEBUG_LEVEL=ABORT SCCL_DEBUG_SUBSYS=BOOTSTRAP mpirun --allow-run-as-root --hostfile hostfile2 -np 4 ./1_mpi_init
*/
*/
examples/2_topo/2_bootstrap/compile_mpi.sh
View file @
a4ac3320
hipcc ./1_mpi_init.cpp
\
hipcc ./1_mpi_init.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvsymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/hardware_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/ibvsymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/net_ib.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/ibvwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/net_ib.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/net_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/net_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/ipc_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_net.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/ipcsocket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/proxy.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/rocm_smi_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/thread_pool.cpp
\
-o
1_mpi_init
\
-o
1_mpi_init
\
-std
=
c++17
-g
-O3
-fopenmp
-DROC_SHMEM
-D__HIP_PLATFORM_HCC__
\
-std
=
c++17
-g
-O3
-fopenmp
-DROC_SHMEM
-D__HIP_PLATFORM_HCC__
-Wno-return-type
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/
\
-I
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_ib/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ipc_socket/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/usr/lib/x86_64-linux-gnu
-libverbs
-lrdmacm
\
-L
/usr/lib/x86_64-linux-gnu
-libverbs
-lrdmacm
\
-L
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib
-lmpi
-L
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib
-lmpi
\
-L
/opt/dtk/lib
-lamdhip64
-lrocm-core
-lrocm_smi64
-pthread
export
HSA_FORCE_FINE_GRAIN_PCIE
=
"1"
export
iommu
=
pt
examples/2_topo/2_bootstrap/hostfile
0 → 100644
View file @
a4ac3320
node037 slots=8
node038 slots=8
\ No newline at end of file
examples/2_topo/2_bootstrap/hostfile2
0 → 100644
View file @
a4ac3320
node037 slots=2
node038 slots=2
\ No newline at end of file
examples/get_cpp_files.py
0 → 100644
View file @
a4ac3320
import
os
import
glob
from
pathlib
import
Path
def
find_cpp_files
(
directory
):
return
[
str
(
file
)
for
file
in
Path
(
directory
).
rglob
(
'*.cpp'
)]
def
main
():
src_path
=
"/public/home/lishen/Code/rocSHMEM/SCCL_v1/src"
cpp_files
=
find_cpp_files
(
src_path
)
cpp_files
.
sort
()
for
cpp_file
in
cpp_files
:
print
(
cpp_file
+
'
\\
'
)
if
__name__
==
"__main__"
:
main
()
src/hardware/comm.h
View file @
a4ac3320
#pragma once
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <stdint.h>
#include <stdint.h>
#include "base.h"
#include "base.h"
#include "topo.h"
namespace
sccl
{
namespace
sccl
{
namespace
hardware
{
namespace
hardware
{
// 定义结构体 scclUniqueInfo,用于存储每个通信节点的信息
struct
scclUniqueInfo
{
int
rank
;
// 当前节点的全局排名
int
nRanks
;
// 总的节点数量
int
localRank
;
// 当前节点在本地计算节点中的排名
int
localRanks
;
// 本地计算节点中的节点总数
int
cudaDev
;
// CUDA 设备 ID
int
gdrSupport
;
// 是否支持 GPU 直接注册 (GDR)
uint64_t
hostHash
;
// 主机哈希值
uint64_t
pidHash
;
// 进程 ID 哈希值
int64_t
busId
;
// 总线 ID
};
// // 定义结构体 scclCommBase,用于存储通信基础信息
// // 定义结构体 scclCommBase,用于存储通信基础信息
// struct scclCommBase {
// struct scclCommBase {
// struct scclUniqueInfo* peerInfo; // 指向 peerInfo 结构体的指针,存储所有节点的信息
// struct scclUniqueInfo* peerInfo; // 指向 peerInfo 结构体的指针,存储所有节点的信息
...
...
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment