Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
lishen01
Sccl
Commits
d9d23f34
Commit
d9d23f34
authored
Jun 20, 2025
by
lishen
Browse files
Initial Code for SCCL_v1
parent
57df3737
Changes
99
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2201 additions
and
0 deletions
+2201
-0
CMakeLists.txt
CMakeLists.txt
+234
-0
README.md
README.md
+9
-0
examples/0_demos/1_thread/compile.sh
examples/0_demos/1_thread/compile.sh
+4
-0
examples/0_demos/1_thread/thread.cpp
examples/0_demos/1_thread/thread.cpp
+27
-0
examples/1_connection/1_rdma_comm/1_simple.cpp
examples/1_connection/1_rdma_comm/1_simple.cpp
+38
-0
examples/1_connection/1_rdma_comm/2_mpi_get.cpp
examples/1_connection/1_rdma_comm/2_mpi_get.cpp
+127
-0
examples/1_connection/1_rdma_comm/3_rdma_info.cpp
examples/1_connection/1_rdma_comm/3_rdma_info.cpp
+49
-0
examples/1_connection/1_rdma_comm/compile_mpi.sh
examples/1_connection/1_rdma_comm/compile_mpi.sh
+23
-0
examples/1_connection/1_rdma_comm/compile_rdma_info.sh
examples/1_connection/1_rdma_comm/compile_rdma_info.sh
+6
-0
examples/1_connection/1_rdma_comm/compile_simple.sh
examples/1_connection/1_rdma_comm/compile_simple.sh
+21
-0
examples/1_connection/1_rdma_comm/hostfile
examples/1_connection/1_rdma_comm/hostfile
+2
-0
examples/1_connection/3_socket_comm/client.cpp
examples/1_connection/3_socket_comm/client.cpp
+54
-0
examples/1_connection/3_socket_comm/compile.sh
examples/1_connection/3_socket_comm/compile.sh
+10
-0
examples/1_connection/3_socket_comm/get_ip.cpp
examples/1_connection/3_socket_comm/get_ip.cpp
+44
-0
examples/1_connection/3_socket_comm/server.cpp
examples/1_connection/3_socket_comm/server.cpp
+81
-0
examples/1_connection/3_socket_comm/socket.cpp
examples/1_connection/3_socket_comm/socket.cpp
+905
-0
examples/1_connection/3_socket_comm/socket.h
examples/1_connection/3_socket_comm/socket.h
+235
-0
examples/1_connection/3_socket_comm/test_socket_itf.cpp
examples/1_connection/3_socket_comm/test_socket_itf.cpp
+281
-0
examples/2_topo/0_demo_topo/compile_topo.sh
examples/2_topo/0_demo_topo/compile_topo.sh
+33
-0
examples/2_topo/0_demo_topo/compile_xml.sh
examples/2_topo/0_demo_topo/compile_xml.sh
+18
-0
No files found.
CMakeLists.txt
0 → 100644
View file @
d9d23f34
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
cmake_minimum_required
(
VERSION 3.16.3 FATAL_ERROR
)
###############################################################################
# AVOID IN SOURCE BUILD
###############################################################################
if
(
CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR AND
CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR
)
set
(
MSG
""
)
message
(
STATUS
"Warning! Building from the source directory is not recommended"
)
message
(
STATUS
"If unintended, please remove 'CMakeCache.txt' and 'CMakeFiles'"
)
message
(
STATUS
"and build from a separate directory"
)
message
(
FATAL_ERROR
"In-source build"
)
endif
()
###############################################################################
# CONFIGURATION OPTIONS
###############################################################################
option
(
DEBUG
"Enable debug trace"
OFF
)
option
(
PROFILE
"Enable statistics and timing support"
OFF
)
option
(
USE_RO
"Enable RO conduit."
ON
)
option
(
USE_IPC
"Enable IPC support (using HIP)"
OFF
)
option
(
USE_THREADS
"Enable workgroup threads to share network queues"
OFF
)
option
(
USE_WF_COAL
"Enable wavefront message coalescing"
OFF
)
option
(
USE_COHERENT_HEAP
"Enable support for coherent systems"
OFF
)
option
(
USE_MANAGED_HEAP
"Enable managed memory"
OFF
)
option
(
USE_HOST_HEAP
"Enable host memory using malloc/free"
OFF
)
option
(
USE_HIP_HOST_HEAP
"Enable host memory using hip api"
OFF
)
option
(
USE_ALLOC_DLMALLOC
"Enable dlmalloc device memory allocator"
ON
)
option
(
USE_ALLOC_POW2BINS
"Enable legacy Pow2Bins device memory allocator"
OFF
)
option
(
USE_FUNC_CALL
"Force compiler to use function calls on library API"
OFF
)
option
(
USE_SHARED_CTX
"Request support for shared ctx between WG"
OFF
)
option
(
USE_SINGLE_NODE
"Enable single node support only."
OFF
)
option
(
USE_HOST_SIDE_HDP_FLUSH
"Use a polling thread to flush the HDP cache on the host."
OFF
)
option
(
BUILD_FUNCTIONAL_TESTS
"Build the functional tests"
ON
)
option
(
BUILD_EXAMPLES
"Build the examples"
ON
)
option
(
BUILD_UNIT_TESTS
"Build the unit tests"
ON
)
option
(
BUILD_TESTS_ONLY
"Build only tests. Used to link agains rocSHMEM in a ROCm Release"
OFF
)
option
(
BUILD_LOCAL_GPU_TARGET_ONLY
"Build only for GPUs detected on this machine"
OFF
)
configure_file
(
cmake/rocshmem_config.h.in rocshmem_config.h
)
###############################################################################
# GLOBAL COMPILE FLAGS
###############################################################################
if
(
DEFINED ENV{ROCM_PATH}
)
set
(
ROCM_PATH
"$ENV{ROCM_PATH}"
CACHE STRING
"ROCm install directory"
)
else
()
set
(
ROCM_PATH
"/opt/rocm"
CACHE STRING
"ROCm install directory"
)
endif
()
if
(
NOT DEFINED CMAKE_CXX_COMPILER
)
set
(
CMAKE_CXX_COMPILER
${
ROCM_PATH
}
/bin/hipcc
)
endif
()
set
(
CMAKE_CXX_EXTENSIONS OFF
)
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CXX_STANDARD_REQUIRED ON
)
set
(
CMAKE_CXX_FLAGS_DEBUG
"-O0 -ggdb"
)
if
(
BUILD_TESTS_ONLY
)
if
(
DEFINED ENV{ROCSHMEM_HOME}
)
set
(
ROCSHMEM_HOME
"$ENV{ROCSHMEM_HOME}"
)
else
()
message
(
"Environment variable ROCSHMEM_HOME is not set."
)
message
(
"Assuming that rocSHMEM is installed at
${
ROCM_PATH
}
."
)
set
(
ROCSHMEM_HOME
"
${
ROCM_PATH
}
"
)
endif
()
endif
()
find_package
(
ROCM PATHS
${
ROCM_PATH
}
)
set
(
ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF
)
include
(
cmake/rocm_local_targets.cmake
)
set
(
DEFAULT_GPUS
gfx936
)
###############################################################################
# PROJECT
###############################################################################
find_package
(
ROCmCMakeBuildTools
)
include
(
ROCMCreatePackage
)
include
(
ROCMInstallTargets
)
include
(
ROCMCheckTargetIds
)
rocm_setup_version
(
VERSION 2.0.0
)
project
(
rocshmem VERSION 2.0.0 LANGUAGES CXX
)
add_compile_options
(
-Wno-return-type
)
###############################################################################
# CREATE ROCSHMEM LIBRARY
###############################################################################
if
(
NOT BUILD_TESTS_ONLY
)
add_library
(
${
PROJECT_NAME
}
)
add_library
(
roc::
${
PROJECT_NAME
}
ALIAS
${
PROJECT_NAME
}
)
add_subdirectory
(
src
)
#############################################################################
# SET GPU ARCHITECTURES
#############################################################################
if
(
BUILD_LOCAL_GPU_TARGET_ONLY
)
message
(
STATUS
"Building only for local GPU target"
)
if
(
COMMAND rocm_local_targets
)
rocm_local_targets
(
DEFAULT_GPUS
)
else
()
message
(
WARNING
"Unable to determine local GPU targets. Falling back to default GPUs."
)
endif
()
endif
()
set
(
GPU_TARGETS
"
${
DEFAULT_GPUS
}
"
CACHE STRING
"Target default GPUs if GPU_TARGETS is not defined."
)
if
(
COMMAND rocm_check_target_ids
)
message
(
STATUS
"Checking for ROCm support for GPU targets: "
"
${
GPU_TARGETS
}
"
)
rocm_check_target_ids
(
SUPPORTED_GPUS TARGETS
${
GPU_TARGETS
}
)
else
()
message
(
WARNING
"Unable to check for supported GPU targets. Falling back to default GPUs."
)
set
(
SUPPORTED_GPUS
${
DEFAULT_GPUS
}
)
endif
()
set
(
COMPILING_TARGETS
"
${
SUPPORTED_GPUS
}
"
CACHE STRING
"GPU targets to compile for."
)
message
(
STATUS
"Compiling for
${
COMPILING_TARGETS
}
"
)
foreach
(
target
${
COMPILING_TARGETS
}
)
list
(
APPEND offload_flags --offload-arch=
${
target
}
)
endforeach
()
add_compile_options
(
${
offload_flags
}
)
#############################################################################
# PACKAGE DEPENDENCIES
#############################################################################
find_package
(
MPI REQUIRED
)
find_package
(
hip REQUIRED
)
find_package
(
hsa-runtime64 REQUIRED
)
set
(
CMAKE_THREAD_PREFER_PTHREAD TRUE
)
set
(
THREADS_PREFER_PTHREAD_FLAG TRUE
)
find_package
(
Threads REQUIRED
)
#############################################################################
# LINKING AND INCLUDE DIRECTORIES
#############################################################################
target_include_directories
(
${
PROJECT_NAME
}
PUBLIC
$<BUILD_INTERFACE:
${
CMAKE_CURRENT_SOURCE_DIR
}
/include>
$<BUILD_INTERFACE:
${
CMAKE_BINARY_DIR
}
>
# rocshmem_config.h
$<INSTALL_INTERFACE:include>
${
MPI_CXX_HEADER_DIR
}
)
target_link_libraries
(
${
PROJECT_NAME
}
PUBLIC
Threads::Threads
${
MPI_mpi_LIBRARY
}
${
MPI_mpicxx_LIBRARY
}
hip::device
hip::host
hsa-runtime64::hsa-runtime64
)
endif
()
###############################################################################
# TEST SUBDIRECTORIES
###############################################################################
add_subdirectory
(
tests
)
if
(
BUILD_EXAMPLES
)
add_subdirectory
(
examples
)
endif
()
if
(
NOT BUILD_TESTS_ONLY
)
#############################################################################
# INSTALL
#############################################################################
include
(
ROCMInstallTargets
)
include
(
ROCMCreatePackage
)
rocm_install
(
TARGETS rocshmem
)
rocm_install
(
DIRECTORY
${
CMAKE_SOURCE_DIR
}
/include/
DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
)
rocm_install
(
FILES
"
${
CMAKE_BINARY_DIR
}
/rocshmem_config.h"
DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
/rocshmem
)
rocm_package_add_dependencies
(
DEPENDS
hsa-rocr
hip-runtime-amd
rocm-dev
)
rocm_export_targets
(
TARGETS roc::rocshmem
NAMESPACE roc::
)
rocm_create_package
(
NAME
"rocSHMEM"
DESCRIPTION
"ROCm OpenSHMEM (rocSHMEM)"
MAINTAINER
"rocSHMEM Maintainer <rocshmem-maintainer@amd.com>"
)
endif
()
README.md
0 → 100644
View file @
d9d23f34
# HCU Collective Communication Library (HcuCCL)
所有和汇编、builtin指令相关的内容都在
`device/utils`
文件夹中,上层应用直接使用
SCCL的整体框架如下

SCCL的topo信息获取过程如下

examples/0_demos/1_thread/compile.sh
0 → 100644
View file @
d9d23f34
hipcc ./thread.cpp
\
-o
thread
\
-std
=
c++17
-g
-O3
-fopenmp
-DROC_SHMEM
-D__HIP_PLATFORM_HCC__
examples/0_demos/1_thread/thread.cpp
0 → 100644
View file @
d9d23f34
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
void
*
thread_function
(
void
*
arg
)
{
// 线程开始执行的函数
printf
(
"Thread is running with argument: %s
\n
"
,
(
char
*
)
arg
);
return
NULL
;
}
int
main
()
{
pthread_t
thread_id
;
const
char
*
message
=
"Hello, World!"
;
int
result
;
// 创建线程
result
=
pthread_create
(
&
thread_id
,
NULL
,
thread_function
,
(
void
*
)
message
);
if
(
result
!=
0
)
{
perror
(
"Thread creation failed"
);
exit
(
EXIT_FAILURE
);
}
printf
(
"Thread created successfully
\n
"
);
pthread_exit
(
NULL
);
// 等待线程结束
return
0
;
}
\ No newline at end of file
examples/1_connection/1_rdma_comm/1_simple.cpp
0 → 100644
View file @
d9d23f34
#include <iostream>
#include "net.h"
using
namespace
sccl
;
int
main
(
int
argc
,
char
*
argv
[])
{
INFO
(
SCCL_LOG_CODEALL
,
"Hello, World!"
);
// SCCLCHECK(scclSystemError);
// SCCLCHECK(sccl::hardware::net::device::scclIbInit());
// SCCLCHECK(sccl::hardware::net::device::scclIbGetDevicesNum(&n_ib));
// printf("device num=%d\n", n_ib);
// ----------------------------------------------------------------------- //
auto
scclNet
=
sccl
::
hardware
::
net
::
initNet
(
sccl
::
hardware
::
net
::
NET_IB
);
// auto scclNet = sccl::hardware::net::initNet(sccl::hardware::net::NET_SOCKET);
sccl
::
hardware
::
net
::
scclNetProperties_t
props
;
int
n_ib
;
scclNet
->
devices
(
&
n_ib
);
printf
(
"device num=%d
\n
"
,
n_ib
);
scclNet
->
getProperties
(
0
,
&
props
);
printf
(
"device name=%s
\n
"
,
props
.
name
);
printf
(
"device pciPath=%s
\n
"
,
props
.
pciPath
);
printf
(
"device guid=%lu
\n
"
,
props
.
guid
);
printf
(
"device ptrSupport=%d
\n
"
,
props
.
ptrSupport
);
printf
(
"device speed=%d
\n
"
,
props
.
speed
);
printf
(
"device port=%d
\n
"
,
props
.
port
);
printf
(
"device latency=%f
\n
"
,
props
.
latency
);
printf
(
"device maxComms=%d
\n
"
,
props
.
maxComms
);
printf
(
"device maxRecvs=%d
\n
"
,
props
.
maxRecvs
);
// 程序成功执行,返回0
return
0
;
}
// HIP_VISIBLE_DEVICES=1 ./1_simple
examples/1_connection/1_rdma_comm/2_mpi_get.cpp
0 → 100644
View file @
d9d23f34
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include "mpi.h"
#include "net.h"
using
namespace
sccl
;
// int main(int argc, char* argv[]) {
// INFO(SCCL_LOG_CODEALL, "Hello, World!");
// // SCCLCHECK(scclSystemError);
// // SCCLCHECK(sccl::hardware::net::device::scclIbInit());
// // SCCLCHECK(sccl::hardware::net::device::scclIbGetDevicesNum(&n_ib));
// // printf("device num=%d\n", n_ib);
// // ----------------------------------------------------------------------- //
// // auto scclNet = sccl::hardware::net::initNet(sccl::hardware::net::NET_IB);
// auto scclNet = sccl::hardware::net::initNet(sccl::hardware::net::NET_SOCKET);
// sccl::hardware::net::scclNetProperties_t props;
// int n_ib;
// scclNet->devices(&n_ib);
// printf("device num=%d\n", n_ib);
// scclNet->getProperties(0, &props);
// printf("device name=%s\n", props.name);
// printf("device pciPath=%s\n", props.pciPath);
// printf("device guid=%lu\n", props.guid);
// printf("device ptrSupport=%d\n", props.ptrSupport);
// printf("device speed=%d\n", props.speed);
// printf("device port=%d\n", props.port);
// printf("device latency=%f\n", props.latency);
// printf("device maxComms=%d\n", props.maxComms);
// printf("device maxRecvs=%d\n", props.maxRecvs);
// // 程序成功执行,返回0
// return 0;
// }
int
main
(
int
argc
,
char
*
argv
[])
{
int
rank
,
nranks
;
int
tag1
,
src
,
dst
,
cnt
;
MPI_Status
status
;
MPI_Init
(
&
argc
,
&
argv
);
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
nranks
);
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
printf
(
"rank=%d, nranks=%d
\n
"
,
rank
,
nranks
);
// ----------------------------------------------------------------------- //
#if 0
{
auto scclNet = sccl::hardware::net::initNet(sccl::hardware::net::NET_SOCKET);
sccl::hardware::net::scclNetProperties_t props;
int n_ib;
scclNet->devices(&n_ib);
int local_rank = rank % n_ib;
scclNet->getProperties(local_rank, &props);
int cuda_dev = local_rank;
char busIdStr[] = "00000000:00:00.0";
(void)hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cuda_dev);
printf("rank=%d/%d, n_ib=%d, device name=%s, bus_id=%s, pciPath=%s,guid=%lu, ptrSupport=%d, speed=%d, port=%d, latency=%f, maxComms=%d, maxRecvs=%d\n",
rank,
nranks,
n_ib,
props.name,
busIdStr,
props.pciPath,
props.guid,
props.ptrSupport,
props.speed,
props.port,
props.latency,
props.maxComms,
props.maxRecvs);
}
#endif
#if 1
{
auto
scclNet
=
sccl
::
hardware
::
net
::
initNet
(
sccl
::
hardware
::
net
::
NET_IB
);
sccl
::
hardware
::
net
::
scclNetProperties_t
props
;
int
n_ib
;
scclNet
->
devices
(
&
n_ib
);
int
local_rank
=
rank
%
n_ib
;
scclNet
->
getProperties
(
local_rank
,
&
props
);
#define MAX_BUSID_SIZE 16
int
cuda_dev
=
local_rank
;
char
busIdStr
[]
=
"00000000:00:00.0"
;
(
void
)
hipDeviceGetPCIBusId
(
busIdStr
,
sizeof
(
busIdStr
),
cuda_dev
);
printf
(
"rank=%d/%d, n_ib=%d, device name=%s, bus_id=%s, pciPath=%s,guid=%lu, ptrSupport=%d, speed=%d, port=%d, latency=%f, maxComms=%d, maxRecvs=%d
\n
"
,
rank
,
nranks
,
n_ib
,
props
.
name
,
busIdStr
,
props
.
pciPath
,
props
.
guid
,
props
.
ptrSupport
,
props
.
speed
,
props
.
port
,
props
.
latency
,
props
.
maxComms
,
props
.
maxRecvs
);
}
#endif
MPI_Finalize
();
}
/*
单机执行
SCCL_DEBUG_LEVEL=SCCL_LOG_ABORT mpirun --allow-run-as-root -np 8 2_mpi_get
SCCL_DEBUG_LEVEL=SCCL_LOG_INFO mpirun --allow-run-as-root -np 8 2_mpi_get
跨机执行
SCCL_DEBUG_LEVEL=SCCL_LOG_ABORT mpirun --allow-run-as-root --hostfile hostfile -np 16 ./2_mpi_get
*/
examples/1_connection/1_rdma_comm/3_rdma_info.cpp
0 → 100644
View file @
d9d23f34
#include <infiniband/verbs.h>
void
check_network_connections
()
{
struct
ibv_device
**
dev_list
;
struct
ibv_context
*
context
;
struct
ibv_port_attr
port_attr
;
int
num_devices
,
i
,
port_num
;
// 获取设备列表
dev_list
=
ibv_get_device_list
(
&
num_devices
);
if
(
!
dev_list
)
{
fprintf
(
stderr
,
"Failed to get IB device list
\n
"
);
return
;
}
// 遍历设备列表
for
(
i
=
0
;
i
<
num_devices
;
i
++
)
{
context
=
ibv_open_device
(
dev_list
[
i
]);
if
(
!
context
)
{
fprintf
(
stderr
,
"Failed to open device %s
\n
"
,
ibv_get_device_name
(
dev_list
[
i
]));
continue
;
}
// 假设我们只检查端口 1
port_num
=
1
;
if
(
ibv_query_port
(
context
,
port_num
,
&
port_attr
))
{
fprintf
(
stderr
,
"Failed to query port %d attributes on device %s
\n
"
,
port_num
,
ibv_get_device_name
(
dev_list
[
i
]));
ibv_close_device
(
context
);
continue
;
}
// 检查端口状态和连接状态
if
(
port_attr
.
state
==
IBV_PORT_ACTIVE
&&
port_attr
.
phys_state
==
5
)
{
// 5 表示端口已连接
printf
(
"Device %s, Port %d is connected.
\n
"
,
ibv_get_device_name
(
dev_list
[
i
]),
port_num
);
}
else
{
printf
(
"Device %s, Port %d is not connected.
\n
"
,
ibv_get_device_name
(
dev_list
[
i
]),
port_num
);
}
ibv_close_device
(
context
);
}
ibv_free_device_list
(
dev_list
);
}
int
main
(
int
argc
,
char
*
argv
[])
{
// 获取设备列表
check_network_connections
();
return
0
;
}
examples/1_connection/1_rdma_comm/compile_mpi.sh
0 → 100644
View file @
d9d23f34
hipcc ./2_mpi_get.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvsymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/net_ib.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/net_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
-o
2_mpi_get
\
-std
=
c++17
-g
-O3
-fopenmp
-DROC_SHMEM
-D__HIP_PLATFORM_HCC__
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/usr/lib/x86_64-linux-gnu
-libverbs
-lrdmacm
\
-L
/public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib
-lmpi
examples/1_connection/1_rdma_comm/compile_rdma_info.sh
0 → 100644
View file @
d9d23f34
hipcc ./3_rdma_info.cpp
\
-o
3_rdma_info
\
-std
=
c++17
-g
-O3
-fopenmp
-DROC_SHMEM
-D__HIP_PLATFORM_HCC__
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-L
/usr/lib/x86_64-linux-gnu
-libverbs
-lrdmacm
examples/1_connection/1_rdma_comm/compile_simple.sh
0 → 100644
View file @
d9d23f34
hipcc ./1_simple.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvsymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/net_ib.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/net_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
-o
1_simple
\
-std
=
c++17
-g
-O3
-fopenmp
-DROC_SHMEM
-D__HIP_PLATFORM_HCC__
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/usr/lib/x86_64-linux-gnu
-libverbs
-lrdmacm
examples/1_connection/1_rdma_comm/hostfile
0 → 100644
View file @
d9d23f34
node037 slots=8
node038 slots=8
\ No newline at end of file
examples/1_connection/3_socket_comm/client.cpp
0 → 100644
View file @
d9d23f34
#include <iostream>
#include <string>
#include <cstring>
#include <unistd.h>
#include <arpa/inet.h>
void
start_client
(
const
std
::
string
&
server_ip
,
int
server_port
)
{
int
sock
=
0
;
struct
sockaddr_in
serv_addr
;
char
buffer
[
1024
]
=
{
0
};
std
::
string
message
=
"你好,服务器!"
;
// 创建 socket 文件描述符
if
((
sock
=
socket
(
AF_INET
,
SOCK_STREAM
,
0
))
<
0
)
{
std
::
cerr
<<
"Socket creation error"
<<
std
::
endl
;
exit
(
EXIT_FAILURE
);
}
serv_addr
.
sin_family
=
AF_INET
;
serv_addr
.
sin_port
=
htons
(
server_port
);
// 转换 IPv4 和 IPv6 地址
if
(
inet_pton
(
AF_INET
,
server_ip
.
c_str
(),
&
serv_addr
.
sin_addr
)
<=
0
)
{
std
::
cerr
<<
"Invalid address/ Address not supported"
<<
std
::
endl
;
close
(
sock
);
exit
(
EXIT_FAILURE
);
}
// 连接到服务器
if
(
connect
(
sock
,
(
struct
sockaddr
*
)
&
serv_addr
,
sizeof
(
serv_addr
))
<
0
)
{
std
::
cerr
<<
"Connection Failed"
<<
std
::
endl
;
close
(
sock
);
exit
(
EXIT_FAILURE
);
}
// 发送数据
send
(
sock
,
message
.
c_str
(),
message
.
length
(),
0
);
std
::
cout
<<
"消息已发送"
<<
std
::
endl
;
// 接收响应
int
valread
=
read
(
sock
,
buffer
,
1024
);
std
::
cout
<<
"收到的响应: "
<<
buffer
<<
std
::
endl
;
// 关闭连接
close
(
sock
);
}
int
main
()
{
std
::
string
server_ip
=
"10.16.1.37"
;
int
server_port
=
6842
;
start_client
(
server_ip
,
server_port
);
return
0
;
}
\ No newline at end of file
examples/1_connection/3_socket_comm/compile.sh
0 → 100644
View file @
d9d23f34
hipcc ./test_socket_itf.cpp
\
./socket.cpp
\
-o
test_socket_itf
\
-std
=
c++17
--offload-arch
=
gfx936
-g
-O3
-fopenmp
-D__HIP_PLATFORM_HCC__
\
-I
./
\
-I
/usr/include
\
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/
\
-L
/usr/lib/x86_64-linux-gnu
-lpthread
-lrt
\ No newline at end of file
examples/1_connection/3_socket_comm/get_ip.cpp
0 → 100644
View file @
d9d23f34
#include <iostream>
#include <ifaddrs.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <stdlib.h>
#include <netdb.h>
#include <unistd.h>
#include <ifaddrs.h>
#include <net/if.h>
#include <vector>
#include <utility>
#include <unordered_set>
#include <unistd.h>
#include <sys/syscall.h>
#define NI_MAXHOST 1025
void
get_ip_addresses
()
{
struct
ifaddrs
*
ifaddr
,
*
ifa
;
char
host
[
NI_MAXHOST
];
if
(
getifaddrs
(
&
ifaddr
)
==
-
1
)
{
perror
(
"getifaddrs"
);
exit
(
EXIT_FAILURE
);
}
for
(
ifa
=
ifaddr
;
ifa
!=
NULL
;
ifa
=
ifa
->
ifa_next
)
{
if
(
ifa
->
ifa_addr
==
NULL
)
continue
;
if
(
ifa
->
ifa_addr
->
sa_family
==
AF_INET
)
{
// 检查是否为 IPv4 地址
(
void
)
getnameinfo
(
ifa
->
ifa_addr
,
sizeof
(
struct
sockaddr_in
),
host
,
NI_MAXHOST
,
NULL
,
0
,
NI_NUMERICHOST
);
std
::
cout
<<
"Interface: "
<<
ifa
->
ifa_name
<<
" Address: "
<<
host
<<
std
::
endl
;
}
}
freeifaddrs
(
ifaddr
);
}
int
main
()
{
get_ip_addresses
();
return
0
;
}
\ No newline at end of file
examples/1_connection/3_socket_comm/server.cpp
0 → 100644
View file @
d9d23f34
#include <iostream>
#include <string>
#include <cstring>
#include <unistd.h>
#include <arpa/inet.h>
#include <ifaddrs.h>
#include <net/if.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
void
start_server
()
{
int
server_fd
,
new_socket
;
struct
sockaddr_in
address
;
int
addrlen
=
sizeof
(
address
);
char
buffer
[
1024
]
=
{
0
};
std
::
string
message
=
"消息已收到"
;
// 创建 socket 文件描述符
if
((
server_fd
=
socket
(
AF_INET
,
SOCK_STREAM
,
0
))
==
0
)
{
perror
(
"socket failed"
);
exit
(
EXIT_FAILURE
);
}
// 绑定地址和端口
address
.
sin_family
=
AF_INET
;
address
.
sin_addr
.
s_addr
=
INADDR_ANY
;
// 自动获取所有 IP 地址
address
.
sin_port
=
htons
(
6842
);
if
(
bind
(
server_fd
,
(
struct
sockaddr
*
)
&
address
,
sizeof
(
address
))
<
0
)
{
perror
(
"bind failed"
);
close
(
server_fd
);
exit
(
EXIT_FAILURE
);
}
// 获取绑定的端口号
socklen_t
len
=
sizeof
(
address
);
if
(
getsockname
(
server_fd
,
(
struct
sockaddr
*
)
&
address
,
&
len
)
==
-
1
)
{
perror
(
"getsockname failed"
);
close
(
server_fd
);
exit
(
EXIT_FAILURE
);
}
int
port
=
ntohs
(
address
.
sin_port
);
std
::
cout
<<
"服务器已启动,端口: "
<<
port
<<
std
::
endl
;
// 监听连接
if
(
listen
(
server_fd
,
3
)
<
0
)
{
perror
(
"listen"
);
close
(
server_fd
);
exit
(
EXIT_FAILURE
);
}
std
::
cout
<<
"等待连接..."
<<
std
::
endl
;
// 接受客户端连接
if
((
new_socket
=
accept
(
server_fd
,
(
struct
sockaddr
*
)
&
address
,
(
socklen_t
*
)
&
addrlen
))
<
0
)
{
perror
(
"accept"
);
close
(
server_fd
);
exit
(
EXIT_FAILURE
);
}
while
(
true
)
{
// 接收数据
int
valread
=
read
(
new_socket
,
buffer
,
1024
);
if
(
valread
==
0
)
{
break
;
}
std
::
cout
<<
"收到的消息: "
<<
buffer
<<
std
::
endl
;
send
(
new_socket
,
message
.
c_str
(),
message
.
length
(),
0
);
memset
(
buffer
,
0
,
sizeof
(
buffer
));
}
// 关闭连接
close
(
new_socket
);
close
(
server_fd
);
}
int
main
()
{
start_server
();
return
0
;
}
\ No newline at end of file
examples/1_connection/3_socket_comm/socket.cpp
0 → 100644
View file @
d9d23f34
This diff is collapsed.
Click to expand it.
examples/1_connection/3_socket_comm/socket.h
0 → 100644
View file @
d9d23f34
#pragma once
#include "debug.h"
#include "check.h"
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/tcp.h>
#include <netdb.h>
#include <fcntl.h>
#include <poll.h>
using
namespace
sccl
;
struct
netIf
{
char
prefix
[
64
];
int
port
;
};
static
thread_local
int
scclDebugNoWarn
=
0
;
#define SYSCHECK(call, name) \
do { \
int retval; \
SYSCHECKVAL(call, name, retval); \
} while(false)
#define SYSCHECKVAL(call, name, retval) \
do { \
SYSCHECKSYNC(call, name, retval); \
if(retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
return scclSystemError; \
} \
} while(false)
#define SYSCHECKSYNC(call, name, retval) \
do { \
retval = call; \
if(retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
INFO(SCCL_LOG_CODEALL, "Call to " name " returned %s, retrying", strerror(errno)); \
} else { \
break; \
} \
} while(true)
#define EQCHECK(statement, value) \
do { \
if((statement) == value) { \
/* Print the back trace*/
\
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, scclSystemError, strerror(errno)); \
return scclSystemError; \
} \
} while(0);
#define NEQCHECKGOTO(statement, value, RES, label) \
do { \
if((statement) != value) { \
/* Print the back trace*/
\
RES = scclSystemError; \
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while(0);
#define SYSCHECKGOTO(statement, RES, label) \
do { \
if((statement) == -1) { \
/* Print the back trace*/
\
RES = scclSystemError; \
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while(0);
#define SCCLCHECKGOTO(call, RES, label) \
do { \
RES = call; \
if(RES != scclSuccess && RES != scclInProgress) { \
/* Print the back trace*/
\
if(scclDebugNoWarn == 0) \
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d", __FILE__, __LINE__, RES); \
goto label; \
} \
INFO(SCCL_LOG_CODEALL, "check pass %s:%d -> %d", __FILE__, __LINE__, RES); \
} while(0);
#define EQCHECKGOTO(statement, value, RES, label) \
do { \
if((statement) == value) { \
/* Print the back trace*/
\
RES = scclSystemError; \
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while(0);
static
int
parseStringList
(
const
char
*
string
,
struct
netIf
*
ifList
,
int
maxList
)
{
if
(
!
string
)
return
0
;
const
char
*
ptr
=
string
;
int
ifNum
=
0
;
int
ifC
=
0
;
char
c
;
do
{
c
=
*
ptr
;
if
(
c
==
':'
)
{
if
(
ifC
>
0
)
{
ifList
[
ifNum
].
prefix
[
ifC
]
=
'\0'
;
ifList
[
ifNum
].
port
=
atoi
(
ptr
+
1
);
ifNum
++
;
ifC
=
0
;
}
while
(
c
!=
','
&&
c
!=
'\0'
)
c
=
*
(
++
ptr
);
}
else
if
(
c
==
','
||
c
==
'\0'
)
{
if
(
ifC
>
0
)
{
ifList
[
ifNum
].
prefix
[
ifC
]
=
'\0'
;
ifList
[
ifNum
].
port
=
-
1
;
ifNum
++
;
ifC
=
0
;
}
}
else
{
ifList
[
ifNum
].
prefix
[
ifC
]
=
c
;
ifC
++
;
}
ptr
++
;
}
while
(
ifNum
<
maxList
&&
c
);
return
ifNum
;
}
static
bool
matchIf
(
const
char
*
string
,
const
char
*
ref
,
bool
matchExact
)
{
// Make sure to include '\0' in the exact case
int
matchLen
=
matchExact
?
strlen
(
string
)
+
1
:
strlen
(
ref
);
return
strncmp
(
string
,
ref
,
matchLen
)
==
0
;
}
static
bool
matchPort
(
const
int
port1
,
const
int
port2
)
{
if
(
port1
==
-
1
)
return
true
;
if
(
port2
==
-
1
)
return
true
;
if
(
port1
==
port2
)
return
true
;
return
false
;
}
static
bool
matchIfList
(
const
char
*
string
,
int
port
,
struct
netIf
*
ifList
,
int
listSize
,
bool
matchExact
)
{
// Make an exception for the case where no user list is defined
if
(
listSize
==
0
)
return
true
;
for
(
int
i
=
0
;
i
<
listSize
;
i
++
)
{
if
(
matchIf
(
string
,
ifList
[
i
].
prefix
,
matchExact
)
&&
matchPort
(
port
,
ifList
[
i
].
port
))
{
return
true
;
}
}
return
false
;
}
#define MAX_IFS 16
#define MAX_IF_NAME_SIZE 16
#define SLEEP_INT 1000 // connection retry sleep interval in usec
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
#define SOCKET_NAME_MAXLEN (NI_MAXHOST + NI_MAXSERV)
#define SCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
union
scclSocketAddress
{
struct
sockaddr
sa
;
struct
sockaddr_in
sin
;
struct
sockaddr_in6
sin6
;
};
enum
scclSocketState
{
scclSocketStateNone
=
0
,
scclSocketStateInitialized
=
1
,
scclSocketStateAccepting
=
2
,
scclSocketStateAccepted
=
3
,
scclSocketStateConnecting
=
4
,
scclSocketStateConnectPolling
=
5
,
scclSocketStateConnected
=
6
,
scclSocketStateReady
=
7
,
scclSocketStateClosed
=
8
,
scclSocketStateError
=
9
,
scclSocketStateNum
=
10
};
enum
scclSocketType
{
scclSocketTypeUnknown
=
0
,
scclSocketTypeBootstrap
=
1
,
scclSocketTypeProxy
=
2
,
scclSocketTypeNetSocket
=
3
,
scclSocketTypeNetIb
=
4
};
struct
scclSocket
{
int
fd
;
int
acceptFd
;
int
timedOutRetries
;
int
refusedRetries
;
union
scclSocketAddress
addr
;
volatile
uint32_t
*
abortFlag
;
int
asyncFlag
;
enum
scclSocketState
state
;
int
salen
;
uint64_t
magic
;
enum
scclSocketType
type
;
};
const
char
*
scclSocketToString
(
union
scclSocketAddress
*
addr
,
char
*
buf
,
const
int
numericHostForm
=
1
);
scclResult_t
scclSocketGetAddrFromString
(
union
scclSocketAddress
*
ua
,
const
char
*
ip_port_pair
);
int
scclFindInterfaceMatchSubnet
(
char
*
ifNames
,
union
scclSocketAddress
*
localAddrs
,
union
scclSocketAddress
*
remoteAddr
,
int
ifNameMaxSize
,
int
maxIfs
);
int
scclFindInterfaces
(
char
*
ifNames
,
union
scclSocketAddress
*
ifAddrs
,
int
ifNameMaxSize
,
int
maxIfs
);
// Initialize a socket
scclResult_t
scclSocketInit
(
struct
scclSocket
*
sock
,
union
scclSocketAddress
*
addr
=
NULL
,
uint64_t
magic
=
SCCL_SOCKET_MAGIC
,
enum
scclSocketType
type
=
scclSocketTypeUnknown
,
volatile
uint32_t
*
abortFlag
=
NULL
,
int
asyncFlag
=
0
);
// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
scclResult_t
scclSocketListen
(
struct
scclSocket
*
sock
);
scclResult_t
scclSocketGetAddr
(
struct
scclSocket
*
sock
,
union
scclSocketAddress
*
addr
);
// Connect to sock->addr. sock->fd is set after a successful call.
scclResult_t
scclSocketConnect
(
struct
scclSocket
*
sock
,
int
portReuse
=
0
);
// Return socket connection state.
scclResult_t
scclSocketReady
(
struct
scclSocket
*
sock
,
int
*
running
);
// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
scclResult_t
scclSocketAccept
(
struct
scclSocket
*
sock
,
struct
scclSocket
*
ulistenSock
);
scclResult_t
scclSocketGetFd
(
struct
scclSocket
*
sock
,
int
*
fd
);
scclResult_t
scclSocketSetFd
(
int
fd
,
struct
scclSocket
*
sock
);
#define SCCL_SOCKET_SEND 0
#define SCCL_SOCKET_RECV 1
scclResult_t
scclSocketProgress
(
int
op
,
struct
scclSocket
*
sock
,
void
*
ptr
,
int
size
,
int
*
offset
);
scclResult_t
scclSocketWait
(
int
op
,
struct
scclSocket
*
sock
,
void
*
ptr
,
int
size
,
int
*
offset
);
scclResult_t
scclSocketSend
(
struct
scclSocket
*
sock
,
void
*
ptr
,
int
size
);
scclResult_t
scclSocketRecv
(
struct
scclSocket
*
sock
,
void
*
ptr
,
int
size
);
scclResult_t
scclSocketTryRecv
(
struct
scclSocket
*
sock
,
void
*
ptr
,
int
size
,
int
*
closed
,
bool
blocking
);
scclResult_t
scclSocketClose
(
struct
scclSocket
*
sock
);
examples/1_connection/3_socket_comm/test_socket_itf.cpp
0 → 100644
View file @
d9d23f34
#include "socket.h"
#include "debug.h"
#include "check.h"
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/tcp.h>
#include <netdb.h>
#include <fcntl.h>
#include <poll.h>
using
namespace
sccl
;
#define MAX_REQUESTS 8
#define MAX_THREADS 16
#define MAX_SOCKETS 64
struct
scclNetSocketTask
{
int
op
;
void
*
data
;
int
size
;
struct
scclSocket
*
sock
;
int
offset
;
int
used
;
scclResult_t
result
;
};
struct
scclNetSocketTaskQueue
{
int
next
;
int
len
;
struct
scclNetSocketTask
*
tasks
;
};
struct
scclNetSocketRequest
{
int
op
;
void
*
data
;
int
size
;
struct
scclSocket
*
ctrlSock
;
int
offset
;
int
used
;
struct
scclNetSocketComm
*
comm
;
struct
scclNetSocketTask
*
tasks
[
MAX_SOCKETS
];
int
nSubs
;
};
struct
scclNetSocketThreadResources
{
struct
scclNetSocketTaskQueue
threadTaskQueue
;
int
stop
;
struct
scclNetSocketComm
*
comm
;
pthread_mutex_t
threadLock
;
pthread_cond_t
threadCond
;
};
struct
scclNetSocketComm
{
struct
scclSocket
ctrlSock
;
struct
scclSocket
socks
[
MAX_SOCKETS
];
int
dev
;
int
hipDev
;
int
nSocks
;
int
nThreads
;
int
nextSock
;
struct
scclNetSocketRequest
requests
[
MAX_REQUESTS
];
pthread_t
helperThread
[
MAX_THREADS
];
struct
scclNetSocketThreadResources
threadResources
[
MAX_THREADS
];
};
#define DIVUP(x, y) (((x) + (y) - 1) / (y))
#define MIN_CHUNKSIZE (64 * 1024)
template
<
typename
T
>
scclResult_t
scclCallocDebug
(
T
**
ptr
,
size_t
nelem
,
const
char
*
filefunc
,
int
line
)
{
void
*
p
=
malloc
(
nelem
*
sizeof
(
T
));
if
(
p
==
NULL
)
{
WARN
(
"Failed to malloc %ld bytes"
,
nelem
*
sizeof
(
T
));
return
scclSystemError
;
}
memset
(
p
,
0
,
nelem
*
sizeof
(
T
));
*
ptr
=
(
T
*
)
p
;
return
scclSuccess
;
}
#define scclCalloc(...) scclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
void
scclSetThreadName
(
pthread_t
thread
,
const
char
*
fmt
,
...)
{
#ifdef _GNU_SOURCE
char
threadName
[
16
];
va_list
vargs
;
va_start
(
vargs
,
fmt
);
vsnprintf
(
threadName
,
16
,
fmt
,
vargs
);
va_end
(
vargs
);
pthread_setname_np
(
thread
,
threadName
);
#endif
}
void
*
persistentSocketThread
(
void
*
args_
)
{
struct
scclNetSocketThreadResources
*
resource
=
(
struct
scclNetSocketThreadResources
*
)
args_
;
struct
scclNetSocketComm
*
comm
=
resource
->
comm
;
struct
scclNetSocketTaskQueue
*
myQueue
=
&
resource
->
threadTaskQueue
;
int
nSocksPerThread
=
comm
->
nSocks
/
comm
->
nThreads
;
while
(
1
)
{
int
idle
=
1
;
int
mark
=
myQueue
->
next
;
// mark newest task seen
for
(
int
i
=
0
;
i
<
myQueue
->
len
;
i
+=
nSocksPerThread
)
{
int
repeat
;
do
{
repeat
=
0
;
for
(
int
j
=
0
;
j
<
nSocksPerThread
;
j
++
)
{
struct
scclNetSocketTask
*
r
=
myQueue
->
tasks
+
i
+
j
;
if
(
r
!=
NULL
&&
r
->
used
==
1
&&
r
->
offset
<
r
->
size
)
{
r
->
result
=
scclSocketProgress
(
r
->
op
,
r
->
sock
,
r
->
data
,
r
->
size
,
&
r
->
offset
);
if
(
r
->
result
!=
scclSuccess
)
{
WARN
(
"NET/Socket : socket progress error"
);
return
NULL
;
}
idle
=
0
;
if
(
r
->
offset
<
r
->
size
)
repeat
=
1
;
}
}
}
while
(
repeat
);
}
if
(
idle
)
{
pthread_mutex_lock
(
&
resource
->
threadLock
);
while
(
mark
==
myQueue
->
next
&&
resource
->
stop
==
0
)
{
// no new tasks, wait
pthread_cond_wait
(
&
resource
->
threadCond
,
&
resource
->
threadLock
);
}
pthread_mutex_unlock
(
&
resource
->
threadLock
);
}
if
(
resource
->
stop
)
return
NULL
;
}
}
scclResult_t
scclNetSocketGetTask
(
struct
scclNetSocketComm
*
comm
,
int
op
,
void
*
data
,
int
size
,
struct
scclNetSocketTask
**
req
)
{
int
tid
=
comm
->
nextSock
%
comm
->
nThreads
;
struct
scclNetSocketThreadResources
*
res
=
comm
->
threadResources
+
tid
;
struct
scclNetSocketTaskQueue
*
queue
=
&
res
->
threadTaskQueue
;
// create helper threads and prepare per-thread task queue
if
(
queue
->
tasks
==
NULL
)
{
// each request can be divided up to nSocks tasks, and
// these tasks are distributed to nThreads threads,
// we need to make sure each thread queue has enough slots for MAX_REQUESTS
queue
->
len
=
MAX_REQUESTS
*
DIVUP
(
comm
->
nSocks
,
comm
->
nThreads
);
SCCLCHECK
(
scclCalloc
(
&
queue
->
tasks
,
queue
->
len
));
queue
->
next
=
0
;
res
->
comm
=
comm
;
pthread_mutex_init
(
&
res
->
threadLock
,
NULL
);
pthread_cond_init
(
&
res
->
threadCond
,
NULL
);
pthread_create
(
comm
->
helperThread
+
tid
,
NULL
,
persistentSocketThread
,
res
);
scclSetThreadName
(
comm
->
helperThread
[
tid
],
"NCCL Sock%c%1u%2u%2u"
,
op
==
SCCL_SOCKET_SEND
?
'S'
:
'R'
,
comm
->
dev
,
tid
,
comm
->
hipDev
);
}
struct
scclNetSocketTask
*
r
=
queue
->
tasks
+
queue
->
next
;
if
(
r
->
used
==
0
)
{
r
->
op
=
op
;
r
->
data
=
data
;
r
->
size
=
size
;
r
->
sock
=
comm
->
socks
+
comm
->
nextSock
;
r
->
offset
=
0
;
r
->
result
=
scclSuccess
;
comm
->
nextSock
=
(
comm
->
nextSock
+
1
)
%
comm
->
nSocks
;
r
->
used
=
1
;
*
req
=
r
;
pthread_mutex_lock
(
&
res
->
threadLock
);
queue
->
next
=
(
queue
->
next
+
1
)
%
queue
->
len
;
pthread_cond_signal
(
&
res
->
threadCond
);
pthread_mutex_unlock
(
&
res
->
threadLock
);
return
scclSuccess
;
}
WARN
(
"NET/Socket : unable to allocate subtasks"
);
return
scclInternalError
;
}
/**
* @brief 测试socket通信请求状态
*
* 该函数用于测试socket通信请求的完成状态,并处理数据传输过程。它会根据请求的不同状态(未开始、正在交换数据大小、已完成交换)执行相应的操作:
* - 如果请求未开始(used=0),则初始化状态
* - 如果正在交换数据大小(used=1),则处理数据大小交换逻辑
* - 如果已完成数据大小交换(used=2),则处理实际数据传输
*
* @param request 指向socket请求的指针
* @param done 输出参数,指示请求是否完成(1=完成,0=未完成)
* @param size 输出参数,返回传输的数据大小
* @return scclResult_t 返回操作结果状态码
*/
scclResult_t
scclNetSocketTest
(
void
*
request
,
int
*
done
,
int
*
size
)
{
*
done
=
0
;
struct
scclNetSocketRequest
*
r
=
(
struct
scclNetSocketRequest
*
)
request
;
if
(
r
==
NULL
)
{
INFO
(
SCCL_LOG_CODEALL
,
"NET/Socket : test called with NULL request"
);
return
scclInternalError
;
}
INFO
(
SCCL_LOG_CODEALL
,
"NET/Socket : test called request used:%d
\n
"
,
r
->
used
);
if
(
r
->
used
==
1
)
{
/* try to send/recv size */
int
data
=
r
->
size
;
int
offset
=
0
;
SCCLCHECK
(
scclSocketProgress
(
r
->
op
,
r
->
ctrlSock
,
&
data
,
sizeof
(
int
),
&
offset
));
if
(
offset
==
0
)
return
scclSuccess
;
/* Not ready -- retry later */
// Not sure we could ever receive less than 4 bytes, but just in case ...
if
(
offset
<
sizeof
(
int
))
SCCLCHECK
(
scclSocketWait
(
r
->
op
,
r
->
ctrlSock
,
&
data
,
sizeof
(
int
),
&
offset
));
// Check size is less or equal to the size provided by the user
if
(
r
->
op
==
SCCL_SOCKET_RECV
&&
data
>
r
->
size
)
{
char
line
[
SOCKET_NAME_MAXLEN
+
1
];
union
scclSocketAddress
addr
;
scclSocketGetAddr
(
r
->
ctrlSock
,
&
addr
);
WARN
(
"NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
there may be a mismatch in collective sizes or environment settings (e.g. SCCL_PROTO, SCCL_ALGO) between ranks"
,
scclSocketToString
(
&
addr
,
line
),
data
,
r
->
size
);
return
scclInvalidUsage
;
}
r
->
size
=
data
;
r
->
offset
=
0
;
r
->
used
=
2
;
// done exchanging size
// divide into subtasks
int
chunkOffset
=
0
,
i
=
0
;
if
(
r
->
comm
->
nSocks
>
0
)
{
// each request can be divided up to nSocks tasks
int
taskSize
=
std
::
max
(
MIN_CHUNKSIZE
,
DIVUP
(
r
->
size
,
r
->
comm
->
nSocks
));
while
(
chunkOffset
<
r
->
size
)
{
int
chunkSize
=
std
::
min
(
taskSize
,
r
->
size
-
chunkOffset
);
SCCLCHECK
(
scclNetSocketGetTask
(
r
->
comm
,
r
->
op
,
(
char
*
)(
r
->
data
)
+
chunkOffset
,
chunkSize
,
r
->
tasks
+
i
++
));
chunkOffset
+=
chunkSize
;
}
}
r
->
nSubs
=
i
;
}
if
(
r
->
used
==
2
)
{
// already exchanged size
if
(
r
->
nSubs
>
0
)
{
int
nCompleted
=
0
;
for
(
int
i
=
0
;
i
<
r
->
nSubs
;
i
++
)
{
struct
scclNetSocketTask
*
sub
=
r
->
tasks
[
i
];
if
(
sub
->
result
!=
scclSuccess
)
return
sub
->
result
;
if
(
sub
->
offset
==
sub
->
size
)
nCompleted
++
;
}
if
(
nCompleted
==
r
->
nSubs
)
{
if
(
size
)
*
size
=
r
->
size
;
*
done
=
1
;
r
->
used
=
0
;
for
(
int
i
=
0
;
i
<
r
->
nSubs
;
i
++
)
{
struct
scclNetSocketTask
*
sub
=
r
->
tasks
[
i
];
sub
->
used
=
0
;
}
}
}
else
{
// progress request using main thread
if
(
r
->
offset
<
r
->
size
)
{
SCCLCHECK
(
scclSocketProgress
(
r
->
op
,
r
->
ctrlSock
,
r
->
data
,
r
->
size
,
&
r
->
offset
));
}
if
(
r
->
offset
==
r
->
size
)
{
if
(
size
)
*
size
=
r
->
size
;
*
done
=
1
;
r
->
used
=
0
;
}
}
}
return
scclSuccess
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
struct
scclNetSocketRequest
*
request
=
(
struct
scclNetSocketRequest
*
)
malloc
(
sizeof
(
struct
scclNetSocketRequest
));
request
->
op
=
SCCL_SOCKET_SEND
;
request
->
used
=
1
;
request
->
size
=
1024
;
request
->
data
=
(
char
*
)
malloc
(
request
->
size
);
request
->
ctrlSock
=
NULL
;
request
->
comm
=
NULL
;
request
->
nSubs
=
0
;
int
done
;
int
sizes
[
32
];
printf
(
"test
\n
"
);
INFO
(
SCCL_LOG_CODEALL
,
"test INFO"
);
SCCLCHECK
(
scclSocketInit
(
request
));
SCCLCHECK
(
scclNetSocketTest
(
request
,
&
done
,
sizes
));
if
(
done
)
{
printf
(
"done
\n
"
);
}
}
\ No newline at end of file
examples/2_topo/0_demo_topo/compile_topo.sh
0 → 100644
View file @
d9d23f34
hipcc ./test_topo.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo/topo.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo/xml.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo/rocm_smi_wrap.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/nvmlwrap.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvsymbols.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvwrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/net_ib.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/net_socket.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp
\
-o
test_topo
\
-std
=
c++17
-g
-O3
-fopenmp
-D__HIP_PLATFORM_HCC__
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/
\
-L
/public/home/lishen/Code/rocSHMEM/SCCL_v1
\
-L
/usr/lib/x86_64-linux-gnu
-L
/usr/lib/
\
-libverbs
-lrdmacm
-lamdhip64
-lrocm_smi64
examples/2_topo/0_demo_topo/compile_xml.sh
0 → 100644
View file @
d9d23f34
hipcc ./test_xml.cpp
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo/topo.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo/xml.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo/rocm_smi_wrap.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/utils.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/archinfo.cc
\
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/nvmlwrap.cc
\
-o
test_xml
\
-std
=
c++17
-g
-O3
-fopenmp
-D__HIP_PLATFORM_HCC__
\
-I
./
-I
/usr/include
-I
/opt/dtk/include
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo
\
-I
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/
\
-L
/usr/lib/x86_64-linux-gnu
\
-L
/usr/lib/
\
-lamdhip64
-lrocm_smi64
\ No newline at end of file
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment