Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
lishen01
Sccl
Commits
a4ac3320
"examples/dreambooth/train_dreambooth_lora_hidream.py" did not exist on "c375903db58826494d858e02b44d21b42669ff5e"
Commit
a4ac3320
authored
Jul 07, 2025
by
lishen
Browse files
通过线程池实现ipcsocket,满足节点内通信
parent
d9d23f34
Changes
132
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
804 additions
and
803 deletions
+804
-803
src/hardware/net/net_ib/CMakeLists.txt
src/hardware/net/net_ib/CMakeLists.txt
+0
-0
src/hardware/net/net_ib/ibvsymbols.cpp
src/hardware/net/net_ib/ibvsymbols.cpp
+2
-2
src/hardware/net/net_ib/ibvsymbols.h
src/hardware/net/net_ib/ibvsymbols.h
+2
-2
src/hardware/net/net_ib/ibvwrap.cpp
src/hardware/net/net_ib/ibvwrap.cpp
+2
-2
src/hardware/net/net_ib/ibvwrap.h
src/hardware/net/net_ib/ibvwrap.h
+2
-2
src/hardware/net/net_ib/net_ib.cpp
src/hardware/net/net_ib/net_ib.cpp
+227
-150
src/hardware/net/net_ib/net_ib.h
src/hardware/net/net_ib/net_ib.h
+137
-0
src/hardware/net/net_socket/CMakeLists.txt
src/hardware/net/net_socket/CMakeLists.txt
+0
-0
src/hardware/net/net_socket/net_socket.cpp
src/hardware/net/net_socket/net_socket.cpp
+150
-174
src/hardware/net/net_socket/net_socket.h
src/hardware/net/net_socket/net_socket.h
+168
-0
src/hardware/net/net_socket/socket.cpp
src/hardware/net/net_socket/socket.cpp
+23
-8
src/hardware/net/net_socket/socket.h
src/hardware/net/net_socket/socket.h
+2
-2
src/hardware/net/net_utils.cpp
src/hardware/net/net_utils.cpp
+16
-0
src/hardware/net/net_utils.h
src/hardware/net/net_utils.h
+65
-38
src/hardware/net/rocm_wrap.cpp
src/hardware/net/rocm_wrap.cpp
+7
-0
src/hardware/net/rocm_wrap.h
src/hardware/net/rocm_wrap.h
+1
-0
src/hardware/topo_bak/cpuset.h
src/hardware/topo_bak/cpuset.h
+0
-143
src/hardware/topo_bak/detect_topo.cc
src/hardware/topo_bak/detect_topo.cc
+0
-0
src/hardware/topo_bak/detect_topo.h
src/hardware/topo_bak/detect_topo.h
+0
-0
src/hardware/topo_bak/nvmlwrap.cc
src/hardware/topo_bak/nvmlwrap.cc
+0
-280
No files found.
src/hardware/net/
device
/CMakeLists.txt
→
src/hardware/net/
net_ib
/CMakeLists.txt
View file @
a4ac3320
File moved
src/hardware/net/
device
/ibvsymbols.cpp
→
src/hardware/net/
net_ib
/ibvsymbols.cpp
View file @
a4ac3320
...
...
@@ -7,7 +7,7 @@
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
device
{
namespace
net_ib
{
#define ASSIGN_SYM(container, symbol, name) container->name = &symbol;
...
...
@@ -102,7 +102,7 @@ scclResult_t buildIbvSymbols(struct scclIbvSymbols* ibvSymbols) {
return
scclSuccess
;
}
}
// namespace
device
}
// namespace
net_ib
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/
device
/ibvsymbols.h
→
src/hardware/net/
net_ib
/ibvsymbols.h
View file @
a4ac3320
...
...
@@ -6,7 +6,7 @@
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
device
{
namespace
net_ib
{
/* IB Verbs Function Pointers*/
struct
scclIbvSymbols
{
...
...
@@ -41,7 +41,7 @@ struct scclIbvSymbols {
/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
scclResult_t
buildIbvSymbols
(
struct
scclIbvSymbols
*
ibvSymbols
);
}
// namespace
device
}
// namespace
net_ib
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/
device
/ibvwrap.cpp
→
src/hardware/net/
net_ib
/ibvwrap.cpp
View file @
a4ac3320
...
...
@@ -13,7 +13,7 @@
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
device
{
namespace
net_ib
{
static
pthread_once_t
initOnceControl
=
PTHREAD_ONCE_INIT
;
static
scclResult_t
initResult
;
...
...
@@ -250,7 +250,7 @@ scclResult_t wrap_ibv_post_recv(struct ibv_qp* qp, struct ibv_recv_wr* wr, struc
return
scclSuccess
;
}
}
// namespace
device
}
// namespace
net_ib
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/
device
/ibvwrap.h
→
src/hardware/net/
net_ib
/ibvwrap.h
View file @
a4ac3320
...
...
@@ -10,7 +10,7 @@
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
device
{
namespace
net_ib
{
typedef
enum
ibv_return_enum
:
uint8_t
{
IBV_SUCCESS
=
0
,
//!< The operation was successful
...
...
@@ -112,7 +112,7 @@ scclResult_t wrap_ibv_post_recv(struct ibv_qp* qp, struct ibv_recv_wr* wr, struc
// 获取事件类型字符串
scclResult_t
wrap_ibv_event_type_str
(
char
**
ret
,
enum
ibv_event_type
event
);
}
// namespace
device
}
// namespace
net_ib
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/
device
/net_ib.cpp
→
src/hardware/net/
net_ib
/net_ib.cpp
View file @
a4ac3320
...
...
@@ -9,15 +9,12 @@
#include <netdb.h>
#include "net_ib.h"
#include "socket.h"
#include "rocm_wrap.h"
#include "base.h"
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
device
{
namespace
net_ib
{
///////////////////////////////////////// 环境变量读取及设置 /////////////////////////////////////////
...
...
@@ -59,11 +56,8 @@ SCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 1);
///////////////////////////////////////// 参数及结构体设置 /////////////////////////////////////////
#define MAXNAMESIZE 64
#define MAX_IF_NAME_SIZE 16
static
char
scclIbIfName
[
MAX_IF_NAME_SIZE
+
1
];
static
union
host
::
scclSocketAddress
scclIbIfAddr
;
// 定义一个静态变量 scclNIbDevs,用于存储 InfiniBand 设备的数量
static
int
scclNIbDevs
=
-
1
;
static
char
scclIbIfName
[
MAX_IF_NAME_SIZE
+
1
];
// 用于存储网络接口名称的字符数组
static
union
net_socket
::
scclSocketAddress
scclIbIfAddr
;
// 定义一个联合体类型的变量,用于存储网络接口地址
struct
scclIbMr
{
uintptr_t
addr
;
// 内存地址
...
...
@@ -117,7 +111,7 @@ pthread_mutex_t scclIbLock = PTHREAD_MUTEX_INITIALIZER;
static
int
scclIbRelaxedOrderingEnabled
=
0
;
// 定义一个线程局部变量,用于存储重用的地址信息
static
thread_local
union
hos
t
::
scclSocketAddress
reusedAddr
;
static
thread_local
union
net_socke
t
::
scclSocketAddress
reusedAddr
;
// 定义一个线程局部变量,用于存储重用的套接字文件描述符
static
thread_local
int
reusedSockfd
=
-
1
;
...
...
@@ -128,7 +122,7 @@ pthread_t scclIbAsyncThread;
// 定义一个常量,表示InfiniBand网络接口的最大接收数量
static
constexpr
int
SCCL_NET_IB_MAX_RECVS
=
8
;
// 定义一个常量,表示最大字符串长度
static
constexpr
int
MAX_STR_LEN
=
8
;
static
constexpr
int
MAX_STR_LEN
=
255
;
// 为每个并发接收支持SCCL_NET_MAX_REQUESTS
static
constexpr
int
MAX_REQUESTS
=
(
SCCL_NET_MAX_REQUESTS
*
SCCL_NET_IB_MAX_RECVS
);
...
...
@@ -146,12 +140,12 @@ scclIbRequest 结构体用于封装 InfiniBand 通信请求的详细信息,包
联合体 union 根据请求类型(发送或接收)存储不同的数据结构,以支持灵活的通信操作。
*/
struct
scclIbRequest
{
struct
scclIbVerbs
*
verbs
;
// 指向 scclIbVerbs 结构体的指针,包含 Infiniband 相关的操作
int
type
;
// 请求的类型,例如发送或接收
int
events
;
// 事件标志, 用于记录请求相关的事件状态
struct
hos
t
::
scclSocket
*
sock
;
// 指向 scclSocket 结构体的指针,表示网络套接字
struct
scclIbGidInfo
*
gidInfo
;
// 指向 scclIbGidInfo 结构体的指针,包含全局标识符信息
int
nreqs
;
// 请求的数量
struct
scclIbVerbs
*
verbs
;
// 指向 scclIbVerbs 结构体的指针,包含 Infiniband 相关的操作
int
type
;
// 请求的类型,例如发送或接收
int
events
;
// 事件标志, 用于记录请求相关的事件状态
struct
net_socke
t
::
scclSocket
*
sock
;
// 指向 scclSocket 结构体的指针,表示网络套接字
struct
scclIbGidInfo
*
gidInfo
;
// 指向 scclIbGidInfo 结构体的指针,包含全局标识符信息
int
nreqs
;
// 请求的数量
// 联合体,用于存储不同类型请求的特定信息
union
{
// send: 发送请求的相关信息
...
...
@@ -195,7 +189,7 @@ struct scclIbSendComm {
struct
scclIbRequest
*
fifoReqs
[
MAX_REQUESTS
][
SCCL_NET_IB_MAX_RECVS
];
// FIFO请求指针数组
struct
ibv_send_wr
wrs
[
SCCL_NET_IB_MAX_RECVS
+
1
];
// 发送工作请求结构体数组
struct
ibv_sge
sges
[
SCCL_NET_IB_MAX_RECVS
];
// 散布-聚集元素结构体数组
struct
hos
t
::
scclSocket
sock
;
// 套接字结构体
struct
net_socke
t
::
scclSocket
sock
;
// 套接字结构体
int
ready
;
// 是否准备好
struct
ibv_qp
*
qps
[
SCCL_IB_MAX_QPS
];
// 队列对指针数组
...
...
@@ -206,33 +200,6 @@ struct scclIbSendComm {
struct
scclIbGidInfo
gidInfo
;
// GID信息结构体
};
/*IB的通信状态*/
enum
scclIbCommState
:
uint8_t
{
scclIbCommStateStart
=
0
,
// 初始状态
scclIbCommStateConnect
=
1
,
// 尝试连接状态
scclIbCommStateAccept
=
3
,
// 接受连接状态
scclIbCommStateSend
=
4
,
// 发送数据状态
scclIbCommStateRecv
=
5
,
// 接收数据状态
scclIbCommStateConnecting
=
6
,
// 正在连接状态
scclIbCommStateConnected
=
7
,
// 已连接状态
scclIbCommStatePendingReady
=
8
,
// 等待准备状态
};
/*通信的阶段*/
struct
scclIbCommStage
{
enum
scclIbCommState
state
;
// 通信阶段的状态
int
offset
;
// 数据偏移量
void
*
buffer
;
// 用于通信的缓冲区指针
void
*
comm
;
// 通信对象指针
};
/*监听通信的上下文*/
struct
scclIbListenComm
{
int
dev
;
// 设备标识符
struct
host
::
scclSocket
sock
;
// 用于网络通信的套接字
struct
scclIbCommStage
stage
;
// 通信阶段的状态
};
struct
scclIbQpInfo
{
uint32_t
lid
;
uint8_t
ib_port
;
...
...
@@ -270,7 +237,7 @@ struct scclIbRemFifo {
struct
scclIbRecvComm
{
struct
scclIbVerbs
verbs
;
struct
scclIbRemFifo
remFifo
;
struct
hos
t
::
scclSocket
sock
;
struct
net_socke
t
::
scclSocket
sock
;
int
ready
;
struct
ibv_qp
*
qps
[
SCCL_IB_MAX_QPS
];
int
nqps
;
...
...
@@ -292,7 +259,7 @@ static_assert((offsetof(struct scclIbRecvComm, remFifo) % 32) == 0, "scclIbSendC
* @param args 传入参数,应转换为ibv_context结构体指针
* @return void* 线程返回值,始终返回NULL
*/
static
void
*
scclIbAsyncThreadMain
(
void
*
args
)
{
void
*
scclNetIb
::
scclIbAsyncThreadMain
(
void
*
args
)
{
// 将传入的参数转换为InfiniBand上下文结构体指针
struct
ibv_context
*
context
=
(
struct
ibv_context
*
)
args
;
...
...
@@ -337,7 +304,7 @@ static void* scclIbAsyncThreadMain(void* args) {
* @param realPort 输出参数,记录实际端口号
* @return scclResult_t 返回操作结果,成功返回scclSuccess
*/
static
scclResult_t
scclIbGetPciPath
(
char
*
devName
,
char
**
path
,
int
*
realPort
)
{
scclResult_t
scclNetIb
::
scclIbGetPciPath
(
char
*
devName
,
char
**
path
,
int
*
realPort
)
{
// 定义一个字符数组用于存储设备路径
char
devicePath
[
PATH_MAX
];
// 构造设备路径字符串,格式为 "/sys/class/infiniband/<devName>/device"
...
...
@@ -396,14 +363,14 @@ static int firstBitSet(int val, int max) {
* @param width 输入的宽度值
* @return 返回ibvWidths数组中对应的宽度索引值
*/
static
int
scclIbWidth
(
int
width
)
{
return
ibvWidths
[
firstBitSet
(
width
,
sizeof
(
ibvWidths
)
/
sizeof
(
int
)
-
1
)];
}
int
scclNetIb
::
scclIbWidth
(
int
width
)
{
return
ibvWidths
[
firstBitSet
(
width
,
sizeof
(
ibvWidths
)
/
sizeof
(
int
)
-
1
)];
}
/**
* 根据给定的速度值查找并返回对应的IB传输速率
* @param speed 输入的速度值
* @return 返回ibvSpeeds数组中第一个匹配的IB传输速率
*/
static
int
scclIbSpeed
(
int
speed
)
{
return
ibvSpeeds
[
firstBitSet
(
speed
,
sizeof
(
ibvSpeeds
)
/
sizeof
(
int
)
-
1
)];
}
int
scclNetIb
::
scclIbSpeed
(
int
speed
)
{
return
ibvSpeeds
[
firstBitSet
(
speed
,
sizeof
(
ibvSpeeds
)
/
sizeof
(
int
)
-
1
)];
}
/**
* 检查当前IB设备是否支持宽松排序(Relaxed Ordering)模式
...
...
@@ -412,7 +379,7 @@ static int scclIbSpeed(int speed) { return ibvSpeeds[firstBitSet(speed, sizeof(i
* @note 通过查询IBVERBS_1.8 API的ibv_reg_mr_iova2函数来检测IBV_ACCESS_RELAXED_ORDERING支持
* @see scclParamIbPciRelaxedOrdering() 获取当前配置的RO模式
*/
static
int
scclIbRelaxedOrderingCapable
(
void
)
{
int
scclNetIb
::
scclIbRelaxedOrderingCapable
(
void
)
{
int
roMode
=
scclParamIbPciRelaxedOrdering
();
scclResult_t
r
=
scclInternalError
;
if
(
roMode
==
1
||
roMode
==
2
)
{
...
...
@@ -432,7 +399,7 @@ static int scclIbRelaxedOrderingCapable(void) {
* @param shownIbHcaEnv 计数器,用于控制日志输出次数
* @return char* 处理后的IB设备环境变量值
*/
static
char
*
scclIbGetIbHca
(
int
&
shownIbHcaEnv
,
bool
*
searchNot
,
bool
*
searchExact
)
{
char
*
scclNetIb
::
scclIbGetIbHca
(
int
&
shownIbHcaEnv
,
bool
*
searchNot
,
bool
*
searchExact
)
{
// 检查用户是否定义了要使用的IB设备:端口
char
*
userIbEnv
=
getenv
(
"SCCL_IB_HCA"
);
if
(
userIbEnv
!=
NULL
&&
shownIbHcaEnv
++
==
0
)
...
...
@@ -463,7 +430,7 @@ static char* scclIbGetIbHca(int& shownIbHcaEnv, bool* searchNot, bool* searchExa
* @note 缓冲区最大长度为MAX_STR_LEN,超出部分会被截断
* 文件内容末尾会自动添加字符串结束符'\0'
*/
scclResult_t
scclGetStrFromSys
(
const
char
*
path
,
const
char
*
fileName
,
char
*
strValue
)
{
scclResult_t
scclNetIb
::
scclGetStrFromSys
(
const
char
*
path
,
const
char
*
fileName
,
char
*
strValue
)
{
char
filePath
[
PATH_MAX
];
sprintf
(
filePath
,
"%s/%s"
,
path
,
fileName
);
int
offset
=
0
;
...
...
@@ -494,7 +461,7 @@ scclResult_t scclGetStrFromSys(const char* path, const char* fileName, char* str
* @param ibDev IB设备号
* @return scclResult_t 返回scclSuccess表示支持,返回scclSystemError表示不支持
*/
scclResult_t
scclIbGdrSupport
(
int
ibDev
)
{
scclResult_t
scclNetIb
::
scclIbGdrSupport
(
int
ibDev
)
{
static
int
moduleLoaded
=
-
1
;
if
(
moduleLoaded
==
-
1
)
{
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
...
...
@@ -527,7 +494,7 @@ scclResult_t scclIbGdrSupport(int ibDev) {
* @param dev 设备索引
* @return scclResult_t 返回scclSuccess表示支持,scclSystemError表示不支持
*/
scclResult_t
scclIbDmaBufSupport
(
int
dev
)
{
scclResult_t
scclNetIb
::
scclIbDmaBufSupport
(
int
dev
)
{
static
int
dmaBufSupported
=
-
1
;
if
(
dmaBufSupported
==
-
1
)
{
scclResult_t
res
;
...
...
@@ -552,9 +519,9 @@ failure:
}
struct
scclIbHandle
{
union
hos
t
::
scclSocketAddress
connectAddr
;
// Filled by the target (目标填充)
uint64_t
magic
;
// random number to help debugging (用于调试的随机数)
struct
scclIbCommStage
stage
;
// Used by the other side when connecting (连接时由另一侧使用)
union
net_socke
t
::
scclSocketAddress
connectAddr
;
// Filled by the target (目标填充)
uint64_t
magic
;
// random number to help debugging (用于调试的随机数)
struct
scclIbCommStage
stage
;
// Used by the other side when connecting (连接时由另一侧使用)
};
/**
...
...
@@ -572,7 +539,7 @@ struct scclIbHandle {
* @note 该函数会递增设备的PD引用计数,并在首次调用时为设备分配PD
* @note 创建的CQ大小为2*MAX_REQUESTS*IB_QPS_PER_CONNECTION,以支持接收请求的双重完成
*/
scclResult_t
scclIbInitVerbs
(
int
dev
,
struct
ibv_context
*
ctx
,
struct
scclIbVerbs
*
verbs
)
{
scclResult_t
scclNetIb
::
scclIbInitVerbs
(
int
dev
,
struct
ibv_context
*
ctx
,
struct
scclIbVerbs
*
verbs
)
{
verbs
->
dev
=
dev
;
pthread_mutex_lock
(
&
scclIbDevs
[
dev
].
lock
);
...
...
@@ -593,7 +560,20 @@ scclResult_t scclIbInitVerbs(int dev, struct ibv_context* ctx, struct scclIbVerb
return
scclSuccess
;
}
scclResult_t
scclIbCreateQp
(
uint8_t
ib_port
,
struct
scclIbVerbs
*
verbs
,
int
access_flags
,
struct
ibv_qp
**
qp
)
{
/**
* 创建并初始化一个InfiniBand队列对(QP)
*
* @param ib_port IB端口号
* @param verbs IB verbs结构体指针
* @param access_flags QP访问权限标志
* @param qp 输出的QP指针
*
* @return 返回scclSuccess表示成功,否则返回错误码
*
* @note QP类型为可靠连接(RC),发送队列大小为2*MAX_REQUESTS,
* 接收队列大小为MAX_REQUESTS,支持内联数据发送(如果配置启用)
*/
scclResult_t
scclNetIb
::
scclIbCreateQp
(
uint8_t
ib_port
,
struct
scclIbVerbs
*
verbs
,
int
access_flags
,
struct
ibv_qp
**
qp
)
{
struct
ibv_qp_init_attr
qpInitAttr
;
memset
(
&
qpInitAttr
,
0
,
sizeof
(
struct
ibv_qp_init_attr
));
qpInitAttr
.
send_cq
=
verbs
->
cq
;
...
...
@@ -616,7 +596,20 @@ scclResult_t scclIbCreateQp(uint8_t ib_port, struct scclIbVerbs* verbs, int acce
return
scclSuccess
;
}
scclResult_t
scclIbRtrQp
(
struct
ibv_qp
*
qp
,
uint32_t
qpn
,
struct
scclIbQpInfo
*
info
)
{
/**
* 将IB QP状态修改为RTR(Ready to Receive)状态
*
* @param qp IB QP指针
* @param qpn 目标QP号
* @param info QP配置信息,包含MTU、链路层类型、端口号等参数
*
* @return 成功返回scclSuccess,失败返回错误码
*
* @note 根据链路层类型(以太网/IB)设置不同的AH属性
* 以太网需要设置全局路由头(GRH)相关参数
* IB链路需要设置目标LID
*/
scclResult_t
scclNetIb
::
scclIbRtrQp
(
struct
ibv_qp
*
qp
,
uint32_t
qpn
,
struct
scclIbQpInfo
*
info
)
{
struct
ibv_qp_attr
qpAttr
;
memset
(
&
qpAttr
,
0
,
sizeof
(
struct
ibv_qp_attr
));
qpAttr
.
qp_state
=
IBV_QPS_RTR
;
...
...
@@ -645,7 +638,16 @@ scclResult_t scclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct scclIbQpInfo* i
return
scclSuccess
;
}
scclResult_t
scclIbRtsQp
(
struct
ibv_qp
*
qp
)
{
/**
* 将IB(InfiniBand)队列对(QP)状态修改为RTS(Ready To Send)状态
*
* @param qp IB队列对指针
* @return 成功返回scclSuccess,失败返回错误码
*
* 该函数配置QP属性并调用ibv_modify_qp将其状态改为RTS状态,
* 设置了超时时间、重试次数、RNR重试次数、SQ PSN和最大RD原子操作数等参数。
*/
scclResult_t
scclNetIb
::
scclIbRtsQp
(
struct
ibv_qp
*
qp
)
{
struct
ibv_qp_attr
qpAttr
;
memset
(
&
qpAttr
,
0
,
sizeof
(
struct
ibv_qp_attr
));
qpAttr
.
qp_state
=
IBV_QPS_RTS
;
...
...
@@ -670,7 +672,17 @@ const char* reqTypeStr[] = {"Unused", "Send", "Recv", "Flush"};
static_assert
((
offsetof
(
struct
scclIbSendComm
,
fifo
)
%
32
)
==
0
,
"scclIbSendComm fifo must be 32-byte aligned"
);
static_assert
((
sizeof
(
struct
scclIbSendFifo
)
%
32
)
==
0
,
"scclIbSendFifo element size must be 32-byte multiples"
);
scclResult_t
scclIbDestroyVerbs
(
struct
scclIbVerbs
*
verbs
)
{
/**
* @brief 销毁IB Verbs资源
*
* 释放指定的IB Verbs资源,包括完成队列(CQ)和保护域(PD)。
* 当PD的引用计数减至0时,会自动释放PD资源。
* 该函数是线程安全的,使用互斥锁保护共享资源。
*
* @param verbs 指向要销毁的IB Verbs结构体
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*/
scclResult_t
scclNetIb
::
scclIbDestroyVerbs
(
struct
scclIbVerbs
*
verbs
)
{
scclResult_t
res
;
SCCLCHECK
(
wrap_ibv_destroy_cq
(
verbs
->
cq
));
...
...
@@ -684,7 +696,17 @@ returning:
return
res
;
}
scclResult_t
scclIbGetRequest
(
struct
scclIbVerbs
*
verbs
,
struct
scclIbRequest
**
req
)
{
/**
* @brief 从verbs请求池中获取一个未使用的请求结构体
*
* @param verbs 指向scclIbVerbs结构体的指针,包含请求池
* @param req 输出参数,用于返回获取到的请求结构体指针
* @return scclResult_t 成功返回scclSuccess,失败返回scclInternalError
*
* 该函数遍历verbs请求池,查找第一个未使用的请求(SCCL_NET_IB_REQ_UNUSED),
* 初始化其字段后返回。如果所有请求都在使用中,则返回错误。
*/
scclResult_t
scclNetIb
::
scclIbGetRequest
(
struct
scclIbVerbs
*
verbs
,
struct
scclIbRequest
**
req
)
{
for
(
int
i
=
0
;
i
<
MAX_REQUESTS
;
i
++
)
{
struct
scclIbRequest
*
r
=
verbs
->
reqs
+
i
;
if
(
r
->
type
==
SCCL_NET_IB_REQ_UNUSED
)
{
...
...
@@ -700,14 +722,36 @@ scclResult_t scclIbGetRequest(struct scclIbVerbs* verbs, struct scclIbRequest**
*
req
=
NULL
;
return
scclInternalError
;
}
scclResult_t
scclIbFreeRequest
(
struct
scclIbRequest
*
r
)
{
/**
* 释放IB网络请求资源。
*
* 将请求类型标记为未使用状态,但不实际释放内存。
*
* @param r 要释放的IB网络请求指针
* @return 总是返回scclSuccess表示操作成功
*/
scclResult_t
scclNetIb
::
scclIbFreeRequest
(
struct
scclIbRequest
*
r
)
{
r
->
type
=
SCCL_NET_IB_REQ_UNUSED
;
return
scclSuccess
;
}
scclResult_t
scclIbTest
(
void
*
request
,
int
*
done
,
int
*
size
);
scclResult_t
scclIbMultiSend
(
struct
scclIbSendComm
*
comm
,
int
slot
)
{
/**
* @brief 执行IB网络的多发送操作
*
* 该函数处理IB网络的多发送请求,包括设置发送工作请求(WR)和分散/聚集元素(SGE),
* 并处理自适应路由(AR)和QP分割等高级功能。
*
* @param comm 指向scclIbSendComm结构的指针,包含发送通信上下文
* @param slot 要使用的发送槽位索引
* @return scclResult_t 返回操作结果,成功返回scclSuccess,失败返回错误码
*
* @note 1. 支持多QP分割发送,确保128B对齐
* 2. 使用RDMA_WRITE_WITH_IMM发送立即数据
* 3. 当请求数>32时会返回错误
* 4. 自适应路由模式下会发送两次WR
*/
scclResult_t
scclNetIb
::
scclIbMultiSend
(
struct
scclIbSendComm
*
comm
,
int
slot
)
{
struct
scclIbRequest
**
reqs
=
comm
->
fifoReqs
[
slot
];
volatile
struct
scclIbSendFifo
*
slots
=
comm
->
fifo
[
slot
];
int
nreqs
=
slots
[
0
].
nreqs
;
...
...
@@ -792,7 +836,23 @@ scclResult_t scclIbMultiSend(struct scclIbSendComm* comm, int slot) {
return
scclSuccess
;
}
scclResult_t
scclIbPostFifo
(
struct
scclIbRecvComm
*
comm
,
int
n
,
void
**
data
,
int
*
sizes
,
int
*
tags
,
void
**
mhandles
,
struct
scclIbRequest
*
req
)
{
/**
* @brief 通过IB Verbs RDMA写入操作向远程FIFO队列提交数据
*
* @param comm 指向接收通信上下文的指针
* @param n 要发送的数据块数量
* @param data 数据指针数组
* @param sizes 数据大小数组
* @param tags 数据标签数组
* @param mhandles 内存句柄数组
* @param req 请求结构体指针
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*
* @note 该函数会将数据打包到本地FIFO元素中,并通过RDMA写入到远程FIFO队列。
* 每MAX_REQUESTS次操作会触发一次带信号(SIGNALED)的发送,以避免发送队列堵塞。
* 使用IBV_WR_RDMA_WRITE操作码进行数据传输。
*/
scclResult_t
scclNetIb
::
scclIbPostFifo
(
struct
scclIbRecvComm
*
comm
,
int
n
,
void
**
data
,
int
*
sizes
,
int
*
tags
,
void
**
mhandles
,
struct
scclIbRequest
*
req
)
{
struct
ibv_send_wr
wr
;
memset
(
&
wr
,
0
,
sizeof
(
wr
));
...
...
@@ -852,10 +912,15 @@ scclResult_t scclIbPostFifo(struct scclIbRecvComm* comm, int n, void** data, int
return
scclSuccess
;
}
}
// namespace net_ib
////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////// scclNetIb调用的函数 ////////////////////////////////////////
namespace
net_ib
{
scclNetIb
::
scclNetIb
()
:
scclNetBase
(
"IB"
)
{}
scclNetIb
::~
scclNetIb
()
{
if
(
ibComm
!=
nullptr
)
{
free
(
ibComm
);
}
}
/**
* @brief 初始化InfiniBand硬件设备
...
...
@@ -872,7 +937,9 @@ namespace net_ib {
* @note 函数内部会处理环境变量SCCL_IB_HCA来过滤特定设备
* @note 使用互斥锁scclIbLock保证线程安全
*/
scclResult_t
scclIbInit
(
void
)
{
scclResult_t
scclNetIb
::
init
()
{
SCCLCHECK
(
scclCalloc
(
&
ibComm
,
1
));
// 如果IB被禁用,返回内部错误
if
(
scclParamIbDisable
())
return
scclInternalError
;
...
...
@@ -894,7 +961,7 @@ scclResult_t scclIbInit(void) {
if
(
scclNIbDevs
==
-
1
)
{
scclNIbDevs
=
0
;
// 查找网络接口
if
(
hos
t
::
scclFindSocketInterfaces
(
scclIbIfName
,
&
scclIbIfAddr
,
MAX_IF_NAME_SIZE
,
1
)
!=
1
)
{
if
(
net_socke
t
::
scclFindSocketInterfaces
(
scclIbIfName
,
&
scclIbIfAddr
,
MAX_IF_NAME_SIZE
,
1
)
!=
1
)
{
WARN
(
"NET/IB : No IP interface found."
);
return
scclInternalError
;
}
...
...
@@ -1042,14 +1109,14 @@ scclResult_t scclIbInit(void) {
// line 是设备的相关信息字符串
// scclIbRelaxedOrderingEnabled 是一个布尔值,指示是否启用了Relaxed Ordering
// scclIbIfName 是IB接口的名称
//
hos
t::scclSocketToString 是一个函数,用于将socket地址转换为字符串
//
net_socke
t::scclSocketToString 是一个函数,用于将socket地址转换为字符串
// addrline 是存储转换后地址字符串的数组
INFO
(
SCCL_LOG_NET
,
"NET/IB : Using%s %s; OOB %s:%s"
,
line
,
scclIbRelaxedOrderingEnabled
?
"[RO]"
:
""
,
scclIbIfName
,
hos
t
::
scclSocketToString
(
&
scclIbIfAddr
,
addrline
));
net_socke
t
::
scclSocketToString
(
&
scclIbIfAddr
,
addrline
));
}
pthread_mutex_unlock
(
&
scclIbLock
);
}
...
...
@@ -1062,7 +1129,7 @@ scclResult_t scclIbInit(void) {
* @param ndev [out] 用于存储设备数量的指针
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*/
scclResult_t
sccl
IbGetD
evices
Num
(
int
*
ndev
)
{
scclResult_t
sccl
NetIb
::
d
evices
(
int
*
ndev
)
{
*
ndev
=
scclNIbDevs
;
return
scclSuccess
;
}
...
...
@@ -1077,10 +1144,11 @@ scclResult_t scclIbGetDevicesNum(int* ndev) {
* @param props 用于存储设备属性的结构体指针
* @return scclResult_t 返回操作结果,成功返回scclSuccess
*/
scclResult_t
scclIbGetProperties
(
int
dev
,
scclNetProperties_t
*
props
)
{
props
->
name
=
scclIbDevs
[
dev
].
devName
;
props
->
pciPath
=
scclIbDevs
[
dev
].
pciPath
;
props
->
guid
=
scclIbDevs
[
dev
].
guid
;
scclResult_t
scclNetIb
::
getProperties
(
int
dev
,
scclNetProperties_t
*
props
)
{
props
->
name
=
scclIbDevs
[
dev
].
devName
;
props
->
pciPath
=
scclIbDevs
[
dev
].
pciPath
;
props
->
guid
=
scclIbDevs
[
dev
].
guid
;
props
->
ptrSupport
=
SCCL_PTR_HOST
;
if
(
scclIbGdrSupport
(
dev
)
==
scclSuccess
)
{
props
->
ptrSupport
|=
SCCL_PTR_CUDA
;
// GDR support via nv_peermem
...
...
@@ -1111,41 +1179,60 @@ scclResult_t scclIbGetProperties(int dev, scclNetProperties_t* props) {
* 3. 根据配置决定是否复用套接字
* 4. 启动套接字监听并获取连接地址
*/
scclResult_t
scclIbListen
(
int
dev
,
void
*
opaqueHandle
,
void
**
listenComm
)
{
// 创建并初始化通信结构体
struct
scclIbListenComm
*
comm
;
SCCLCHECK
(
scclCalloc
(
&
comm
,
1
));
scclResult_t
scclNetIb
::
listen
(
int
dev
,
void
*
opaqueHandle
,
void
**
listenComm
)
{
memset
(
ibComm
,
0
,
sizeof
(
struct
scclIbListenComm
));
struct
scclIbHandle
*
handle
=
(
struct
scclIbHandle
*
)
opaqueHandle
;
// 静态断言,确保 scclIbHandle 结构体的大小不超过 SCCL_NET_HANDLE_MAXSIZE
static_assert
(
sizeof
(
struct
scclIbHandle
)
<
SCCL_NET_HANDLE_MAXSIZE
,
"scclIbHandle size too large"
);
// 将 handle 指向的内存区域清零,大小为 scclIbHandle 结构体的大小
memset
(
handle
,
0
,
sizeof
(
struct
scclIbHandle
));
// 设置设备和处理句柄
c
omm
->
dev
=
dev
;
ibC
omm
->
dev
=
dev
;
handle
->
magic
=
SCCL_SOCKET_MAGIC
;
SCCLCHECK
(
hos
t
::
scclSocketInit
(
&
c
omm
->
sock
,
&
scclIbIfAddr
,
handle
->
magic
,
hos
t
::
scclSocketTypeNetIb
,
NULL
,
1
));
SCCLCHECK
(
net_socke
t
::
scclSocketInit
(
&
ibC
omm
->
sock
,
&
scclIbIfAddr
,
handle
->
magic
,
net_socke
t
::
scclSocketTypeNetIb
,
NULL
,
1
));
// 如果启用了端口复用,则复用套接字地址和文件描述符
if
(
scclParamIbSockServerPortReuse
())
{
if
(
reusedSockfd
==
-
1
)
{
SCCLCHECK
(
scclSocketListen
(
&
c
omm
->
sock
));
memcpy
(
&
reusedAddr
,
&
c
omm
->
sock
.
addr
,
sizeof
(
union
hos
t
::
scclSocketAddress
));
reusedSockfd
=
c
omm
->
sock
.
fd
;
SCCLCHECK
(
scclSocketListen
(
&
ibC
omm
->
sock
));
memcpy
(
&
reusedAddr
,
&
ibC
omm
->
sock
.
addr
,
sizeof
(
union
net_socke
t
::
scclSocketAddress
));
reusedSockfd
=
ibC
omm
->
sock
.
fd
;
}
else
{
memcpy
(
&
c
omm
->
sock
.
addr
,
&
reusedAddr
,
sizeof
(
union
hos
t
::
scclSocketAddress
));
c
omm
->
sock
.
fd
=
reusedSockfd
;
memcpy
(
&
ibC
omm
->
sock
.
addr
,
&
reusedAddr
,
sizeof
(
union
net_socke
t
::
scclSocketAddress
));
ibC
omm
->
sock
.
fd
=
reusedSockfd
;
}
}
else
{
SCCLCHECK
(
hos
t
::
scclSocketListen
(
&
c
omm
->
sock
));
SCCLCHECK
(
net_socke
t
::
scclSocketListen
(
&
ibC
omm
->
sock
));
}
// 获取套接字地址并设置监听通信
SCCLCHECK
(
hos
t
::
scclSocketGetAddr
(
&
c
omm
->
sock
,
&
handle
->
connectAddr
));
*
listenComm
=
c
omm
;
SCCLCHECK
(
net_socke
t
::
scclSocketGetAddr
(
&
ibC
omm
->
sock
,
&
handle
->
connectAddr
));
*
listenComm
=
ibC
omm
;
return
scclSuccess
;
}
scclResult_t
scclIbConnect
(
int
dev
,
void
*
opaqueHandle
,
void
**
sendComm
)
{
/**
* @brief 建立IB网络连接并初始化通信资源
*
* 该函数负责完成以下操作:
* 1. 初始化socket连接
* 2. 创建IB QP队列对
* 3. 交换QP信息
* 4. 完成QP状态转换(RTR/RTS)
* 5. 注册内存区域
*
* @param dev 设备索引
* @param opaqueHandle 包含连接信息的句柄
* @param sendComm 输出参数,返回建立的发送通信上下文
* @return scclResult_t 返回操作结果状态码
*
* @note 该函数使用状态机模式处理异步连接过程
* @warning 不能重复连接已建立的sendComm
*/
scclResult_t
scclNetIb
::
connect
(
int
dev
,
void
*
opaqueHandle
,
void
**
sendComm
)
{
struct
scclIbHandle
*
handle
=
(
struct
scclIbHandle
*
)
opaqueHandle
;
struct
scclIbCommStage
*
stage
=
&
handle
->
stage
;
struct
scclIbSendComm
*
comm
=
(
struct
scclIbSendComm
*
)
stage
->
comm
;
...
...
@@ -1166,14 +1253,14 @@ scclResult_t scclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
}
SCCLCHECK
(
scclIbMalloc
((
void
**
)
&
comm
,
sizeof
(
struct
scclIbSendComm
)));
SCCLCHECK
(
hos
t
::
scclSocketInit
(
&
comm
->
sock
,
&
handle
->
connectAddr
,
handle
->
magic
,
hos
t
::
scclSocketTypeNetIb
,
NULL
,
1
));
SCCLCHECK
(
net_socke
t
::
scclSocketInit
(
&
comm
->
sock
,
&
handle
->
connectAddr
,
handle
->
magic
,
net_socke
t
::
scclSocketTypeNetIb
,
NULL
,
1
));
stage
->
comm
=
comm
;
stage
->
state
=
scclIbCommStateConnect
;
SCCLCHECK
(
hos
t
::
scclSocketConnect
(
&
comm
->
sock
,
scclParamIbSockClientPortReuse
()));
SCCLCHECK
(
net_socke
t
::
scclSocketConnect
(
&
comm
->
sock
,
scclParamIbSockClientPortReuse
()));
ib_connect_check:
/* since scclSocketConnect is async, we must check if connection is complete */
SCCLCHECK
(
hos
t
::
scclSocketReady
(
&
comm
->
sock
,
&
ready
));
SCCLCHECK
(
net_socke
t
::
scclSocketReady
(
&
comm
->
sock
,
&
ready
));
if
(
!
ready
)
return
scclSuccess
;
...
...
@@ -1292,7 +1379,7 @@ ib_send_ready:
* @param recvComm 输出参数,接收通信句柄
* @return scclResult_t 返回操作结果,成功返回scclSuccess
*/
scclResult_t
sccl
IbA
ccept
(
void
*
listenComm
,
void
**
recvComm
)
{
scclResult_t
sccl
NetIb
::
a
ccept
(
void
*
listenComm
,
void
**
recvComm
)
{
struct
scclIbListenComm
*
lComm
=
(
struct
scclIbListenComm
*
)
listenComm
;
struct
scclIbCommStage
*
stage
=
&
lComm
->
stage
;
struct
scclIbRecvComm
*
rComm
=
(
struct
scclIbRecvComm
*
)
stage
->
comm
;
...
...
@@ -1315,11 +1402,11 @@ scclResult_t scclIbAccept(void* listenComm, void** recvComm) {
SCCLCHECK
(
scclIbMalloc
((
void
**
)
&
rComm
,
sizeof
(
struct
scclIbRecvComm
)));
stage
->
comm
=
rComm
;
stage
->
state
=
scclIbCommStateAccept
;
SCCLCHECK
(
hos
t
::
scclSocketInit
(
&
rComm
->
sock
));
SCCLCHECK
(
hos
t
::
scclSocketAccept
(
&
rComm
->
sock
,
&
lComm
->
sock
));
SCCLCHECK
(
net_socke
t
::
scclSocketInit
(
&
rComm
->
sock
));
SCCLCHECK
(
net_socke
t
::
scclSocketAccept
(
&
rComm
->
sock
,
&
lComm
->
sock
));
ib_accept_check:
SCCLCHECK
(
hos
t
::
scclSocketReady
(
&
rComm
->
sock
,
&
ready
));
SCCLCHECK
(
net_socke
t
::
scclSocketReady
(
&
rComm
->
sock
,
&
ready
));
if
(
!
ready
)
return
scclSuccess
;
...
...
@@ -1329,7 +1416,7 @@ ib_accept_check:
SCCLCHECK
(
scclIbMalloc
((
void
**
)
&
stage
->
buffer
,
sizeof
(
remQpInfo
)));
ib_recv:
SCCLCHECK
(
hos
t
::
scclSocketProgress
(
SCCL_SOCKET_RECV
,
&
rComm
->
sock
,
stage
->
buffer
,
sizeof
(
remQpInfo
),
&
stage
->
offset
));
SCCLCHECK
(
net_socke
t
::
scclSocketProgress
(
SCCL_SOCKET_RECV
,
&
rComm
->
sock
,
stage
->
buffer
,
sizeof
(
remQpInfo
),
&
stage
->
offset
));
if
(
stage
->
offset
!=
sizeof
(
remQpInfo
))
return
scclSuccess
;
...
...
@@ -1416,7 +1503,7 @@ ib_recv:
memcpy
(
stage
->
buffer
,
&
qpInfo
,
sizeof
(
struct
scclIbQpInfo
));
ib_send:
SCCLCHECK
(
hos
t
::
scclSocketProgress
(
SCCL_SOCKET_SEND
,
&
rComm
->
sock
,
stage
->
buffer
,
sizeof
(
struct
scclIbQpInfo
),
&
stage
->
offset
));
SCCLCHECK
(
net_socke
t
::
scclSocketProgress
(
SCCL_SOCKET_SEND
,
&
rComm
->
sock
,
stage
->
buffer
,
sizeof
(
struct
scclIbQpInfo
),
&
stage
->
offset
));
if
(
stage
->
offset
<
sizeof
(
struct
scclIbQpInfo
))
return
scclSuccess
;
...
...
@@ -1424,7 +1511,7 @@ ib_send:
stage
->
state
=
scclIbCommStatePendingReady
;
ib_recv_ready:
SCCLCHECK
(
hos
t
::
scclSocketProgress
(
SCCL_SOCKET_RECV
,
&
rComm
->
sock
,
&
rComm
->
ready
,
sizeof
(
int
),
&
stage
->
offset
));
SCCLCHECK
(
net_socke
t
::
scclSocketProgress
(
SCCL_SOCKET_RECV
,
&
rComm
->
sock
,
&
rComm
->
ready
,
sizeof
(
int
),
&
stage
->
offset
));
if
(
stage
->
offset
!=
sizeof
(
int
))
return
scclSuccess
;
...
...
@@ -1440,7 +1527,7 @@ ib_recv_ready:
}
/* DMA-BUF support */
scclResult_t
sccl
IbR
egMrDmaBuf
(
void
*
comm
,
void
*
data
,
size_t
size
,
int
type
,
uint64_t
offset
,
int
fd
,
void
**
mhandle
)
{
scclResult_t
sccl
NetIb
::
r
egMrDmaBuf
(
void
*
comm
,
void
*
data
,
size_t
size
,
int
type
,
uint64_t
offset
,
int
fd
,
void
**
mhandle
)
{
static_assert
(
offsetof
(
struct
scclIbSendComm
,
verbs
)
==
offsetof
(
struct
scclIbRecvComm
,
verbs
),
"Send and recv comms must have verbs at the same offset"
);
assert
(
size
>
0
);
...
...
@@ -1498,11 +1585,21 @@ returning:
return
res
;
}
scclResult_t
sccl
IbR
egMr
(
void
*
comm
,
void
*
data
,
int
size
,
int
type
,
void
**
mhandle
)
{
return
scclIbR
egMrDmaBuf
(
comm
,
data
,
(
size_t
)
size
,
type
,
0ULL
,
-
1
,
mhandle
);
scclResult_t
sccl
NetIb
::
r
egMr
(
void
*
comm
,
void
*
data
,
int
size
,
int
type
,
void
**
mhandle
)
{
return
r
egMrDmaBuf
(
comm
,
data
,
(
size_t
)
size
,
type
,
0ULL
,
-
1
,
mhandle
);
}
scclResult_t
scclIbDeregMr
(
void
*
comm
,
void
*
mhandle
)
{
/**
* @brief 注销IB内存区域(MR)
*
* 该函数用于注销指定的IB内存区域(MR),并更新MR缓存。如果MR的引用计数减至0,
* 则从缓存中移除并调用ibv_dereg_mr释放资源。
*
* @param comm 通信上下文指针
* @param mhandle 要注销的内存区域句柄
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*/
scclResult_t
scclNetIb
::
deregMr
(
void
*
comm
,
void
*
mhandle
)
{
struct
scclIbVerbs
*
verbs
=
(
struct
scclIbVerbs
*
)
comm
;
struct
scclIbMrCache
*
cache
=
&
scclIbDevs
[
verbs
->
dev
].
mrCache
;
scclResult_t
res
;
...
...
@@ -1529,10 +1626,10 @@ returning:
return
res
;
}
scclResult_t
sccl
IbI
send
(
void
*
sendComm
,
void
*
data
,
int
size
,
int
tag
,
void
*
mhandle
,
void
**
request
)
{
scclResult_t
sccl
NetIb
::
i
send
(
void
*
sendComm
,
void
*
data
,
int
size
,
int
tag
,
void
*
mhandle
,
void
**
request
)
{
struct
scclIbSendComm
*
comm
=
(
struct
scclIbSendComm
*
)
sendComm
;
if
(
comm
->
ready
==
0
)
{
WARN
(
"NET/IB:
scclIbI
send() called when comm->ready == 0"
);
WARN
(
"NET/IB:
i
send() called when comm->ready == 0"
);
return
scclInternalError
;
}
if
(
comm
->
ready
==
0
)
{
...
...
@@ -1567,26 +1664,26 @@ scclResult_t scclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
// Sanity checks to catch user collective call count/size mismatches
if
(
size
>
slots
[
r
].
size
)
{
char
line
[
SOCKET_NAME_MAXLEN
+
1
];
union
hos
t
::
scclSocketAddress
addr
;
hos
t
::
scclSocketGetAddr
(
&
comm
->
sock
,
&
addr
);
union
net_socke
t
::
scclSocketAddress
addr
;
net_socke
t
::
scclSocketGetAddr
(
&
comm
->
sock
,
&
addr
);
WARN
(
"NET/IB : req %d/%d tag %x peer %s collective mismatch error, local size %d remote size %d"
,
r
,
nreqs
,
tag
,
hos
t
::
scclSocketToString
(
&
addr
,
line
),
net_socke
t
::
scclSocketToString
(
&
addr
,
line
),
size
,
slots
[
r
].
size
);
return
scclInvalidUsage
;
}
// plus any potential programming errors
else
if
(
slots
[
r
].
size
<
0
||
slots
[
r
].
addr
==
0
||
slots
[
r
].
rkey
==
0
)
{
char
line
[
SOCKET_NAME_MAXLEN
+
1
];
union
hos
t
::
scclSocketAddress
addr
;
hos
t
::
scclSocketGetAddr
(
&
comm
->
sock
,
&
addr
);
union
net_socke
t
::
scclSocketAddress
addr
;
net_socke
t
::
scclSocketGetAddr
(
&
comm
->
sock
,
&
addr
);
WARN
(
"NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x"
,
r
,
nreqs
,
tag
,
hos
t
::
scclSocketToString
(
&
addr
,
line
),
net_socke
t
::
scclSocketToString
(
&
addr
,
line
),
slots
[
r
].
size
,
slots
[
r
].
addr
,
slots
[
r
].
rkey
);
...
...
@@ -1626,10 +1723,10 @@ scclResult_t scclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
return
scclSuccess
;
}
scclResult_t
sccl
IbI
recv
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
int
*
tags
,
void
**
mhandles
,
void
**
request
)
{
scclResult_t
sccl
NetIb
::
i
recv
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
int
*
tags
,
void
**
mhandles
,
void
**
request
)
{
struct
scclIbRecvComm
*
comm
=
(
struct
scclIbRecvComm
*
)
recvComm
;
if
(
comm
->
ready
==
0
)
{
WARN
(
"NET/IB:
scclIbI
recv() called when comm->ready == 0"
);
WARN
(
"NET/IB:
i
recv() called when comm->ready == 0"
);
return
scclInternalError
;
}
if
(
comm
->
ready
==
0
)
{
...
...
@@ -1672,7 +1769,7 @@ scclResult_t scclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
return
scclSuccess
;
}
scclResult_t
sccl
IbI
flush
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
void
**
mhandles
,
void
**
request
)
{
scclResult_t
sccl
NetIb
::
i
flush
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
void
**
mhandles
,
void
**
request
)
{
struct
scclIbRecvComm
*
comm
=
(
struct
scclIbRecvComm
*
)
recvComm
;
int
last
=
-
1
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
...
...
@@ -1706,7 +1803,7 @@ scclResult_t scclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
return
scclSuccess
;
}
scclResult_t
sccl
IbT
est
(
void
*
request
,
int
*
done
,
int
*
sizes
)
{
scclResult_t
sccl
NetIb
::
t
est
(
void
*
request
,
int
*
done
,
int
*
sizes
)
{
struct
scclIbRequest
*
r
=
(
struct
scclIbRequest
*
)
request
;
*
done
=
0
;
...
...
@@ -1732,8 +1829,8 @@ scclResult_t scclIbTest(void* request, int* done, int* sizes) {
struct
ibv_wc
*
wc
=
wcs
+
w
;
if
(
wc
->
status
!=
IBV_WC_SUCCESS
)
{
char
line
[
SOCKET_NAME_MAXLEN
+
1
];
union
hos
t
::
scclSocketAddress
addr
;
hos
t
::
scclSocketGetAddr
(
r
->
sock
,
&
addr
);
union
net_socke
t
::
scclSocketAddress
addr
;
net_socke
t
::
scclSocketGetAddr
(
r
->
sock
,
&
addr
);
char
localGidString
[
INET6_ADDRSTRLEN
]
=
""
;
char
remoteGidString
[
INET6_ADDRSTRLEN
]
=
""
;
const
char
*
localGidStr
=
NULL
,
*
remoteGidStr
=
NULL
;
...
...
@@ -1742,7 +1839,7 @@ scclResult_t scclIbTest(void* request, int* done, int* sizes) {
remoteGidStr
=
inet_ntop
(
AF_INET6
,
&
r
->
gidInfo
->
remoteGid
,
remoteGidString
,
sizeof
(
remoteGidString
));
}
WARN
(
"NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d (%s)%s%s%s%s"
,
hos
t
::
scclSocketToString
(
&
addr
,
line
),
net_socke
t
::
scclSocketToString
(
&
addr
,
line
),
wc
->
status
,
wc
->
opcode
,
wc
->
byte_len
,
...
...
@@ -1782,10 +1879,10 @@ scclResult_t scclIbTest(void* request, int* done, int* sizes) {
}
}
scclResult_t
sccl
IbC
loseSend
(
void
*
sendComm
)
{
scclResult_t
sccl
NetIb
::
c
loseSend
(
void
*
sendComm
)
{
struct
scclIbSendComm
*
comm
=
(
struct
scclIbSendComm
*
)
sendComm
;
if
(
comm
)
{
SCCLCHECK
(
hos
t
::
scclSocketClose
(
&
comm
->
sock
));
SCCLCHECK
(
net_socke
t
::
scclSocketClose
(
&
comm
->
sock
));
for
(
int
q
=
0
;
q
<
comm
->
nqps
;
q
++
)
if
(
comm
->
qps
[
q
]
!=
NULL
)
SCCLCHECK
(
wrap_ibv_destroy_qp
(
comm
->
qps
[
q
]));
...
...
@@ -1797,11 +1894,11 @@ scclResult_t scclIbCloseSend(void* sendComm) {
return
scclSuccess
;
}
scclResult_t
sccl
IbC
loseRecv
(
void
*
recvComm
)
{
scclResult_t
sccl
NetIb
::
c
loseRecv
(
void
*
recvComm
)
{
struct
scclIbRecvComm
*
comm
=
(
struct
scclIbRecvComm
*
)
recvComm
;
if
(
comm
)
{
if
(
!
scclParamIbSockServerPortReuse
()
||
reusedSockfd
!=
comm
->
sock
.
fd
)
SCCLCHECK
(
hos
t
::
scclSocketClose
(
&
comm
->
sock
));
SCCLCHECK
(
net_socke
t
::
scclSocketClose
(
&
comm
->
sock
));
for
(
int
q
=
0
;
q
<
comm
->
nqps
;
q
++
)
if
(
comm
->
qps
[
q
]
!=
NULL
)
SCCLCHECK
(
wrap_ibv_destroy_qp
(
comm
->
qps
[
q
]));
...
...
@@ -1819,36 +1916,16 @@ scclResult_t scclIbCloseRecv(void* recvComm) {
return
scclSuccess
;
}
scclResult_t
sccl
IbC
loseListen
(
void
*
listenComm
)
{
scclResult_t
sccl
NetIb
::
c
loseListen
(
void
*
listenComm
)
{
struct
scclIbListenComm
*
comm
=
(
struct
scclIbListenComm
*
)
listenComm
;
if
(
comm
)
{
SCCLCHECK
(
hos
t
::
scclSocketClose
(
&
comm
->
sock
));
SCCLCHECK
(
net_socke
t
::
scclSocketClose
(
&
comm
->
sock
));
free
(
comm
);
}
return
scclSuccess
;
}
}
// namespace net_ib
scclNet_t
scclNetIb
=
{
"IB"
,
net_ib
::
scclIbInit
,
net_ib
::
scclIbGetDevicesNum
,
net_ib
::
scclIbGetProperties
,
net_ib
::
scclIbListen
,
net_ib
::
scclIbConnect
,
net_ib
::
scclIbAccept
,
net_ib
::
scclIbRegMr
,
net_ib
::
scclIbRegMrDmaBuf
,
net_ib
::
scclIbDeregMr
,
net_ib
::
scclIbIsend
,
net_ib
::
scclIbIrecv
,
net_ib
::
scclIbIflush
,
net_ib
::
scclIbTest
,
net_ib
::
scclIbCloseSend
,
net_ib
::
scclIbCloseRecv
,
net_ib
::
scclIbCloseListen
};
}
// namespace device
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/net_ib/net_ib.h
0 → 100644
View file @
a4ac3320
#pragma once
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <poll.h>
#include <sys/types.h>
#include <unistd.h>
#include "ibvwrap.h"
#include "socket.h"
#include "net_utils.h"
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
net_ib
{
/*IB的通信状态*/
enum
scclIbCommState
:
uint8_t
{
scclIbCommStateStart
=
0
,
// 初始状态
scclIbCommStateConnect
=
1
,
// 尝试连接状态
scclIbCommStateAccept
=
3
,
// 接受连接状态
scclIbCommStateSend
=
4
,
// 发送数据状态
scclIbCommStateRecv
=
5
,
// 接收数据状态
scclIbCommStateConnecting
=
6
,
// 正在连接状态
scclIbCommStateConnected
=
7
,
// 已连接状态
scclIbCommStatePendingReady
=
8
,
// 等待准备状态
};
/*通信的阶段*/
struct
scclIbCommStage
{
enum
scclIbCommState
state
;
// 通信阶段的状态
int
offset
;
// 数据偏移量
void
*
buffer
;
// 用于通信的缓冲区指针
void
*
comm
;
// 通信对象指针
};
/*监听通信的上下文*/
struct
scclIbListenComm
{
int
dev
;
// 设备标识符
struct
net_socket
::
scclSocket
sock
;
// 用于网络通信的套接字
struct
scclIbCommStage
stage
;
// 通信阶段的状态
};
//////////////////////////////////
class
scclNetIb
:
public
scclNetBase
{
public:
// 构造函数和析构函数
scclNetIb
();
virtual
~
scclNetIb
();
// 初始化网络。
scclResult_t
init
()
override
;
// 返回适配器的数量。
scclResult_t
devices
(
int
*
ndev
)
override
;
// 获取各种设备属性。
scclResult_t
getProperties
(
int
dev
,
scclNetProperties_t
*
props
)
override
;
// 创建一个接收对象并提供一个句柄以连接到它。该句柄最多可以是 SCCL_NET_HANDLE_MAXSIZE 字节,并将在排名之间交换以创建连接。
scclResult_t
listen
(
int
dev
,
void
*
handle
,
void
**
listenComm
)
override
;
// 连接到一个句柄并返回一个发送 comm 对象给该对等体。
// 此调用不应阻塞以建立连接,而应成功返回 sendComm == NULL,并期望再次调用直到 sendComm != NULL。
scclResult_t
connect
(
int
dev
,
void
*
handle
,
void
**
sendComm
)
override
;
// 在远程对等体调用 connect 后最终确定连接建立。
// 此调用不应阻塞以建立连接,而应成功返回 recvComm == NULL,并期望再次调用直到 recvComm != NULL。
scclResult_t
accept
(
void
*
listenComm
,
void
**
recvComm
)
override
;
// 注册/注销内存。Comm 可以是 sendComm 或 recvComm。
// 类型是 SCCL_PTR_HOST 或 SCCL_PTR_CUDA。
scclResult_t
regMr
(
void
*
comm
,
void
*
data
,
int
size
,
int
type
,
void
**
mhandle
)
override
;
/* DMA-BUF 支持 */
scclResult_t
regMrDmaBuf
(
void
*
comm
,
void
*
data
,
size_t
size
,
int
type
,
uint64_t
offset
,
int
fd
,
void
**
mhandle
)
override
;
// 注销IB内存区域(MR)
scclResult_t
deregMr
(
void
*
comm
,
void
*
mhandle
)
override
;
// 异步发送到对等体。
// 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t
isend
(
void
*
sendComm
,
void
*
data
,
int
size
,
int
tag
,
void
*
mhandle
,
void
**
request
)
override
;
// 异步从对等体接收。 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t
irecv
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
int
*
tags
,
void
**
mhandles
,
void
**
request
)
override
;
// 执行刷新/栅栏操作,以确保所有使用 SCCL_PTR_CUDA 接收到的数据对 GPU 可见
scclResult_t
iflush
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
void
**
mhandles
,
void
**
request
)
override
;
// 测试请求是否完成。如果 size 不为 NULL,则返回发送/接收的字节数。
scclResult_t
test
(
void
*
request
,
int
*
done
,
int
*
sizes
)
override
;
// 关闭并释放 send/recv comm 对象
scclResult_t
closeSend
(
void
*
sendComm
)
override
;
scclResult_t
closeRecv
(
void
*
recvComm
)
override
;
scclResult_t
closeListen
(
void
*
listenComm
)
override
;
private:
struct
scclIbListenComm
*
ibComm
=
nullptr
;
// 定义一个静态变量 scclNIbDevs,用于存储 InfiniBand 设备的数量
int
scclNIbDevs
=
-
1
;
private:
// IB异步事件处理线程主函数
static
void
*
scclIbAsyncThreadMain
(
void
*
args
);
// 获取IB设备的PCI路径并处理多端口和虚拟功能合并
scclResult_t
scclIbGetPciPath
(
char
*
devName
,
char
**
path
,
int
*
realPort
);
// 根据输入的宽度值,返回对应的IB(InfiniBand)链路宽度索引
int
scclIbWidth
(
int
width
);
// 根据给定的速度值查找并返回对应的IB传输速率
int
scclIbSpeed
(
int
speed
);
// 检查当前IB设备是否支持宽松排序(Relaxed Ordering)模式
int
scclIbRelaxedOrderingCapable
(
void
);
// 获取并处理用户指定的IB设备环境变量
char
*
scclIbGetIbHca
(
int
&
shownIbHcaEnv
,
bool
*
searchNot
,
bool
*
searchExact
);
// 从系统文件中读取字符串内容
scclResult_t
scclGetStrFromSys
(
const
char
*
path
,
const
char
*
fileName
,
char
*
strValue
);
// 检查IB设备是否支持GPU Direct RDMA (GDR)
scclResult_t
scclIbGdrSupport
(
int
ibDev
);
// 检查设备是否支持DMA-BUF功能
scclResult_t
scclIbDmaBufSupport
(
int
dev
);
// 初始化InfiniBand Verbs资源
scclResult_t
scclIbInitVerbs
(
int
dev
,
struct
ibv_context
*
ctx
,
struct
scclIbVerbs
*
verbs
);
// 创建并初始化一个InfiniBand队列对(QP)
scclResult_t
scclIbCreateQp
(
uint8_t
ib_port
,
struct
scclIbVerbs
*
verbs
,
int
access_flags
,
struct
ibv_qp
**
qp
);
// 将IB QP状态修改为RTR(Ready to Receive)状态
scclResult_t
scclIbRtrQp
(
struct
ibv_qp
*
qp
,
uint32_t
qpn
,
struct
scclIbQpInfo
*
info
);
// 将IB(InfiniBand)队列对(QP)状态修改为RTS(Ready To Send)状态
scclResult_t
scclIbRtsQp
(
struct
ibv_qp
*
qp
);
// 销毁IB Verbs资源
scclResult_t
scclIbDestroyVerbs
(
struct
scclIbVerbs
*
verbs
);
// 从verbs请求池中获取一个未使用的请求结构体
scclResult_t
scclIbGetRequest
(
struct
scclIbVerbs
*
verbs
,
struct
scclIbRequest
**
req
);
// 释放IB网络请求资源。
scclResult_t
scclIbFreeRequest
(
struct
scclIbRequest
*
r
);
// 执行IB网络的多发送操作
scclResult_t
scclIbMultiSend
(
struct
scclIbSendComm
*
comm
,
int
slot
);
// 通过IB Verbs RDMA写入操作向远程FIFO队列提交数据
scclResult_t
scclIbPostFifo
(
struct
scclIbRecvComm
*
comm
,
int
n
,
void
**
data
,
int
*
sizes
,
int
*
tags
,
void
**
mhandles
,
struct
scclIbRequest
*
req
);
};
}
// namespace net_ib
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/
hos
t/CMakeLists.txt
→
src/hardware/net/
net_socke
t/CMakeLists.txt
View file @
a4ac3320
File moved
src/hardware/net/
hos
t/net_socket.cpp
→
src/hardware/net/
net_socke
t/net_socket.cpp
View file @
a4ac3320
...
...
@@ -9,8 +9,6 @@
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
host
{
namespace
net_socket
{
#define MAX_LINE_LEN (2047)
...
...
@@ -26,7 +24,28 @@ static struct scclNetSocketDev scclNetSocketDevs[MAX_IFS];
pthread_mutex_t
scclNetSocketLock
=
PTHREAD_MUTEX_INITIALIZER
;
static
scclResult_t
scclNetSocketGetPciPath
(
char
*
devName
,
char
**
pciPath
)
{
SCCL_PARAM
(
SocketNsocksPerThread
,
"NSOCKS_PERTHREAD"
,
-
2
);
SCCL_PARAM
(
SocketNthreads
,
"SOCKET_NTHREADS"
,
-
2
);
////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////// scclNetSocket调用的函数 ////////////////////////////////////////
scclNetSocket
::
scclNetSocket
()
:
scclNetBase
(
"Socket"
)
{}
scclNetSocket
::~
scclNetSocket
()
{
if
(
socketComm
!=
nullptr
)
{
free
(
socketComm
);
}
}
/**
* 获取网络设备的PCI路径
*
* @param devName 网络设备名称
* @param pciPath 输出参数,用于存储PCI路径的指针
* @return 返回操作结果(scclSuccess表示成功)
*
* @note 如果设备不存在,pciPath可能返回NULL
*/
scclResult_t
scclNetSocket
::
scclNetSocketGetPciPath
(
char
*
devName
,
char
**
pciPath
)
{
char
devicePath
[
PATH_MAX
];
snprintf
(
devicePath
,
PATH_MAX
,
"/sys/class/net/%s/device"
,
devName
);
// May return NULL if the file doesn't exist.
...
...
@@ -34,7 +53,9 @@ static scclResult_t scclNetSocketGetPciPath(char* devName, char** pciPath) {
return
scclSuccess
;
}
scclResult_t
scclNetSocketInit
(
void
)
{
scclResult_t
scclNetSocket
::
init
()
{
SCCLCHECK
(
scclMalloc
(
&
socketComm
,
1
));
if
(
scclNetIfs
==
-
1
)
{
pthread_mutex_lock
(
&
scclNetSocketLock
);
if
(
scclNetIfs
==
-
1
)
{
...
...
@@ -69,12 +90,22 @@ scclResult_t scclNetSocketInit(void) {
return
scclSuccess
;
}
scclResult_t
scclNetSocket
D
evices
(
int
*
ndev
)
{
scclResult_t
scclNetSocket
::
d
evices
(
int
*
ndev
)
{
*
ndev
=
scclNetIfs
;
return
scclSuccess
;
}
static
scclResult_t
scclNetSocketGetSpeed
(
char
*
devName
,
int
*
speed
)
{
/**
* @brief 获取指定网络设备的速度(单位:Mbps)
*
* 该函数通过读取/sys/class/net/<设备名>/speed文件来获取网络设备的速度。
* 如果读取失败或速度为0,则默认返回10Gbps(10000Mbps)。
*
* @param devName 网络设备名称
* @param speed 输出参数,用于存储获取到的速度值
* @return scclResult_t 始终返回scclSuccess表示成功
*/
scclResult_t
scclNetSocket
::
scclNetSocketGetSpeed
(
char
*
devName
,
int
*
speed
)
{
*
speed
=
0
;
char
speedPath
[
PATH_MAX
];
sprintf
(
speedPath
,
"/sys/class/net/%s/speed"
,
devName
);
...
...
@@ -93,7 +124,17 @@ static scclResult_t scclNetSocketGetSpeed(char* devName, int* speed) {
return
scclSuccess
;
}
scclResult_t
scclNetSocketGetProperties
(
int
dev
,
scclNetProperties_t
*
props
)
{
/**
* @brief 获取网络套接字设备的属性
*
* @param dev 设备索引
* @param props 用于存储设备属性的结构体指针
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*
* 该函数用于填充指定网络设备的属性信息,包括设备名称、PCI路径、速度等。
* 注意:延迟(latency)和端口(port)属性当前未设置。
*/
scclResult_t
scclNetSocket
::
getProperties
(
int
dev
,
scclNetProperties_t
*
props
)
{
props
->
name
=
scclNetSocketDevs
[
dev
].
devName
;
props
->
pciPath
=
scclNetSocketDevs
[
dev
].
pciPath
;
props
->
guid
=
dev
;
...
...
@@ -106,97 +147,19 @@ scclResult_t scclNetSocketGetProperties(int dev, scclNetProperties_t* props) {
return
scclSuccess
;
}
/* Communication functions */
#define MAX_SOCKETS 64
#define MAX_THREADS 16
#define MAX_REQUESTS SCCL_NET_MAX_REQUESTS
#define MIN_CHUNKSIZE (64 * 1024)
SCCL_PARAM
(
SocketNsocksPerThread
,
"NSOCKS_PERTHREAD"
,
-
2
);
SCCL_PARAM
(
SocketNthreads
,
"SOCKET_NTHREADS"
,
-
2
);
enum
scclNetSocketCommState
:
uint8_t
{
scclNetSocketCommStateStart
=
0
,
scclNetSocketCommStateConnect
=
1
,
scclNetSocketCommStateAccept
=
3
,
scclNetSocketCommStateSend
=
4
,
scclNetSocketCommStateRecv
=
5
,
};
struct
scclNetSocketCommStage
{
enum
scclNetSocketCommState
state
;
uint8_t
iteration
;
struct
scclSocket
*
sock
;
struct
scclNetSocketComm
*
comm
;
};
struct
scclNetSocketHandle
{
union
scclSocketAddress
connectAddr
;
uint64_t
magic
;
// random number to help debugging
int
nSocks
;
int
nThreads
;
struct
scclNetSocketCommStage
stage
;
};
struct
scclNetSocketTask
{
int
op
;
void
*
data
;
int
size
;
struct
scclSocket
*
sock
;
int
offset
;
int
used
;
scclResult_t
result
;
};
struct
scclNetSocketRequest
{
int
op
;
void
*
data
;
int
size
;
struct
scclSocket
*
ctrlSock
;
int
offset
;
int
used
;
struct
scclNetSocketComm
*
comm
;
struct
scclNetSocketTask
*
tasks
[
MAX_SOCKETS
];
int
nSubs
;
};
struct
scclNetSocketTaskQueue
{
int
next
;
int
len
;
struct
scclNetSocketTask
*
tasks
;
};
struct
scclNetSocketThreadResources
{
struct
scclNetSocketTaskQueue
threadTaskQueue
;
int
stop
;
struct
scclNetSocketComm
*
comm
;
pthread_mutex_t
threadLock
;
pthread_cond_t
threadCond
;
};
struct
scclNetSocketListenComm
{
struct
scclSocket
sock
;
struct
scclNetSocketCommStage
stage
;
int
nSocks
;
int
nThreads
;
int
dev
;
};
struct
scclNetSocketComm
{
struct
scclSocket
ctrlSock
;
struct
scclSocket
socks
[
MAX_SOCKETS
];
int
dev
;
int
cudaDev
;
int
nSocks
;
int
nThreads
;
int
nextSock
;
struct
scclNetSocketRequest
requests
[
MAX_REQUESTS
];
pthread_t
helperThread
[
MAX_THREADS
];
struct
scclNetSocketThreadResources
threadResources
[
MAX_THREADS
];
};
void
*
persistentSocketThread
(
void
*
args_
)
{
/**
* @brief 持久化socket线程处理函数
*
* 该线程持续处理socket任务队列中的任务,每个线程负责处理nSocksPerThread个socket。
* 当任务队列为空时,线程会等待条件变量通知;当收到停止信号时,线程退出。
*
* @param args_ 线程参数,包含通信结构、任务队列和同步原语
* @return void* 总是返回NULL
*
* @note 线程会循环处理任务直到收到停止信号
* @warning 如果socket处理出错,线程会直接退出并打印警告信息
*/
void
*
scclNetSocket
::
persistentSocketThread
(
void
*
args_
)
{
struct
scclNetSocketThreadResources
*
resource
=
(
struct
scclNetSocketThreadResources
*
)
args_
;
struct
scclNetSocketComm
*
comm
=
resource
->
comm
;
struct
scclNetSocketTaskQueue
*
myQueue
=
&
resource
->
threadTaskQueue
;
...
...
@@ -235,7 +198,18 @@ void* persistentSocketThread(void* args_) {
}
}
scclResult_t
scclNetSocketGetNsockNthread
(
int
dev
,
int
*
ns
,
int
*
nt
)
{
/**
* @brief 获取指定设备的socket和线程数量配置
*
* 根据设备类型和参数配置,自动检测或设置每个线程的socket数量和线程数量。
* 支持AWS和GCP设备的自动检测,并确保配置不超过最大限制。
*
* @param dev 设备索引
* @param ns 输出参数,返回总socket数量
* @param nt 输出参数,返回线程数量
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*/
scclResult_t
scclNetSocket
::
scclNetSocketGetNsockNthread
(
int
dev
,
int
*
ns
,
int
*
nt
)
{
int
nSocksPerThread
=
scclParamSocketNsocksPerThread
();
int
nThreads
=
scclParamSocketNthreads
();
if
(
nThreads
>
MAX_THREADS
)
{
...
...
@@ -287,28 +261,28 @@ scclResult_t scclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
return
scclSuccess
;
}
scclResult_t
scclNetSocket
L
isten
(
int
dev
,
void
*
opaqueHandle
,
void
**
listenComm
)
{
scclResult_t
scclNetSocket
::
l
isten
(
int
dev
,
void
*
opaqueHandle
,
void
**
listenComm
)
{
if
(
dev
<
0
||
dev
>=
scclNetIfs
)
{
// data transfer socket is based on specified dev
return
scclInternalError
;
}
struct
scclNetSocketHandle
*
handle
=
(
struct
scclNetSocketHandle
*
)
opaqueHandle
;
memset
(
handle
,
0
,
sizeof
(
struct
scclNetSocketHandle
));
static_assert
(
sizeof
(
struct
scclNetSocketHandle
)
<=
SCCL_NET_HANDLE_MAXSIZE
,
"scclNetSocketHandle size too large"
);
struct
scclNetSocketListenComm
*
comm
;
SCCLCHECK
(
scclCalloc
(
&
comm
,
1
));
memset
(
socketComm
,
0
,
sizeof
(
struct
scclNetSocketListenComm
))
;
handle
->
magic
=
SCCL_SOCKET_MAGIC
;
SCCLCHECK
(
scclSocketInit
(
&
c
omm
->
sock
,
&
scclNetSocketDevs
[
dev
].
addr
,
handle
->
magic
,
scclSocketTypeNetSocket
,
NULL
,
1
));
SCCLCHECK
(
scclSocketListen
(
&
c
omm
->
sock
));
SCCLCHECK
(
scclSocketGetAddr
(
&
c
omm
->
sock
,
&
handle
->
connectAddr
));
SCCLCHECK
(
scclNetSocketGetNsockNthread
(
dev
,
&
c
omm
->
nSocks
,
&
c
omm
->
nThreads
));
handle
->
nSocks
=
c
omm
->
nSocks
;
handle
->
nThreads
=
c
omm
->
nThreads
;
c
omm
->
dev
=
dev
;
*
listenComm
=
c
omm
;
SCCLCHECK
(
scclSocketInit
(
&
socketC
omm
->
sock
,
&
scclNetSocketDevs
[
dev
].
addr
,
handle
->
magic
,
scclSocketTypeNetSocket
,
NULL
,
1
));
SCCLCHECK
(
scclSocketListen
(
&
socketC
omm
->
sock
));
SCCLCHECK
(
scclSocketGetAddr
(
&
socketC
omm
->
sock
,
&
handle
->
connectAddr
));
SCCLCHECK
(
scclNetSocketGetNsockNthread
(
dev
,
&
socketC
omm
->
nSocks
,
&
socketC
omm
->
nThreads
));
handle
->
nSocks
=
socketC
omm
->
nSocks
;
handle
->
nThreads
=
socketC
omm
->
nThreads
;
socketC
omm
->
dev
=
dev
;
*
listenComm
=
socketC
omm
;
return
scclSuccess
;
}
scclResult_t
scclNetSocket
C
onnect
(
int
dev
,
void
*
opaqueHandle
,
void
**
sendComm
)
{
scclResult_t
scclNetSocket
::
c
onnect
(
int
dev
,
void
*
opaqueHandle
,
void
**
sendComm
)
{
if
(
dev
<
0
||
dev
>=
scclNetIfs
)
{
// data transfer socket is based on specified dev
return
scclInternalError
;
}
...
...
@@ -331,7 +305,7 @@ scclResult_t scclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm)
comm
->
nSocks
=
handle
->
nSocks
;
comm
->
nThreads
=
handle
->
nThreads
;
comm
->
dev
=
dev
;
HIPCHECK
(
hipGetDevice
(
&
comm
->
cuda
Dev
));
HIPCHECK
(
hipGetDevice
(
&
comm
->
hip
Dev
));
for
(;
i
<
comm
->
nSocks
+
1
;
i
++
)
{
sock
=
(
i
==
comm
->
nSocks
)
?
&
comm
->
ctrlSock
:
comm
->
socks
+
i
;
SCCLCHECK
(
scclSocketInit
(
sock
,
&
handle
->
connectAddr
,
handle
->
magic
,
scclSocketTypeNetSocket
,
NULL
,
1
));
...
...
@@ -357,7 +331,7 @@ scclResult_t scclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm)
return
scclSuccess
;
}
scclResult_t
scclNetSocket
A
ccept
(
void
*
listenComm
,
void
**
recvComm
)
{
scclResult_t
scclNetSocket
::
a
ccept
(
void
*
listenComm
,
void
**
recvComm
)
{
struct
scclNetSocketListenComm
*
lComm
=
(
struct
scclNetSocketListenComm
*
)
listenComm
;
struct
scclNetSocketCommStage
*
stage
=
&
lComm
->
stage
;
struct
scclNetSocketComm
*
rComm
=
stage
->
comm
;
...
...
@@ -376,7 +350,7 @@ scclResult_t scclNetSocketAccept(void* listenComm, void** recvComm) {
rComm
->
nSocks
=
lComm
->
nSocks
;
rComm
->
nThreads
=
lComm
->
nThreads
;
rComm
->
dev
=
lComm
->
dev
;
HIPCHECK
(
hipGetDevice
(
&
rComm
->
cuda
Dev
));
HIPCHECK
(
hipGetDevice
(
&
rComm
->
hip
Dev
));
for
(;
i
<
rComm
->
nSocks
+
1
;
i
++
)
{
uint8_t
sendSockIdx
;
...
...
@@ -434,7 +408,51 @@ scclResult_t scclNetSocketGetRequest(struct scclNetSocketComm* comm, int op, voi
return
scclInternalError
;
}
scclResult_t
scclNetSocketGetTask
(
struct
scclNetSocketComm
*
comm
,
int
op
,
void
*
data
,
int
size
,
struct
scclNetSocketTask
**
req
)
{
scclResult_t
scclNetSocket
::
regMr
(
void
*
comm
,
void
*
data
,
int
size
,
int
type
,
void
**
mhandle
)
{
return
(
type
!=
SCCL_PTR_HOST
)
?
scclInternalError
:
scclSuccess
;
}
scclResult_t
scclNetSocket
::
regMrDmaBuf
(
void
*
comm
,
void
*
data
,
size_t
size
,
int
type
,
uint64_t
offset
,
int
fd
,
void
**
mhandle
)
{
WARN
(
"NET/Socket : unable to check DMA-BUF support"
);
return
scclSuccess
;
}
scclResult_t
scclNetSocket
::
deregMr
(
void
*
comm
,
void
*
mhandle
)
{
return
scclSuccess
;
}
scclResult_t
scclNetSocket
::
isend
(
void
*
sendComm
,
void
*
data
,
int
size
,
int
tag
,
void
*
mhandle
,
void
**
request
)
{
struct
scclNetSocketComm
*
comm
=
(
struct
scclNetSocketComm
*
)
sendComm
;
SCCLCHECK
(
scclNetSocketGetRequest
(
comm
,
SCCL_SOCKET_SEND
,
data
,
size
,
(
struct
scclNetSocketRequest
**
)
request
));
return
scclSuccess
;
}
scclResult_t
scclNetSocket
::
irecv
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
int
*
tags
,
void
**
mhandles
,
void
**
request
)
{
struct
scclNetSocketComm
*
comm
=
(
struct
scclNetSocketComm
*
)
recvComm
;
if
(
n
!=
1
)
return
scclInternalError
;
SCCLCHECK
(
scclNetSocketGetRequest
(
comm
,
SCCL_SOCKET_RECV
,
data
[
0
],
sizes
[
0
],
(
struct
scclNetSocketRequest
**
)
request
));
return
scclSuccess
;
}
scclResult_t
scclNetSocket
::
iflush
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
void
**
mhandles
,
void
**
request
)
{
// We don't support HIP pointers, so we don't need a flush operation
return
scclInternalError
;
}
/**
* 为指定通信对象创建并获取一个网络套接字任务
*
* @param comm 网络套接字通信对象指针
* @param op 操作类型(SCCL_SOCKET_SEND/SCCL_SOCKET_RECV)
* @param data 任务数据缓冲区指针
* @param size 数据大小
* @param req [out] 返回创建的任务指针
*
* @return 成功返回scclSuccess,失败返回scclInternalError
*
* @note 该函数会初始化线程资源(首次调用时),创建持久化线程处理任务队列
* @warning 当任务队列已满时会返回错误并打印警告
*/
scclResult_t
scclNetSocket
::
scclNetSocketGetTask
(
struct
scclNetSocketComm
*
comm
,
int
op
,
void
*
data
,
int
size
,
struct
scclNetSocketTask
**
req
)
{
int
tid
=
comm
->
nextSock
%
comm
->
nThreads
;
struct
scclNetSocketThreadResources
*
res
=
comm
->
threadResources
+
tid
;
struct
scclNetSocketTaskQueue
*
queue
=
&
res
->
threadTaskQueue
;
...
...
@@ -450,7 +468,7 @@ scclResult_t scclNetSocketGetTask(struct scclNetSocketComm* comm, int op, void*
pthread_mutex_init
(
&
res
->
threadLock
,
NULL
);
pthread_cond_init
(
&
res
->
threadCond
,
NULL
);
pthread_create
(
comm
->
helperThread
+
tid
,
NULL
,
persistentSocketThread
,
res
);
scclSetThreadName
(
comm
->
helperThread
[
tid
],
"SCCL Sock%c%1u%2u%2u"
,
op
==
SCCL_SOCKET_SEND
?
'S'
:
'R'
,
comm
->
dev
,
tid
,
comm
->
cuda
Dev
);
scclSetThreadName
(
comm
->
helperThread
[
tid
],
"SCCL Sock%c%1u%2u%2u"
,
op
==
SCCL_SOCKET_SEND
?
'S'
:
'R'
,
comm
->
dev
,
tid
,
comm
->
hip
Dev
);
}
struct
scclNetSocketTask
*
r
=
queue
->
tasks
+
queue
->
next
;
if
(
r
->
used
==
0
)
{
...
...
@@ -473,7 +491,7 @@ scclResult_t scclNetSocketGetTask(struct scclNetSocketComm* comm, int op, void*
return
scclInternalError
;
}
scclResult_t
scclNetSocket
T
est
(
void
*
request
,
int
*
done
,
int
*
size
)
{
scclResult_t
scclNetSocket
::
t
est
(
void
*
request
,
int
*
done
,
int
*
size
)
{
*
done
=
0
;
struct
scclNetSocketRequest
*
r
=
(
struct
scclNetSocketRequest
*
)
request
;
if
(
r
==
NULL
)
{
...
...
@@ -555,43 +573,7 @@ scclResult_t scclNetSocketTest(void* request, int* done, int* size) {
return
scclSuccess
;
}
scclResult_t
scclNetSocketRegMr
(
void
*
comm
,
void
*
data
,
int
size
,
int
type
,
void
**
mhandle
)
{
return
(
type
!=
SCCL_PTR_HOST
)
?
scclInternalError
:
scclSuccess
;
}
scclResult_t
scclNetSocketDeregMr
(
void
*
comm
,
void
*
mhandle
)
{
return
scclSuccess
;
}
scclResult_t
scclNetSocketIsend
(
void
*
sendComm
,
void
*
data
,
int
size
,
int
tag
,
void
*
mhandle
,
void
**
request
)
{
struct
scclNetSocketComm
*
comm
=
(
struct
scclNetSocketComm
*
)
sendComm
;
SCCLCHECK
(
scclNetSocketGetRequest
(
comm
,
SCCL_SOCKET_SEND
,
data
,
size
,
(
struct
scclNetSocketRequest
**
)
request
));
return
scclSuccess
;
}
scclResult_t
scclNetSocketIrecv
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
int
*
tags
,
void
**
mhandles
,
void
**
request
)
{
struct
scclNetSocketComm
*
comm
=
(
struct
scclNetSocketComm
*
)
recvComm
;
if
(
n
!=
1
)
return
scclInternalError
;
SCCLCHECK
(
scclNetSocketGetRequest
(
comm
,
SCCL_SOCKET_RECV
,
data
[
0
],
sizes
[
0
],
(
struct
scclNetSocketRequest
**
)
request
));
return
scclSuccess
;
}
scclResult_t
scclNetSocketIflush
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
void
**
mhandles
,
void
**
request
)
{
// We don't support HIP pointers, so we don't need a flush operation
return
scclInternalError
;
}
scclResult_t
scclNetSocketCloseListen
(
void
*
opaqueComm
)
{
struct
scclNetSocketListenComm
*
comm
=
(
struct
scclNetSocketListenComm
*
)
opaqueComm
;
if
(
comm
)
{
int
ready
;
SCCLCHECK
(
scclSocketReady
(
&
comm
->
sock
,
&
ready
));
if
(
ready
)
SCCLCHECK
(
scclSocketClose
(
&
comm
->
sock
));
free
(
comm
);
}
return
scclSuccess
;
}
scclResult_t
scclNetSocketClose
(
void
*
opaqueComm
)
{
scclResult_t
scclNetSocket
::
closeSend
(
void
*
opaqueComm
)
{
struct
scclNetSocketComm
*
comm
=
(
struct
scclNetSocketComm
*
)
opaqueComm
;
if
(
comm
)
{
for
(
int
i
=
0
;
i
<
comm
->
nThreads
;
i
++
)
{
...
...
@@ -619,27 +601,21 @@ scclResult_t scclNetSocketClose(void* opaqueComm) {
return
scclSuccess
;
}
}
// namespace net_socket
scclResult_t
scclNetSocket
::
closeRecv
(
void
*
opaqueComm
)
{
return
closeSend
(
opaqueComm
);
}
scclNet_t
scclNetSocket
=
{
"Socket"
,
net_socket
::
scclNetSocketInit
,
net_socket
::
scclNetSocketDevices
,
net_socket
::
scclNetSocketGetProperties
,
net_socket
::
scclNetSocketListen
,
net_socket
::
scclNetSocketConnect
,
net_socket
::
scclNetSocketAccept
,
net_socket
::
scclNetSocketRegMr
,
NULL
,
// No DMA-BUF support
net_socket
::
scclNetSocketDeregMr
,
net_socket
::
scclNetSocketIsend
,
net_socket
::
scclNetSocketIrecv
,
net_socket
::
scclNetSocketIflush
,
net_socket
::
scclNetSocketTest
,
net_socket
::
scclNetSocketClose
,
net_socket
::
scclNetSocketClose
,
net_socket
::
scclNetSocketCloseListen
};
}
// namespace host
scclResult_t
scclNetSocket
::
closeListen
(
void
*
opaqueComm
)
{
struct
scclNetSocketListenComm
*
comm
=
(
struct
scclNetSocketListenComm
*
)
opaqueComm
;
if
(
comm
)
{
int
ready
;
SCCLCHECK
(
scclSocketReady
(
&
comm
->
sock
,
&
ready
));
if
(
ready
)
SCCLCHECK
(
scclSocketClose
(
&
comm
->
sock
));
free
(
comm
);
}
return
scclSuccess
;
}
}
// namespace net_socket
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/net_socket/net_socket.h
0 → 100644
View file @
a4ac3320
#pragma once
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <poll.h>
#include <sys/types.h>
#include <unistd.h>
#include "base.h"
#include "net_utils.h"
#include "socket.h"
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
net_socket
{
/* Communication functions */
static
constexpr
int
MAX_SOCKETS
=
64
;
static
constexpr
int
MAX_THREADS
=
16
;
static
constexpr
int
MAX_REQUESTS
=
SCCL_NET_MAX_REQUESTS
;
static
constexpr
int
MIN_CHUNKSIZE
=
(
64
*
1024
);
enum
scclNetSocketCommState
:
uint8_t
{
scclNetSocketCommStateStart
=
0
,
scclNetSocketCommStateConnect
=
1
,
scclNetSocketCommStateAccept
=
3
,
scclNetSocketCommStateSend
=
4
,
scclNetSocketCommStateRecv
=
5
,
};
struct
scclNetSocketCommStage
{
enum
scclNetSocketCommState
state
;
uint8_t
iteration
;
struct
scclSocket
*
sock
;
struct
scclNetSocketComm
*
comm
=
nullptr
;
};
struct
scclNetSocketHandle
{
union
scclSocketAddress
connectAddr
;
uint64_t
magic
;
// random number to help debugging
int
nSocks
;
int
nThreads
;
struct
scclNetSocketCommStage
stage
;
};
struct
scclNetSocketTask
{
int
op
;
void
*
data
;
int
size
;
struct
scclSocket
*
sock
=
nullptr
;
int
offset
;
int
used
;
scclResult_t
result
;
};
struct
scclNetSocketRequest
{
int
op
;
void
*
data
;
int
size
;
struct
scclSocket
*
ctrlSock
=
nullptr
;
int
offset
;
int
used
;
struct
scclNetSocketComm
*
comm
=
nullptr
;
struct
scclNetSocketTask
*
tasks
[
MAX_SOCKETS
]
=
{
nullptr
};
int
nSubs
;
};
struct
scclNetSocketTaskQueue
{
int
next
;
int
len
;
struct
scclNetSocketTask
*
tasks
=
nullptr
;
};
struct
scclNetSocketThreadResources
{
struct
scclNetSocketTaskQueue
threadTaskQueue
;
int
stop
;
struct
scclNetSocketComm
*
comm
=
nullptr
;
pthread_mutex_t
threadLock
;
pthread_cond_t
threadCond
;
};
struct
scclNetSocketListenComm
{
struct
scclSocket
sock
;
struct
scclNetSocketCommStage
stage
;
int
nSocks
;
int
nThreads
;
int
dev
;
};
struct
scclNetSocketComm
{
struct
scclSocket
ctrlSock
;
struct
scclSocket
socks
[
MAX_SOCKETS
];
int
dev
;
int
hipDev
;
int
nSocks
;
int
nThreads
;
int
nextSock
;
struct
scclNetSocketRequest
requests
[
MAX_REQUESTS
];
pthread_t
helperThread
[
MAX_THREADS
];
struct
scclNetSocketThreadResources
threadResources
[
MAX_THREADS
];
};
//////////////////////////////////
class
scclNetSocket
:
public
scclNetBase
{
public:
// 构造函数和析构函数
scclNetSocket
();
virtual
~
scclNetSocket
();
// 初始化网络。
scclResult_t
init
()
override
;
// 返回适配器的数量。
scclResult_t
devices
(
int
*
ndev
)
override
;
// 获取各种设备属性。
scclResult_t
getProperties
(
int
dev
,
scclNetProperties_t
*
props
)
override
;
// 创建一个接收对象并提供一个句柄以连接到它。该句柄最多可以是 SCCL_NET_HANDLE_MAXSIZE 字节,并将在排名之间交换以创建连接。
scclResult_t
listen
(
int
dev
,
void
*
handle
,
void
**
listenComm
)
override
;
// 连接到一个句柄并返回一个发送 comm 对象给该对等体。
// 此调用不应阻塞以建立连接,而应成功返回 sendComm == NULL,并期望再次调用直到 sendComm != NULL。
scclResult_t
connect
(
int
dev
,
void
*
handle
,
void
**
sendComm
)
override
;
// 在远程对等体调用 connect 后最终确定连接建立。
// 此调用不应阻塞以建立连接,而应成功返回 recvComm == NULL,并期望再次调用直到 recvComm != NULL。
scclResult_t
accept
(
void
*
listenComm
,
void
**
recvComm
)
override
;
// 注册/注销内存。Comm 可以是 sendComm 或 recvComm。
// 类型是 SCCL_PTR_HOST 或 SCCL_PTR_CUDA。
scclResult_t
regMr
(
void
*
comm
,
void
*
data
,
int
size
,
int
type
,
void
**
mhandle
)
override
;
/* DMA-BUF 支持 */
scclResult_t
regMrDmaBuf
(
void
*
comm
,
void
*
data
,
size_t
size
,
int
type
,
uint64_t
offset
,
int
fd
,
void
**
mhandle
)
override
;
// 注销IB内存区域(MR)
scclResult_t
deregMr
(
void
*
comm
,
void
*
mhandle
)
override
;
// 异步发送到对等体。
// 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t
isend
(
void
*
sendComm
,
void
*
data
,
int
size
,
int
tag
,
void
*
mhandle
,
void
**
request
)
override
;
// 异步从对等体接收。 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t
irecv
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
int
*
tags
,
void
**
mhandles
,
void
**
request
)
override
;
// 执行刷新/栅栏操作,以确保所有使用 SCCL_PTR_CUDA 接收到的数据对 GPU 可见
scclResult_t
iflush
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
void
**
mhandles
,
void
**
request
)
override
;
// 测试请求是否完成。如果 size 不为 NULL,则返回发送/接收的字节数。
scclResult_t
test
(
void
*
request
,
int
*
done
,
int
*
sizes
)
override
;
// 关闭并释放 send/recv comm 对象
scclResult_t
closeSend
(
void
*
sendComm
)
override
;
scclResult_t
closeRecv
(
void
*
recvComm
)
override
;
scclResult_t
closeListen
(
void
*
listenComm
)
override
;
private:
struct
scclNetSocketListenComm
*
socketComm
=
nullptr
;
private:
// 获取网络设备的PCI路径
static
scclResult_t
scclNetSocketGetPciPath
(
char
*
devName
,
char
**
pciPath
);
// 获取指定网络设备的速度(单位:Mbps)
scclResult_t
scclNetSocketGetSpeed
(
char
*
devName
,
int
*
speed
);
// 持久化socket线程处理函数
static
void
*
persistentSocketThread
(
void
*
args_
);
// 为指定通信对象创建并获取一个网络套接字任务
scclResult_t
scclNetSocketGetTask
(
struct
scclNetSocketComm
*
comm
,
int
op
,
void
*
data
,
int
size
,
struct
scclNetSocketTask
**
req
);
// 获取指定设备的socket和线程数量配置
scclResult_t
scclNetSocketGetNsockNthread
(
int
dev
,
int
*
ns
,
int
*
nt
);
};
}
// namespace net_socket
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/
hos
t/socket.cpp
→
src/hardware/net/
net_socke
t/socket.cpp
View file @
a4ac3320
...
...
@@ -15,7 +15,7 @@
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
hos
t
{
namespace
net_socke
t
{
namespace
socket_base
{
/**
...
...
@@ -383,7 +383,7 @@ static scclResult_t socketFinalizeConnect(struct scclSocket* sock) {
return
scclSuccess
;
}
static
scclResult_t
socketProgressState
(
struct
host
::
scclSocket
*
sock
)
{
static
scclResult_t
socketProgressState
(
struct
scclSocket
*
sock
)
{
if
(
sock
->
state
==
scclSocketStateAccepting
)
{
SCCLCHECK
(
socketTryAccept
(
sock
));
}
...
...
@@ -588,8 +588,13 @@ int scclFindInterfaceMatchSubnet(char* ifNames, union scclSocketAddress* localAd
* @brief 查找可用的socket网络接口
*
* 该函数用于查找系统中可用的网络接口,支持通过环境变量指定接口或自动探测。
* 查找顺序:1) 用户指定的接口(SCCL_SOCKET_IFNAME) 2) IB接口 3) 与SCCL_COMM_ID同子网的接口
* 4) 排除docker和lo的其他接口 5) docker接口 6) lo接口
* 查找顺序:
* 1) 用户指定的接口(SCCL_SOCKET_IFNAME)
* 2) IB接口
* 3) 与SCCL_COMM_ID同子网的接口
* 4) 排除docker和lo的其他接口
* 5) docker接口
* 6) lo接口
*
* @param ifNames 输出参数,存储找到的接口名称
* @param ifAddrs 输出参数,存储找到的接口地址
...
...
@@ -630,9 +635,9 @@ int scclFindSocketInterfaces(char* ifNames, union scclSocketAddress* ifAddrs, in
WARN
(
"No socket network interface found. "
);
}
//
//
Then look for anything else (but not docker or lo)
//
if(nIfs == 0)
//
nIfs = socket_base::findSocketInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
// Then look for anything else (but not docker or lo)
if
(
nIfs
==
0
)
nIfs
=
socket_base
::
findSocketInterfaces
(
"^docker,lo"
,
ifNames
,
ifAddrs
,
sock_family
,
ifNameMaxSize
,
maxIfs
);
// // Finally look for docker, then lo.
// if(nIfs == 0)
// nIfs = socket_base::findSocketInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
...
...
@@ -868,6 +873,16 @@ scclResult_t scclSocketListen(struct scclSocket* sock) {
return
scclSuccess
;
}
/**
* 获取socket地址信息
*
* @param sock 要获取地址的socket指针,不能为NULL
* @param addr 用于存储获取到的地址信息的缓冲区
* @return scclResult_t 返回操作结果:
* - scclInvalidArgument: 参数无效(sock为NULL)
* - scclInternalError: socket未就绪
* - scclSuccess: 操作成功
*/
scclResult_t
scclSocketGetAddr
(
struct
scclSocket
*
sock
,
union
scclSocketAddress
*
addr
)
{
if
(
sock
==
NULL
)
{
WARN
(
"scclSocketGetAddr: pass NULL socket"
);
...
...
@@ -1101,7 +1116,7 @@ scclResult_t scclSocketSetFd(int fd, struct scclSocket* sock) {
return
scclSuccess
;
}
}
// namespace
hos
t
}
// namespace
net_socke
t
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/
hos
t/socket.h
→
src/hardware/net/
net_socke
t/socket.h
View file @
a4ac3320
...
...
@@ -11,7 +11,7 @@
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
hos
t
{
namespace
net_socke
t
{
#define MAX_IFS 16 // 最大接口数量
#define MAX_IF_NAME_SIZE 16 // 每个接口名称的最大长度
...
...
@@ -114,7 +114,7 @@ scclResult_t scclSocketGetFd(struct scclSocket* sock, int* fd);
// 设置socket文件描述符
scclResult_t
scclSocketSetFd
(
int
fd
,
struct
scclSocket
*
sock
);
}
// namespace
hos
t
}
// namespace
net_socke
t
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/net_utils.cpp
View file @
a4ac3320
...
...
@@ -97,6 +97,22 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
return
false
;
}
scclResult_t
printNetProps
(
const
scclNetProperties_t
*
props
,
int
rank
,
int
localRank
)
{
printf
(
"rank=%d, localRank=%d, device name=%s, pciPath=%s, guid=%lu, ptrSupport=%d, speed=%d, port=%d, latency=%f, maxComms=%d, maxRecvs=%d
\n
"
,
rank
,
localRank
,
props
->
name
,
props
->
pciPath
,
props
->
guid
,
props
->
ptrSupport
,
props
->
speed
,
props
->
port
,
props
->
latency
,
props
->
maxComms
,
props
->
maxRecvs
);
return
scclSuccess
;
}
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/net_utils.h
View file @
a4ac3320
...
...
@@ -7,18 +7,11 @@ namespace sccl {
namespace
hardware
{
namespace
net
{
struct
netIf
{
// 网络接口结构体
char
prefix
[
64
];
// 网络前缀
int
port
;
// 端口号
};
// 解析字符串列表,将结果存储在网络接口列表中
int
parseStringList
(
const
char
*
string
,
struct
netIf
*
ifList
,
int
maxList
);
// 根据给定的字符串和端口,匹配网络接口列表中的接口
bool
matchIfList
(
const
char
*
string
,
int
port
,
struct
netIf
*
ifList
,
int
listSize
,
bool
matchExact
);
scclResult_t
rocmLibraryInit
(
void
);
typedef
enum
{
SCCL_PTR_HOST
=
0x1
,
SCCL_PTR_CUDA
=
0x2
,
SCCL_PTR_DMABUF
=
0x4
}
sccl_ptr_t
;
////////////////////////////////// 用于定义网络设备 //////////////////////////////////
typedef
struct
{
...
...
@@ -33,53 +26,87 @@ typedef struct {
int
maxRecvs
;
// 最大分组接收数量。
}
scclNetProperties_t
;
typedef
struct
{
// 网络的名称(主要用于日志)
const
char
*
name
;
/**
* @brief scclNetBase 类定义了网络通信的基础接口
*
* 该类是一个抽象基类,提供了网络初始化、设备管理、连接建立、内存注册、
* 数据传输等核心功能的纯虚函数接口。具体实现应由派生类完成。
*
* 主要功能包括:
* - 网络初始化和设备属性查询
* - 监听/连接建立和管理
* - 内存注册和注销
* - 异步发送/接收操作
* - 请求状态测试
* - 连接关闭
*
* 接口设计为非阻塞式,支持异步操作。
*/
typedef
class
scclNetBase
{
public:
// 构造函数和析构函数
scclNetBase
(
const
char
*
net_name
)
:
name
(
net_name
)
{};
virtual
~
scclNetBase
()
{};
// 初始化网络。
scclResult_t
(
*
init
)
();
virtual
scclResult_t
init
()
=
0
;
// 返回适配器的数量。
scclResult_t
(
*
devices
)
(
int
*
ndev
);
virtual
scclResult_t
devices
(
int
*
ndev
)
=
0
;
// 获取各种设备属性。
scclResult_t
(
*
getProperties
)
(
int
dev
,
scclNetProperties_t
*
props
);
virtual
scclResult_t
getProperties
(
int
dev
,
scclNetProperties_t
*
props
)
=
0
;
// 创建一个接收对象并提供一个句柄以连接到它。该句柄最多可以是 SCCL_NET_HANDLE_MAXSIZE 字节,并将在排名之间交换以创建连接。
scclResult_t
(
*
listen
)
(
int
dev
,
void
*
handle
,
void
**
listenComm
);
virtual
scclResult_t
listen
(
int
dev
,
void
*
handle
,
void
**
listenComm
)
=
0
;
// 连接到一个句柄并返回一个发送 comm 对象给该对等体。
// 此调用不应阻塞以建立连接,而应成功返回 sendComm == NULL,并期望再次调用直到 sendComm != NULL。
scclResult_t
(
*
connect
)
(
int
dev
,
void
*
handle
,
void
**
sendComm
);
virtual
scclResult_t
connect
(
int
dev
,
void
*
handle
,
void
**
sendComm
)
=
0
;
// 在远程对等体调用 connect 后最终确定连接建立。
// 此调用不应阻塞以建立连接,而应成功返回 recvComm == NULL,并期望再次调用直到 recvComm != NULL。
scclResult_t
(
*
accept
)
(
void
*
listenComm
,
void
**
recvComm
);
virtual
scclResult_t
accept
(
void
*
listenComm
,
void
**
recvComm
)
=
0
;
// 注册/注销内存。Comm 可以是 sendComm 或 recvComm。
// 类型是 SCCL_PTR_HOST 或 SCCL_PTR_CUDA。
scclResult_t
(
*
regMr
)
(
void
*
comm
,
void
*
data
,
int
size
,
int
type
,
void
**
mhandle
);
virtual
scclResult_t
regMr
(
void
*
comm
,
void
*
data
,
int
size
,
int
type
,
void
**
mhandle
)
=
0
;
/* DMA-BUF 支持 */
scclResult_t
(
*
regMrDmaBuf
)(
void
*
comm
,
void
*
data
,
size_t
size
,
int
type
,
uint64_t
offset
,
int
fd
,
void
**
mhandle
);
scclResult_t
(
*
deregMr
)(
void
*
comm
,
void
*
mhandle
);
virtual
scclResult_t
regMrDmaBuf
(
void
*
comm
,
void
*
data
,
size_t
size
,
int
type
,
uint64_t
offset
,
int
fd
,
void
**
mhandle
)
=
0
;
// 注销IB内存区域(MR)
virtual
scclResult_t
deregMr
(
void
*
comm
,
void
*
mhandle
)
=
0
;
// 异步发送到对等体。
// 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t
(
*
isend
)
(
void
*
sendComm
,
void
*
data
,
int
size
,
int
tag
,
void
*
mhandle
,
void
**
request
);
virtual
scclResult_t
isend
(
void
*
sendComm
,
void
*
data
,
int
size
,
int
tag
,
void
*
mhandle
,
void
**
request
)
=
0
;
// 异步从对等体接收。 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t
(
*
irecv
)
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
int
*
tags
,
void
**
mhandles
,
void
**
request
);
virtual
scclResult_t
irecv
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
int
*
tags
,
void
**
mhandles
,
void
**
request
)
=
0
;
// 执行刷新/栅栏操作,以确保所有使用 SCCL_PTR_CUDA 接收到的数据对 GPU 可见
scclResult_t
(
*
iflush
)
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
void
**
mhandles
,
void
**
request
);
virtual
scclResult_t
iflush
(
void
*
recvComm
,
int
n
,
void
**
data
,
int
*
sizes
,
void
**
mhandles
,
void
**
request
)
=
0
;
// 测试请求是否完成。如果 size 不为 NULL,则返回发送/接收的字节数。
scclResult_t
(
*
test
)
(
void
*
request
,
int
*
done
,
int
*
sizes
);
virtual
scclResult_t
test
(
void
*
request
,
int
*
done
,
int
*
sizes
)
=
0
;
// 关闭并释放 send/recv comm 对象
scclResult_t
(
*
closeSend
)(
void
*
sendComm
);
scclResult_t
(
*
closeRecv
)(
void
*
recvComm
);
scclResult_t
(
*
closeListen
)(
void
*
listenComm
);
}
scclNet_t
;
virtual
scclResult_t
closeSend
(
void
*
sendComm
)
=
0
;
virtual
scclResult_t
closeRecv
(
void
*
recvComm
)
=
0
;
virtual
scclResult_t
closeListen
(
void
*
listenComm
)
=
0
;
////////////////////////////////// 其他定义 //////////////////////////////////
public:
// 网络的名称(主要用于日志)
const
char
*
name
;
typedef
enum
sccl_ptr
{
SCCL_PTR_HOST
=
0x1
,
SCCL_PTR_CUDA
=
0x2
,
SCCL_PTR_DMABUF
=
0x4
}
sccl
_ptr_t
;
}
scclNet_t
;
////////////////////////////////// 功能函数 //////////////////////////////////
// 初始化 ROCm 库
sccl
Result_t
rocmLibraryInit
(
void
)
;
#define SCCL_NET_HANDLE_MAXSIZE 128
struct
netIf
{
// 网络接口结构体
char
prefix
[
64
];
// 网络前缀
int
port
;
// 端口号
};
// 解析字符串列表,将结果存储在网络接口列表中
int
parseStringList
(
const
char
*
string
,
struct
netIf
*
ifList
,
int
maxList
);
// 根据给定的字符串和端口,匹配网络接口列表中的接口
bool
matchIfList
(
const
char
*
string
,
int
port
,
struct
netIf
*
ifList
,
int
listSize
,
bool
matchExact
);
// 打印网络属性信息
scclResult_t
printNetProps
(
const
scclNetProperties_t
*
props
,
int
rank
,
int
localRank
);
}
// namespace net
}
// namespace hardware
...
...
src/hardware/net/rocm_wrap.cpp
View file @
a4ac3320
...
...
@@ -170,6 +170,13 @@ error:
}
// namespace rocm_wrap
/**
* 初始化 ROCm 库
*
* 该函数使用 pthread_once 确保 ROCm 库只被初始化一次。
*
* @return 返回初始化结果,scclResult_t 类型。
*/
scclResult_t
rocmLibraryInit
()
{
pthread_once
(
&
rocm_wrap
::
initOnceControl
,
rocm_wrap
::
initOnceFunc
);
return
rocm_wrap
::
initResult
;
...
...
src/hardware/net/rocm_wrap.h
View file @
a4ac3320
...
...
@@ -23,6 +23,7 @@ DECLARE_ROCM_PFN_EXTERN(hsa_status_string);
}
// namespace rocm_wrap
// 初始化 ROCm 库
scclResult_t
rocmLibraryInit
(
void
);
}
// namespace net
...
...
src/hardware/topo_bak/cpuset.h
deleted
100644 → 0
View file @
d9d23f34
/*************************************************************************
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef SCCL_CPUSET_H_
#define SCCL_CPUSET_H_
#include "base.h"
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
topo
{
// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
/**
* 将十六进制字符转换为对应的整数值
*
* @param c 输入的十六进制字符(0-9, a-f)
* @return 返回对应的整数值(0-15),如果输入无效则返回-1
*/
static
int
hexToInt
(
char
c
)
{
int
v
=
c
-
'0'
;
if
(
v
<
0
)
return
-
1
;
if
(
v
>
9
)
v
=
10
+
c
-
'a'
;
if
((
v
<
0
)
||
(
v
>
15
))
return
-
1
;
return
v
;
}
#define CPU_SET_N_U32 (sizeof(cpu_set_t) / sizeof(uint32_t))
/**
* 将十六进制字符串转换为CPU集合掩码
*
* @param str 输入的十六进制字符串,用逗号分隔不同部分
* @param mask 输出的CPU集合掩码
* @return scclSuccess 表示转换成功
*
* @note 字符串从左到右对应掩码从高到低的32位字
* 每个字符代表4位十六进制数
* 遇到非十六进制字符会提前终止转换
*/
static
scclResult_t
scclStrToCpuset
(
const
char
*
str
,
cpu_set_t
*
mask
)
{
uint32_t
cpumasks
[
CPU_SET_N_U32
];
int
m
=
CPU_SET_N_U32
-
1
;
cpumasks
[
m
]
=
0
;
for
(
int
o
=
0
;
o
<
strlen
(
str
);
o
++
)
{
char
c
=
str
[
o
];
if
(
c
==
','
)
{
m
--
;
cpumasks
[
m
]
=
0
;
}
else
{
int
v
=
hexToInt
(
c
);
if
(
v
==
-
1
)
break
;
cpumasks
[
m
]
<<=
4
;
cpumasks
[
m
]
+=
v
;
}
}
// Copy cpumasks to mask
for
(
int
a
=
0
;
m
<
CPU_SET_N_U32
;
a
++
,
m
++
)
{
memcpy
(((
uint32_t
*
)
mask
)
+
a
,
cpumasks
+
m
,
sizeof
(
uint32_t
));
}
return
scclSuccess
;
}
/**
* 将CPU集合掩码转换为十六进制字符串表示
*
* @param mask 输入的CPU集合掩码
* @param str 输出的字符串缓冲区,用于存储转换结果
* @return 返回操作结果(scclSuccess表示成功)
*
* 转换规则:
* 1. 将cpu_set_t按字节从高到低转换为十六进制字符串
* 2. 每4个字节后添加一个逗号分隔符
* 3. 忽略前导零
*/
static
scclResult_t
scclCpusetToStr
(
cpu_set_t
*
mask
,
char
*
str
)
{
int
c
=
0
;
uint8_t
*
m8
=
(
uint8_t
*
)
mask
;
for
(
int
o
=
sizeof
(
cpu_set_t
)
-
1
;
o
>=
0
;
o
--
)
{
if
(
c
==
0
&&
m8
[
o
]
==
0
)
continue
;
sprintf
(
str
+
c
,
"%02x"
,
m8
[
o
]);
c
+=
2
;
if
(
o
&&
o
%
4
==
0
)
{
sprintf
(
str
+
c
,
","
);
c
++
;
}
}
str
[
c
]
=
'\0'
;
return
scclSuccess
;
}
/**
* 将CPU集合掩码转换为范围字符串表示
*
* @param mask 输入的CPU集合掩码
* @param str 用于存储结果的缓冲区
* @param len 缓冲区长度
* @return 返回转换后的字符串指针(即str参数)
*
* 该函数将CPU集合掩码转换为可读的范围字符串格式,例如"0-3,5,7-9"。
* 如果缓冲区空间不足,结果会被截断。空集合会返回空字符串。
*/
static
char
*
scclCpusetToRangeStr
(
cpu_set_t
*
mask
,
char
*
str
,
size_t
len
)
{
int
c
=
0
;
int
start
=
-
1
;
// Iterate through all possible CPU bits plus one extra position
for
(
int
cpu
=
0
;
cpu
<=
CPU_SETSIZE
;
cpu
++
)
{
int
isSet
=
(
cpu
==
CPU_SETSIZE
)
?
0
:
CPU_ISSET
(
cpu
,
mask
);
// Start of a new range
if
(
isSet
&&
start
==
-
1
)
{
start
=
cpu
;
}
// End of a range, add comma between ranges
if
(
!
isSet
&&
start
!=
-
1
)
{
if
(
cpu
-
1
==
start
)
{
c
+=
snprintf
(
str
+
c
,
len
-
c
,
"%s%d"
,
c
?
","
:
""
,
start
);
}
else
{
c
+=
snprintf
(
str
+
c
,
len
-
c
,
"%s%d-%d"
,
c
?
","
:
""
,
start
,
cpu
-
1
);
}
if
(
c
>=
len
-
1
)
break
;
start
=
-
1
;
}
}
if
(
c
==
0
)
str
[
0
]
=
'\0'
;
return
str
;
}
}
// namespace topo
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
#endif
src/hardware/topo_bak/detect_topo.cc
deleted
100644 → 0
View file @
d9d23f34
src/hardware/topo_bak/detect_topo.h
deleted
100644 → 0
View file @
d9d23f34
src/hardware/topo_bak/nvmlwrap.cc
deleted
100644 → 0
View file @
d9d23f34
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nvmlwrap.h"
#include "base.h"
#include <initializer_list>
#include <memory>
#include <mutex>
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
int
scclNvmlDeviceCount
=
0
;
scclNvmlDeviceInfo
scclNvmlDevices
[
scclNvmlMaxDevices
];
scclNvmlDevicePairInfo
scclNvmlDevicePairs
[
scclNvmlMaxDevices
][
scclNvmlMaxDevices
];
#if SCCL_NVML_DIRECT
#define SCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name) arglist = name;
#else
#include <dlfcn.h>
#define SCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name) arglist = nullptr;
#endif
namespace
{
SCCL_NVML_FN
(
nvmlInit
,
nvmlReturn_t
,
())
SCCL_NVML_FN
(
nvmlInit_v2
,
nvmlReturn_t
,
())
SCCL_NVML_FN
(
nvmlShutdown
,
nvmlReturn_t
,
())
SCCL_NVML_FN
(
nvmlDeviceGetCount
,
nvmlReturn_t
,
(
unsigned
int
*
))
SCCL_NVML_FN
(
nvmlDeviceGetCount_v2
,
nvmlReturn_t
,
(
unsigned
int
*
))
SCCL_NVML_FN
(
nvmlDeviceGetHandleByPciBusId
,
nvmlReturn_t
,
(
const
char
*
pciBusId
,
nvmlDevice_t
*
device
))
SCCL_NVML_FN
(
nvmlDeviceGetHandleByIndex
,
nvmlReturn_t
,
(
unsigned
int
index
,
nvmlDevice_t
*
device
))
SCCL_NVML_FN
(
nvmlDeviceGetIndex
,
nvmlReturn_t
,
(
nvmlDevice_t
device
,
unsigned
*
index
))
SCCL_NVML_FN
(
nvmlErrorString
,
char
const
*
,
(
nvmlReturn_t
r
))
SCCL_NVML_FN
(
nvmlDeviceGetNvLinkState
,
nvmlReturn_t
,
(
nvmlDevice_t
device
,
unsigned
int
link
,
nvmlEnableState_t
*
isActive
))
SCCL_NVML_FN
(
nvmlDeviceGetNvLinkRemotePciInfo
,
nvmlReturn_t
,
(
nvmlDevice_t
device
,
unsigned
int
link
,
nvmlPciInfo_t
*
pci
))
SCCL_NVML_FN
(
nvmlDeviceGetNvLinkCapability
,
nvmlReturn_t
,
(
nvmlDevice_t
device
,
unsigned
int
link
,
nvmlNvLinkCapability_t
capability
,
unsigned
int
*
capResult
))
SCCL_NVML_FN
(
nvmlDeviceGetCudaComputeCapability
,
nvmlReturn_t
,
(
nvmlDevice_t
device
,
int
*
major
,
int
*
minor
))
SCCL_NVML_FN
(
nvmlDeviceGetP2PStatus
,
nvmlReturn_t
,
(
nvmlDevice_t
device1
,
nvmlDevice_t
device2
,
nvmlGpuP2PCapsIndex_t
p2pIndex
,
nvmlGpuP2PStatus_t
*
p2pStatus
))
SCCL_NVML_FN
(
nvmlDeviceGetFieldValues
,
nvmlReturn_t
,
(
nvmlDevice_t
device
,
int
valuesCount
,
nvmlFieldValue_t
*
values
))
std
::
mutex
lock
;
// NVML has had some thread safety bugs
bool
initialized
=
false
;
thread_local
bool
threadInitialized
=
false
;
scclResult_t
initResult
;
}
// namespace
scclResult_t
scclNvmlEnsureInitialized
()
{
// Optimization to avoid repeatedly grabbing the lock when we only want to
// read from the global tables.
if
(
threadInitialized
)
return
initResult
;
threadInitialized
=
true
;
std
::
lock_guard
<
std
::
mutex
>
locked
(
lock
);
if
(
initialized
)
return
initResult
;
initialized
=
true
;
#if !SCCL_NVML_DIRECT
if
(
pfn_nvmlInit
==
nullptr
)
{
void
*
libhandle
=
dlopen
(
"libnvidia-ml.so.1"
,
RTLD_NOW
);
if
(
libhandle
==
nullptr
)
{
WARN
(
"Failed to open libnvidia-ml.so.1"
);
initResult
=
scclSystemError
;
return
initResult
;
}
struct
Symbol
{
void
**
ppfn
;
char
const
*
name
;
};
std
::
initializer_list
<
Symbol
>
symbols
=
{{(
void
**
)
&
pfn_nvmlInit
,
"nvmlInit"
},
{(
void
**
)
&
pfn_nvmlInit_v2
,
"nvmlInit_v2"
},
{(
void
**
)
&
pfn_nvmlShutdown
,
"nvmlShutdown"
},
{(
void
**
)
&
pfn_nvmlDeviceGetCount
,
"nvmlDeviceGetCount"
},
{(
void
**
)
&
pfn_nvmlDeviceGetCount_v2
,
"nvmlDeviceGetCount_v2"
},
{(
void
**
)
&
pfn_nvmlDeviceGetHandleByPciBusId
,
"nvmlDeviceGetHandleByPciBusId"
},
{(
void
**
)
&
pfn_nvmlDeviceGetHandleByIndex
,
"nvmlDeviceGetHandleByIndex"
},
{(
void
**
)
&
pfn_nvmlDeviceGetIndex
,
"nvmlDeviceGetIndex"
},
{(
void
**
)
&
pfn_nvmlErrorString
,
"nvmlErrorString"
},
{(
void
**
)
&
pfn_nvmlDeviceGetNvLinkState
,
"nvmlDeviceGetNvLinkState"
},
{(
void
**
)
&
pfn_nvmlDeviceGetNvLinkRemotePciInfo
,
"nvmlDeviceGetNvLinkRemotePciInfo"
},
{(
void
**
)
&
pfn_nvmlDeviceGetNvLinkCapability
,
"nvmlDeviceGetNvLinkCapability"
},
{(
void
**
)
&
pfn_nvmlDeviceGetCudaComputeCapability
,
"nvmlDeviceGetCudaComputeCapability"
},
{(
void
**
)
&
pfn_nvmlDeviceGetP2PStatus
,
"nvmlDeviceGetP2PStatus"
},
{(
void
**
)
&
pfn_nvmlDeviceGetFieldValues
,
"nvmlDeviceGetFieldValues"
}};
for
(
Symbol
sym
:
symbols
)
{
*
sym
.
ppfn
=
dlsym
(
libhandle
,
sym
.
name
);
}
}
#endif
#if SCCL_NVML_DIRECT
bool
have_v2
=
true
;
#else
bool
have_v2
=
pfn_nvmlInit_v2
!=
nullptr
;
// if this compare is done in the SCCL_NVML_DIRECT=1 case then GCC warns about it never being null
#endif
nvmlReturn_t
res1
=
(
have_v2
?
pfn_nvmlInit_v2
:
pfn_nvmlInit
)();
if
(
res1
!=
NVML_SUCCESS
)
{
WARN
(
"nvmlInit%s() failed: %s"
,
have_v2
?
"_v2"
:
""
,
pfn_nvmlErrorString
(
res1
));
initResult
=
scclSystemError
;
return
initResult
;
}
unsigned
int
ndev
;
res1
=
(
have_v2
?
pfn_nvmlDeviceGetCount_v2
:
pfn_nvmlDeviceGetCount
)(
&
ndev
);
if
(
res1
!=
NVML_SUCCESS
)
{
WARN
(
"nvmlDeviceGetCount%s() failed: %s"
,
have_v2
?
"_v2"
:
""
,
pfn_nvmlErrorString
(
res1
));
initResult
=
scclSystemError
;
return
initResult
;
}
scclNvmlDeviceCount
=
int
(
ndev
);
if
(
scclNvmlMaxDevices
<
scclNvmlDeviceCount
)
{
WARN
(
"nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (scclNvmlMaxDevices=%d)"
,
scclNvmlDeviceCount
,
scclNvmlMaxDevices
);
initResult
=
scclInternalError
;
return
initResult
;
}
for
(
int
a
=
0
;
a
<
scclNvmlDeviceCount
;
a
++
)
{
res1
=
pfn_nvmlDeviceGetHandleByIndex
(
a
,
&
scclNvmlDevices
[
a
].
handle
);
if
(
res1
!=
NVML_SUCCESS
)
{
WARN
(
"nvmlDeviceGetHandleByIndex(%d) failed: %s"
,
int
(
a
),
pfn_nvmlErrorString
(
res1
));
initResult
=
scclSystemError
;
return
initResult
;
}
res1
=
pfn_nvmlDeviceGetCudaComputeCapability
(
scclNvmlDevices
[
a
].
handle
,
&
scclNvmlDevices
[
a
].
computeCapabilityMajor
,
&
scclNvmlDevices
[
a
].
computeCapabilityMinor
);
if
(
res1
!=
NVML_SUCCESS
)
{
WARN
(
"nvmlDeviceGetCudaComputeCapability(%d) failed: %s"
,
int
(
a
),
pfn_nvmlErrorString
(
res1
));
initResult
=
scclSystemError
;
return
initResult
;
}
}
for
(
int
a
=
0
;
a
<
scclNvmlDeviceCount
;
a
++
)
{
for
(
int
b
=
0
;
b
<
scclNvmlDeviceCount
;
b
++
)
{
nvmlDevice_t
da
=
scclNvmlDevices
[
a
].
handle
;
nvmlDevice_t
db
=
scclNvmlDevices
[
b
].
handle
;
res1
=
pfn_nvmlDeviceGetP2PStatus
(
da
,
db
,
NVML_P2P_CAPS_INDEX_READ
,
&
scclNvmlDevicePairs
[
a
][
b
].
p2pStatusRead
);
if
(
res1
!=
NVML_SUCCESS
)
{
WARN
(
"nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s"
,
a
,
b
,
pfn_nvmlErrorString
(
res1
));
initResult
=
scclSystemError
;
return
initResult
;
}
res1
=
pfn_nvmlDeviceGetP2PStatus
(
da
,
db
,
NVML_P2P_CAPS_INDEX_WRITE
,
&
scclNvmlDevicePairs
[
a
][
b
].
p2pStatusWrite
);
if
(
res1
!=
NVML_SUCCESS
)
{
WARN
(
"nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s"
,
a
,
b
,
pfn_nvmlErrorString
(
res1
));
initResult
=
scclSystemError
;
return
initResult
;
}
}
}
initResult
=
scclSuccess
;
return
initResult
;
}
#define NVMLCHECK(name, ...) \
do { \
nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
if(e44241808 != NVML_SUCCESS) { \
WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
return scclSystemError; \
} \
} while(0)
#define NVMLTRY(name, ...) \
do { \
if(!SCCL_NVML_DIRECT && pfn_##name == nullptr) \
return scclInternalError;
/* missing symbol is not a warned error */
\
nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
if(e44241808 != NVML_SUCCESS) { \
if(e44241808 != NVML_ERROR_NOT_SUPPORTED) \
INFO(SCCL_LOG_TOPO, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
return scclSystemError; \
} \
} while(0)
scclResult_t
scclNvmlDeviceGetHandleByPciBusId
(
const
char
*
pciBusId
,
nvmlDevice_t
*
device
)
{
SCCLCHECK
(
scclNvmlEnsureInitialized
());
std
::
lock_guard
<
std
::
mutex
>
locked
(
lock
);
NVMLCHECK
(
nvmlDeviceGetHandleByPciBusId
,
pciBusId
,
device
);
return
scclSuccess
;
}
scclResult_t
scclNvmlDeviceGetHandleByIndex
(
unsigned
int
index
,
nvmlDevice_t
*
device
)
{
SCCLCHECK
(
scclNvmlEnsureInitialized
());
*
device
=
scclNvmlDevices
[
index
].
handle
;
return
scclSuccess
;
}
scclResult_t
scclNvmlDeviceGetIndex
(
nvmlDevice_t
device
,
unsigned
*
index
)
{
SCCLCHECK
(
scclNvmlEnsureInitialized
());
for
(
int
d
=
0
;
d
<
scclNvmlDeviceCount
;
d
++
)
{
if
(
scclNvmlDevices
[
d
].
handle
==
device
)
{
*
index
=
d
;
return
scclSuccess
;
}
}
return
scclInvalidArgument
;
}
scclResult_t
scclNvmlDeviceGetNvLinkState
(
nvmlDevice_t
device
,
unsigned
int
link
,
nvmlEnableState_t
*
isActive
)
{
SCCLCHECK
(
scclNvmlEnsureInitialized
());
std
::
lock_guard
<
std
::
mutex
>
locked
(
lock
);
NVMLTRY
(
nvmlDeviceGetNvLinkState
,
device
,
link
,
isActive
);
return
scclSuccess
;
}
scclResult_t
scclNvmlDeviceGetNvLinkRemotePciInfo
(
nvmlDevice_t
device
,
unsigned
int
link
,
nvmlPciInfo_t
*
pci
)
{
SCCLCHECK
(
scclNvmlEnsureInitialized
());
std
::
lock_guard
<
std
::
mutex
>
locked
(
lock
);
NVMLTRY
(
nvmlDeviceGetNvLinkRemotePciInfo
,
device
,
link
,
pci
);
return
scclSuccess
;
}
scclResult_t
scclNvmlDeviceGetNvLinkCapability
(
nvmlDevice_t
device
,
unsigned
int
link
,
nvmlNvLinkCapability_t
capability
,
unsigned
int
*
capResult
)
{
SCCLCHECK
(
scclNvmlEnsureInitialized
());
std
::
lock_guard
<
std
::
mutex
>
locked
(
lock
);
NVMLTRY
(
nvmlDeviceGetNvLinkCapability
,
device
,
link
,
capability
,
capResult
);
return
scclSuccess
;
}
scclResult_t
scclNvmlDeviceGetCudaComputeCapability
(
nvmlDevice_t
device
,
int
*
major
,
int
*
minor
)
{
SCCLCHECK
(
scclNvmlEnsureInitialized
());
for
(
int
d
=
0
;
d
<
scclNvmlDeviceCount
;
d
++
)
{
if
(
device
==
scclNvmlDevices
[
d
].
handle
)
{
*
major
=
scclNvmlDevices
[
d
].
computeCapabilityMajor
;
*
minor
=
scclNvmlDevices
[
d
].
computeCapabilityMinor
;
return
scclSuccess
;
}
}
return
scclInvalidArgument
;
}
scclResult_t
scclNvmlDeviceGetP2PStatus
(
nvmlDevice_t
device1
,
nvmlDevice_t
device2
,
nvmlGpuP2PCapsIndex_t
p2pIndex
,
nvmlGpuP2PStatus_t
*
p2pStatus
)
{
SCCLCHECK
(
scclNvmlEnsureInitialized
());
if
(
p2pIndex
==
NVML_P2P_CAPS_INDEX_READ
||
p2pIndex
==
NVML_P2P_CAPS_INDEX_WRITE
)
{
int
a
=
-
1
,
b
=
-
1
;
for
(
int
d
=
0
;
d
<
scclNvmlDeviceCount
;
d
++
)
{
if
(
device1
==
scclNvmlDevices
[
d
].
handle
)
a
=
d
;
if
(
device2
==
scclNvmlDevices
[
d
].
handle
)
b
=
d
;
}
if
(
a
==
-
1
||
b
==
-
1
)
return
scclInvalidArgument
;
if
(
p2pIndex
==
NVML_P2P_CAPS_INDEX_READ
)
*
p2pStatus
=
scclNvmlDevicePairs
[
a
][
b
].
p2pStatusRead
;
else
*
p2pStatus
=
scclNvmlDevicePairs
[
a
][
b
].
p2pStatusWrite
;
}
else
{
std
::
lock_guard
<
std
::
mutex
>
locked
(
lock
);
NVMLCHECK
(
nvmlDeviceGetP2PStatus
,
device1
,
device2
,
p2pIndex
,
p2pStatus
);
}
return
scclSuccess
;
}
scclResult_t
scclNvmlDeviceGetFieldValues
(
nvmlDevice_t
device
,
int
valuesCount
,
nvmlFieldValue_t
*
values
)
{
SCCLCHECK
(
scclNvmlEnsureInitialized
());
std
::
lock_guard
<
std
::
mutex
>
locked
(
lock
);
NVMLTRY
(
nvmlDeviceGetFieldValues
,
device
,
valuesCount
,
values
);
return
scclSuccess
;
}
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment