"examples/dreambooth/train_dreambooth_lora_hidream.py" did not exist on "c375903db58826494d858e02b44d21b42669ff5e"
Commit a4ac3320 authored by lishen's avatar lishen
Browse files

通过线程池实现ipcsocket,满足节点内通信

parent d9d23f34
......@@ -7,7 +7,7 @@
namespace sccl {
namespace hardware {
namespace net {
namespace device {
namespace net_ib {
#define ASSIGN_SYM(container, symbol, name) container->name = &symbol;
......@@ -102,7 +102,7 @@ scclResult_t buildIbvSymbols(struct scclIbvSymbols* ibvSymbols) {
return scclSuccess;
}
} // namespace device
} // namespace net_ib
} // namespace net
} // namespace hardware
} // namespace sccl
......@@ -6,7 +6,7 @@
namespace sccl {
namespace hardware {
namespace net {
namespace device {
namespace net_ib {
/* IB Verbs Function Pointers*/
struct scclIbvSymbols {
......@@ -41,7 +41,7 @@ struct scclIbvSymbols {
/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
scclResult_t buildIbvSymbols(struct scclIbvSymbols* ibvSymbols);
} // namespace device
} // namespace net_ib
} // namespace net
} // namespace hardware
} // namespace sccl
......@@ -13,7 +13,7 @@
namespace sccl {
namespace hardware {
namespace net {
namespace device {
namespace net_ib {
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
static scclResult_t initResult;
......@@ -250,7 +250,7 @@ scclResult_t wrap_ibv_post_recv(struct ibv_qp* qp, struct ibv_recv_wr* wr, struc
return scclSuccess;
}
} // namespace device
} // namespace net_ib
} // namespace net
} // namespace hardware
} // namespace sccl
......@@ -10,7 +10,7 @@
namespace sccl {
namespace hardware {
namespace net {
namespace device {
namespace net_ib {
typedef enum ibv_return_enum : uint8_t {
IBV_SUCCESS = 0, //!< The operation was successful
......@@ -112,7 +112,7 @@ scclResult_t wrap_ibv_post_recv(struct ibv_qp* qp, struct ibv_recv_wr* wr, struc
// 获取事件类型字符串
scclResult_t wrap_ibv_event_type_str(char** ret, enum ibv_event_type event);
} // namespace device
} // namespace net_ib
} // namespace net
} // namespace hardware
} // namespace sccl
......@@ -9,15 +9,12 @@
#include <netdb.h>
#include "net_ib.h"
#include "socket.h"
#include "rocm_wrap.h"
#include "base.h"
namespace sccl {
namespace hardware {
namespace net {
namespace device {
namespace net_ib {
///////////////////////////////////////// 环境变量读取及设置 /////////////////////////////////////////
......@@ -59,11 +56,8 @@ SCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 1);
///////////////////////////////////////// 参数及结构体设置 /////////////////////////////////////////
#define MAXNAMESIZE 64
#define MAX_IF_NAME_SIZE 16
static char scclIbIfName[MAX_IF_NAME_SIZE + 1];
static union host::scclSocketAddress scclIbIfAddr;
// 定义一个静态变量 scclNIbDevs,用于存储 InfiniBand 设备的数量
static int scclNIbDevs = -1;
static char scclIbIfName[MAX_IF_NAME_SIZE + 1]; // 用于存储网络接口名称的字符数组
static union net_socket::scclSocketAddress scclIbIfAddr; // 定义一个联合体类型的变量,用于存储网络接口地址
struct scclIbMr {
uintptr_t addr; // 内存地址
......@@ -117,7 +111,7 @@ pthread_mutex_t scclIbLock = PTHREAD_MUTEX_INITIALIZER;
static int scclIbRelaxedOrderingEnabled = 0;
// 定义一个线程局部变量,用于存储重用的地址信息
static thread_local union host::scclSocketAddress reusedAddr;
static thread_local union net_socket::scclSocketAddress reusedAddr;
// 定义一个线程局部变量,用于存储重用的套接字文件描述符
static thread_local int reusedSockfd = -1;
......@@ -128,7 +122,7 @@ pthread_t scclIbAsyncThread;
// 定义一个常量,表示InfiniBand网络接口的最大接收数量
static constexpr int SCCL_NET_IB_MAX_RECVS = 8;
// 定义一个常量,表示最大字符串长度
static constexpr int MAX_STR_LEN = 8;
static constexpr int MAX_STR_LEN = 255;
// 为每个并发接收支持SCCL_NET_MAX_REQUESTS
static constexpr int MAX_REQUESTS = (SCCL_NET_MAX_REQUESTS * SCCL_NET_IB_MAX_RECVS);
......@@ -146,12 +140,12 @@ scclIbRequest 结构体用于封装 InfiniBand 通信请求的详细信息,包
联合体 union 根据请求类型(发送或接收)存储不同的数据结构,以支持灵活的通信操作。
*/
struct scclIbRequest {
struct scclIbVerbs* verbs; // 指向 scclIbVerbs 结构体的指针,包含 Infiniband 相关的操作
int type; // 请求的类型,例如发送或接收
int events; // 事件标志, 用于记录请求相关的事件状态
struct host::scclSocket* sock; // 指向 scclSocket 结构体的指针,表示网络套接字
struct scclIbGidInfo* gidInfo; // 指向 scclIbGidInfo 结构体的指针,包含全局标识符信息
int nreqs; // 请求的数量
struct scclIbVerbs* verbs; // 指向 scclIbVerbs 结构体的指针,包含 Infiniband 相关的操作
int type; // 请求的类型,例如发送或接收
int events; // 事件标志, 用于记录请求相关的事件状态
struct net_socket::scclSocket* sock; // 指向 scclSocket 结构体的指针,表示网络套接字
struct scclIbGidInfo* gidInfo; // 指向 scclIbGidInfo 结构体的指针,包含全局标识符信息
int nreqs; // 请求的数量
// 联合体,用于存储不同类型请求的特定信息
union {
// send: 发送请求的相关信息
......@@ -195,7 +189,7 @@ struct scclIbSendComm {
struct scclIbRequest* fifoReqs[MAX_REQUESTS][SCCL_NET_IB_MAX_RECVS]; // FIFO请求指针数组
struct ibv_send_wr wrs[SCCL_NET_IB_MAX_RECVS + 1]; // 发送工作请求结构体数组
struct ibv_sge sges[SCCL_NET_IB_MAX_RECVS]; // 散布-聚集元素结构体数组
struct host::scclSocket sock; // 套接字结构体
struct net_socket::scclSocket sock; // 套接字结构体
int ready; // 是否准备好
struct ibv_qp* qps[SCCL_IB_MAX_QPS]; // 队列对指针数组
......@@ -206,33 +200,6 @@ struct scclIbSendComm {
struct scclIbGidInfo gidInfo; // GID信息结构体
};
/*IB的通信状态*/
enum scclIbCommState : uint8_t {
scclIbCommStateStart = 0, // 初始状态
scclIbCommStateConnect = 1, // 尝试连接状态
scclIbCommStateAccept = 3, // 接受连接状态
scclIbCommStateSend = 4, // 发送数据状态
scclIbCommStateRecv = 5, // 接收数据状态
scclIbCommStateConnecting = 6, // 正在连接状态
scclIbCommStateConnected = 7, // 已连接状态
scclIbCommStatePendingReady = 8, // 等待准备状态
};
/*通信的阶段*/
struct scclIbCommStage {
enum scclIbCommState state; // 通信阶段的状态
int offset; // 数据偏移量
void* buffer; // 用于通信的缓冲区指针
void* comm; // 通信对象指针
};
/*监听通信的上下文*/
struct scclIbListenComm {
int dev; // 设备标识符
struct host::scclSocket sock; // 用于网络通信的套接字
struct scclIbCommStage stage; // 通信阶段的状态
};
struct scclIbQpInfo {
uint32_t lid;
uint8_t ib_port;
......@@ -270,7 +237,7 @@ struct scclIbRemFifo {
struct scclIbRecvComm {
struct scclIbVerbs verbs;
struct scclIbRemFifo remFifo;
struct host::scclSocket sock;
struct net_socket::scclSocket sock;
int ready;
struct ibv_qp* qps[SCCL_IB_MAX_QPS];
int nqps;
......@@ -292,7 +259,7 @@ static_assert((offsetof(struct scclIbRecvComm, remFifo) % 32) == 0, "scclIbSendC
* @param args 传入参数,应转换为ibv_context结构体指针
* @return void* 线程返回值,始终返回NULL
*/
static void* scclIbAsyncThreadMain(void* args) {
void* scclNetIb::scclIbAsyncThreadMain(void* args) {
// 将传入的参数转换为InfiniBand上下文结构体指针
struct ibv_context* context = (struct ibv_context*)args;
......@@ -337,7 +304,7 @@ static void* scclIbAsyncThreadMain(void* args) {
* @param realPort 输出参数,记录实际端口号
* @return scclResult_t 返回操作结果,成功返回scclSuccess
*/
static scclResult_t scclIbGetPciPath(char* devName, char** path, int* realPort) {
scclResult_t scclNetIb::scclIbGetPciPath(char* devName, char** path, int* realPort) {
// 定义一个字符数组用于存储设备路径
char devicePath[PATH_MAX];
// 构造设备路径字符串,格式为 "/sys/class/infiniband/<devName>/device"
......@@ -396,14 +363,14 @@ static int firstBitSet(int val, int max) {
* @param width 输入的宽度值
* @return 返回ibvWidths数组中对应的宽度索引值
*/
static int scclIbWidth(int width) { return ibvWidths[firstBitSet(width, sizeof(ibvWidths) / sizeof(int) - 1)]; }
int scclNetIb::scclIbWidth(int width) { return ibvWidths[firstBitSet(width, sizeof(ibvWidths) / sizeof(int) - 1)]; }
/**
* 根据给定的速度值查找并返回对应的IB传输速率
* @param speed 输入的速度值
* @return 返回ibvSpeeds数组中第一个匹配的IB传输速率
*/
static int scclIbSpeed(int speed) { return ibvSpeeds[firstBitSet(speed, sizeof(ibvSpeeds) / sizeof(int) - 1)]; }
int scclNetIb::scclIbSpeed(int speed) { return ibvSpeeds[firstBitSet(speed, sizeof(ibvSpeeds) / sizeof(int) - 1)]; }
/**
* 检查当前IB设备是否支持宽松排序(Relaxed Ordering)模式
......@@ -412,7 +379,7 @@ static int scclIbSpeed(int speed) { return ibvSpeeds[firstBitSet(speed, sizeof(i
* @note 通过查询IBVERBS_1.8 API的ibv_reg_mr_iova2函数来检测IBV_ACCESS_RELAXED_ORDERING支持
* @see scclParamIbPciRelaxedOrdering() 获取当前配置的RO模式
*/
static int scclIbRelaxedOrderingCapable(void) {
int scclNetIb::scclIbRelaxedOrderingCapable(void) {
int roMode = scclParamIbPciRelaxedOrdering();
scclResult_t r = scclInternalError;
if(roMode == 1 || roMode == 2) {
......@@ -432,7 +399,7 @@ static int scclIbRelaxedOrderingCapable(void) {
* @param shownIbHcaEnv 计数器,用于控制日志输出次数
* @return char* 处理后的IB设备环境变量值
*/
static char* scclIbGetIbHca(int& shownIbHcaEnv, bool* searchNot, bool* searchExact) {
char* scclNetIb::scclIbGetIbHca(int& shownIbHcaEnv, bool* searchNot, bool* searchExact) {
// 检查用户是否定义了要使用的IB设备:端口
char* userIbEnv = getenv("SCCL_IB_HCA");
if(userIbEnv != NULL && shownIbHcaEnv++ == 0)
......@@ -463,7 +430,7 @@ static char* scclIbGetIbHca(int& shownIbHcaEnv, bool* searchNot, bool* searchExa
* @note 缓冲区最大长度为MAX_STR_LEN,超出部分会被截断
* 文件内容末尾会自动添加字符串结束符'\0'
*/
scclResult_t scclGetStrFromSys(const char* path, const char* fileName, char* strValue) {
scclResult_t scclNetIb::scclGetStrFromSys(const char* path, const char* fileName, char* strValue) {
char filePath[PATH_MAX];
sprintf(filePath, "%s/%s", path, fileName);
int offset = 0;
......@@ -494,7 +461,7 @@ scclResult_t scclGetStrFromSys(const char* path, const char* fileName, char* str
* @param ibDev IB设备号
* @return scclResult_t 返回scclSuccess表示支持,返回scclSystemError表示不支持
*/
scclResult_t scclIbGdrSupport(int ibDev) {
scclResult_t scclNetIb::scclIbGdrSupport(int ibDev) {
static int moduleLoaded = -1;
if(moduleLoaded == -1) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
......@@ -527,7 +494,7 @@ scclResult_t scclIbGdrSupport(int ibDev) {
* @param dev 设备索引
* @return scclResult_t 返回scclSuccess表示支持,scclSystemError表示不支持
*/
scclResult_t scclIbDmaBufSupport(int dev) {
scclResult_t scclNetIb::scclIbDmaBufSupport(int dev) {
static int dmaBufSupported = -1;
if(dmaBufSupported == -1) {
scclResult_t res;
......@@ -552,9 +519,9 @@ failure:
}
struct scclIbHandle {
union host::scclSocketAddress connectAddr; // Filled by the target (目标填充)
uint64_t magic; // random number to help debugging (用于调试的随机数)
struct scclIbCommStage stage; // Used by the other side when connecting (连接时由另一侧使用)
union net_socket::scclSocketAddress connectAddr; // Filled by the target (目标填充)
uint64_t magic; // random number to help debugging (用于调试的随机数)
struct scclIbCommStage stage; // Used by the other side when connecting (连接时由另一侧使用)
};
/**
......@@ -572,7 +539,7 @@ struct scclIbHandle {
* @note 该函数会递增设备的PD引用计数,并在首次调用时为设备分配PD
* @note 创建的CQ大小为2*MAX_REQUESTS*IB_QPS_PER_CONNECTION,以支持接收请求的双重完成
*/
scclResult_t scclIbInitVerbs(int dev, struct ibv_context* ctx, struct scclIbVerbs* verbs) {
scclResult_t scclNetIb::scclIbInitVerbs(int dev, struct ibv_context* ctx, struct scclIbVerbs* verbs) {
verbs->dev = dev;
pthread_mutex_lock(&scclIbDevs[dev].lock);
......@@ -593,7 +560,20 @@ scclResult_t scclIbInitVerbs(int dev, struct ibv_context* ctx, struct scclIbVerb
return scclSuccess;
}
scclResult_t scclIbCreateQp(uint8_t ib_port, struct scclIbVerbs* verbs, int access_flags, struct ibv_qp** qp) {
/**
* 创建并初始化一个InfiniBand队列对(QP)
*
* @param ib_port IB端口号
* @param verbs IB verbs结构体指针
* @param access_flags QP访问权限标志
* @param qp 输出的QP指针
*
* @return 返回scclSuccess表示成功,否则返回错误码
*
* @note QP类型为可靠连接(RC),发送队列大小为2*MAX_REQUESTS,
* 接收队列大小为MAX_REQUESTS,支持内联数据发送(如果配置启用)
*/
scclResult_t scclNetIb::scclIbCreateQp(uint8_t ib_port, struct scclIbVerbs* verbs, int access_flags, struct ibv_qp** qp) {
struct ibv_qp_init_attr qpInitAttr;
memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr));
qpInitAttr.send_cq = verbs->cq;
......@@ -616,7 +596,20 @@ scclResult_t scclIbCreateQp(uint8_t ib_port, struct scclIbVerbs* verbs, int acce
return scclSuccess;
}
scclResult_t scclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct scclIbQpInfo* info) {
/**
* 将IB QP状态修改为RTR(Ready to Receive)状态
*
* @param qp IB QP指针
* @param qpn 目标QP号
* @param info QP配置信息,包含MTU、链路层类型、端口号等参数
*
* @return 成功返回scclSuccess,失败返回错误码
*
* @note 根据链路层类型(以太网/IB)设置不同的AH属性
* 以太网需要设置全局路由头(GRH)相关参数
* IB链路需要设置目标LID
*/
scclResult_t scclNetIb::scclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct scclIbQpInfo* info) {
struct ibv_qp_attr qpAttr;
memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
qpAttr.qp_state = IBV_QPS_RTR;
......@@ -645,7 +638,16 @@ scclResult_t scclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct scclIbQpInfo* i
return scclSuccess;
}
scclResult_t scclIbRtsQp(struct ibv_qp* qp) {
/**
* 将IB(InfiniBand)队列对(QP)状态修改为RTS(Ready To Send)状态
*
* @param qp IB队列对指针
* @return 成功返回scclSuccess,失败返回错误码
*
* 该函数配置QP属性并调用ibv_modify_qp将其状态改为RTS状态,
* 设置了超时时间、重试次数、RNR重试次数、SQ PSN和最大RD原子操作数等参数。
*/
scclResult_t scclNetIb::scclIbRtsQp(struct ibv_qp* qp) {
struct ibv_qp_attr qpAttr;
memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
qpAttr.qp_state = IBV_QPS_RTS;
......@@ -670,7 +672,17 @@ const char* reqTypeStr[] = {"Unused", "Send", "Recv", "Flush"};
static_assert((offsetof(struct scclIbSendComm, fifo) % 32) == 0, "scclIbSendComm fifo must be 32-byte aligned");
static_assert((sizeof(struct scclIbSendFifo) % 32) == 0, "scclIbSendFifo element size must be 32-byte multiples");
scclResult_t scclIbDestroyVerbs(struct scclIbVerbs* verbs) {
/**
* @brief 销毁IB Verbs资源
*
* 释放指定的IB Verbs资源,包括完成队列(CQ)和保护域(PD)。
* 当PD的引用计数减至0时,会自动释放PD资源。
* 该函数是线程安全的,使用互斥锁保护共享资源。
*
* @param verbs 指向要销毁的IB Verbs结构体
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*/
scclResult_t scclNetIb::scclIbDestroyVerbs(struct scclIbVerbs* verbs) {
scclResult_t res;
SCCLCHECK(wrap_ibv_destroy_cq(verbs->cq));
......@@ -684,7 +696,17 @@ returning:
return res;
}
scclResult_t scclIbGetRequest(struct scclIbVerbs* verbs, struct scclIbRequest** req) {
/**
* @brief 从verbs请求池中获取一个未使用的请求结构体
*
* @param verbs 指向scclIbVerbs结构体的指针,包含请求池
* @param req 输出参数,用于返回获取到的请求结构体指针
* @return scclResult_t 成功返回scclSuccess,失败返回scclInternalError
*
* 该函数遍历verbs请求池,查找第一个未使用的请求(SCCL_NET_IB_REQ_UNUSED),
* 初始化其字段后返回。如果所有请求都在使用中,则返回错误。
*/
scclResult_t scclNetIb::scclIbGetRequest(struct scclIbVerbs* verbs, struct scclIbRequest** req) {
for(int i = 0; i < MAX_REQUESTS; i++) {
struct scclIbRequest* r = verbs->reqs + i;
if(r->type == SCCL_NET_IB_REQ_UNUSED) {
......@@ -700,14 +722,36 @@ scclResult_t scclIbGetRequest(struct scclIbVerbs* verbs, struct scclIbRequest**
*req = NULL;
return scclInternalError;
}
scclResult_t scclIbFreeRequest(struct scclIbRequest* r) {
/**
* 释放IB网络请求资源。
*
* 将请求类型标记为未使用状态,但不实际释放内存。
*
* @param r 要释放的IB网络请求指针
* @return 总是返回scclSuccess表示操作成功
*/
scclResult_t scclNetIb::scclIbFreeRequest(struct scclIbRequest* r) {
r->type = SCCL_NET_IB_REQ_UNUSED;
return scclSuccess;
}
scclResult_t scclIbTest(void* request, int* done, int* size);
scclResult_t scclIbMultiSend(struct scclIbSendComm* comm, int slot) {
/**
* @brief 执行IB网络的多发送操作
*
* 该函数处理IB网络的多发送请求,包括设置发送工作请求(WR)和分散/聚集元素(SGE),
* 并处理自适应路由(AR)和QP分割等高级功能。
*
* @param comm 指向scclIbSendComm结构的指针,包含发送通信上下文
* @param slot 要使用的发送槽位索引
* @return scclResult_t 返回操作结果,成功返回scclSuccess,失败返回错误码
*
* @note 1. 支持多QP分割发送,确保128B对齐
* 2. 使用RDMA_WRITE_WITH_IMM发送立即数据
* 3. 当请求数>32时会返回错误
* 4. 自适应路由模式下会发送两次WR
*/
scclResult_t scclNetIb::scclIbMultiSend(struct scclIbSendComm* comm, int slot) {
struct scclIbRequest** reqs = comm->fifoReqs[slot];
volatile struct scclIbSendFifo* slots = comm->fifo[slot];
int nreqs = slots[0].nreqs;
......@@ -792,7 +836,23 @@ scclResult_t scclIbMultiSend(struct scclIbSendComm* comm, int slot) {
return scclSuccess;
}
scclResult_t scclIbPostFifo(struct scclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct scclIbRequest* req) {
/**
* @brief 通过IB Verbs RDMA写入操作向远程FIFO队列提交数据
*
* @param comm 指向接收通信上下文的指针
* @param n 要发送的数据块数量
* @param data 数据指针数组
* @param sizes 数据大小数组
* @param tags 数据标签数组
* @param mhandles 内存句柄数组
* @param req 请求结构体指针
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*
* @note 该函数会将数据打包到本地FIFO元素中,并通过RDMA写入到远程FIFO队列。
* 每MAX_REQUESTS次操作会触发一次带信号(SIGNALED)的发送,以避免发送队列堵塞。
* 使用IBV_WR_RDMA_WRITE操作码进行数据传输。
*/
scclResult_t scclNetIb::scclIbPostFifo(struct scclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct scclIbRequest* req) {
struct ibv_send_wr wr;
memset(&wr, 0, sizeof(wr));
......@@ -852,10 +912,15 @@ scclResult_t scclIbPostFifo(struct scclIbRecvComm* comm, int n, void** data, int
return scclSuccess;
}
} // namespace net_ib
////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////// scclNetIb调用的函数 ////////////////////////////////////////
namespace net_ib {
scclNetIb::scclNetIb() : scclNetBase("IB") {}
scclNetIb::~scclNetIb() {
if(ibComm != nullptr) {
free(ibComm);
}
}
/**
* @brief 初始化InfiniBand硬件设备
......@@ -872,7 +937,9 @@ namespace net_ib {
* @note 函数内部会处理环境变量SCCL_IB_HCA来过滤特定设备
* @note 使用互斥锁scclIbLock保证线程安全
*/
scclResult_t scclIbInit(void) {
scclResult_t scclNetIb::init() {
SCCLCHECK(scclCalloc(&ibComm, 1));
// 如果IB被禁用,返回内部错误
if(scclParamIbDisable())
return scclInternalError;
......@@ -894,7 +961,7 @@ scclResult_t scclIbInit(void) {
if(scclNIbDevs == -1) {
scclNIbDevs = 0;
// 查找网络接口
if(host::scclFindSocketInterfaces(scclIbIfName, &scclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
if(net_socket::scclFindSocketInterfaces(scclIbIfName, &scclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
WARN("NET/IB : No IP interface found.");
return scclInternalError;
}
......@@ -1042,14 +1109,14 @@ scclResult_t scclIbInit(void) {
// line 是设备的相关信息字符串
// scclIbRelaxedOrderingEnabled 是一个布尔值,指示是否启用了Relaxed Ordering
// scclIbIfName 是IB接口的名称
// host::scclSocketToString 是一个函数,用于将socket地址转换为字符串
// net_socket::scclSocketToString 是一个函数,用于将socket地址转换为字符串
// addrline 是存储转换后地址字符串的数组
INFO(SCCL_LOG_NET,
"NET/IB : Using%s %s; OOB %s:%s",
line,
scclIbRelaxedOrderingEnabled ? "[RO]" : "",
scclIbIfName,
host::scclSocketToString(&scclIbIfAddr, addrline));
net_socket::scclSocketToString(&scclIbIfAddr, addrline));
}
pthread_mutex_unlock(&scclIbLock);
}
......@@ -1062,7 +1129,7 @@ scclResult_t scclIbInit(void) {
* @param ndev [out] 用于存储设备数量的指针
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*/
scclResult_t scclIbGetDevicesNum(int* ndev) {
scclResult_t scclNetIb::devices(int* ndev) {
*ndev = scclNIbDevs;
return scclSuccess;
}
......@@ -1077,10 +1144,11 @@ scclResult_t scclIbGetDevicesNum(int* ndev) {
* @param props 用于存储设备属性的结构体指针
* @return scclResult_t 返回操作结果,成功返回scclSuccess
*/
scclResult_t scclIbGetProperties(int dev, scclNetProperties_t* props) {
props->name = scclIbDevs[dev].devName;
props->pciPath = scclIbDevs[dev].pciPath;
props->guid = scclIbDevs[dev].guid;
scclResult_t scclNetIb::getProperties(int dev, scclNetProperties_t* props) {
props->name = scclIbDevs[dev].devName;
props->pciPath = scclIbDevs[dev].pciPath;
props->guid = scclIbDevs[dev].guid;
props->ptrSupport = SCCL_PTR_HOST;
if(scclIbGdrSupport(dev) == scclSuccess) {
props->ptrSupport |= SCCL_PTR_CUDA; // GDR support via nv_peermem
......@@ -1111,41 +1179,60 @@ scclResult_t scclIbGetProperties(int dev, scclNetProperties_t* props) {
* 3. 根据配置决定是否复用套接字
* 4. 启动套接字监听并获取连接地址
*/
scclResult_t scclIbListen(int dev, void* opaqueHandle, void** listenComm) {
// 创建并初始化通信结构体
struct scclIbListenComm* comm;
SCCLCHECK(scclCalloc(&comm, 1));
scclResult_t scclNetIb::listen(int dev, void* opaqueHandle, void** listenComm) {
memset(ibComm, 0, sizeof(struct scclIbListenComm));
struct scclIbHandle* handle = (struct scclIbHandle*)opaqueHandle;
// 静态断言,确保 scclIbHandle 结构体的大小不超过 SCCL_NET_HANDLE_MAXSIZE
static_assert(sizeof(struct scclIbHandle) < SCCL_NET_HANDLE_MAXSIZE, "scclIbHandle size too large");
// 将 handle 指向的内存区域清零,大小为 scclIbHandle 结构体的大小
memset(handle, 0, sizeof(struct scclIbHandle));
// 设置设备和处理句柄
comm->dev = dev;
ibComm->dev = dev;
handle->magic = SCCL_SOCKET_MAGIC;
SCCLCHECK(host::scclSocketInit(&comm->sock, &scclIbIfAddr, handle->magic, host::scclSocketTypeNetIb, NULL, 1));
SCCLCHECK(net_socket::scclSocketInit(&ibComm->sock, &scclIbIfAddr, handle->magic, net_socket::scclSocketTypeNetIb, NULL, 1));
// 如果启用了端口复用,则复用套接字地址和文件描述符
if(scclParamIbSockServerPortReuse()) {
if(reusedSockfd == -1) {
SCCLCHECK(scclSocketListen(&comm->sock));
memcpy(&reusedAddr, &comm->sock.addr, sizeof(union host::scclSocketAddress));
reusedSockfd = comm->sock.fd;
SCCLCHECK(scclSocketListen(&ibComm->sock));
memcpy(&reusedAddr, &ibComm->sock.addr, sizeof(union net_socket::scclSocketAddress));
reusedSockfd = ibComm->sock.fd;
} else {
memcpy(&comm->sock.addr, &reusedAddr, sizeof(union host::scclSocketAddress));
comm->sock.fd = reusedSockfd;
memcpy(&ibComm->sock.addr, &reusedAddr, sizeof(union net_socket::scclSocketAddress));
ibComm->sock.fd = reusedSockfd;
}
} else {
SCCLCHECK(host::scclSocketListen(&comm->sock));
SCCLCHECK(net_socket::scclSocketListen(&ibComm->sock));
}
// 获取套接字地址并设置监听通信
SCCLCHECK(host::scclSocketGetAddr(&comm->sock, &handle->connectAddr));
*listenComm = comm;
SCCLCHECK(net_socket::scclSocketGetAddr(&ibComm->sock, &handle->connectAddr));
*listenComm = ibComm;
return scclSuccess;
}
scclResult_t scclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
/**
* @brief 建立IB网络连接并初始化通信资源
*
* 该函数负责完成以下操作:
* 1. 初始化socket连接
* 2. 创建IB QP队列对
* 3. 交换QP信息
* 4. 完成QP状态转换(RTR/RTS)
* 5. 注册内存区域
*
* @param dev 设备索引
* @param opaqueHandle 包含连接信息的句柄
* @param sendComm 输出参数,返回建立的发送通信上下文
* @return scclResult_t 返回操作结果状态码
*
* @note 该函数使用状态机模式处理异步连接过程
* @warning 不能重复连接已建立的sendComm
*/
scclResult_t scclNetIb::connect(int dev, void* opaqueHandle, void** sendComm) {
struct scclIbHandle* handle = (struct scclIbHandle*)opaqueHandle;
struct scclIbCommStage* stage = &handle->stage;
struct scclIbSendComm* comm = (struct scclIbSendComm*)stage->comm;
......@@ -1166,14 +1253,14 @@ scclResult_t scclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
}
SCCLCHECK(scclIbMalloc((void**)&comm, sizeof(struct scclIbSendComm)));
SCCLCHECK(host::scclSocketInit(&comm->sock, &handle->connectAddr, handle->magic, host::scclSocketTypeNetIb, NULL, 1));
SCCLCHECK(net_socket::scclSocketInit(&comm->sock, &handle->connectAddr, handle->magic, net_socket::scclSocketTypeNetIb, NULL, 1));
stage->comm = comm;
stage->state = scclIbCommStateConnect;
SCCLCHECK(host::scclSocketConnect(&comm->sock, scclParamIbSockClientPortReuse()));
SCCLCHECK(net_socket::scclSocketConnect(&comm->sock, scclParamIbSockClientPortReuse()));
ib_connect_check:
/* since scclSocketConnect is async, we must check if connection is complete */
SCCLCHECK(host::scclSocketReady(&comm->sock, &ready));
SCCLCHECK(net_socket::scclSocketReady(&comm->sock, &ready));
if(!ready)
return scclSuccess;
......@@ -1292,7 +1379,7 @@ ib_send_ready:
* @param recvComm 输出参数,接收通信句柄
* @return scclResult_t 返回操作结果,成功返回scclSuccess
*/
scclResult_t scclIbAccept(void* listenComm, void** recvComm) {
scclResult_t scclNetIb::accept(void* listenComm, void** recvComm) {
struct scclIbListenComm* lComm = (struct scclIbListenComm*)listenComm;
struct scclIbCommStage* stage = &lComm->stage;
struct scclIbRecvComm* rComm = (struct scclIbRecvComm*)stage->comm;
......@@ -1315,11 +1402,11 @@ scclResult_t scclIbAccept(void* listenComm, void** recvComm) {
SCCLCHECK(scclIbMalloc((void**)&rComm, sizeof(struct scclIbRecvComm)));
stage->comm = rComm;
stage->state = scclIbCommStateAccept;
SCCLCHECK(host::scclSocketInit(&rComm->sock));
SCCLCHECK(host::scclSocketAccept(&rComm->sock, &lComm->sock));
SCCLCHECK(net_socket::scclSocketInit(&rComm->sock));
SCCLCHECK(net_socket::scclSocketAccept(&rComm->sock, &lComm->sock));
ib_accept_check:
SCCLCHECK(host::scclSocketReady(&rComm->sock, &ready));
SCCLCHECK(net_socket::scclSocketReady(&rComm->sock, &ready));
if(!ready)
return scclSuccess;
......@@ -1329,7 +1416,7 @@ ib_accept_check:
SCCLCHECK(scclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo)));
ib_recv:
SCCLCHECK(host::scclSocketProgress(SCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset));
SCCLCHECK(net_socket::scclSocketProgress(SCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset));
if(stage->offset != sizeof(remQpInfo))
return scclSuccess;
......@@ -1416,7 +1503,7 @@ ib_recv:
memcpy(stage->buffer, &qpInfo, sizeof(struct scclIbQpInfo));
ib_send:
SCCLCHECK(host::scclSocketProgress(SCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct scclIbQpInfo), &stage->offset));
SCCLCHECK(net_socket::scclSocketProgress(SCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct scclIbQpInfo), &stage->offset));
if(stage->offset < sizeof(struct scclIbQpInfo))
return scclSuccess;
......@@ -1424,7 +1511,7 @@ ib_send:
stage->state = scclIbCommStatePendingReady;
ib_recv_ready:
SCCLCHECK(host::scclSocketProgress(SCCL_SOCKET_RECV, &rComm->sock, &rComm->ready, sizeof(int), &stage->offset));
SCCLCHECK(net_socket::scclSocketProgress(SCCL_SOCKET_RECV, &rComm->sock, &rComm->ready, sizeof(int), &stage->offset));
if(stage->offset != sizeof(int))
return scclSuccess;
......@@ -1440,7 +1527,7 @@ ib_recv_ready:
}
/* DMA-BUF support */
scclResult_t scclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
scclResult_t scclNetIb::regMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
static_assert(offsetof(struct scclIbSendComm, verbs) == offsetof(struct scclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
assert(size > 0);
......@@ -1498,11 +1585,21 @@ returning:
return res;
}
scclResult_t scclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
return scclIbRegMrDmaBuf(comm, data, (size_t)size, type, 0ULL, -1, mhandle);
scclResult_t scclNetIb::regMr(void* comm, void* data, int size, int type, void** mhandle) {
return regMrDmaBuf(comm, data, (size_t)size, type, 0ULL, -1, mhandle);
}
scclResult_t scclIbDeregMr(void* comm, void* mhandle) {
/**
* @brief 注销IB内存区域(MR)
*
* 该函数用于注销指定的IB内存区域(MR),并更新MR缓存。如果MR的引用计数减至0,
* 则从缓存中移除并调用ibv_dereg_mr释放资源。
*
* @param comm 通信上下文指针
* @param mhandle 要注销的内存区域句柄
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*/
scclResult_t scclNetIb::deregMr(void* comm, void* mhandle) {
struct scclIbVerbs* verbs = (struct scclIbVerbs*)comm;
struct scclIbMrCache* cache = &scclIbDevs[verbs->dev].mrCache;
scclResult_t res;
......@@ -1529,10 +1626,10 @@ returning:
return res;
}
scclResult_t scclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
scclResult_t scclNetIb::isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
struct scclIbSendComm* comm = (struct scclIbSendComm*)sendComm;
if(comm->ready == 0) {
WARN("NET/IB: scclIbIsend() called when comm->ready == 0");
WARN("NET/IB: isend() called when comm->ready == 0");
return scclInternalError;
}
if(comm->ready == 0) {
......@@ -1567,26 +1664,26 @@ scclResult_t scclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
// Sanity checks to catch user collective call count/size mismatches
if(size > slots[r].size) {
char line[SOCKET_NAME_MAXLEN + 1];
union host::scclSocketAddress addr;
host::scclSocketGetAddr(&comm->sock, &addr);
union net_socket::scclSocketAddress addr;
net_socket::scclSocketGetAddr(&comm->sock, &addr);
WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error, local size %d remote size %d",
r,
nreqs,
tag,
host::scclSocketToString(&addr, line),
net_socket::scclSocketToString(&addr, line),
size,
slots[r].size);
return scclInvalidUsage;
} // plus any potential programming errors
else if(slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
char line[SOCKET_NAME_MAXLEN + 1];
union host::scclSocketAddress addr;
host::scclSocketGetAddr(&comm->sock, &addr);
union net_socket::scclSocketAddress addr;
net_socket::scclSocketGetAddr(&comm->sock, &addr);
WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x",
r,
nreqs,
tag,
host::scclSocketToString(&addr, line),
net_socket::scclSocketToString(&addr, line),
slots[r].size,
slots[r].addr,
slots[r].rkey);
......@@ -1626,10 +1723,10 @@ scclResult_t scclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
return scclSuccess;
}
scclResult_t scclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
scclResult_t scclNetIb::irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
struct scclIbRecvComm* comm = (struct scclIbRecvComm*)recvComm;
if(comm->ready == 0) {
WARN("NET/IB: scclIbIrecv() called when comm->ready == 0");
WARN("NET/IB: irecv() called when comm->ready == 0");
return scclInternalError;
}
if(comm->ready == 0) {
......@@ -1672,7 +1769,7 @@ scclResult_t scclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
return scclSuccess;
}
scclResult_t scclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
scclResult_t scclNetIb::iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
struct scclIbRecvComm* comm = (struct scclIbRecvComm*)recvComm;
int last = -1;
for(int i = 0; i < n; i++)
......@@ -1706,7 +1803,7 @@ scclResult_t scclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
return scclSuccess;
}
scclResult_t scclIbTest(void* request, int* done, int* sizes) {
scclResult_t scclNetIb::test(void* request, int* done, int* sizes) {
struct scclIbRequest* r = (struct scclIbRequest*)request;
*done = 0;
......@@ -1732,8 +1829,8 @@ scclResult_t scclIbTest(void* request, int* done, int* sizes) {
struct ibv_wc* wc = wcs + w;
if(wc->status != IBV_WC_SUCCESS) {
char line[SOCKET_NAME_MAXLEN + 1];
union host::scclSocketAddress addr;
host::scclSocketGetAddr(r->sock, &addr);
union net_socket::scclSocketAddress addr;
net_socket::scclSocketGetAddr(r->sock, &addr);
char localGidString[INET6_ADDRSTRLEN] = "";
char remoteGidString[INET6_ADDRSTRLEN] = "";
const char *localGidStr = NULL, *remoteGidStr = NULL;
......@@ -1742,7 +1839,7 @@ scclResult_t scclIbTest(void* request, int* done, int* sizes) {
remoteGidStr = inet_ntop(AF_INET6, &r->gidInfo->remoteGid, remoteGidString, sizeof(remoteGidString));
}
WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d (%s)%s%s%s%s",
host::scclSocketToString(&addr, line),
net_socket::scclSocketToString(&addr, line),
wc->status,
wc->opcode,
wc->byte_len,
......@@ -1782,10 +1879,10 @@ scclResult_t scclIbTest(void* request, int* done, int* sizes) {
}
}
scclResult_t scclIbCloseSend(void* sendComm) {
scclResult_t scclNetIb::closeSend(void* sendComm) {
struct scclIbSendComm* comm = (struct scclIbSendComm*)sendComm;
if(comm) {
SCCLCHECK(host::scclSocketClose(&comm->sock));
SCCLCHECK(net_socket::scclSocketClose(&comm->sock));
for(int q = 0; q < comm->nqps; q++)
if(comm->qps[q] != NULL)
SCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
......@@ -1797,11 +1894,11 @@ scclResult_t scclIbCloseSend(void* sendComm) {
return scclSuccess;
}
scclResult_t scclIbCloseRecv(void* recvComm) {
scclResult_t scclNetIb::closeRecv(void* recvComm) {
struct scclIbRecvComm* comm = (struct scclIbRecvComm*)recvComm;
if(comm) {
if(!scclParamIbSockServerPortReuse() || reusedSockfd != comm->sock.fd)
SCCLCHECK(host::scclSocketClose(&comm->sock));
SCCLCHECK(net_socket::scclSocketClose(&comm->sock));
for(int q = 0; q < comm->nqps; q++)
if(comm->qps[q] != NULL)
SCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
......@@ -1819,36 +1916,16 @@ scclResult_t scclIbCloseRecv(void* recvComm) {
return scclSuccess;
}
scclResult_t scclIbCloseListen(void* listenComm) {
scclResult_t scclNetIb::closeListen(void* listenComm) {
struct scclIbListenComm* comm = (struct scclIbListenComm*)listenComm;
if(comm) {
SCCLCHECK(host::scclSocketClose(&comm->sock));
SCCLCHECK(net_socket::scclSocketClose(&comm->sock));
free(comm);
}
return scclSuccess;
}
} // namespace net_ib
scclNet_t scclNetIb = {"IB",
net_ib::scclIbInit,
net_ib::scclIbGetDevicesNum,
net_ib::scclIbGetProperties,
net_ib::scclIbListen,
net_ib::scclIbConnect,
net_ib::scclIbAccept,
net_ib::scclIbRegMr,
net_ib::scclIbRegMrDmaBuf,
net_ib::scclIbDeregMr,
net_ib::scclIbIsend,
net_ib::scclIbIrecv,
net_ib::scclIbIflush,
net_ib::scclIbTest,
net_ib::scclIbCloseSend,
net_ib::scclIbCloseRecv,
net_ib::scclIbCloseListen};
} // namespace device
} // namespace net
} // namespace hardware
} // namespace sccl
#pragma once
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <poll.h>
#include <sys/types.h>
#include <unistd.h>
#include "ibvwrap.h"
#include "socket.h"
#include "net_utils.h"
namespace sccl {
namespace hardware {
namespace net {
namespace net_ib {
/*IB的通信状态*/
enum scclIbCommState : uint8_t {
scclIbCommStateStart = 0, // 初始状态
scclIbCommStateConnect = 1, // 尝试连接状态
scclIbCommStateAccept = 3, // 接受连接状态
scclIbCommStateSend = 4, // 发送数据状态
scclIbCommStateRecv = 5, // 接收数据状态
scclIbCommStateConnecting = 6, // 正在连接状态
scclIbCommStateConnected = 7, // 已连接状态
scclIbCommStatePendingReady = 8, // 等待准备状态
};
/*通信的阶段*/
struct scclIbCommStage {
enum scclIbCommState state; // 通信阶段的状态
int offset; // 数据偏移量
void* buffer; // 用于通信的缓冲区指针
void* comm; // 通信对象指针
};
/*监听通信的上下文*/
struct scclIbListenComm {
int dev; // 设备标识符
struct net_socket::scclSocket sock; // 用于网络通信的套接字
struct scclIbCommStage stage; // 通信阶段的状态
};
//////////////////////////////////
class scclNetIb : public scclNetBase {
public:
// 构造函数和析构函数
scclNetIb();
virtual ~scclNetIb();
// 初始化网络。
scclResult_t init() override;
// 返回适配器的数量。
scclResult_t devices(int* ndev) override;
// 获取各种设备属性。
scclResult_t getProperties(int dev, scclNetProperties_t* props) override;
// 创建一个接收对象并提供一个句柄以连接到它。该句柄最多可以是 SCCL_NET_HANDLE_MAXSIZE 字节,并将在排名之间交换以创建连接。
scclResult_t listen(int dev, void* handle, void** listenComm) override;
// 连接到一个句柄并返回一个发送 comm 对象给该对等体。
// 此调用不应阻塞以建立连接,而应成功返回 sendComm == NULL,并期望再次调用直到 sendComm != NULL。
scclResult_t connect(int dev, void* handle, void** sendComm) override;
// 在远程对等体调用 connect 后最终确定连接建立。
// 此调用不应阻塞以建立连接,而应成功返回 recvComm == NULL,并期望再次调用直到 recvComm != NULL。
scclResult_t accept(void* listenComm, void** recvComm) override;
// 注册/注销内存。Comm 可以是 sendComm 或 recvComm。
// 类型是 SCCL_PTR_HOST 或 SCCL_PTR_CUDA。
scclResult_t regMr(void* comm, void* data, int size, int type, void** mhandle) override;
/* DMA-BUF 支持 */
scclResult_t regMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) override;
// 注销IB内存区域(MR)
scclResult_t deregMr(void* comm, void* mhandle) override;
// 异步发送到对等体。
// 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) override;
// 异步从对等体接收。 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) override;
// 执行刷新/栅栏操作,以确保所有使用 SCCL_PTR_CUDA 接收到的数据对 GPU 可见
scclResult_t iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) override;
// 测试请求是否完成。如果 size 不为 NULL,则返回发送/接收的字节数。
scclResult_t test(void* request, int* done, int* sizes) override;
// 关闭并释放 send/recv comm 对象
scclResult_t closeSend(void* sendComm) override;
scclResult_t closeRecv(void* recvComm) override;
scclResult_t closeListen(void* listenComm) override;
private:
struct scclIbListenComm* ibComm = nullptr;
// 定义一个静态变量 scclNIbDevs,用于存储 InfiniBand 设备的数量
int scclNIbDevs = -1;
private:
// IB异步事件处理线程主函数
static void* scclIbAsyncThreadMain(void* args);
// 获取IB设备的PCI路径并处理多端口和虚拟功能合并
scclResult_t scclIbGetPciPath(char* devName, char** path, int* realPort);
// 根据输入的宽度值,返回对应的IB(InfiniBand)链路宽度索引
int scclIbWidth(int width);
// 根据给定的速度值查找并返回对应的IB传输速率
int scclIbSpeed(int speed);
// 检查当前IB设备是否支持宽松排序(Relaxed Ordering)模式
int scclIbRelaxedOrderingCapable(void);
// 获取并处理用户指定的IB设备环境变量
char* scclIbGetIbHca(int& shownIbHcaEnv, bool* searchNot, bool* searchExact);
// 从系统文件中读取字符串内容
scclResult_t scclGetStrFromSys(const char* path, const char* fileName, char* strValue);
// 检查IB设备是否支持GPU Direct RDMA (GDR)
scclResult_t scclIbGdrSupport(int ibDev);
// 检查设备是否支持DMA-BUF功能
scclResult_t scclIbDmaBufSupport(int dev);
// 初始化InfiniBand Verbs资源
scclResult_t scclIbInitVerbs(int dev, struct ibv_context* ctx, struct scclIbVerbs* verbs);
// 创建并初始化一个InfiniBand队列对(QP)
scclResult_t scclIbCreateQp(uint8_t ib_port, struct scclIbVerbs* verbs, int access_flags, struct ibv_qp** qp);
// 将IB QP状态修改为RTR(Ready to Receive)状态
scclResult_t scclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct scclIbQpInfo* info);
// 将IB(InfiniBand)队列对(QP)状态修改为RTS(Ready To Send)状态
scclResult_t scclIbRtsQp(struct ibv_qp* qp);
// 销毁IB Verbs资源
scclResult_t scclIbDestroyVerbs(struct scclIbVerbs* verbs);
// 从verbs请求池中获取一个未使用的请求结构体
scclResult_t scclIbGetRequest(struct scclIbVerbs* verbs, struct scclIbRequest** req);
// 释放IB网络请求资源。
scclResult_t scclIbFreeRequest(struct scclIbRequest* r);
// 执行IB网络的多发送操作
scclResult_t scclIbMultiSend(struct scclIbSendComm* comm, int slot);
// 通过IB Verbs RDMA写入操作向远程FIFO队列提交数据
scclResult_t scclIbPostFifo(struct scclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct scclIbRequest* req);
};
} // namespace net_ib
} // namespace net
} // namespace hardware
} // namespace sccl
......@@ -9,8 +9,6 @@
namespace sccl {
namespace hardware {
namespace net {
namespace host {
namespace net_socket {
#define MAX_LINE_LEN (2047)
......@@ -26,7 +24,28 @@ static struct scclNetSocketDev scclNetSocketDevs[MAX_IFS];
pthread_mutex_t scclNetSocketLock = PTHREAD_MUTEX_INITIALIZER;
static scclResult_t scclNetSocketGetPciPath(char* devName, char** pciPath) {
SCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
SCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////// scclNetSocket调用的函数 ////////////////////////////////////////
scclNetSocket::scclNetSocket() : scclNetBase("Socket") {}
scclNetSocket::~scclNetSocket() {
if(socketComm != nullptr) {
free(socketComm);
}
}
/**
* 获取网络设备的PCI路径
*
* @param devName 网络设备名称
* @param pciPath 输出参数,用于存储PCI路径的指针
* @return 返回操作结果(scclSuccess表示成功)
*
* @note 如果设备不存在,pciPath可能返回NULL
*/
scclResult_t scclNetSocket::scclNetSocketGetPciPath(char* devName, char** pciPath) {
char devicePath[PATH_MAX];
snprintf(devicePath, PATH_MAX, "/sys/class/net/%s/device", devName);
// May return NULL if the file doesn't exist.
......@@ -34,7 +53,9 @@ static scclResult_t scclNetSocketGetPciPath(char* devName, char** pciPath) {
return scclSuccess;
}
scclResult_t scclNetSocketInit(void) {
scclResult_t scclNetSocket::init() {
SCCLCHECK(scclMalloc(&socketComm, 1));
if(scclNetIfs == -1) {
pthread_mutex_lock(&scclNetSocketLock);
if(scclNetIfs == -1) {
......@@ -69,12 +90,22 @@ scclResult_t scclNetSocketInit(void) {
return scclSuccess;
}
scclResult_t scclNetSocketDevices(int* ndev) {
scclResult_t scclNetSocket::devices(int* ndev) {
*ndev = scclNetIfs;
return scclSuccess;
}
static scclResult_t scclNetSocketGetSpeed(char* devName, int* speed) {
/**
* @brief 获取指定网络设备的速度(单位:Mbps)
*
* 该函数通过读取/sys/class/net/<设备名>/speed文件来获取网络设备的速度。
* 如果读取失败或速度为0,则默认返回10Gbps(10000Mbps)。
*
* @param devName 网络设备名称
* @param speed 输出参数,用于存储获取到的速度值
* @return scclResult_t 始终返回scclSuccess表示成功
*/
scclResult_t scclNetSocket::scclNetSocketGetSpeed(char* devName, int* speed) {
*speed = 0;
char speedPath[PATH_MAX];
sprintf(speedPath, "/sys/class/net/%s/speed", devName);
......@@ -93,7 +124,17 @@ static scclResult_t scclNetSocketGetSpeed(char* devName, int* speed) {
return scclSuccess;
}
scclResult_t scclNetSocketGetProperties(int dev, scclNetProperties_t* props) {
/**
* @brief 获取网络套接字设备的属性
*
* @param dev 设备索引
* @param props 用于存储设备属性的结构体指针
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*
* 该函数用于填充指定网络设备的属性信息,包括设备名称、PCI路径、速度等。
* 注意:延迟(latency)和端口(port)属性当前未设置。
*/
scclResult_t scclNetSocket::getProperties(int dev, scclNetProperties_t* props) {
props->name = scclNetSocketDevs[dev].devName;
props->pciPath = scclNetSocketDevs[dev].pciPath;
props->guid = dev;
......@@ -106,97 +147,19 @@ scclResult_t scclNetSocketGetProperties(int dev, scclNetProperties_t* props) {
return scclSuccess;
}
/* Communication functions */
#define MAX_SOCKETS 64
#define MAX_THREADS 16
#define MAX_REQUESTS SCCL_NET_MAX_REQUESTS
#define MIN_CHUNKSIZE (64 * 1024)
SCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
SCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
enum scclNetSocketCommState : uint8_t {
scclNetSocketCommStateStart = 0,
scclNetSocketCommStateConnect = 1,
scclNetSocketCommStateAccept = 3,
scclNetSocketCommStateSend = 4,
scclNetSocketCommStateRecv = 5,
};
struct scclNetSocketCommStage {
enum scclNetSocketCommState state;
uint8_t iteration;
struct scclSocket* sock;
struct scclNetSocketComm* comm;
};
struct scclNetSocketHandle {
union scclSocketAddress connectAddr;
uint64_t magic; // random number to help debugging
int nSocks;
int nThreads;
struct scclNetSocketCommStage stage;
};
struct scclNetSocketTask {
int op;
void* data;
int size;
struct scclSocket* sock;
int offset;
int used;
scclResult_t result;
};
struct scclNetSocketRequest {
int op;
void* data;
int size;
struct scclSocket* ctrlSock;
int offset;
int used;
struct scclNetSocketComm* comm;
struct scclNetSocketTask* tasks[MAX_SOCKETS];
int nSubs;
};
struct scclNetSocketTaskQueue {
int next;
int len;
struct scclNetSocketTask* tasks;
};
struct scclNetSocketThreadResources {
struct scclNetSocketTaskQueue threadTaskQueue;
int stop;
struct scclNetSocketComm* comm;
pthread_mutex_t threadLock;
pthread_cond_t threadCond;
};
struct scclNetSocketListenComm {
struct scclSocket sock;
struct scclNetSocketCommStage stage;
int nSocks;
int nThreads;
int dev;
};
struct scclNetSocketComm {
struct scclSocket ctrlSock;
struct scclSocket socks[MAX_SOCKETS];
int dev;
int cudaDev;
int nSocks;
int nThreads;
int nextSock;
struct scclNetSocketRequest requests[MAX_REQUESTS];
pthread_t helperThread[MAX_THREADS];
struct scclNetSocketThreadResources threadResources[MAX_THREADS];
};
void* persistentSocketThread(void* args_) {
/**
* @brief 持久化socket线程处理函数
*
* 该线程持续处理socket任务队列中的任务,每个线程负责处理nSocksPerThread个socket。
* 当任务队列为空时,线程会等待条件变量通知;当收到停止信号时,线程退出。
*
* @param args_ 线程参数,包含通信结构、任务队列和同步原语
* @return void* 总是返回NULL
*
* @note 线程会循环处理任务直到收到停止信号
* @warning 如果socket处理出错,线程会直接退出并打印警告信息
*/
void* scclNetSocket::persistentSocketThread(void* args_) {
struct scclNetSocketThreadResources* resource = (struct scclNetSocketThreadResources*)args_;
struct scclNetSocketComm* comm = resource->comm;
struct scclNetSocketTaskQueue* myQueue = &resource->threadTaskQueue;
......@@ -235,7 +198,18 @@ void* persistentSocketThread(void* args_) {
}
}
scclResult_t scclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
/**
* @brief 获取指定设备的socket和线程数量配置
*
* 根据设备类型和参数配置,自动检测或设置每个线程的socket数量和线程数量。
* 支持AWS和GCP设备的自动检测,并确保配置不超过最大限制。
*
* @param dev 设备索引
* @param ns 输出参数,返回总socket数量
* @param nt 输出参数,返回线程数量
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*/
scclResult_t scclNetSocket::scclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
int nSocksPerThread = scclParamSocketNsocksPerThread();
int nThreads = scclParamSocketNthreads();
if(nThreads > MAX_THREADS) {
......@@ -287,28 +261,28 @@ scclResult_t scclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
return scclSuccess;
}
scclResult_t scclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) {
scclResult_t scclNetSocket::listen(int dev, void* opaqueHandle, void** listenComm) {
if(dev < 0 || dev >= scclNetIfs) { // data transfer socket is based on specified dev
return scclInternalError;
}
struct scclNetSocketHandle* handle = (struct scclNetSocketHandle*)opaqueHandle;
memset(handle, 0, sizeof(struct scclNetSocketHandle));
static_assert(sizeof(struct scclNetSocketHandle) <= SCCL_NET_HANDLE_MAXSIZE, "scclNetSocketHandle size too large");
struct scclNetSocketListenComm* comm;
SCCLCHECK(scclCalloc(&comm, 1));
memset(socketComm, 0, sizeof(struct scclNetSocketListenComm));
handle->magic = SCCL_SOCKET_MAGIC;
SCCLCHECK(scclSocketInit(&comm->sock, &scclNetSocketDevs[dev].addr, handle->magic, scclSocketTypeNetSocket, NULL, 1));
SCCLCHECK(scclSocketListen(&comm->sock));
SCCLCHECK(scclSocketGetAddr(&comm->sock, &handle->connectAddr));
SCCLCHECK(scclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
handle->nSocks = comm->nSocks;
handle->nThreads = comm->nThreads;
comm->dev = dev;
*listenComm = comm;
SCCLCHECK(scclSocketInit(&socketComm->sock, &scclNetSocketDevs[dev].addr, handle->magic, scclSocketTypeNetSocket, NULL, 1));
SCCLCHECK(scclSocketListen(&socketComm->sock));
SCCLCHECK(scclSocketGetAddr(&socketComm->sock, &handle->connectAddr));
SCCLCHECK(scclNetSocketGetNsockNthread(dev, &socketComm->nSocks, &socketComm->nThreads));
handle->nSocks = socketComm->nSocks;
handle->nThreads = socketComm->nThreads;
socketComm->dev = dev;
*listenComm = socketComm;
return scclSuccess;
}
scclResult_t scclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
scclResult_t scclNetSocket::connect(int dev, void* opaqueHandle, void** sendComm) {
if(dev < 0 || dev >= scclNetIfs) { // data transfer socket is based on specified dev
return scclInternalError;
}
......@@ -331,7 +305,7 @@ scclResult_t scclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm)
comm->nSocks = handle->nSocks;
comm->nThreads = handle->nThreads;
comm->dev = dev;
HIPCHECK(hipGetDevice(&comm->cudaDev));
HIPCHECK(hipGetDevice(&comm->hipDev));
for(; i < comm->nSocks + 1; i++) {
sock = (i == comm->nSocks) ? &comm->ctrlSock : comm->socks + i;
SCCLCHECK(scclSocketInit(sock, &handle->connectAddr, handle->magic, scclSocketTypeNetSocket, NULL, 1));
......@@ -357,7 +331,7 @@ scclResult_t scclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm)
return scclSuccess;
}
scclResult_t scclNetSocketAccept(void* listenComm, void** recvComm) {
scclResult_t scclNetSocket::accept(void* listenComm, void** recvComm) {
struct scclNetSocketListenComm* lComm = (struct scclNetSocketListenComm*)listenComm;
struct scclNetSocketCommStage* stage = &lComm->stage;
struct scclNetSocketComm* rComm = stage->comm;
......@@ -376,7 +350,7 @@ scclResult_t scclNetSocketAccept(void* listenComm, void** recvComm) {
rComm->nSocks = lComm->nSocks;
rComm->nThreads = lComm->nThreads;
rComm->dev = lComm->dev;
HIPCHECK(hipGetDevice(&rComm->cudaDev));
HIPCHECK(hipGetDevice(&rComm->hipDev));
for(; i < rComm->nSocks + 1; i++) {
uint8_t sendSockIdx;
......@@ -434,7 +408,51 @@ scclResult_t scclNetSocketGetRequest(struct scclNetSocketComm* comm, int op, voi
return scclInternalError;
}
scclResult_t scclNetSocketGetTask(struct scclNetSocketComm* comm, int op, void* data, int size, struct scclNetSocketTask** req) {
scclResult_t scclNetSocket::regMr(void* comm, void* data, int size, int type, void** mhandle) {
return (type != SCCL_PTR_HOST) ? scclInternalError : scclSuccess;
}
scclResult_t scclNetSocket::regMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
WARN("NET/Socket : unable to check DMA-BUF support");
return scclSuccess;
}
scclResult_t scclNetSocket::deregMr(void* comm, void* mhandle) { return scclSuccess; }
scclResult_t scclNetSocket::isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
struct scclNetSocketComm* comm = (struct scclNetSocketComm*)sendComm;
SCCLCHECK(scclNetSocketGetRequest(comm, SCCL_SOCKET_SEND, data, size, (struct scclNetSocketRequest**)request));
return scclSuccess;
}
scclResult_t scclNetSocket::irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
struct scclNetSocketComm* comm = (struct scclNetSocketComm*)recvComm;
if(n != 1)
return scclInternalError;
SCCLCHECK(scclNetSocketGetRequest(comm, SCCL_SOCKET_RECV, data[0], sizes[0], (struct scclNetSocketRequest**)request));
return scclSuccess;
}
scclResult_t scclNetSocket::iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
// We don't support HIP pointers, so we don't need a flush operation
return scclInternalError;
}
/**
* 为指定通信对象创建并获取一个网络套接字任务
*
* @param comm 网络套接字通信对象指针
* @param op 操作类型(SCCL_SOCKET_SEND/SCCL_SOCKET_RECV)
* @param data 任务数据缓冲区指针
* @param size 数据大小
* @param req [out] 返回创建的任务指针
*
* @return 成功返回scclSuccess,失败返回scclInternalError
*
* @note 该函数会初始化线程资源(首次调用时),创建持久化线程处理任务队列
* @warning 当任务队列已满时会返回错误并打印警告
*/
scclResult_t scclNetSocket::scclNetSocketGetTask(struct scclNetSocketComm* comm, int op, void* data, int size, struct scclNetSocketTask** req) {
int tid = comm->nextSock % comm->nThreads;
struct scclNetSocketThreadResources* res = comm->threadResources + tid;
struct scclNetSocketTaskQueue* queue = &res->threadTaskQueue;
......@@ -450,7 +468,7 @@ scclResult_t scclNetSocketGetTask(struct scclNetSocketComm* comm, int op, void*
pthread_mutex_init(&res->threadLock, NULL);
pthread_cond_init(&res->threadCond, NULL);
pthread_create(comm->helperThread + tid, NULL, persistentSocketThread, res);
scclSetThreadName(comm->helperThread[tid], "SCCL Sock%c%1u%2u%2u", op == SCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
scclSetThreadName(comm->helperThread[tid], "SCCL Sock%c%1u%2u%2u", op == SCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->hipDev);
}
struct scclNetSocketTask* r = queue->tasks + queue->next;
if(r->used == 0) {
......@@ -473,7 +491,7 @@ scclResult_t scclNetSocketGetTask(struct scclNetSocketComm* comm, int op, void*
return scclInternalError;
}
scclResult_t scclNetSocketTest(void* request, int* done, int* size) {
scclResult_t scclNetSocket::test(void* request, int* done, int* size) {
*done = 0;
struct scclNetSocketRequest* r = (struct scclNetSocketRequest*)request;
if(r == NULL) {
......@@ -555,43 +573,7 @@ scclResult_t scclNetSocketTest(void* request, int* done, int* size) {
return scclSuccess;
}
scclResult_t scclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
return (type != SCCL_PTR_HOST) ? scclInternalError : scclSuccess;
}
scclResult_t scclNetSocketDeregMr(void* comm, void* mhandle) { return scclSuccess; }
scclResult_t scclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
struct scclNetSocketComm* comm = (struct scclNetSocketComm*)sendComm;
SCCLCHECK(scclNetSocketGetRequest(comm, SCCL_SOCKET_SEND, data, size, (struct scclNetSocketRequest**)request));
return scclSuccess;
}
scclResult_t scclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
struct scclNetSocketComm* comm = (struct scclNetSocketComm*)recvComm;
if(n != 1)
return scclInternalError;
SCCLCHECK(scclNetSocketGetRequest(comm, SCCL_SOCKET_RECV, data[0], sizes[0], (struct scclNetSocketRequest**)request));
return scclSuccess;
}
scclResult_t scclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
// We don't support HIP pointers, so we don't need a flush operation
return scclInternalError;
}
scclResult_t scclNetSocketCloseListen(void* opaqueComm) {
struct scclNetSocketListenComm* comm = (struct scclNetSocketListenComm*)opaqueComm;
if(comm) {
int ready;
SCCLCHECK(scclSocketReady(&comm->sock, &ready));
if(ready)
SCCLCHECK(scclSocketClose(&comm->sock));
free(comm);
}
return scclSuccess;
}
scclResult_t scclNetSocketClose(void* opaqueComm) {
scclResult_t scclNetSocket::closeSend(void* opaqueComm) {
struct scclNetSocketComm* comm = (struct scclNetSocketComm*)opaqueComm;
if(comm) {
for(int i = 0; i < comm->nThreads; i++) {
......@@ -619,27 +601,21 @@ scclResult_t scclNetSocketClose(void* opaqueComm) {
return scclSuccess;
}
} // namespace net_socket
scclResult_t scclNetSocket::closeRecv(void* opaqueComm) { return closeSend(opaqueComm); }
scclNet_t scclNetSocket = {"Socket",
net_socket::scclNetSocketInit,
net_socket::scclNetSocketDevices,
net_socket::scclNetSocketGetProperties,
net_socket::scclNetSocketListen,
net_socket::scclNetSocketConnect,
net_socket::scclNetSocketAccept,
net_socket::scclNetSocketRegMr,
NULL, // No DMA-BUF support
net_socket::scclNetSocketDeregMr,
net_socket::scclNetSocketIsend,
net_socket::scclNetSocketIrecv,
net_socket::scclNetSocketIflush,
net_socket::scclNetSocketTest,
net_socket::scclNetSocketClose,
net_socket::scclNetSocketClose,
net_socket::scclNetSocketCloseListen};
} // namespace host
scclResult_t scclNetSocket::closeListen(void* opaqueComm) {
struct scclNetSocketListenComm* comm = (struct scclNetSocketListenComm*)opaqueComm;
if(comm) {
int ready;
SCCLCHECK(scclSocketReady(&comm->sock, &ready));
if(ready)
SCCLCHECK(scclSocketClose(&comm->sock));
free(comm);
}
return scclSuccess;
}
} // namespace net_socket
} // namespace net
} // namespace hardware
} // namespace sccl
#pragma once
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <poll.h>
#include <sys/types.h>
#include <unistd.h>
#include "base.h"
#include "net_utils.h"
#include "socket.h"
namespace sccl {
namespace hardware {
namespace net {
namespace net_socket {
/* Communication functions */
static constexpr int MAX_SOCKETS = 64;
static constexpr int MAX_THREADS = 16;
static constexpr int MAX_REQUESTS = SCCL_NET_MAX_REQUESTS;
static constexpr int MIN_CHUNKSIZE = (64 * 1024);
enum scclNetSocketCommState : uint8_t {
scclNetSocketCommStateStart = 0,
scclNetSocketCommStateConnect = 1,
scclNetSocketCommStateAccept = 3,
scclNetSocketCommStateSend = 4,
scclNetSocketCommStateRecv = 5,
};
struct scclNetSocketCommStage {
enum scclNetSocketCommState state;
uint8_t iteration;
struct scclSocket* sock;
struct scclNetSocketComm* comm = nullptr;
};
struct scclNetSocketHandle {
union scclSocketAddress connectAddr;
uint64_t magic; // random number to help debugging
int nSocks;
int nThreads;
struct scclNetSocketCommStage stage;
};
struct scclNetSocketTask {
int op;
void* data;
int size;
struct scclSocket* sock = nullptr;
int offset;
int used;
scclResult_t result;
};
struct scclNetSocketRequest {
int op;
void* data;
int size;
struct scclSocket* ctrlSock = nullptr;
int offset;
int used;
struct scclNetSocketComm* comm = nullptr;
struct scclNetSocketTask* tasks[MAX_SOCKETS] = {nullptr};
int nSubs;
};
struct scclNetSocketTaskQueue {
int next;
int len;
struct scclNetSocketTask* tasks = nullptr;
};
struct scclNetSocketThreadResources {
struct scclNetSocketTaskQueue threadTaskQueue;
int stop;
struct scclNetSocketComm* comm = nullptr;
pthread_mutex_t threadLock;
pthread_cond_t threadCond;
};
struct scclNetSocketListenComm {
struct scclSocket sock;
struct scclNetSocketCommStage stage;
int nSocks;
int nThreads;
int dev;
};
struct scclNetSocketComm {
struct scclSocket ctrlSock;
struct scclSocket socks[MAX_SOCKETS];
int dev;
int hipDev;
int nSocks;
int nThreads;
int nextSock;
struct scclNetSocketRequest requests[MAX_REQUESTS];
pthread_t helperThread[MAX_THREADS];
struct scclNetSocketThreadResources threadResources[MAX_THREADS];
};
//////////////////////////////////
class scclNetSocket : public scclNetBase {
public:
// 构造函数和析构函数
scclNetSocket();
virtual ~scclNetSocket();
// 初始化网络。
scclResult_t init() override;
// 返回适配器的数量。
scclResult_t devices(int* ndev) override;
// 获取各种设备属性。
scclResult_t getProperties(int dev, scclNetProperties_t* props) override;
// 创建一个接收对象并提供一个句柄以连接到它。该句柄最多可以是 SCCL_NET_HANDLE_MAXSIZE 字节,并将在排名之间交换以创建连接。
scclResult_t listen(int dev, void* handle, void** listenComm) override;
// 连接到一个句柄并返回一个发送 comm 对象给该对等体。
// 此调用不应阻塞以建立连接,而应成功返回 sendComm == NULL,并期望再次调用直到 sendComm != NULL。
scclResult_t connect(int dev, void* handle, void** sendComm) override;
// 在远程对等体调用 connect 后最终确定连接建立。
// 此调用不应阻塞以建立连接,而应成功返回 recvComm == NULL,并期望再次调用直到 recvComm != NULL。
scclResult_t accept(void* listenComm, void** recvComm) override;
// 注册/注销内存。Comm 可以是 sendComm 或 recvComm。
// 类型是 SCCL_PTR_HOST 或 SCCL_PTR_CUDA。
scclResult_t regMr(void* comm, void* data, int size, int type, void** mhandle) override;
/* DMA-BUF 支持 */
scclResult_t regMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) override;
// 注销IB内存区域(MR)
scclResult_t deregMr(void* comm, void* mhandle) override;
// 异步发送到对等体。
// 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) override;
// 异步从对等体接收。 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) override;
// 执行刷新/栅栏操作,以确保所有使用 SCCL_PTR_CUDA 接收到的数据对 GPU 可见
scclResult_t iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) override;
// 测试请求是否完成。如果 size 不为 NULL,则返回发送/接收的字节数。
scclResult_t test(void* request, int* done, int* sizes) override;
// 关闭并释放 send/recv comm 对象
scclResult_t closeSend(void* sendComm) override;
scclResult_t closeRecv(void* recvComm) override;
scclResult_t closeListen(void* listenComm) override;
private:
struct scclNetSocketListenComm* socketComm = nullptr;
private:
// 获取网络设备的PCI路径
static scclResult_t scclNetSocketGetPciPath(char* devName, char** pciPath);
// 获取指定网络设备的速度(单位:Mbps)
scclResult_t scclNetSocketGetSpeed(char* devName, int* speed);
// 持久化socket线程处理函数
static void* persistentSocketThread(void* args_);
// 为指定通信对象创建并获取一个网络套接字任务
scclResult_t scclNetSocketGetTask(struct scclNetSocketComm* comm, int op, void* data, int size, struct scclNetSocketTask** req);
// 获取指定设备的socket和线程数量配置
scclResult_t scclNetSocketGetNsockNthread(int dev, int* ns, int* nt);
};
} // namespace net_socket
} // namespace net
} // namespace hardware
} // namespace sccl
......@@ -15,7 +15,7 @@
namespace sccl {
namespace hardware {
namespace net {
namespace host {
namespace net_socket {
namespace socket_base {
/**
......@@ -383,7 +383,7 @@ static scclResult_t socketFinalizeConnect(struct scclSocket* sock) {
return scclSuccess;
}
static scclResult_t socketProgressState(struct host::scclSocket* sock) {
static scclResult_t socketProgressState(struct scclSocket* sock) {
if(sock->state == scclSocketStateAccepting) {
SCCLCHECK(socketTryAccept(sock));
}
......@@ -588,8 +588,13 @@ int scclFindInterfaceMatchSubnet(char* ifNames, union scclSocketAddress* localAd
* @brief 查找可用的socket网络接口
*
* 该函数用于查找系统中可用的网络接口,支持通过环境变量指定接口或自动探测。
* 查找顺序:1) 用户指定的接口(SCCL_SOCKET_IFNAME) 2) IB接口 3) 与SCCL_COMM_ID同子网的接口
* 4) 排除docker和lo的其他接口 5) docker接口 6) lo接口
* 查找顺序:
* 1) 用户指定的接口(SCCL_SOCKET_IFNAME)
* 2) IB接口
* 3) 与SCCL_COMM_ID同子网的接口
* 4) 排除docker和lo的其他接口
* 5) docker接口
* 6) lo接口
*
* @param ifNames 输出参数,存储找到的接口名称
* @param ifAddrs 输出参数,存储找到的接口地址
......@@ -630,9 +635,9 @@ int scclFindSocketInterfaces(char* ifNames, union scclSocketAddress* ifAddrs, in
WARN("No socket network interface found. ");
}
// // Then look for anything else (but not docker or lo)
// if(nIfs == 0)
// nIfs = socket_base::findSocketInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
// Then look for anything else (but not docker or lo)
if(nIfs == 0)
nIfs = socket_base::findSocketInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
// // Finally look for docker, then lo.
// if(nIfs == 0)
// nIfs = socket_base::findSocketInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
......@@ -868,6 +873,16 @@ scclResult_t scclSocketListen(struct scclSocket* sock) {
return scclSuccess;
}
/**
* 获取socket地址信息
*
* @param sock 要获取地址的socket指针,不能为NULL
* @param addr 用于存储获取到的地址信息的缓冲区
* @return scclResult_t 返回操作结果:
* - scclInvalidArgument: 参数无效(sock为NULL)
* - scclInternalError: socket未就绪
* - scclSuccess: 操作成功
*/
scclResult_t scclSocketGetAddr(struct scclSocket* sock, union scclSocketAddress* addr) {
if(sock == NULL) {
WARN("scclSocketGetAddr: pass NULL socket");
......@@ -1101,7 +1116,7 @@ scclResult_t scclSocketSetFd(int fd, struct scclSocket* sock) {
return scclSuccess;
}
} // namespace host
} // namespace net_socket
} // namespace net
} // namespace hardware
} // namespace sccl
......@@ -11,7 +11,7 @@
namespace sccl {
namespace hardware {
namespace net {
namespace host {
namespace net_socket {
#define MAX_IFS 16 // 最大接口数量
#define MAX_IF_NAME_SIZE 16 // 每个接口名称的最大长度
......@@ -114,7 +114,7 @@ scclResult_t scclSocketGetFd(struct scclSocket* sock, int* fd);
// 设置socket文件描述符
scclResult_t scclSocketSetFd(int fd, struct scclSocket* sock);
} // namespace host
} // namespace net_socket
} // namespace net
} // namespace hardware
} // namespace sccl
......@@ -97,6 +97,22 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
return false;
}
scclResult_t printNetProps(const scclNetProperties_t* props, int rank, int localRank) {
printf("rank=%d, localRank=%d, device name=%s, pciPath=%s, guid=%lu, ptrSupport=%d, speed=%d, port=%d, latency=%f, maxComms=%d, maxRecvs=%d\n",
rank,
localRank,
props->name,
props->pciPath,
props->guid,
props->ptrSupport,
props->speed,
props->port,
props->latency,
props->maxComms,
props->maxRecvs);
return scclSuccess;
}
} // namespace net
} // namespace hardware
} // namespace sccl
......@@ -7,18 +7,11 @@ namespace sccl {
namespace hardware {
namespace net {
struct netIf { // 网络接口结构体
char prefix[64]; // 网络前缀
int port; // 端口号
};
// 解析字符串列表,将结果存储在网络接口列表中
int parseStringList(const char* string, struct netIf* ifList, int maxList);
// 根据给定的字符串和端口,匹配网络接口列表中的接口
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
scclResult_t rocmLibraryInit(void);
typedef enum {
SCCL_PTR_HOST = 0x1,
SCCL_PTR_CUDA = 0x2,
SCCL_PTR_DMABUF = 0x4
} sccl_ptr_t;
////////////////////////////////// 用于定义网络设备 //////////////////////////////////
typedef struct {
......@@ -33,53 +26,87 @@ typedef struct {
int maxRecvs; // 最大分组接收数量。
} scclNetProperties_t;
typedef struct {
// 网络的名称(主要用于日志)
const char* name;
/**
* @brief scclNetBase 类定义了网络通信的基础接口
*
* 该类是一个抽象基类,提供了网络初始化、设备管理、连接建立、内存注册、
* 数据传输等核心功能的纯虚函数接口。具体实现应由派生类完成。
*
* 主要功能包括:
* - 网络初始化和设备属性查询
* - 监听/连接建立和管理
* - 内存注册和注销
* - 异步发送/接收操作
* - 请求状态测试
* - 连接关闭
*
* 接口设计为非阻塞式,支持异步操作。
*/
typedef class scclNetBase {
public:
// 构造函数和析构函数
scclNetBase(const char* net_name) : name(net_name) {};
virtual ~scclNetBase() {};
// 初始化网络。
scclResult_t (*init)();
virtual scclResult_t init() = 0;
// 返回适配器的数量。
scclResult_t (*devices)(int* ndev);
virtual scclResult_t devices(int* ndev) = 0;
// 获取各种设备属性。
scclResult_t (*getProperties)(int dev, scclNetProperties_t* props);
virtual scclResult_t getProperties(int dev, scclNetProperties_t* props) = 0;
// 创建一个接收对象并提供一个句柄以连接到它。该句柄最多可以是 SCCL_NET_HANDLE_MAXSIZE 字节,并将在排名之间交换以创建连接。
scclResult_t (*listen)(int dev, void* handle, void** listenComm);
virtual scclResult_t listen(int dev, void* handle, void** listenComm) = 0;
// 连接到一个句柄并返回一个发送 comm 对象给该对等体。
// 此调用不应阻塞以建立连接,而应成功返回 sendComm == NULL,并期望再次调用直到 sendComm != NULL。
scclResult_t (*connect)(int dev, void* handle, void** sendComm);
virtual scclResult_t connect(int dev, void* handle, void** sendComm) = 0;
// 在远程对等体调用 connect 后最终确定连接建立。
// 此调用不应阻塞以建立连接,而应成功返回 recvComm == NULL,并期望再次调用直到 recvComm != NULL。
scclResult_t (*accept)(void* listenComm, void** recvComm);
virtual scclResult_t accept(void* listenComm, void** recvComm) = 0;
// 注册/注销内存。Comm 可以是 sendComm 或 recvComm。
// 类型是 SCCL_PTR_HOST 或 SCCL_PTR_CUDA。
scclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
virtual scclResult_t regMr(void* comm, void* data, int size, int type, void** mhandle) = 0;
/* DMA-BUF 支持 */
scclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
scclResult_t (*deregMr)(void* comm, void* mhandle);
virtual scclResult_t regMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) = 0;
// 注销IB内存区域(MR)
virtual scclResult_t deregMr(void* comm, void* mhandle) = 0;
// 异步发送到对等体。
// 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
virtual scclResult_t isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) = 0;
// 异步从对等体接收。 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
virtual scclResult_t irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) = 0;
// 执行刷新/栅栏操作,以确保所有使用 SCCL_PTR_CUDA 接收到的数据对 GPU 可见
scclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
virtual scclResult_t iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) = 0;
// 测试请求是否完成。如果 size 不为 NULL,则返回发送/接收的字节数。
scclResult_t (*test)(void* request, int* done, int* sizes);
virtual scclResult_t test(void* request, int* done, int* sizes) = 0;
// 关闭并释放 send/recv comm 对象
scclResult_t (*closeSend)(void* sendComm);
scclResult_t (*closeRecv)(void* recvComm);
scclResult_t (*closeListen)(void* listenComm);
} scclNet_t;
virtual scclResult_t closeSend(void* sendComm) = 0;
virtual scclResult_t closeRecv(void* recvComm) = 0;
virtual scclResult_t closeListen(void* listenComm) = 0;
////////////////////////////////// 其他定义 //////////////////////////////////
public:
// 网络的名称(主要用于日志)
const char* name;
typedef enum sccl_ptr {
SCCL_PTR_HOST = 0x1,
SCCL_PTR_CUDA = 0x2,
SCCL_PTR_DMABUF = 0x4
} sccl_ptr_t;
} scclNet_t;
////////////////////////////////// 功能函数 //////////////////////////////////
// 初始化 ROCm 库
scclResult_t rocmLibraryInit(void);
#define SCCL_NET_HANDLE_MAXSIZE 128
struct netIf { // 网络接口结构体
char prefix[64]; // 网络前缀
int port; // 端口号
};
// 解析字符串列表,将结果存储在网络接口列表中
int parseStringList(const char* string, struct netIf* ifList, int maxList);
// 根据给定的字符串和端口,匹配网络接口列表中的接口
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
// 打印网络属性信息
scclResult_t printNetProps(const scclNetProperties_t* props, int rank, int localRank);
} // namespace net
} // namespace hardware
......
......@@ -170,6 +170,13 @@ error:
} // namespace rocm_wrap
/**
* 初始化 ROCm 库
*
* 该函数使用 pthread_once 确保 ROCm 库只被初始化一次。
*
* @return 返回初始化结果,scclResult_t 类型。
*/
scclResult_t rocmLibraryInit() {
pthread_once(&rocm_wrap::initOnceControl, rocm_wrap::initOnceFunc);
return rocm_wrap::initResult;
......
......@@ -23,6 +23,7 @@ DECLARE_ROCM_PFN_EXTERN(hsa_status_string);
} // namespace rocm_wrap
// 初始化 ROCm 库
scclResult_t rocmLibraryInit(void);
} // namespace net
......
/*************************************************************************
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef SCCL_CPUSET_H_
#define SCCL_CPUSET_H_
#include "base.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace topo {
// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
/**
* 将十六进制字符转换为对应的整数值
*
* @param c 输入的十六进制字符(0-9, a-f)
* @return 返回对应的整数值(0-15),如果输入无效则返回-1
*/
static int hexToInt(char c) {
int v = c - '0';
if(v < 0)
return -1;
if(v > 9)
v = 10 + c - 'a';
if((v < 0) || (v > 15))
return -1;
return v;
}
#define CPU_SET_N_U32 (sizeof(cpu_set_t) / sizeof(uint32_t))
/**
* 将十六进制字符串转换为CPU集合掩码
*
* @param str 输入的十六进制字符串,用逗号分隔不同部分
* @param mask 输出的CPU集合掩码
* @return scclSuccess 表示转换成功
*
* @note 字符串从左到右对应掩码从高到低的32位字
* 每个字符代表4位十六进制数
* 遇到非十六进制字符会提前终止转换
*/
static scclResult_t scclStrToCpuset(const char* str, cpu_set_t* mask) {
uint32_t cpumasks[CPU_SET_N_U32];
int m = CPU_SET_N_U32 - 1;
cpumasks[m] = 0;
for(int o = 0; o < strlen(str); o++) {
char c = str[o];
if(c == ',') {
m--;
cpumasks[m] = 0;
} else {
int v = hexToInt(c);
if(v == -1)
break;
cpumasks[m] <<= 4;
cpumasks[m] += v;
}
}
// Copy cpumasks to mask
for(int a = 0; m < CPU_SET_N_U32; a++, m++) {
memcpy(((uint32_t*)mask) + a, cpumasks + m, sizeof(uint32_t));
}
return scclSuccess;
}
/**
* 将CPU集合掩码转换为十六进制字符串表示
*
* @param mask 输入的CPU集合掩码
* @param str 输出的字符串缓冲区,用于存储转换结果
* @return 返回操作结果(scclSuccess表示成功)
*
* 转换规则:
* 1. 将cpu_set_t按字节从高到低转换为十六进制字符串
* 2. 每4个字节后添加一个逗号分隔符
* 3. 忽略前导零
*/
static scclResult_t scclCpusetToStr(cpu_set_t* mask, char* str) {
int c = 0;
uint8_t* m8 = (uint8_t*)mask;
for(int o = sizeof(cpu_set_t) - 1; o >= 0; o--) {
if(c == 0 && m8[o] == 0)
continue;
sprintf(str + c, "%02x", m8[o]);
c += 2;
if(o && o % 4 == 0) {
sprintf(str + c, ",");
c++;
}
}
str[c] = '\0';
return scclSuccess;
}
/**
* 将CPU集合掩码转换为范围字符串表示
*
* @param mask 输入的CPU集合掩码
* @param str 用于存储结果的缓冲区
* @param len 缓冲区长度
* @return 返回转换后的字符串指针(即str参数)
*
* 该函数将CPU集合掩码转换为可读的范围字符串格式,例如"0-3,5,7-9"。
* 如果缓冲区空间不足,结果会被截断。空集合会返回空字符串。
*/
static char* scclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) {
int c = 0;
int start = -1;
// Iterate through all possible CPU bits plus one extra position
for(int cpu = 0; cpu <= CPU_SETSIZE; cpu++) {
int isSet = (cpu == CPU_SETSIZE) ? 0 : CPU_ISSET(cpu, mask);
// Start of a new range
if(isSet && start == -1) {
start = cpu;
}
// End of a range, add comma between ranges
if(!isSet && start != -1) {
if(cpu - 1 == start) {
c += snprintf(str + c, len - c, "%s%d", c ? "," : "", start);
} else {
c += snprintf(str + c, len - c, "%s%d-%d", c ? "," : "", start, cpu - 1);
}
if(c >= len - 1)
break;
start = -1;
}
}
if(c == 0)
str[0] = '\0';
return str;
}
} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nvmlwrap.h"
#include "base.h"
#include <initializer_list>
#include <memory>
#include <mutex>
namespace sccl {
namespace hardware {
namespace topology {
int scclNvmlDeviceCount = 0;
scclNvmlDeviceInfo scclNvmlDevices[scclNvmlMaxDevices];
scclNvmlDevicePairInfo scclNvmlDevicePairs[scclNvmlMaxDevices][scclNvmlMaxDevices];
#if SCCL_NVML_DIRECT
#define SCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name) arglist = name;
#else
#include <dlfcn.h>
#define SCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name) arglist = nullptr;
#endif
namespace {
SCCL_NVML_FN(nvmlInit, nvmlReturn_t, ())
SCCL_NVML_FN(nvmlInit_v2, nvmlReturn_t, ())
SCCL_NVML_FN(nvmlShutdown, nvmlReturn_t, ())
SCCL_NVML_FN(nvmlDeviceGetCount, nvmlReturn_t, (unsigned int*))
SCCL_NVML_FN(nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*))
SCCL_NVML_FN(nvmlDeviceGetHandleByPciBusId, nvmlReturn_t, (const char* pciBusId, nvmlDevice_t* device))
SCCL_NVML_FN(nvmlDeviceGetHandleByIndex, nvmlReturn_t, (unsigned int index, nvmlDevice_t* device))
SCCL_NVML_FN(nvmlDeviceGetIndex, nvmlReturn_t, (nvmlDevice_t device, unsigned* index))
SCCL_NVML_FN(nvmlErrorString, char const*, (nvmlReturn_t r))
SCCL_NVML_FN(nvmlDeviceGetNvLinkState, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive))
SCCL_NVML_FN(nvmlDeviceGetNvLinkRemotePciInfo, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci))
SCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult))
SCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
SCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
SCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values))
std::mutex lock; // NVML has had some thread safety bugs
bool initialized = false;
thread_local bool threadInitialized = false;
scclResult_t initResult;
} // namespace
scclResult_t scclNvmlEnsureInitialized() {
// Optimization to avoid repeatedly grabbing the lock when we only want to
// read from the global tables.
if(threadInitialized)
return initResult;
threadInitialized = true;
std::lock_guard<std::mutex> locked(lock);
if(initialized)
return initResult;
initialized = true;
#if !SCCL_NVML_DIRECT
if(pfn_nvmlInit == nullptr) {
void* libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
if(libhandle == nullptr) {
WARN("Failed to open libnvidia-ml.so.1");
initResult = scclSystemError;
return initResult;
}
struct Symbol {
void** ppfn;
char const* name;
};
std::initializer_list<Symbol> symbols = {{(void**)&pfn_nvmlInit, "nvmlInit"},
{(void**)&pfn_nvmlInit_v2, "nvmlInit_v2"},
{(void**)&pfn_nvmlShutdown, "nvmlShutdown"},
{(void**)&pfn_nvmlDeviceGetCount, "nvmlDeviceGetCount"},
{(void**)&pfn_nvmlDeviceGetCount_v2, "nvmlDeviceGetCount_v2"},
{(void**)&pfn_nvmlDeviceGetHandleByPciBusId, "nvmlDeviceGetHandleByPciBusId"},
{(void**)&pfn_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"},
{(void**)&pfn_nvmlDeviceGetIndex, "nvmlDeviceGetIndex"},
{(void**)&pfn_nvmlErrorString, "nvmlErrorString"},
{(void**)&pfn_nvmlDeviceGetNvLinkState, "nvmlDeviceGetNvLinkState"},
{(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"},
{(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
{(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
{(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"},
{(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"}};
for(Symbol sym : symbols) {
*sym.ppfn = dlsym(libhandle, sym.name);
}
}
#endif
#if SCCL_NVML_DIRECT
bool have_v2 = true;
#else
bool have_v2 = pfn_nvmlInit_v2 != nullptr; // if this compare is done in the SCCL_NVML_DIRECT=1 case then GCC warns about it never being null
#endif
nvmlReturn_t res1 = (have_v2 ? pfn_nvmlInit_v2 : pfn_nvmlInit)();
if(res1 != NVML_SUCCESS) {
WARN("nvmlInit%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1));
initResult = scclSystemError;
return initResult;
}
unsigned int ndev;
res1 = (have_v2 ? pfn_nvmlDeviceGetCount_v2 : pfn_nvmlDeviceGetCount)(&ndev);
if(res1 != NVML_SUCCESS) {
WARN("nvmlDeviceGetCount%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1));
initResult = scclSystemError;
return initResult;
}
scclNvmlDeviceCount = int(ndev);
if(scclNvmlMaxDevices < scclNvmlDeviceCount) {
WARN("nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (scclNvmlMaxDevices=%d)", scclNvmlDeviceCount, scclNvmlMaxDevices);
initResult = scclInternalError;
return initResult;
}
for(int a = 0; a < scclNvmlDeviceCount; a++) {
res1 = pfn_nvmlDeviceGetHandleByIndex(a, &scclNvmlDevices[a].handle);
if(res1 != NVML_SUCCESS) {
WARN("nvmlDeviceGetHandleByIndex(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
initResult = scclSystemError;
return initResult;
}
res1 = pfn_nvmlDeviceGetCudaComputeCapability(
scclNvmlDevices[a].handle, &scclNvmlDevices[a].computeCapabilityMajor, &scclNvmlDevices[a].computeCapabilityMinor);
if(res1 != NVML_SUCCESS) {
WARN("nvmlDeviceGetCudaComputeCapability(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
initResult = scclSystemError;
return initResult;
}
}
for(int a = 0; a < scclNvmlDeviceCount; a++) {
for(int b = 0; b < scclNvmlDeviceCount; b++) {
nvmlDevice_t da = scclNvmlDevices[a].handle;
nvmlDevice_t db = scclNvmlDevices[b].handle;
res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_READ, &scclNvmlDevicePairs[a][b].p2pStatusRead);
if(res1 != NVML_SUCCESS) {
WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
initResult = scclSystemError;
return initResult;
}
res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_WRITE, &scclNvmlDevicePairs[a][b].p2pStatusWrite);
if(res1 != NVML_SUCCESS) {
WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
initResult = scclSystemError;
return initResult;
}
}
}
initResult = scclSuccess;
return initResult;
}
#define NVMLCHECK(name, ...) \
do { \
nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
if(e44241808 != NVML_SUCCESS) { \
WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
return scclSystemError; \
} \
} while(0)
#define NVMLTRY(name, ...) \
do { \
if(!SCCL_NVML_DIRECT && pfn_##name == nullptr) \
return scclInternalError; /* missing symbol is not a warned error */ \
nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
if(e44241808 != NVML_SUCCESS) { \
if(e44241808 != NVML_ERROR_NOT_SUPPORTED) \
INFO(SCCL_LOG_TOPO, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
return scclSystemError; \
} \
} while(0)
scclResult_t scclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
SCCLCHECK(scclNvmlEnsureInitialized());
std::lock_guard<std::mutex> locked(lock);
NVMLCHECK(nvmlDeviceGetHandleByPciBusId, pciBusId, device);
return scclSuccess;
}
scclResult_t scclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
SCCLCHECK(scclNvmlEnsureInitialized());
*device = scclNvmlDevices[index].handle;
return scclSuccess;
}
scclResult_t scclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
SCCLCHECK(scclNvmlEnsureInitialized());
for(int d = 0; d < scclNvmlDeviceCount; d++) {
if(scclNvmlDevices[d].handle == device) {
*index = d;
return scclSuccess;
}
}
return scclInvalidArgument;
}
scclResult_t scclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) {
SCCLCHECK(scclNvmlEnsureInitialized());
std::lock_guard<std::mutex> locked(lock);
NVMLTRY(nvmlDeviceGetNvLinkState, device, link, isActive);
return scclSuccess;
}
scclResult_t scclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) {
SCCLCHECK(scclNvmlEnsureInitialized());
std::lock_guard<std::mutex> locked(lock);
NVMLTRY(nvmlDeviceGetNvLinkRemotePciInfo, device, link, pci);
return scclSuccess;
}
scclResult_t scclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) {
SCCLCHECK(scclNvmlEnsureInitialized());
std::lock_guard<std::mutex> locked(lock);
NVMLTRY(nvmlDeviceGetNvLinkCapability, device, link, capability, capResult);
return scclSuccess;
}
scclResult_t scclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
SCCLCHECK(scclNvmlEnsureInitialized());
for(int d = 0; d < scclNvmlDeviceCount; d++) {
if(device == scclNvmlDevices[d].handle) {
*major = scclNvmlDevices[d].computeCapabilityMajor;
*minor = scclNvmlDevices[d].computeCapabilityMinor;
return scclSuccess;
}
}
return scclInvalidArgument;
}
scclResult_t scclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus) {
SCCLCHECK(scclNvmlEnsureInitialized());
if(p2pIndex == NVML_P2P_CAPS_INDEX_READ || p2pIndex == NVML_P2P_CAPS_INDEX_WRITE) {
int a = -1, b = -1;
for(int d = 0; d < scclNvmlDeviceCount; d++) {
if(device1 == scclNvmlDevices[d].handle)
a = d;
if(device2 == scclNvmlDevices[d].handle)
b = d;
}
if(a == -1 || b == -1)
return scclInvalidArgument;
if(p2pIndex == NVML_P2P_CAPS_INDEX_READ)
*p2pStatus = scclNvmlDevicePairs[a][b].p2pStatusRead;
else
*p2pStatus = scclNvmlDevicePairs[a][b].p2pStatusWrite;
} else {
std::lock_guard<std::mutex> locked(lock);
NVMLCHECK(nvmlDeviceGetP2PStatus, device1, device2, p2pIndex, p2pStatus);
}
return scclSuccess;
}
scclResult_t scclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values) {
SCCLCHECK(scclNvmlEnsureInitialized());
std::lock_guard<std::mutex> locked(lock);
NVMLTRY(nvmlDeviceGetFieldValues, device, valuesCount, values);
return scclSuccess;
}
} // namespace topology
} // namespace hardware
} // namespace sccl
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment