Commit d9d23f34 authored by lishen's avatar lishen
Browse files

Initial Code for SCCL_v1

parent 57df3737
#pragma once
#include <string.h>
#include "base.h"
#include "ipcsocket.h"
#include "proxy.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
typedef net::host::scclSocketAddress scclSocketAddress_t;
typedef net::host::scclSocket scclSocket_t;
// scclBootstrapHandle 结构体定义,用于存储引导句柄
struct scclBootstrapHandle {
uint64_t magic; // 魔术数,用于标识结构体的有效性
scclSocketAddress_t addr; // 地址,用于网络通信
};
struct scclProxyState {
int refCount; // 引用计数
int tpRank; // 当前线程的排名
int tpnRanks; // 线程组中线程的总数
int tpLocalnRanks; // 本地线程组中线程的总数
int cudaDev; // CUDA设备编号
int p2pnChannels; // 点对点通信的通道数
int p2pChunkSize; // 点对点通信的数据块大小
int nChannels; // 通道总数
int buffSizes[SCCL_NUM_PROTOCOLS]; // 各种协议的缓冲区大小
// 服务线程
pthread_t thread; // 线程ID
scclSocket_t* listenSock; // 监听套接字
int stop; // 停止标志
// 由主线程使用
scclSocketAddress_t* peerAddresses; // 对等体地址
scclSocket_t* peerSocks; // 对等体套接字
struct scclIpcSocket peerIpcSock; // cuMEM API支持(UDS)
// 进展线程
struct scclProxyProgressState progressState; // 进展状态
// 从代理预期的响应队列
struct scclExpectedProxyResponse* expectedResponses; // 预期的代理响应
};
// scclBootstrapComm 结构体定义,用于存储引导通信信息
struct scclBootstrapComm {
struct scclUniqueInfo unique_info; // 每个通信节点的基础信息
void* bootstrap; // 引导信息
uint64_t magic; // 魔术数,用于验证结构体
volatile uint32_t* abortFlag; // 中止标志
int splitShare; // 是否使用共享内存进行分割
int* topParentRanks; // 顶级父节点的rank
/* 与代理相关的共享资源 */
struct scclProxyState* proxyState;
};
// extInfo 结构体定义,用于存储Socket扩展信息
struct extInfo {
int rank; // 进程排名
int nranks; // 进程总数
scclSocketAddress_t extAddressListenRoot; // 根监听地址
scclSocketAddress_t extAddressListen; // 监听地址
};
struct unexConn {
int peer; // 对等节点的标识符
int tag; // 连接的标签,用于区分不同的连接
scclSocket_t sock; // 套接字结构,用于网络通信
struct unexConn* next; // 指向下一个未建立连接的指针,形成链表结构
};
// bootstrapState 结构体定义,用于存储引导状态
struct bootstrapState {
scclSocket_t listenSock; // 监听套接字
scclSocket_t ringRecvSocket; // 环接收套接字
scclSocket_t ringSendSocket; // 环发送套接字
scclSocketAddress_t* peerCommAddresses; // 对等通信地址
scclSocketAddress_t* peerProxyAddresses; // 对等代理地址
struct unexConn* unexpectedConnections; // 意外连接
int cudaDev; // CUDA 设备编号
int rank; // 进程排名
int nranks; // 进程总数
uint64_t magic; // 魔术数,用于验证结构体
volatile uint32_t* abortFlag; // 中止标志
};
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include "ipcsocket.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
// Enable Linux abstract socket naming
#define USE_ABSTRACT_SOCKET
#define SCCL_IPC_SOCKNAME_STR "/tmp/sccl-socket-%d-%lx"
/**
* @brief 初始化IPC套接字
*
* 创建一个UNIX域数据报套接字,并绑定到指定路径。支持抽象套接字和普通文件系统套接字两种模式。
*
* @param handle 指向scclIpcSocket结构体的指针,用于存储套接字信息
* @param rank 进程排名,用于生成唯一的套接字名称
* @param hash 哈希值,与rank一起用于生成唯一的套接字名称
* @param abortFlag 指向中止标志的指针,如果非NULL则设置套接字为非阻塞模式
* @return scclResult_t 返回操作结果,成功返回scclSuccess,失败返回相应错误码
*/
scclResult_t scclIpcSocketInit(scclIpcSocket* handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) {
int fd = -1;
struct sockaddr_un cliaddr;
char temp[SCCL_IPC_SOCKNAME_LEN] = "";
if(handle == NULL) {
return scclInternalError;
}
handle->fd = -1;
handle->socketName[0] = '\0';
if((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
WARN("UDS: Socket creation error : %d", errno);
return scclSystemError;
}
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
// Create unique name for the socket.
int len = snprintf(temp, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, rank, hash);
if(len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot bind provided name to socket. Name too large");
return scclInternalError;
}
#ifndef USE_ABSTRACT_SOCKET
unlink(temp);
#endif
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Creating socket %s", temp);
strncpy(cliaddr.sun_path, temp, len);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
if(bind(fd, (struct sockaddr*)&cliaddr, sizeof(cliaddr)) < 0) {
WARN("UDS: Binding to socket %s failed : %d", temp, errno);
close(fd);
return scclSystemError;
}
handle->fd = fd;
strcpy(handle->socketName, temp);
handle->abortFlag = abortFlag;
// Mark socket as non-blocking
if(handle->abortFlag) {
int flags;
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
}
return scclSuccess;
}
/**
* 关闭IPC套接字并释放相关资源
*
* @param handle 指向scclIpcSocket结构体的指针,包含要关闭的套接字信息
* @return scclResult_t 返回操作结果:
* - scclSuccess: 操作成功完成
* - scclInternalError: 传入无效句柄(handle为NULL)
*
* @note 如果定义了USE_ABSTRACT_SOCKET宏,则不会删除socket文件
* 如果套接字文件描述符无效(fd<=0),函数会直接返回成功
*/
scclResult_t scclIpcSocketClose(scclIpcSocket* handle) {
if(handle == NULL) {
return scclInternalError;
}
if(handle->fd <= 0) {
return scclSuccess;
}
#ifndef USE_ABSTRACT_SOCKET
if(handle->socketName[0] != '\0') {
unlink(handle->socketName);
}
#endif
close(handle->fd);
return scclSuccess;
}
/**
* 通过IPC socket接收文件描述符
*
* @param handle 指向scclIpcSocket结构体的指针,包含socket相关信息
* @param recvFd 用于存储接收到的文件描述符的指针
* @return scclResult_t 返回操作结果:
* - scclSuccess: 成功接收文件描述符
* - scclSystemError: 系统调用出错
* - scclInternalError: 操作被中断
*
* @note 该函数会阻塞等待直到接收到数据或发生错误
* @warning 调用者需要确保recvFd指向有效的内存空间
*/
scclResult_t scclIpcSocketRecvFd(scclIpcSocket* handle, int* recvFd) {
struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
struct iovec iov[1];
// Union to guarantee alignment requirements for control array
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr* cmptr;
char dummy_buffer[1];
int ret;
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
iov[0].iov_base = (void*)dummy_buffer;
iov[0].iov_len = sizeof(dummy_buffer);
msg.msg_iov = iov;
msg.msg_iovlen = 1;
while((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Receiving data over socket failed : %d", errno);
return scclSystemError;
}
if(handle->abortFlag && *handle->abortFlag)
return scclInternalError;
}
if(((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
if((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
WARN("UDS: Receiving data over socket failed");
return scclSystemError;
}
memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
} else {
WARN("UDS: Receiving data over socket %s failed", handle->socketName);
return scclSystemError;
}
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
return scclSuccess;
}
/**
* 通过UNIX域套接字发送文件描述符
*
* @param handle IPC套接字句柄
* @param sendFd 要发送的文件描述符
* @param rank 目标rank号
* @param hash 用于生成套接字名的哈希值
*
* @return 成功返回scclSuccess,失败返回错误码:
* - scclInternalError: 内部错误(如名称过长或操作被中止)
* - scclSystemError: 系统调用错误
*
* @note 使用SCM_RIGHTS机制通过控制消息发送文件描述符
* 在Linux下支持抽象套接字命名空间(当USE_ABSTRACT_SOCKET定义时)
*/
scclResult_t scclIpcSocketSendFd(scclIpcSocket* handle, const int sendFd, int rank, uint64_t hash) {
struct msghdr msg;
struct iovec iov[1];
char temp[SCCL_IPC_SOCKNAME_LEN];
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr* cmptr;
struct sockaddr_un cliaddr;
// Construct client address to send this shareable handle to
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
int len = snprintf(temp, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, rank, hash);
if(len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot connect to provided name for socket. Name too large");
return scclInternalError;
}
(void)strncpy(cliaddr.sun_path, temp, len);
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
msg.msg_name = (void*)&cliaddr;
msg.msg_namelen = sizeof(struct sockaddr_un);
iov[0].iov_base = (void*)"";
iov[0].iov_len = 1;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_flags = 0;
ssize_t sendResult;
while((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Sending data over socket %s failed : %d", temp, errno);
return scclSystemError;
}
if(handle->abortFlag && *handle->abortFlag)
return scclInternalError;
}
return scclSuccess;
}
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
#pragma once
#include <stdio.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <memory.h>
#include <sys/un.h>
#include <inttypes.h>
#include "base.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
#define SCCL_IPC_SOCKNAME_LEN 64
// 定义IPC套接字结构体
struct scclIpcSocket {
int fd; // 文件描述符
char socketName[SCCL_IPC_SOCKNAME_LEN]; // 套接字名称
volatile uint32_t* abortFlag; // 用于中止操作的标志
};
// 初始化IPC套接字
scclResult_t scclIpcSocketInit(struct scclIpcSocket* handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
// 关闭IPC套接字
scclResult_t scclIpcSocketClose(struct scclIpcSocket* handle);
// 接收文件描述符
scclResult_t scclIpcSocketRecvFd(struct scclIpcSocket* handle, int* fd);
// 发送文件描述符
scclResult_t scclIpcSocketSendFd(struct scclIpcSocket* handle, const int fd, int rank, uint64_t hash);
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
#include <sys/syscall.h>
#include <assert.h>
#include "proxy.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {}
} // namespace topology
} // namespace hardware
} // namespace sccl
// static bool NeedProxy(int type, int pattern, int root, struct scclRing* ring, int nranks) {
// if(pattern == scclPatternRing || pattern == scclPatternRingTwice)
// return true;
// /* In chains, one rank does not need a proxy. Let's figure out which one it is */
// /* Which index in the reorganized rings should we compare root against */
// const int myrank = 0, nextrank = 1, prevrank = nranks - 1;
// int index = pattern == scclPatternPipelineFrom ?
// /* no recv / no send if root = */
// /* bcast */ (type == proxyRecv ? myrank : nextrank)
// :
// /* reduce */ (type == proxyRecv ? prevrank : myrank);
// int rank = ring->userRanks[index];
// return (root != rank);
// }
// #define PROXYARGS_ALLOCATE_SIZE SCCL_MAX_OPS
// struct scclProxyPool {
// struct scclProxyPool* next;
// struct scclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
// };
// static void expectedProxyResponseFree(struct scclProxyState* state) {
// struct scclExpectedProxyResponse* elem = state->expectedResponses;
// struct scclExpectedProxyResponse* prev = NULL;
// while(elem) {
// prev = elem;
// elem = elem->next;
// free(prev->respBuff);
// free(prev);
// }
// }
// static scclResult_t expectedProxyResponseStore(struct scclProxyState* state, void* opId, void* respBuff, int respSize) {
// struct scclExpectedProxyResponse* elem = state->expectedResponses;
// while(elem) {
// if(elem->opId == opId) {
// if(respSize != elem->respSize) {
// WARN("Mismatched response size for opId=%p", opId);
// return scclInternalError;
// }
// if(elem->done) {
// WARN("Storing response for already completed opId=%p", opId);
// return scclInternalError;
// }
// memcpy(elem->respBuff, respBuff, respSize);
// free(respBuff);
// elem->done = true;
// return scclSuccess;
// }
// elem = elem->next;
// }
// WARN("Proxy response for opId=%p doesn't match any expected response", opId);
// return scclInternalError;
// }
// static scclResult_t expectedProxyResponseEnqueue(struct scclProxyState* state, void* opId, int respSize) {
// struct scclExpectedProxyResponse* ex;
// scclCHECK(scclCalloc(&ex, 1));
// ex->opId = opId;
// // Pre-alloc response buffer
// ex->respBuff = malloc(respSize);
// ex->respSize = respSize;
// ex->done = false;
// // Enqueue
// struct scclExpectedProxyResponse* list = state->expectedResponses;
// if(list == NULL) {
// state->expectedResponses = ex;
// return scclSuccess;
// }
// while(list->next)
// list = list->next;
// list->next = ex;
// return scclSuccess;
// }
// static scclResult_t expectedProxyResponseDequeue(struct scclProxyState* state, void* opId, void* respBuff, int* found) {
// struct scclExpectedProxyResponse* elem = state->expectedResponses;
// struct scclExpectedProxyResponse* prev = NULL;
// *found = 0;
// while(elem) {
// if((elem->opId == opId) && elem->done) {
// if(prev == NULL) {
// state->expectedResponses = elem->next;
// } else {
// prev->next = elem->next;
// }
// memcpy(respBuff, elem->respBuff, elem->respSize);
// free(elem->respBuff);
// free(elem);
// *found = 1;
// return scclSuccess;
// }
// prev = elem;
// elem = elem->next;
// }
// return scclSuccess;
// }
// static scclResult_t expectedProxyResponseRemove(struct scclProxyState* state, void* opId) {
// struct scclExpectedProxyResponse* elem = state->expectedResponses;
// struct scclExpectedProxyResponse* prev = NULL;
// while(elem) {
// if(elem->opId == opId) {
// if(prev == NULL) {
// state->expectedResponses = elem->next;
// } else {
// prev->next = elem->next;
// }
// free(elem->respBuff);
// free(elem);
// return scclSuccess;
// }
// prev = elem;
// elem = elem->next;
// }
// WARN("Couldn't find opId=%p", opId);
// return scclInternalError;
// }
// static scclResult_t asyncProxyOpEnqueue(struct scclProxyLocalPeer* peer, scclProxyAsyncOp* op) {
// scclProxyAsyncOp* list = peer->asyncOps;
// if(list == NULL) {
// peer->asyncOps = op;
// return scclSuccess;
// }
// while(list->next)
// list = list->next;
// list->next = op;
// return scclSuccess;
// }
// static scclResult_t asyncProxyOpDequeue(struct scclProxyLocalPeer* peer, scclProxyAsyncOp* op) {
// struct scclProxyAsyncOp* elem = peer->asyncOps;
// struct scclProxyAsyncOp* prev = NULL;
// while(elem) {
// if(elem->opId == op->opId) {
// if(prev == NULL) {
// peer->asyncOps = elem->next;
// } else {
// prev->next = elem->next;
// }
// if(elem->reqBuff) {
// free(elem->reqBuff);
// }
// if(elem->respBuff) {
// free(elem->respBuff);
// }
// free(elem);
// return scclSuccess;
// }
// prev = elem;
// elem = elem->next;
// }
// if(op) {
// WARN("Attempting to dequeue nonexistent async opId=%p", op->opId);
// } else {
// WARN("Attempting to dequeue null operation");
// }
// return scclInternalError;
// }
// static scclResult_t allocateArgs(struct scclProxyProgressState* state, struct scclProxyArgs** argsptr) {
// struct scclProxyArgs* elem;
// if(state->pool == NULL) {
// // Allocate a new pool of elements. Make sure we allocate the memory close
// // to the network thread
// struct scclProxyPool* newPool;
// scclCHECK(scclCalloc(&newPool, 1));
// struct scclProxyArgs* newElems = newPool->elems;
// // Chain newly allocated elements
// for(int i = 0; i < PROXYARGS_ALLOCATE_SIZE; i++) {
// if(i + 1 < PROXYARGS_ALLOCATE_SIZE)
// newElems[i].next = newElems + i + 1;
// }
// // Add them all to the pool list
// state->pool = newElems;
// // Save the pool memory block for later resource release
// newPool->next = state->pools;
// state->pools = newPool;
// }
// elem = state->pool;
// state->pool = state->pool->next;
// elem->next = elem->nextPeer = NULL;
// *argsptr = elem;
// return scclSuccess;
// }
// // #define DEBUG_PROXY 1
// #ifdef DEBUG_PROXY
// #define DEBUG_PROXY_PRINT printf
// #else
// #define DEBUG_PROXY_PRINT(...)
// #endif
// #define OP_INDEX(op) ((op) ? (op) - state->pools->elems : -1)
// #define OP_SEEN 0x100000
// scclResult_t getOpIndex(struct scclProxyArgs* op, struct scclProxyProgressState* state, int* poolIndex, int* opIndex) {
// struct scclProxyPool* pool = state->pools;
// int p = 0;
// while(pool) {
// uint64_t o = op - pool->elems;
// if(o < PROXYARGS_ALLOCATE_SIZE) {
// *opIndex = o;
// *poolIndex = p;
// return scclSuccess;
// }
// pool = pool->next;
// p++;
// }
// WARN("Could not find pool of op %p", op);
// return scclInternalError;
// }
// scclResult_t printProxyOp(struct scclProxyArgs* op, int poolIndex, int opIndex) {
// printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, op->pattern == scclPatternSend ? "Send" : op->pattern == scclPatternRecv ? "Recv" : "Coll");
// for(int s = 0; s < op->nsubs; s++) {
// struct scclProxySubArgs* sub = op->subs + s;
// if(op->state == scclProxyOpProgress) {
// char status = ' ';
// if(op->pattern == scclPatternRecv) {
// if(sub->posted < sub->nsteps && sub->posted < sub->done + SCCL_STEPS)
// status = 'I'; // Init
// else if(sub->received < sub->posted)
// status = 'R'; // Receiving
// else if(sub->received < sub->transmitted)
// status = 'R'; // Receiving
// else if(sub->transmitted < sub->received)
// status = 'F'; // Flushing
// else if(sub->done < sub->transmitted)
// status = 'G'; // Waiting on GPU
// else
// status = 'D'; // Done
// } else if(op->pattern == scclPatternSend) {
// if(sub->posted < sub->nsteps && sub->posted < sub->done + SCCL_STEPS)
// status = 'I'; // Init
// else if(sub->transmitted < sub->posted)
// status = 'G'; // Waiting on GPU
// else if(sub->done < sub->transmitted)
// status = 'S'; // Sending
// else
// status = 'D'; // Done
// }
// printf(" %d%c/%d", sub->peer, status, sub->channelId);
// } else {
// printf(" %d/%d", sub->peer, sub->channelId);
// }
// }
// printf("]");
// return scclSuccess;
// }
// scclResult_t dumpProxyState(struct scclProxyProgressState* state) {
// struct scclProxyArgs* op = state->active;
// int poolIndex, opIndex;
// printf("ACTIVE OPS\n");
// while(op) {
// scclCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
// if(op->state & OP_SEEN) {
// WARN("List loop at element %d-%d", poolIndex, opIndex);
// }
// scclCHECK(printProxyOp(op, poolIndex, opIndex));
// op->state |= OP_SEEN;
// printf("\n");
// struct scclProxyArgs* nextOp = op->nextPeer;
// while(nextOp) {
// scclCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex));
// if(nextOp->state & OP_SEEN) {
// WARN("List loop at element %d-%d", poolIndex, opIndex);
// }
// printf("| `-> ");
// scclCHECK(printProxyOp(nextOp, poolIndex, opIndex));
// nextOp->state |= OP_SEEN;
// printf("\n");
// if(nextOp->next) {
// WARN("Inactive op has next set!");
// }
// nextOp = nextOp->nextPeer;
// }
// if(op->nextPeer == NULL)
// printf("|\n");
// op = op->next;
// printf("v\n");
// }
// printf("[X]\n");
// #if 0
// printf("FREE OPS\n");
// op = state->pool;
// while (op) {
// scclCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
// if (op->state & OP_SEEN) {
// WARN("List loop at element %d-%d", poolIndex, opIndex);
// }
// scclCHECK(printProxyOp(op, poolIndex, opIndex));
// op->state |= OP_SEEN;
// printf("->");
// op = op->next;
// }
// printf("[X]\n");
// #else
// op = state->pool;
// while(op) {
// scclCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
// if(op->state & OP_SEEN) {
// WARN("List loop at element %d-%d", poolIndex, opIndex);
// }
// op->state |= OP_SEEN;
// op = op->next;
// }
// #endif
// struct scclProxyPool* pool = state->pools;
// poolIndex = 0;
// while(pool) {
// struct scclProxyArgs* elem = pool->elems;
// for(int e = 0; e < PROXYARGS_ALLOCATE_SIZE; e++, elem++) {
// if((elem->state & OP_SEEN) == 0) {
// printf("Elem %d-%d is not in any list:\n", poolIndex, e);
// scclCHECK(printProxyOp(elem, poolIndex, e));
// printf("\n");
// } else {
// elem->state -= OP_SEEN;
// }
// }
// pool = pool->next;
// poolIndex++;
// }
// return scclSuccess;
// }
// static scclResult_t scclProxyOpToArgs(struct scclProxyOp* op, struct scclProxyArgs* args, int subIndex) {
// struct scclProxySubArgs* sub = args->subs + subIndex;
// if(subIndex >= SCCL_PROXY_MAX_SUBS) {
// WARN("Proxy append out of bounds");
// return scclInternalError;
// }
// // memset(sub, 0, sizeof(struct scclProxySubArgs));
// sub->connection = op->connection;
// sub->channelId = op->channelId;
// sub->nsteps = op->nsteps;
// sub->nbytes = op->nbytes;
// sub->peer = op->root;
// args->nsubs = subIndex + 1;
// if(subIndex) {
// if((args->sliceSteps != op->sliceSteps) || (args->chunkSteps != op->chunkSteps) || (args->protocol != op->protocol) || (args->dtype != op->dtype) ||
// (args->redOp != op->redOp)) {
// WARN("Proxy append mismatch");
// return scclInternalError;
// }
// if(args->state != scclProxyOpReady) {
// WARN("Proxy append on running operation");
// return scclInternalError;
// }
// return scclSuccess;
// }
// // memset(&args->progress, 0, sizeof(struct scclProxyArgs)-offsetof(struct scclProxyArgs, progress));
// args->done = 0;
// args->opCount = op->opCount;
// args->sliceSteps = op->sliceSteps;
// args->chunkSteps = op->chunkSteps;
// args->chunkSize = op->chunkSize;
// args->dtype = op->dtype;
// args->redOp = op->redOp;
// args->pattern = op->pattern;
// args->protocol = op->protocol;
// args->state = scclProxyOpReady;
// args->progress = op->connection->tcomm->proxyProgress;
// args->proxyAppendPtr = op->connection->proxyAppendPtr;
// return scclSuccess;
// }
// static scclResult_t ProxyAppend(struct scclProxyProgressState* state, struct scclProxyOp* op) {
// struct scclProxyConnection* connection = op->connection;
// int shared = connection->shared;
// struct scclProxyArgs* args = *connection->proxyAppendPtr;
// if(args) {
// if(shared && args->opCount == op->opCount) {
// scclCHECK(scclProxyOpToArgs(op, args, args->nsubs));
// DEBUG_PROXY_PRINT("Insert (%d/%5ld/%5ld) as group with %5ld\n", shared, args->opCount, op->opCount, OP_INDEX(args));
// } else {
// struct scclProxyArgs* prevArgs = args;
// scclCHECK(allocateArgs(state, &args));
// scclCHECK(scclProxyOpToArgs(op, args, 0));
// prevArgs->nextPeer = args;
// DEBUG_PROXY_PRINT(
// "Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, prevArgs->opCount, args->opCount, OP_INDEX(prevArgs));
// *(args->proxyAppendPtr) = args;
// }
// } else {
// // Nothing running for that peer. Add to the list
// scclCHECK(allocateArgs(state, &args));
// scclCHECK(scclProxyOpToArgs(op, args, 0));
// if(state->active == NULL) {
// // Create the list
// DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element\n", OP_INDEX(args), shared, args->opCount);
// state->active = args;
// } else {
// // Append element at the end of the list
// struct scclProxyArgs* last = state->active;
// while(last->next)
// last = last->next;
// last->next = args;
// DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element\n", OP_INDEX(args), shared, args->opCount);
// }
// *(args->proxyAppendPtr) = args;
// }
// return scclSuccess;
// }
// scclResult_t scclProxyPost(struct scclProxyOpsPool* pool, int nextOps, int nextOpsEnd) {
// pthread_mutex_lock(&pool->mutex);
// if(pool->nextOps == -1) {
// pool->nextOps = nextOps;
// pthread_cond_signal(&pool->cond);
// } else {
// pool->ops[pool->nextOpsEnd].next = nextOps;
// }
// pool->nextOpsEnd = nextOpsEnd;
// pthread_mutex_unlock(&pool->mutex);
// return scclSuccess;
// }
// static scclResult_t scclLocalOpAppend(struct scclComm* comm, struct scclProxyConnector* proxyConn, struct scclProxyOp* proxyOp) {
// int tpLocalRank = comm->topParentLocalRanks[comm->localRank];
// struct scclProxyOps* proxyOps = comm->proxyState->proxyOps;
// if(proxyOps == NULL)
// return scclInternalError;
// proxyOps += proxyConn->tpLocalRank;
// struct scclProxyOpsPool* pool = proxyOps->pool;
// TIME_START(0);
// int opIndex = proxyOps->freeOp;
// struct scclProxyOp* op;
// if(opIndex != -1) {
// op = pool->ops + opIndex;
// proxyOps->freeOp = op->next;
// } else {
// int freeOp;
// while((freeOp = pool->freeOps[tpLocalRank]) == -1)
// sched_yield();
// int freeOpNew;
// while((freeOpNew = __sync_val_compare_and_swap(pool->freeOps + tpLocalRank, freeOp, -1)) != freeOp)
// freeOp = freeOpNew;
// opIndex = freeOp;
// op = pool->ops + opIndex;
// proxyOps->freeOp = op->next;
// }
// if(op->next != -1)
// __builtin_prefetch(pool->ops + op->next); // Prefetch next free op
// memcpy(op, proxyOp, sizeof(struct scclProxyOp));
// op->next = -1;
// op->connection = proxyConn->connection;
// if(proxyOps->nextOps == -1) {
// proxyOps->nextOps = proxyOps->nextOpsEnd = opIndex;
// } else {
// pool->ops[proxyOps->nextOpsEnd].next = opIndex;
// proxyOps->nextOpsEnd = opIndex;
// }
// if(++proxyOps->count == MAX_OPS_PER_PEER) {
// // Post what we have so far to free some ops in the pool
// // Do not post last operations as we could have more coming with the same opCount, and posting
// // them in different batches would break proxyArgs aggregation with subs.
// uint64_t lastOpCount = pool->ops[proxyOps->nextOpsEnd].opCount;
// int lastOp = -1;
// int toSend = 0;
// int ops = 0;
// for(int op = proxyOps->nextOps; op != proxyOps->nextOpsEnd; op = pool->ops[op].next) {
// ops++;
// if(pool->ops[op].opCount != lastOpCount) {
// lastOp = op;
// toSend = ops;
// }
// }
// if(lastOp == -1) {
// WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
// return scclInternalError;
// }
// // Cut chain at lastOp
// int nextOps = proxyOps->nextOps;
// proxyOps->nextOps = pool->ops[lastOp].next;
// pool->ops[lastOp].next = -1;
// scclCHECK(scclProxyPost(proxyOps->pool, nextOps, lastOp));
// proxyOps->count -= toSend;
// }
// TIME_STOP(0);
// return scclSuccess;
// }
// static scclResult_t
// SaveProxy(struct scclComm* comm, struct scclChannel* channel, int type, int peer, struct scclProxyOp* op, int connIndex, bool* justInquire) {
// if(peer < 0)
// return scclSuccess;
// struct scclChannelPeer* peerComm = channel->peers[peer];
// struct scclConnector* connector = type == proxyRecv ? peerComm->recv + connIndex : peerComm->send + connIndex;
// if(connector->transportComm == NULL) {
// WARN("Rank %d has no transport for %s peer %d on channel %d/%d", comm->rank, type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex);
// return scclInternalError;
// }
// if(connector->transportComm->proxyProgress == NULL)
// return scclSuccess;
// if(justInquire)
// *justInquire = true;
// else {
// scclCHECK(scclLocalOpAppend(comm, &connector->proxyConn, op));
// }
// return scclSuccess;
// }
// scclResult_t mscclSaveProxy(struct scclComm* comm, struct scclChannel* channel, int type, int peer, struct scclProxyOp* op, int connIndex) {
// scclCHECK(SaveProxy(comm, channel, type, peer, op, connIndex, nullptr));
// return scclSuccess;
// }
// // justInquire != nullptr means don't actually do anything, just assertain need of
// // scclProxySaveOp for this op.
// scclResult_t scclProxySaveOp(struct scclComm* comm, struct scclProxyOp* op, bool* justInquire) {
// struct scclChannel* channel = &comm->channels[op->channelId];
// if(justInquire)
// *justInquire = false;
// switch(op->pattern) {
// case scclPatternRing:
// case scclPatternRingTwice:
// case scclPatternPipelineFrom:
// case scclPatternPipelineTo: {
// struct scclRing* ring = &channel->ring;
// if(NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) {
// scclCHECK(SaveProxy(comm, channel, proxyRecv, ring->prev, op, op->connIndex, justInquire));
// }
// if(NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) {
// scclCHECK(SaveProxy(comm, channel, proxySend, ring->next, op, op->connIndex, justInquire));
// }
// } break;
// case scclPatternTreeUp:
// case scclPatternTreeDown:
// case scclPatternTreeUpDown: {
// if(op->pattern != scclPatternTreeDown) { // Tree up
// struct scclTree* tree = &channel->tree;
// for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++) {
// scclCHECK(SaveProxy(comm, channel, proxyRecv, tree->down[i], op, 0, justInquire));
// }
// scclCHECK(SaveProxy(comm, channel, proxySend, tree->up, op, 0, justInquire));
// }
// if(op->pattern != scclPatternTreeUp) { // Tree down
// struct scclTree* tree = &channel->tree;
// for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++) {
// scclCHECK(SaveProxy(comm, channel, proxySend, tree->down[i], op, 0, justInquire));
// }
// scclCHECK(SaveProxy(comm, channel, proxyRecv, tree->up, op, 0, justInquire));
// }
// } break;
// case scclPatternCollnetChain: {
// scclCHECK(SaveProxy(comm, channel, proxySend, channel->collnetChain.up, op, 1, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetChain.up, op, 0, justInquire));
// } break;
// case scclPatternCollnetDirect: {
// scclCHECK(SaveProxy(comm, channel, proxySend, channel->collnetDirect.out, op, 1, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetDirect.out, op, 0, justInquire));
// } break;
// case scclPatternNvls: {
// scclCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.out, op, 1, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.out, op, 0, justInquire));
// } break;
// case scclPatternNvlsTree: {
// scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[1], op, 0, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[2], op, 0, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeUp, op, 0, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[1], op, 0, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[2], op, 0, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeUp, op, 0, justInquire));
// } break;
// case scclPatternSend:
// case scclPatternRecv: {
// if(op->root == comm->rank)
// return scclSuccess;
// scclCHECK(SaveProxy(comm, channel, op->pattern == scclPatternSend ? proxySend : proxyRecv, op->root, op, op->connIndex, justInquire));
// } break;
// }
// return scclSuccess;
// }
// SCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
// scclResult_t scclProxyComputeP2p(struct scclInfo* info, struct scclProxyOp* op) {
// memset(op, 0, sizeof(struct scclProxyOp));
// int channelId = info->channelId;
// struct scclChannel* channel = info->comm->channels + channelId;
// op->channelId = channelId;
// op->sliceSteps = P2P_SLICESTEPS;
// op->chunkSteps = P2P_CHUNKSTEPS;
// op->dtype = info->datatype;
// op->protocol = info->protocol;
// int stepSize = info->comm->buffSizes[op->protocol] / SCCL_STEPS;
// if(op->protocol == SCCL_PROTO_SIMPLE)
// stepSize = info->comm->p2pChunkSize;
// #ifdef HCU_SDMA_FEATURE
// info->chunkSize = info->comm->p2pRealChunkSize;
// #else
// info->chunkSize = stepSize;
// #endif
// op->root = info->root;
// struct scclChannelPeer* peer = channel->peers[op->root];
// if(info->coll == scclFuncSend) {
// op->pattern = scclPatternSend;
// if(op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
// // Tune chunk size for the network
// if(info->count < stepSize)
// info->chunkSize /= 4;
// else if(info->count < 8 * stepSize)
// info->chunkSize /= 2;
// }
// } else if(info->coll == scclFuncRecv) {
// op->pattern = scclPatternRecv;
// if(op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) {
// // Tune chunk size for the network
// if(info->count < stepSize)
// info->chunkSize /= 4;
// else if(info->count < 8 * stepSize)
// info->chunkSize /= 2;
// }
// } else {
// WARN("P2p operation is neither send or recv");
// return scclInternalError;
// }
// if(scclParamChunkSize() != 0) {
// info->chunkSize = scclParamChunkSize();
// }
// op->chunkSize = info->chunkSize;
// // Compute nSteps for proxies
// int chunkEffectiveSize = op->chunkSize;
// if(op->protocol == SCCL_PROTO_LL) {
// chunkEffectiveSize /= 2;
// }
// op->nbytes = stepSize;
// op->nsteps = DIVUP(info->count, chunkEffectiveSize);
// if(op->nsteps == 0)
// op->nsteps = 1;
// return scclSuccess;
// }
// static scclResult_t removeOp(struct scclProxyProgressState* state, struct scclProxyArgs** opPtr, struct scclProxyArgs** prevOpPtr) {
// struct scclProxyArgs* freeOp = *opPtr;
// struct scclProxyArgs* next = freeOp->next;
// DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(next));
// *opPtr = next;
// if(freeOp->nextPeer) {
// // replace op by nextPeer
// struct scclProxyArgs* nextPeer = freeOp->nextPeer;
// if(*prevOpPtr) {
// (*prevOpPtr)->next = nextPeer;
// } else {
// state->active = nextPeer;
// }
// nextPeer->next = next;
// *(prevOpPtr) = nextPeer;
// } else {
// *(freeOp->proxyAppendPtr) = NULL;
// if(*prevOpPtr) {
// (*prevOpPtr)->next = next;
// } else {
// state->active = next;
// }
// }
// freeOp->next = state->pool;
// state->pool = freeOp;
// DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
// #ifdef DEBUG_PROXY
// scclCHECK(dumpProxyState(state));
// #endif
// return scclSuccess;
// }
// static scclResult_t progressOps(struct scclProxyState* proxyState, struct scclProxyProgressState* state, struct scclProxyArgs* opStart, int* idle) {
// struct scclProxyArgs* prevOp = NULL;
// struct scclProxyArgs* op = opStart;
// while(op) {
// if(op->state == scclProxyOpNone)
// return scclInternalError;
// TIME_START(0);
// TIME_START(1);
// scclCHECK(op->progress(proxyState, op));
// if(op->idle) {
// TIME_STOP(1);
// TIME_CANCEL(0);
// } else {
// TIME_CANCEL(1);
// TIME_STOP(0);
// }
// *idle &= op->idle;
// if(op->state == scclProxyOpNone) {
// TIME_START(2);
// scclCHECK(removeOp(state, &op, &prevOp));
// TIME_STOP(2);
// } else {
// prevOp = op;
// op = op->next;
// }
// }
// return scclSuccess;
// }
// SCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16);
// static scclResult_t scclProxyGetPostedOps(struct scclProxyState* proxyState, int* added) {
// struct scclProxyProgressState* state = &proxyState->progressState;
// if(state->opsPool == NULL)
// return scclInternalError;
// struct scclProxyOpsPool* pool = state->opsPool;
// struct scclProxyArgs profArgs; // Only used for profiling purposes
// if(state->nextOps != -1)
// goto process_nextops;
// // If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock
// // to be available. Exit, continue progress, and come back later.
// if(state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0))
// return scclSuccess;
// if(state->active == NULL) {
// pthread_mutex_lock(&pool->mutex);
// while(pool->nextOps == -1 && !state->stop) {
// struct scclProxyArgs profArgs; // Only used for profiling purposes
// scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileSleep);
// pthread_cond_wait(&pool->cond, &pool->mutex);
// scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileWakeup);
// }
// if(state->stop) { // We might have been woken up to stop.
// pthread_mutex_unlock(&pool->mutex);
// return scclSuccess;
// }
// }
// state->nextOps = pool->nextOps;
// pool->nextOps = pool->nextOpsEnd = -1;
// pthread_mutex_unlock(&pool->mutex);
// if(state->nextOps == -1)
// return scclInternalError;
// process_nextops:
// scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileAppend);
// TIME_START(2);
// int freeOp[SCCL_MAX_LOCAL_RANKS];
// int freeOpEnd[SCCL_MAX_LOCAL_RANKS];
// for(int i = 0; i < proxyState->tpLocalnRanks; i++)
// freeOp[i] = -1;
// uint64_t lastOpCount = 0;
// int lastPeer = -1;
// int count = 0;
// for(int opIndex = state->nextOps; opIndex != -1;) {
// struct scclProxyOp* peerOp = pool->ops + opIndex;
// int peer = opIndex / MAX_OPS_PER_PEER;
// if((lastOpCount && peerOp->opCount != lastOpCount) || ((lastPeer != -1) && peer != lastPeer))
// count++;
// if(count == scclParamProxyAppendBatchSize() + 1)
// break;
// lastOpCount = peerOp->opCount;
// lastPeer = peer;
// if(peerOp->connection == NULL)
// return scclInternalError;
// if(peerOp->next != -1)
// __builtin_prefetch(pool->ops + peerOp->next);
// scclCHECK(ProxyAppend(state, peerOp));
// (*added)++;
// int lastOpIndex = opIndex;
// opIndex = peerOp->next;
// // Return op to peer pool
// if(freeOp[peer] == -1) {
// freeOpEnd[peer] = lastOpIndex;
// } else {
// peerOp->next = freeOp[peer];
// }
// freeOp[peer] = lastOpIndex;
// state->nextOps = opIndex;
// }
// for(int i = 0; i < proxyState->tpLocalnRanks; i++) {
// if(freeOp[i] == -1)
// continue;
// int newFree = freeOp[i];
// int oldFree = pool->freeOps[i];
// pool->ops[freeOpEnd[i]].next = oldFree;
// if(oldFree == -1) {
// // Nothing for the main thread to consume, we can set it.
// pool->freeOps[i] = newFree;
// } else {
// // The main thread may recycle free ops at any time, replace the freeOps value atomically and check it worked.
// int swap = __sync_val_compare_and_swap(pool->freeOps + i, oldFree, newFree);
// if(swap != oldFree) {
// if(swap != -1)
// return scclInternalError;
// // Ops were recycled while we were trying to swap, just set the value directly now.
// pool->ops[freeOpEnd[i]].next = -1;
// pool->freeOps[i] = newFree;
// }
// }
// }
// profArgs.opCount = *added;
// scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileAppendEnd);
// TIME_STOP(2);
// return scclSuccess;
// }
// #include <signal.h>
// static scclProxyProgressState* scclLastProxyState;
// void scclDumpProxyState(int signal) { dumpProxyState(scclLastProxyState); }
// SCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0);
// static int setProxyThreadContext(struct scclProxyState* proxyState) {
// #if CUDART_VERSION >= 11030
// static int createThreadContext = -1;
// if(createThreadContext == -1) {
// createThreadContext = scclParamCreateThreadContext();
// if(createThreadContext) {
// if(CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
// WARN("Unable to create thread context due to old driver, disabling.");
// createThreadContext = 0;
// }
// }
// }
// if(createThreadContext) {
// if(proxyState->cudaCtx == NULL) {
// if(CUPFN(cuCtxCreate(&proxyState->cudaCtx, CU_CTX_SCHED_SPIN | CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) {
// WARN("Failed to create CUDA context on device %d", proxyState->cudaDev);
// createThreadContext = 0;
// }
// } else {
// if(CUPFN(cuCtxSetCurrent(proxyState->cudaCtx)) != CUDA_SUCCESS) {
// WARN("Failed to set CUDA context on device %d", proxyState->cudaDev);
// return 0;
// }
// return 1;
// }
// }
// #endif
// return 0;
// }
// // Set to SIGUSR1 or SIGUSR2 to help debug proxy state during hangs
// SCCL_PARAM(ProxyDumpSignal, "PROXY_DUMP_SIGNAL", -1);
// SCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8);
// void* scclProxyProgress(void* proxyState_) {
// struct scclProxyState* proxyState = (struct scclProxyState*)proxyState_;
// if(setProxyThreadContext(proxyState)) {
// INFO(SCCL_INIT, "[Proxy Progress] Created CUDA context on device %d", proxyState->cudaDev);
// } else if(cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
// WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev);
// }
// // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
// struct scclProxyProgressState* state = &proxyState->progressState;
// state->nextOps = -1;
// const int sig = scclParamProxyDumpSignal();
// if(sig != -1)
// signal(sig, scclDumpProxyState);
// scclLastProxyState = state;
// char threadName[SCCL_THREAD_NAMELEN];
// snprintf(threadName, SCCL_THREAD_NAMELEN, "sccl Progress%2d", proxyState->cudaDev);
// nvtxNameOsThreadA(syscall(SYS_gettid), threadName);
// int lastIdle = 0;
// /* Too frequent call of scclProxyGetPostedOps() will result in perf regression for small message
// * communication. proxyOpAppendCounter is a counter that helps us decide if we need to append proxy ops.
// * After each progress, proxyOpAppendCounter will increase by 1 and compare with environment variable
// * scclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the
// * frequency of calling scclProxyGetPostedOps() and reduce the perf impact. */
// int proxyOpAppendCounter = 0;
// struct scclProxyArgs profArgs; // Only used for profiling purposes
// while((state->stop == false || (state->stop == true && state->active)) && *proxyState->abortFlag == 0) {
// int idle = 1;
// scclResult_t ret = progressOps(proxyState, state, state->active, &idle);
// if(ret != scclSuccess) {
// INFO(SCCL_ALL, "%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
// return NULL;
// }
// if(lastIdle == 0 && idle == 1)
// scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileIdle);
// if(lastIdle == 1 && idle == 0)
// scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileActive);
// if(idle || (++proxyOpAppendCounter == scclParamProgressAppendOpFreq())) {
// int added = 0;
// proxyOpAppendCounter = 0;
// TIME_START(3);
// if(state->stop == false)
// ret = scclProxyGetPostedOps(proxyState, &added);
// if(added) {
// TIME_STOP(3);
// } else {
// TIME_CANCEL(3);
// }
// if(ret != scclSuccess) {
// INFO(SCCL_ALL, "%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
// }
// if(added == 0) {
// sched_yield(); // No request progressed. Let others run.
// }
// }
// lastIdle = idle;
// }
// return NULL;
// }
// scclResult_t scclProxyStart(struct scclComm* comm) {
// struct scclProxyOps* proxyOps = comm->proxyState->proxyOps;
// if(proxyOps == NULL)
// return scclSuccess;
// TIME_START(1);
// for(int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) {
// struct scclProxyOps* ops = proxyOps + r;
// if(ops->pool == NULL || ops->nextOps == -1)
// continue;
// scclCHECK(scclProxyPost(ops->pool, ops->nextOps, ops->nextOpsEnd));
// ops->nextOps = ops->nextOpsEnd = -1;
// ops->count = 0;
// }
// comm->opCount++;
// TIME_STOP(1);
// return scclSuccess;
// }
// static scclResult_t scclProxyProgressCreate(struct scclProxyState* proxyState) {
// struct scclProxyProgressState* state = &proxyState->progressState;
// if(!state->thread) {
// pthread_create(&state->thread, NULL, scclProxyProgress, proxyState);
// scclSetThreadName(state->thread, "sccl Progress%2d", proxyState->tpLocalnRanks);
// }
// return scclSuccess;
// }
// scclResult_t scclProxyProgressDestroy(struct scclProxyState* proxyState) {
// struct scclProxyProgressState* state = &proxyState->progressState;
// // Request the proxy to stop and then wake it
// if(state->opsPool) {
// pthread_mutex_lock(&state->opsPool->mutex);
// state->stop = true;
// pthread_cond_signal(&state->opsPool->cond);
// pthread_mutex_unlock(&state->opsPool->mutex);
// pthread_join(state->thread, NULL);
// }
// // Free off any memory allocated for the proxy arg pools
// while(state->pools != NULL) {
// struct scclProxyPool* next = state->pools->next;
// free(state->pools);
// state->pools = next;
// }
// scclProfilingDump();
// TIME_PRINT("Proxy");
// return scclSuccess;
// }
// #define SCCL_PROXY_CONN_POOL_SIZE_POW2 7
// #define SCCL_PROXY_CONN_POOL_SIZE (1 << (SCCL_PROXY_CONN_POOL_SIZE_POW2))
// #define SCCL_PROXY_CONN_POOL_MASK ((SCCL_PROXY_CONN_POOL_SIZE) - 1)
// struct scclProxyConnectionPool {
// struct scclProxyConnection** pools;
// int banks;
// int offset;
// };
// static scclResult_t scclProxyNewConnection(struct scclProxyConnectionPool* pool, int* id) {
// if(pool->offset == SCCL_PROXY_CONN_POOL_SIZE) {
// scclCHECK(scclRealloc(&pool->pools, pool->banks, pool->banks + 1));
// scclCHECK(scclCalloc(pool->pools + pool->banks, SCCL_PROXY_CONN_POOL_SIZE));
// pool->banks++;
// pool->offset = 0;
// }
// *id = ((pool->banks - 1) << SCCL_PROXY_CONN_POOL_SIZE_POW2) + pool->offset;
// pool->offset++;
// return scclSuccess;
// }
// static scclResult_t scclProxyGetConnection(struct scclProxyConnectionPool* pool, int id, struct scclProxyConnection** conn) {
// int bank = id >> SCCL_PROXY_CONN_POOL_SIZE_POW2;
// int offset = id & SCCL_PROXY_CONN_POOL_MASK;
// if((pool->pools == NULL) || (bank > pool->banks) || (pool->pools[bank] == NULL))
// return scclInternalError;
// *conn = pool->pools[bank] + offset;
// return scclSuccess;
// }
// static scclResult_t proxyFree(struct scclProxyConnection* connection, struct scclProxyState* proxyState) {
// if(connection->send) {
// if(scclTransports[connection->transport]->send.proxyFree) {
// scclCHECK(scclTransports[connection->transport]->send.proxyFree(connection, proxyState));
// }
// } else {
// if(scclTransports[connection->transport]->recv.proxyFree) {
// scclCHECK(scclTransports[connection->transport]->recv.proxyFree(connection, proxyState));
// }
// }
// return scclSuccess;
// }
// static scclResult_t scclProxyFreeConnections(struct scclProxyConnectionPool* pool, struct scclProxyState* proxyState) {
// for(int b = 0; b < pool->banks; b++) {
// int max = b == pool->banks - 1 ? pool->offset : SCCL_PROXY_CONN_POOL_SIZE;
// for(int i = 0; i < max; i++) {
// scclProxyConnection* connection = pool->pools[b] + i;
// if(connection->state != connUninitialized) {
// scclCHECK(proxyFree(connection, proxyState));
// }
// }
// free(pool->pools[b]);
// }
// free(pool->pools);
// return scclSuccess;
// }
// #include "transport.h"
// struct scclProxyInitReq {
// int transport;
// int send;
// int tpLocalRank;
// int tpRank;
// int sameProcess;
// };
// struct scclProxyInitResp {
// scclProxyConnection* connection;
// char devShmPath[6]; // "XXXXXX" - May or may not be set
// };
// scclResult_t scclProxyConnect(struct scclComm* comm, int transport, int send, int tpProxyRank, struct scclProxyConnector* proxyConn) {
// struct scclSocket* sock;
// int ready, proxyRank = -1;
// struct scclProxyState* sharedProxyState = comm->proxyState;
// // Keep one connection per mlocal rank
// for(int i = 0; i < comm->localRanks; ++i) {
// /* find the proxy rank in comm. */
// if(comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) {
// proxyRank = comm->localRankToRank[i];
// break;
// }
// }
// proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
// // Keep one connection per local rank
// proxyConn->connection = NULL;
// proxyConn->tpRank = tpProxyRank;
// if(sharedProxyState->peerSocks == NULL) {
// scclCHECK(scclCalloc(&sharedProxyState->peerSocks, comm->sharedRes->tpNLocalRanks));
// scclCHECK(scclCalloc(&sharedProxyState->proxyOps, comm->sharedRes->tpNLocalRanks));
// scclCHECK(scclCalloc(&sharedProxyState->sharedDevMems, comm->sharedRes->tpNLocalRanks));
// for(int i = 0; i < comm->sharedRes->tpNLocalRanks; ++i) {
// scclCHECK(scclSocketSetFd(-1, &sharedProxyState->peerSocks[i]));
// }
// }
// proxyConn->tpLocalRank = comm->sharedRes->tpRankToLocalRank[proxyConn->tpRank];
// sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
// scclCHECK(scclSocketReady(sock, &ready));
// if(!ready) {
// scclCHECK(scclSocketInit(sock, sharedProxyState->peerAddresses + proxyConn->tpRank, comm->sharedRes->magic, scclSocketTypeProxy, comm->abortFlag));
// scclCHECK(scclSocketConnect(sock));
// }
// struct scclProxyInitReq req = {0};
// req.transport = transport;
// req.send = send;
// req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
// req.tpRank = comm->topParentRanks[comm->rank];
// req.sameProcess = proxyConn->sameProcess;
// struct scclProxyInitResp resp = {0};
// // This usually sends proxyConn->connection to identify which connection this is.
// // However, this is part of the response and therefore is ignored
// scclCHECK(scclProxyCallBlocking(comm, proxyConn, scclProxyMsgInit, &req, sizeof(req), &resp, sizeof(resp)));
// proxyConn->connection = resp.connection;
// // If we need proxy progress, map progress ops
// struct scclTransportComm* tcomm = send ? &scclTransports[transport]->send : &scclTransports[transport]->recv;
// if(tcomm->proxyProgress) {
// char poolPath[] = "/dev/shm/sccl-XXXXXX";
// strncpy(poolPath + sizeof("/dev/shm/sccl-") - 1, resp.devShmPath, sizeof("XXXXXX") - 1);
// struct scclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank;
// if(proxyOps->pool == NULL) {
// scclCHECK(scclShmOpen(poolPath, sizeof(struct scclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0, &proxyOps->handle));
// proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
// }
// }
// INFO(SCCL_NET | SCCL_PROXY, "Connection to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
// return scclSuccess;
// }
// // cuMem API support
// // The response is sent out-of-band using scclIpcSocket for this specific command
// /**
// * 通过代理连接将文件描述符转换为跨进程可用的描述符
// *
// * @param comm sccl通信器
// * @param proxyConn 代理连接器
// * @param fd 待转换的文件描述符
// * @param convertedFd 输出参数,存储转换后的文件描述符
// * @return 操作结果(scclSuccess表示成功)
// *
// * 该函数会阻塞直到转换完成或失败。首先创建UDS socket接收转换后的fd,
// * 然后通过代理请求转换,最后轮询代理响应直到操作完成。
// * 出错时会关闭socket并返回错误信息。
// */
// scclResult_t scclProxyClientConvertFdBlocking(struct scclComm* comm, struct scclProxyConnector* proxyConn, int fd, int* convertedFd) {
// scclResult_t ret = scclSuccess;
// scclResult_t res = scclInProgress;
// struct scclIpcSocket ipcSock = {0};
// void* opId = malloc(1);
// // Create a UDS socket to receive the converted fd
// scclCHECK(scclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag));
// // Request the conversion of the fd over sockets
// scclCHECKGOTO(scclProxyCallAsync(comm, proxyConn, scclProxyMsgConvertFd, &fd, sizeof(int), 0, opId), ret, error);
// // Receive converted fd over UDS
// scclCHECK(scclIpcSocketRecvFd(&ipcSock, convertedFd));
// TRACE(SCCL_PROXY, "UDS: ConvertFd rank %d returned %p %d", proxyConn->tpLocalRank, convertedFd, *convertedFd);
// scclCHECK(scclIpcSocketClose(&ipcSock));
// while(res == scclInProgress) {
// res = scclPollProxyResponse(comm, proxyConn, NULL, opId);
// }
// free(opId);
// return res;
// error:
// scclCHECK(scclIpcSocketClose(&ipcSock));
// WARN("scclProxyClientConvertFd call to top parent rank %d failed", proxyConn->tpRank);
// return ret;
// }
// const char* scclProxyMsgTypeStr[] = {"Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd"};
// scclResult_t scclProxyCallAsync(struct scclComm* comm, struct scclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId)
// {
// struct scclSocket* sock;
// scclResult_t ret = scclSuccess;
// struct scclProxyState* sharedProxyState = comm->proxyState;
// if(sharedProxyState->peerSocks == NULL)
// return scclInternalError;
// sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
// if(sock == NULL)
// return scclInternalError;
// scclCHECKGOTO(scclSocketSend(sock, &type, sizeof(int)), ret, error);
// scclCHECKGOTO(scclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
// scclCHECKGOTO(scclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
// scclCHECKGOTO(scclSocketSend(sock, &respSize, sizeof(int)), ret, error);
// if(reqSize)
// scclCHECKGOTO(scclSocketSend(sock, reqBuff, reqSize), ret, error);
// // Send opId to proxy
// scclCHECKGOTO(scclSocketSend(sock, &opId, sizeof(opId)), ret, error);
// // Add proxyOp to expected response queue
// scclCHECK(expectedProxyResponseEnqueue(sharedProxyState, opId, respSize));
// return scclSuccess;
// error:
// return ret;
// }
// scclResult_t scclPollProxyResponse(struct scclComm* comm, struct scclProxyConnector* proxyConn, void* respBuff, void* opId) {
// struct scclProxyState* sharedProxyState = comm->proxyState;
// // Receive the connection pointer from the Proxy
// if(*comm->abortFlag) {
// WARN("Comm %p is in abort state", comm);
// return scclInternalError;
// }
// if(sharedProxyState->peerSocks == NULL)
// return scclInternalError;
// // Check response queue
// int found = 0;
// scclCHECK(expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found));
// if(found == 0) {
// // Attempt to read in a new response header from the proxy thread
// struct scclSocket* sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
// void* recvOpId;
// int offset = 0;
// if(scclSuccess != scclSocketProgress(SCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)) {
// WARN("Socket recv failed while polling for opId=%p", opId);
// return scclInternalError;
// }
// if(offset == 0) {
// return scclInProgress;
// // If we've returned a partial response, block to receive the rest of it
// } else if(offset < sizeof(recvOpId)) {
// while(offset < sizeof(recvOpId))
// scclCHECK(scclSocketProgress(SCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset));
// }
// INFO(SCCL_PROXY, "scclPollProxyResponse Received new opId=%p", recvOpId);
// // Now do a blocking recv of the response size
// int respSize = 0;
// scclCHECK(scclSocketRecv(sock, &respSize, sizeof(respSize)));
// // If there's a respSize to recv
// if(respSize > 0) {
// if(recvOpId != opId) {
// // Unexpected response, need to buffer the socket data
// respBuff = malloc(respSize);
// }
// assert(respBuff != NULL);
// scclCHECK(scclSocketRecv(sock, respBuff, respSize));
// }
// if(recvOpId == opId) {
// INFO(SCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId);
// scclCHECK(expectedProxyResponseRemove(sharedProxyState, recvOpId));
// return scclSuccess;
// } else {
// INFO(SCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", recvOpId, respBuff, respSize);
// // Store the result and mark response as completed
// scclCHECK(expectedProxyResponseStore(sharedProxyState, recvOpId, respBuff, respSize));
// return scclInProgress;
// }
// } else {
// INFO(SCCL_PROXY, "scclPollProxyResponse Dequeued cached opId=%p", opId);
// }
// return scclSuccess;
// }
// scclResult_t
// scclProxyCallBlocking(struct scclComm* comm, struct scclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
// // Alloc some memory to act as a handle
// scclResult_t res = scclSuccess;
// void* opId = malloc(1);
// scclCHECKGOTO(scclProxyCallAsync(comm, proxyConn, type, reqBuff, reqSize, respSize, opId), res, fail);
// do {
// res = scclPollProxyResponse(comm, proxyConn, respBuff, opId);
// } while(res == scclInProgress);
// exit:
// free(opId);
// return res;
// fail:
// goto exit;
// }
// static scclResult_t proxyProgressInit(struct scclProxyState* proxyState) {
// struct scclProxyProgressState* state = &proxyState->progressState;
// if(state->opsPool == NULL) {
// int size = sizeof(struct scclProxyOpsPool);
// struct scclProxyOpsPool* pool = NULL;
// char shmPath[sizeof("/dev/shm/sccl-XXXXXX")];
// shmPath[0] = '\0';
// scclCHECK(scclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks + 1, &state->handle));
// // Init pool
// pool->nextOps = -1;
// for(int r = 0; r < proxyState->tpLocalnRanks; r++) {
// pool->freeOps[r] = r * MAX_OPS_PER_PEER;
// for(int i = 0; i < MAX_OPS_PER_PEER - 1; i++)
// pool->ops[r * MAX_OPS_PER_PEER + i].next = r * MAX_OPS_PER_PEER + i + 1;
// pool->ops[(r + 1) * MAX_OPS_PER_PEER - 1].next = -1;
// }
// // Setup mutex/cond to work inter-process
// pthread_mutexattr_t mutexAttr;
// pthread_mutexattr_init(&mutexAttr);
// pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED);
// pthread_mutex_init(&pool->mutex, &mutexAttr);
// pthread_condattr_t condAttr;
// pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED);
// pthread_cond_init(&pool->cond, &condAttr);
// state->opsPool = pool;
// memcpy(state->opsPoolShmSuffix, shmPath + sizeof("/dev/shm/sccl-") - 1, sizeof("XXXXXX") - 1);
// // All ops structures are created, we can start the progress thread
// scclCHECK(scclProxyProgressCreate(proxyState));
// }
// return scclSuccess;
// }
// static void proxyOpsFree(struct scclProxyState* proxyState) {
// struct scclProxyProgressState* state = &proxyState->progressState;
// if(scclShmClose(state->handle) != scclSuccess) {
// WARN("[Service thread] shm close failed");
// }
// }
// scclResult_t scclProxyShmUnlink(struct scclComm* comm) {
// struct scclProxyProgressState* state = &comm->proxyState->progressState;
// if(state->opsPool == NULL)
// return scclSuccess;
// if(scclShmUnlink(state->handle) != scclSuccess) {
// WARN("[Service thread] proxy ops shm unlink failed");
// }
// return scclSuccess;
// }
// static scclResult_t proxyConnInit(struct scclProxyLocalPeer* peer,
// struct scclProxyConnectionPool* connectionPool,
// struct scclProxyState* proxyState,
// scclProxyInitReq* req,
// scclProxyInitResp* resp,
// struct scclProxyConnection** connection) {
// int id;
// scclCHECK(scclProxyNewConnection(connectionPool, &id));
// scclCHECK(scclProxyGetConnection(connectionPool, id, connection));
// (*connection)->sock = &peer->sock;
// (*connection)->transport = req->transport;
// (*connection)->send = req->send;
// (*connection)->tpLocalRank = req->tpLocalRank;
// (*connection)->sameProcess = req->sameProcess;
// peer->tpLocalRank = req->tpLocalRank;
// peer->tpRank = req->tpRank;
// resp->connection = *connection;
// (*connection)->tcomm = (*connection)->send ? &scclTransports[(*connection)->transport]->send : &scclTransports[(*connection)->transport]->recv;
// // If we need proxy progress, let's allocate ops and start the thread
// if((*connection)->tcomm->proxyProgress) {
// scclCHECK(proxyProgressInit(proxyState));
// struct scclProxyProgressState* state = &proxyState->progressState;
// strncpy(resp->devShmPath, state->opsPoolShmSuffix, sizeof(resp->devShmPath));
// }
// INFO(SCCL_NET | SCCL_PROXY,
// "New proxy %s connection %d from local rank %d, transport %d",
// (*connection)->send ? "send" : "recv",
// id,
// (*connection)->tpLocalRank,
// (*connection)->transport);
// __atomic_store_n(&(*connection)->state, connInitialized, __ATOMIC_RELEASE);
// return scclSuccess;
// }
// // cuMem API support
// static scclResult_t proxyConvertFd(struct scclProxyLocalPeer* peer, void* opId, struct scclProxyState* proxyState, int fd) {
// struct scclIpcSocket ipcSock = {0};
// uint64_t hash = (uint64_t)opId;
// INFO(SCCL_PROXY, "UDS proxyConvertFd received fd %d peer %d opId %lx", fd, peer->tpLocalRank, hash);
// // Send back the converted fd using UDS
// scclCHECK(scclIpcSocketInit(&ipcSock, proxyState->tpRank, hash ^ 1, proxyState->abortFlag));
// scclCHECK(scclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash));
// scclCHECK(scclIpcSocketClose(&ipcSock));
// return scclSuccess;
// }
// static scclResult_t proxyProgressAsync(struct scclProxyAsyncOp* op,
// struct scclProxyState* proxyState,
// int* asyncOpCount,
// struct scclProxyLocalPeer* peer,
// struct scclProxyConnectionPool* connectionPool) {
// int done = 1;
// if(op->type == scclProxyMsgSetup) {
// TRACE(SCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId);
// scclCHECK(op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
// } else if(op->type == scclProxyMsgConnect) {
// TRACE(SCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
// scclCHECK(op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
// } else if(op->type == scclProxyMsgSharedInit) {
// int nChannels = (int)*op->reqBuff;
// TRACE(SCCL_PROXY, "proxyProgressAsync::scclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels);
// if(op->connection->tcomm->proxySharedInit)
// scclCHECK(op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels));
// __atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE);
// } else if(op->type == scclProxyMsgConvertFd) {
// int fd = *(int*)op->reqBuff;
// TRACE(SCCL_PROXY, "proxyProgressAsync::scclProxyMsgConvertFd opId=%p op.reqBuff=%p fd=%d", op->opId, op->reqBuff, fd);
// scclCHECK(proxyConvertFd(peer, op->opId, proxyState, fd)); // cuMem API support
// } else if(op->type == scclProxyMsgInit) {
// TRACE(SCCL_PROXY, "proxyProgressAsync::scclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
// scclCHECK(proxyConnInit(peer, connectionPool, proxyState, (scclProxyInitReq*)op->reqBuff, (scclProxyInitResp*)op->respBuff, &op->connection));
// } else
// return scclInternalError;
// if(done) {
// INFO(SCCL_PROXY, "proxyProgressAsync opId=%p op.type=%d op.reqBuff=%p op.respSize=%d done", op->opId, op->type, op->reqBuff, op->respSize);
// if(op->type == scclProxyMsgSetup)
// __atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE);
// else if(op->type == scclProxyMsgConnect)
// __atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE);
// /* if setup or connect is done, we should not return any error at this point since
// * scclSocketSend might already send the respBuff to the requester. If we still choose
// * to abort and close the connection, it can cause segfault if the requester is using
// * the respBuff. */
// // Send the opId for referencing async operation
// scclCHECK(scclSocketSend(op->connection->sock, &op->opId, sizeof(op->opId)));
// // Send the response size
// scclCHECK(scclSocketSend(op->connection->sock, &op->respSize, sizeof(op->respSize)));
// if(op->respSize) {
// // Send the response
// scclCHECK(scclSocketSend(op->connection->sock, op->respBuff, op->respSize));
// }
// asyncProxyOpDequeue(peer, op);
// (*asyncOpCount)--;
// return scclSuccess;
// } else if(*proxyState->abortFlag != 0) {
// return scclInternalError;
// }
// return scclInProgress;
// }
// static scclResult_t proxyServiceInitOp(
// int type, struct scclProxyLocalPeer* peer, struct scclProxyConnectionPool* connectionPool, struct scclProxyState* proxyState, int* asyncOpCount) {
// struct scclSocket* sock = &peer->sock;
// struct scclProxyAsyncOp* asyncOp;
// scclCHECK(scclCalloc(&asyncOp, 1));
// asyncOp->type = type;
// scclCHECK(scclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
// scclCHECK(scclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)));
// scclCHECK(scclSocketRecv(sock, &asyncOp->respSize, sizeof(int)));
// if(asyncOp->reqSize) {
// scclCHECK(scclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
// scclCHECK(scclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
// }
// // Store opId for completion response
// scclCHECK(scclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)));
// if(asyncOp->respSize)
// scclCHECK(scclCalloc(&asyncOp->respBuff, asyncOp->respSize));
// asyncProxyOpEnqueue(peer, asyncOp);
// (*asyncOpCount)++;
// scclCHECK(proxyProgressAsync(asyncOp, proxyState, asyncOpCount, peer, connectionPool));
// return scclSuccess;
// }
// #include <poll.h>
// static bool proxyMatchOpType(int type) {
// switch(type) {
// case scclProxyMsgInit:
// case scclProxyMsgSharedInit:
// case scclProxyMsgSetup:
// case scclProxyMsgConnect:
// case scclProxyMsgConvertFd: return true;
// default: return false;
// }
// }
// void* scclProxyService(void* _args) {
// struct scclProxyState* proxyState = (struct scclProxyState*)_args;
// // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
// if(setProxyThreadContext(proxyState)) {
// INFO(SCCL_INIT, "[Proxy Service] Created CUDA context on device %d", proxyState->cudaDev);
// } else if(cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
// WARN("[Proxy Service] Failed to set CUDA device %d", proxyState->cudaDev);
// }
// // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
// // Prepare poll descriptor
// struct scclProxyConnectionPool connectionPool;
// connectionPool.pools = NULL;
// connectionPool.banks = 0;
// connectionPool.offset = SCCL_PROXY_CONN_POOL_SIZE;
// struct pollfd pollfds[SCCL_MAX_LOCAL_RANKS + 1];
// struct scclProxyLocalPeer peers[SCCL_MAX_LOCAL_RANKS];
// memset(&peers, 0, sizeof(struct scclProxyLocalPeer) * SCCL_MAX_LOCAL_RANKS);
// for(int s = 0; s < SCCL_MAX_LOCAL_RANKS; s++) {
// pollfds[s].fd = -1;
// pollfds[s].events = POLLHUP | POLLIN;
// }
// if(scclSocketGetFd(proxyState->listenSock, &pollfds[SCCL_MAX_LOCAL_RANKS].fd) != scclSuccess) {
// WARN("[Proxy Service] Get listenSock fd fails");
// return NULL;
// };
// pollfds[SCCL_MAX_LOCAL_RANKS].events = POLLIN;
// int maxnpeers = 0;
// int npeers = 0;
// int stop = 0;
// int asyncOpCount = 0;
// while(stop == 0 || (stop == 1 && npeers > 0)) {
// /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
// * connections. Need to wait until all other related comms call abort and safely exit
// * together, or we could face segmentation fault. */
// if(*proxyState->abortFlag != 0)
// stop = 1;
// /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
// int ret;
// do {
// ret = poll(pollfds, SCCL_MAX_LOCAL_RANKS + 1, asyncOpCount ? 0 : 500);
// } while(ret < 0 && errno == EINTR);
// if(ret < 0) {
// WARN("[Proxy Service] Poll failed: %s", strerror(errno));
// return NULL;
// }
// if(pollfds[SCCL_MAX_LOCAL_RANKS].revents) {
// int s = 0;
// while(s < SCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0)
// s++;
// if(s == SCCL_MAX_LOCAL_RANKS) {
// WARN("[Proxy service] Too many connections (%d max)", SCCL_MAX_LOCAL_RANKS);
// return NULL;
// }
// if(maxnpeers < s + 1)
// maxnpeers = s + 1;
// if(scclSocketInit(&peers[s].sock) != scclSuccess) {
// WARN("[Service thread] Initialize peers[%d].sock fails", s);
// return NULL;
// }
// if(scclSocketAccept(&peers[s].sock, proxyState->listenSock) != scclSuccess) {
// WARN("[Service thread] Accept failed %s", strerror(errno));
// } else {
// if(scclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != scclSuccess) {
// WARN("[Service thread] Get peers[%d].sock fd fails", s);
// return NULL;
// }
// npeers++;
// peers[s].tpLocalRank = -1;
// }
// }
// for(int s = 0; s < maxnpeers; s++) {
// struct scclProxyLocalPeer* peer = peers + s;
// struct scclSocket* sock = &peer->sock;
// int closeConn = 0;
// int type = 0;
// scclResult_t res = scclSuccess;
// if(pollfds[s].fd == -1)
// continue;
// // Progress all ops for this scclProxyLocalPeer
// scclProxyAsyncOp* op = peer->asyncOps;
// while(op != nullptr) {
// scclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */
// type = op->type;
// res = proxyProgressAsync(op, proxyState, &asyncOpCount, peer, &connectionPool);
// if(res == scclSuccess || res == scclInProgress) {
// op = opnext;
// } else {
// // Res is a bad result
// closeConn = 1;
// WARN("[Service thread] Error encountered progressing operation=%s, res=%d, closing connection", scclProxyMsgTypeStr[type], res);
// break;
// }
// }
// // Check for additional ops coming in
// if(pollfds[s].revents & POLLIN) {
// int closed;
// res = scclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/);
// if(res != scclSuccess && res != scclInProgress) {
// WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed);
// closeConn = 1;
// } else if(closed) {
// INFO(SCCL_INIT | SCCL_NET | SCCL_PROXY, "[Service thread] Connection closed by localRank %d", peer->tpLocalRank);
// closeConn = 1;
// } else if(res == scclSuccess) { // We received something from the sock
// if(type == scclProxyMsgStop) {
// stop = 1;
// closeConn = 1;
// } else if(type == scclProxyMsgClose) {
// closeConn = 1;
// } else if(proxyMatchOpType(type)) {
// res = proxyServiceInitOp(type, peers + s, &connectionPool, proxyState, &asyncOpCount);
// } else {
// WARN("[Service thread] Unknown command %d from localRank %d", type, peer->tpLocalRank);
// closeConn = 1;
// }
// INFO(SCCL_PROXY, "Received and initiated operation=%s res=%d", scclProxyMsgTypeStr[type], res);
// }
// } else if(pollfds[s].revents & POLLHUP) {
// closeConn = 1;
// }
// if(res != scclSuccess && res != scclInProgress) {
// WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d",
// proxyState->tpRank,
// scclProxyMsgTypeStr[type],
// peer->tpRank,
// res);
// closeConn = 1;
// }
// if(closeConn) {
// scclSocketClose(sock);
// if(op != nullptr) {
// asyncProxyOpDequeue(peer, op);
// asyncOpCount--;
// }
// pollfds[s].fd = -1;
// npeers--;
// }
// }
// }
// // Wait for all operations to complete and stop progress thread before freeing any resource
// if(scclProxyProgressDestroy(proxyState) != scclSuccess) {
// WARN("[Proxy Service] proxyDestroy failed");
// }
// for(int s = 0; s < maxnpeers; s++) {
// scclSocketClose(&peers[s].sock);
// }
// scclProxyFreeConnections(&connectionPool, proxyState);
// scclSocketClose(proxyState->listenSock);
// free(proxyState->listenSock);
// proxyOpsFree(proxyState);
// return NULL;
// }
// scclResult_t scclProxyInit(struct scclComm* comm, struct scclSocket* sock, union scclSocketAddress* peerAddresses) {
// assert(comm->sharedRes->proxyState == NULL);
// scclCHECK(scclCalloc(&comm->sharedRes->proxyState, 1));
// comm->proxyState = comm->sharedRes->proxyState;
// comm->proxyState->refCount = 1;
// comm->proxyState->listenSock = sock;
// comm->proxyState->peerAddresses = peerAddresses;
// return scclSuccess;
// }
// scclResult_t scclProxyCreate(struct scclComm* comm) {
// /* proxyState is shared among parent comm and split comms. comm->proxyState->thread is
// * pthread_join()'d by commFree() in init.cc when the refCount reduces down to 0. */
// struct scclProxyState* proxyState = comm->proxyState;
// if(proxyState->refCount == 1) {
// /* we have to make sure all following fields in comm have been initialized. */
// proxyState->tpRank = comm->rank;
// proxyState->tpnRanks = comm->nRanks;
// proxyState->tpLocalnRanks = comm->localRanks;
// proxyState->cudaDev = comm->cudaDev;
// proxyState->abortFlag = comm->abortFlag;
// proxyState->p2pnChannels = comm->p2pnChannels;
// proxyState->p2pChunkSize = comm->p2pChunkSize;
// proxyState->nChannels = comm->nChannels;
// proxyState->allocP2pNetLLBuffers = comm->allocP2pNetLLBuffers;
// proxyState->dmaBufSupport = comm->dmaBufSupport;
// proxyState->scclNet = comm->scclNet;
// proxyState->scclCollNet = comm->scclCollNet;
// memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes));
// pthread_create(&comm->proxyState->thread, NULL, scclProxyService, comm->proxyState);
// scclSetThreadName(comm->proxyState->thread, "sccl Service %2d", comm->cudaDev);
// }
// return scclSuccess;
// }
// scclResult_t scclProxyStop(struct scclComm* comm) {
// if(comm->sharedRes && comm->sharedRes->proxyState) {
// struct scclProxyState* sharedProxyState = comm->sharedRes->proxyState;
// if((comm->proxyRefCountOld = scclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
// if(sharedProxyState->peerAddresses) {
// if(*comm->abortFlag == 0) {
// struct scclSocket sock;
// int type = scclProxyMsgStop;
// scclCHECK(scclSocketInit(&sock,
// sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank],
// comm->sharedRes->magic,
// scclSocketTypeProxy,
// comm->abortFlag));
// scclCHECK(scclSocketConnect(&sock));
// scclCHECK(scclSocketSend(&sock, &type, sizeof(int)));
// scclCHECK(scclSocketClose(&sock));
// }
// }
// if(sharedProxyState->peerSocks) {
// int tplocalRanks = comm->sharedRes->tpNLocalRanks;
// for(int i = 0; i < tplocalRanks; i++) {
// int fd;
// scclCHECK(scclSocketGetFd(sharedProxyState->peerSocks + i, &fd));
// if(fd >= 0) {
// if(sharedProxyState->proxyOps[i].pool) {
// scclCHECK(scclShmClose(sharedProxyState->proxyOps[i].handle));
// }
// if(sharedProxyState->sharedDevMems[i]) {
// if(!scclCuMemEnable()) {
// CUDACHECK(cudaIpcCloseMemHandle(sharedProxyState->sharedDevMems[i]));
// }
// }
// int type = scclProxyMsgClose;
// if(*comm->abortFlag == 0)
// scclCHECK(scclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)));
// scclCHECK(scclSocketClose(sharedProxyState->peerSocks + i));
// }
// }
// }
// }
// }
// return scclSuccess;
// }
// scclResult_t scclProxyDestroy(struct scclComm* comm) {
// struct scclProxyState* sharedProxyState = comm->sharedRes->proxyState;
// assert(sharedProxyState->refCount == 0);
// free(sharedProxyState->peerAddresses);
// free(sharedProxyState->peerSocks);
// free(sharedProxyState->proxyOps);
// free(sharedProxyState->sharedDevMems);
// expectedProxyResponseFree(sharedProxyState);
// free(sharedProxyState);
// return scclSuccess;
// }
#pragma once
#include <pthread.h>
#include "socket.h"
#include "ipcsocket.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
typedef net::host::scclSocketAddress scclSocketAddress_t;
typedef net::host::scclSocket scclSocket_t;
#define SCCL_PROXY_MAX_SUBS MAXCHANNELS
#define PROXYARGS_ALLOCATE_SIZE SCCL_MAX_OPS
enum proxyConnectState : uint8_t {
connUninitialized = 0,
connInitialized = 1,
connSharedInitialized = 2,
connSetupDone = 3,
connConnected = 4,
numConnStates = 5
};
// 期望代理响应FIFO
struct scclExpectedProxyResponse {
void* opId; // 操作ID,用于标识特定的操作
int respSize; // 响应大小,表示响应数据的字节数
bool done; // 完成标志,表示该响应是否已完成处理
void* respBuff; // 响应缓冲区,用于存储接收到的响应数据
struct scclExpectedProxyResponse* next; // 指向下一个预期代理响应的指针,形成链表结构
};
// 子代理参数数组
struct scclProxySubArgs {
int channelId; // 通道ID
int nsteps; // 操作步骤数
ssize_t nbytes; // 数据字节数
int peer; // 对等体ID
int groupSize; //
uint64_t base; // 基础计数
uint64_t posted; // 已发布的计数
uint64_t received; // 已接收的计数
uint64_t flushed; // 已刷新的计数
uint64_t transmitted; // 已传输的计数
uint64_t done; // 已完成的计数
uint64_t end; // 结束计数
void* requests[SCCL_STEPS]; // 每个步骤的请求指针数组
};
// 定义代理参数结构体
struct scclProxyArgs {
struct scclProxySubArgs subs[SCCL_PROXY_MAX_SUBS]; // 子代理参数数组
int nsubs; // 子代理数量
int done; // 是否完成的标志
uint64_t opCount; // 操作计数
int sliceSteps; // 切片步骤数
int chunkSteps; // 数据块步骤数
int chunkSize; // 数据块大小
scclDataType_t dtype; // 数据类型
scclProtocolType_t protocol; // 协议类型
int state; // 当前状态
char* sharedBuff[SCCL_STEPS]; // 共享缓冲区指针数组
int sharedSize[SCCL_STEPS]; // 共享缓冲区大小数组
int idle; // 是否空闲的标志
// 元素链接
struct scclProxyArgs* next; // 指向下一个代理参数的指针
struct scclProxyArgs* nextPeer; // 指向下一个对等代理参数的指针
struct scclProxyArgs** proxyAppendPtr; // 指向代理追加指针的指针
};
struct scclProxyPool {
struct scclProxyPool* next; // 指向下一个代理池的指针
struct scclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; // 代理参数元素数组
};
struct scclProxyProgressState {
// 用于主线程向进度线程发送工作
// struct scclProxyOpsPool* opsPool;
// scclShmHandle_t handle;
char opsPoolShmSuffix[6]; // 操作池共享内存后缀
pthread_t thread; // 进度线程的线程ID
bool stop; // 停止标志,用于控制线程停止
// struct scclProxyPeer** localPeers;
// struct scclSharedNetComms* netComms[SCCL_MAX_NETDEVS];
struct scclProxyArgs* active; // 当前活动的代理参数
struct scclProxyArgs* pool; // 代理参数池
struct scclProxyPool* pools; // 代理池
int nextOps; // 下一个操作的索引
};
// struct scclProxyOp {
// struct scclProxyConnection* connection;
// int channelId;
// int nsteps;
// ssize_t nbytes;
// struct {
// int root : 30;
// uint32_t connIndex : 2;
// };
// int next;
// uint64_t opCount;
// int sliceSteps;
// int chunkSteps;
// int chunkSize;
// uint8_t /*scclDataType_t*/ dtype;
// uint8_t /*scclDevRedOp_t*/ redOp;
// uint8_t /*scclPattern_t*/ pattern;
// uint8_t protocol;
// union {
// uint64_t unused;
// // For use by enqueue.cc
// struct scclProxyOp* enqNext;
// };
// };
// struct scclProxyOpsPool {
// struct scclProxyOp ops[MAX_OPS_PER_PEER * SCCL_MAX_LOCAL_RANKS];
// volatile int nextOps;
// volatile int nextOpsEnd;
// volatile int freeOps[SCCL_MAX_LOCAL_RANKS];
// pthread_mutex_t mutex;
// pthread_cond_t cond;
// };
////////////////////////////////////////////////////////////////////////////////////////////////
// scclResult_t scclProxyInit(struct scclComm* comm, scclSocket_t* sock, union scclSocketAddress* peerAddresses);
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
// enum scclProxyOpState {
// scclProxyOpNone,
// scclProxyOpReady,
// scclProxyOpProgress
// };
// enum {
// proxyRecv = 0,
// proxySend = 1
// };
// struct scclProxyArgs;
// typedef scclResult_t (*proxyProgressFunc_t)(struct scclProxyState*, struct scclProxyArgs*);
// static_assert(SCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
// struct scclProxyOp {
// struct scclProxyConnection* connection;
// int channelId;
// int nsteps;
// ssize_t nbytes;
// struct {
// int root : 30;
// uint32_t connIndex : 2;
// };
// int next;
// uint64_t opCount;
// int sliceSteps;
// int chunkSteps;
// int chunkSize;
// uint8_t /*scclDataType_t*/
// dtype;
// uint8_t /*scclDevRedOp_t*/ redOp;
// uint8_t /*scclPattern_t*/ pattern;
// uint8_t protocol;
// union {
// uint64_t unused;
// // For use by enqueue.cc
// struct scclProxyOp* enqNext;
// };
// }
// ;
// static_assert(sizeof(struct scclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
// #define SCCL_MAX_NETDEVS 128
// // ProxyOps are used to communicate between main thread and service thread
// // Make sure we have enough to store two full rounds of operations on all channels.
// // Otherwise we'd be unable to post half of them to free new elements.
// #define MAX_OPS_PER_PEER (2 * MAXCHANNELS * SCCL_MAX_WORK_ELEMENTS_P2P)
// #define SCCL_MAX_LOCAL_RANKS 64
// struct scclProxyOpsPool {
// struct scclProxyOp ops[MAX_OPS_PER_PEER * SCCL_MAX_LOCAL_RANKS];
// volatile int nextOps;
// volatile int nextOpsEnd;
// volatile int freeOps[SCCL_MAX_LOCAL_RANKS];
// pthread_mutex_t mutex;
// pthread_cond_t cond;
// };
// struct scclProxyOps {
// scclProxyOpsPool* pool;
// scclShmHandle_t handle;
// int count;
// int freeOp;
// int nextOps;
// int nextOpsEnd;
// };
// struct scclProxySharedP2p {
// int refcount;
// size_t size;
// char* cudaBuff;
// char* hostBuff;
// // CUDA IPC
// scclIpcDesc ipcDesc;
// struct scclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
// };
// struct scclProxyPeer {
// struct scclProxySharedP2p send;
// struct scclProxySharedP2p recv;
// };
// struct scclSharedNetComms {
// void* sendComm[MAXCHANNELS];
// void* recvComm[MAXCHANNELS];
// int sendRefCount[MAXCHANNELS];
// int recvRefCount[MAXCHANNELS];
// };
// struct scclProxyPool;
// struct scclProxyProgressState {
// // Used by main threads to send work to progress thread
// struct scclProxyOpsPool* opsPool;
// scclShmHandle_t handle;
// char opsPoolShmSuffix[6];
// pthread_t thread;
// bool stop;
// struct scclProxyPeer** localPeers;
// struct scclSharedNetComms* netComms[SCCL_MAX_NETDEVS];
// struct scclProxyArgs* active;
// struct scclProxyArgs* pool;
// struct scclProxyPool* pools;
// int nextOps;
// };
// struct scclProxyAsyncOp {
// int type;
// struct scclProxyConnection* connection;
// int reqSize, respSize;
// char *reqBuff, *respBuff;
// void* opId;
// scclProxyAsyncOp* next;
// };
// struct scclProxyLocalPeer {
// struct scclSocket sock;
// int tpRank;
// int tpLocalRank;
// scclProxyAsyncOp* asyncOps;
// int asyncOpCounter;
// };
// struct scclProxyState {
// int refCount;
// int tpRank;
// int tpnRanks;
// int tpLocalnRanks;
// int cudaDev;
// int p2pnChannels;
// int p2pChunkSize;
// int nChannels;
// int buffSizes[SCCL_NUM_PROTOCOLS];
// bool allocP2pNetLLBuffers;
// bool dmaBufSupport;
// scclNet_t* scclNet;
// scclCollNet_t* scclCollNet;
// volatile uint32_t* abortFlag;
// // Service thread
// pthread_t thread;
// struct scclSocket* listenSock;
// int stop;
// CUcontext cudaCtx;
// // Used by main thread
// union scclSocketAddress* peerAddresses;
// struct scclSocket* peerSocks;
// struct scclProxyOps* proxyOps;
// void** sharedDevMems;
// struct scclIpcSocket peerIpcSock; // cuMEM API support (UDS)
// // Progress thread
// struct scclProxyProgressState progressState;
// // Queue of expected responses from the proxy
// struct scclExpectedProxyResponse* expectedResponses;
// };
// enum proxyConnectState {
// connUninitialized = 0,
// connInitialized = 1,
// connSharedInitialized = 2,
// connSetupDone = 3,
// connConnected = 4,
// numConnStates = 5
// };
// struct scclProxyConnection {
// int send, transport, shared;
// int tpLocalRank, sameProcess;
// struct scclSocket* sock;
// struct scclTransportComm* tcomm;
// struct scclProxyArgs* proxyAppend;
// struct scclProxyArgs** proxyAppendPtr;
// void* transportResources;
// proxyConnectState state;
// struct scclCollNetSharedRes* collNet;
// };
// typedef scclResult_t (*threadFunc_t)(struct scclProxyArgs*);
// enum proxyMode {
// proxyRing = 0,
// proxyFrom = 1,
// proxyTo = 2
// };
// scclResult_t scclProxySaveOp(struct scclComm* comm, struct scclProxyOp* proxyOp, bool* justInquire);
// scclResult_t scclProxyComputeP2p(struct scclInfo* info, struct scclProxyOp* proxyOp);
// scclResult_t scclProxyStart(struct scclComm* comm);
// scclResult_t scclProxyCreate(struct scclComm* comm);
// scclResult_t scclProxyConnect(struct scclComm* comm, int transport, int send, int proxyRank, struct scclProxyConnector* proxyConn);
// enum scclProxyMsgType {
// scclProxyMsgInit = 1,
// scclProxyMsgSharedInit = 2,
// scclProxyMsgSetup = 3,
// scclProxyMsgConnect = 4,
// scclProxyMsgStart = 5,
// scclProxyMsgClose = 6,
// scclProxyMsgAbort = 7,
// scclProxyMsgStop = 8,
// scclProxyMsgConvertFd = 9, // cuMem API support (UDS)
// };
// // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
// // Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
// // scclPollProxyResponse(), supplying the same opId to confirm the operation has completed
// scclResult_t scclProxyCallAsync(struct scclComm* comm, struct scclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
// // This function will internally call scclProxyCallAsync() and spin until scclPollProxyResponse() confirms the result is received
// scclResult_t
// scclProxyCallBlocking(struct scclComm* comm, struct scclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
// scclResult_t scclPollProxyResponse(struct scclComm* comm, struct scclProxyConnector* proxyConn, void* respBuff, void* opId);
// scclResult_t scclProxyClientConvertFdBlocking(struct scclComm* comm, struct scclProxyConnector* proxyConn, int fd, int* convertedFd);
// scclResult_t scclProxyStop(struct scclComm* comm);
// scclResult_t scclProxyShmUnlink(struct scclComm* comm);
// scclResult_t scclProxyDestroy(struct scclComm* comm);
// scclResult_t mscclSaveProxy(struct scclComm* comm, struct scclChannel* channel, int type, int peer, struct scclProxyOp* op, int connIndex);
#pragma once
#include <string.h>
#include "base.h"
#include "archinfo.h"
namespace sccl {
namespace hardware {
namespace topology {
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
static bool initialized = false;
// scclResult_t scclTopoInit() {}
} // namespace topology
} // namespace hardware
} // namespace sccl
#include <string.h>
#include "topo_utils.h"
// #include "net.h"
// #include "xml.h"
// #include "net.h"
namespace sccl {
namespace hardware {
namespace topology {
scclResult_t int64ToBusId(int64_t id, char* busId) {
sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
return scclSuccess;
}
scclResult_t busIdToInt64(const char* busId, int64_t* id) {
char hexStr[17]; // Longest possible int64 hex string + null terminator.
int hexOffset = 0;
for(int i = 0; hexOffset < sizeof(hexStr) - 1; i++) {
char c = busId[i];
if(c == '.' || c == ':')
continue;
if((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) {
hexStr[hexOffset++] = busId[i];
} else
break;
}
hexStr[hexOffset] = '\0';
*id = strtol(hexStr, NULL, 16);
return scclSuccess;
}
scclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
char* str = path + offset;
// Remove trailing "/"
if(*str == '/')
str--;
// Find next /
while(*str != '/')
str--;
str++;
int64_t numid;
SCCLCHECK(busIdToInt64(str, &numid));
// Ignore subdevice because those should use the same PCI link so we want to merge nodes.
numid -= numid & 0xf;
*id = numid;
return scclSuccess;
}
} // namespace topology
} // namespace hardware
} // namespace sccl
#pragma once
#include <string.h>
#include "base.h"
namespace sccl {
namespace hardware {
namespace topology {
// 定义硬件拓扑类型枚举
typedef enum topoNodeType {
GPU = 0, // 图形处理单元
PCI = 1, // 外围组件互连
NVS = 2, // 非易失性存储器
CPU = 3, // 中央处理器,实际上是NUMA域
NIC = 4, // 网络接口控制器
NET = 5 // 网络
} topoNodeType_t;
// 定义 topoPathType_t 枚举类型,用于表示不同的路径类型。
enum topoPathType {
PATH_LOC = 0, // 本地路径
PATH_NVL = 1, // 通过 NVLink 连接
PATH_NVB = 2, // 通过中间 GPU 使用 NVLink 连接
PATH_PIX = 3, // 通过最多一个 PCIe 桥连接
PATH_PXB = 4, // 通过多个 PCIe 桥连接(不经过 PCIe 主桥)
PATH_PXN = 5, // GPU 和 NIC 之间通过中间 GPU 连接
PATH_PHB = 6, // 通过 PCIe 以及 PCIe 主桥连接
PATH_SYS = 7, // 通过 PCIe 以及 NUMA 节点之间的 SMP 互连连接
PATH_NET = 8, // 通过网络连接
PATH_DIS = 9 // 断开连接
};
////////////////////////////////////////////////////////////////////////////////////////////////
// 将64位整数转换为总线ID字符串
scclResult_t int64ToBusId(int64_t id, char* busId);
// 将总线ID字符串转换为64位整数
scclResult_t busIdToInt64(const char* busId, int64_t* id);
// 将PCI路径转换为64位整数,路径偏移量和最小偏移量作为参数
scclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id);
} // namespace topology
} // namespace hardware
} // namespace sccl
#pragma once
#include <stdint.h>
#include <string.h>
#include "debug.h"
#include "check.h"
#include "param.h"
#include "alloc.h"
#include "utils.h"
/*
外部环境变量设置:
src/debug.h:
SCCL_DEBUG_LEVEL、SCCL_DEBUG_POS
*/
namespace sccl {
#define WARP_SIZE warpSize
#define MAXCHANNELS 32
#define SCCL_MAX_NTHREADS 256
#define SCCL_MAX_OPS 2048
#define SCCL_STEPS 8
typedef enum : uint8_t {
scclInt8 = 0,
scclChar = 0,
scclUint8 = 1,
scclInt32 = 2,
scclInt = 2,
scclUint32 = 3,
scclInt64 = 4,
scclUint64 = 5,
scclFloat16 = 6,
scclHalf = 6,
scclFloat32 = 7,
scclFloat = 7,
scclFloat64 = 8,
scclDouble = 8,
scclBfloat16 = 9,
} scclDataType_t;
#define SCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
typedef enum : uint8_t {
SCCL_PROTO_LL = 0,
SCCL_PROTO_LL128 = 1,
SCCL_PROTO_SIMPLE = 2
} scclProtocolType_t;
// 每个进程的唯一ID
struct scclUniqueId {
int rank; // 当前节点的全局排名
int nRanks; // 总的节点数量
}
} // namespace sccl
#pragma once
#include <hip/hip_runtime_api.h>
#include <hip/hip_fp16.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <pthread.h>
#include <pwd.h>
#include <errno.h>
#include "debug.h"
#define SCCL_MAJOR 1
#define SCCL_MINOR 0
#define SCCL_PATCH 0
#define SCCL_SUFFIX ""
#define SCCL_VERSION(X, Y, Z) ((X) * 1000 + (Y) * 100 + (Z))
namespace sccl {
/**
* @brief 对选中的代码进行简要功能说明
* @note 根据代码作用域(如公开API或内部实现)编写适当的文档注释
*/
typedef enum {
scclSuccess = 0, /*!< No error */
scclUnhandledHipError = 1, /*!< Unhandled HIP error */
scclSystemError = 2, /*!< Unhandled system error */
scclInternalError = 3, /*!< Internal Error - Please report to RCCL developers */
scclInvalidArgument = 4, /*!< Invalid argument */
scclInvalidUsage = 5, /*!< Invalid usage */
scclRemoteError = 6, /*!< Remote process exited or there was a network error */
scclInProgress = 7, /*!< RCCL operation in progress */
scclNumResults = 8 /*!< Number of result types */
} scclResult_t;
typedef enum {
testSuccess = 0,
testInternalError = 1,
testHipError = 2,
testScclError = 3,
testTimeout = 4,
testNumResults = 5
} testResult_t;
static const char* scclGetErrorString(scclResult_t code) {
switch(code) {
case scclSuccess: return "success";
case scclUnhandledHipError: return "unhandled hip error (run with SCCL_DEBUG=INFO for details)";
case scclSystemError: return "unhandled system error (run with SCCL_DEBUG=INFO for details)";
case scclInternalError: return "internal error - please report this issue to the SCCL developers";
case scclInvalidArgument: return "invalid argument (run with SCCL_DEBUG=WARN for details)";
case scclInvalidUsage: return "invalid usage (run with SCCL_DEBUG=WARN for details)";
case scclRemoteError: return "remote process exited or there was a network error";
case scclInProgress: return "SCCL operation in progress";
default: return "unknown result code";
}
}
////////////////////////////// SCCL和HIP //////////////////////////////
// Propagate errors up
#define SCCLCHECK(call) \
do { \
scclResult_t RES = call; \
if(RES != scclSuccess && RES != scclInProgress) { \
/* Print the back trace*/ \
INFO(SCCL_LOG_CODEALL, "check fail: %s", scclGetErrorString(RES)); \
return RES; \
} \
} while(0);
#define SCCLCHECKGOTO(call, RES, label) \
do { \
RES = call; \
if(RES != scclSuccess && RES != scclInProgress) { \
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d", __FILE__, __LINE__, RES); \
goto label; \
} \
} while(0);
#define HIPCHECK(cmd) \
do { \
hipError_t err = cmd; \
if(err != hipSuccess) { \
char hostname[1024]; \
gethostname(hostname, 1024); \
printf("%s: Test HIP failure %s:%d '%s'\n", hostname, __FILE__, __LINE__, hipGetErrorString(err)); \
return scclUnhandledHipError; \
} \
} while(0)
#define HIPCHECKGOTO(cmd, RES, label) \
do { \
hipError_t err = cmd; \
if(err != hipSuccess) { \
WARN("HIP failure '%s'", hipGetErrorString(err)); \
RES = scclUnhandledHipError; \
goto label; \
} \
} while(false)
////////////////////////////// Value检查 //////////////////////////////
#define EQCHECK(statement, value) \
do { \
if((statement) == value) { \
/* Print the back trace*/ \
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, scclSystemError, strerror(errno)); \
return scclSystemError; \
} \
} while(0);
#define EQCHECKGOTO(statement, value, RES, label) \
do { \
if((statement) == value) { \
/* Print the back trace*/ \
RES = scclSystemError; \
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while(0);
////////////////////////////// SYS //////////////////////////////
// Check system calls
#define SYSCHECK(call, name) \
do { \
int retval; \
SYSCHECKVAL(call, name, retval); \
} while(false)
#define SYSCHECKVAL(call, name, retval) \
do { \
SYSCHECKSYNC(call, name, retval); \
if(retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
return scclSystemError; \
} \
} while(false)
#define SYSCHECKSYNC(call, name, retval) \
do { \
retval = call; \
if(retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
INFO(SCCL_LOG_CODEALL, "Call to " name " returned %s, retrying", strerror(errno)); \
} else { \
break; \
} \
} while(true)
#define SYSCHECKGOTO(statement, RES, label) \
do { \
if((statement) == -1) { \
/* Print the back trace*/ \
RES = scclSystemError; \
INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while(0);
} // namespace sccl
#pragma once
#include <hip/hip_runtime_api.h>
#include <hip/hip_fp16.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <pthread.h>
#include <string.h>
#include <stdarg.h>
#include <sys/syscall.h>
namespace sccl {
#define SCCL_NET_MAX_REQUESTS 8
typedef enum : uint8_t {
SCCL_LOG_NONE = 0,
SCCL_LOG_VERSION = 1,
SCCL_LOG_WARN = 2,
SCCL_LOG_INFO = 3,
SCCL_LOG_ABORT = 4
} scclDebugLogLevel_t;
typedef enum : uint8_t {
SCCL_LOG_CODEALL = 0,
SCCL_LOG_NET = 1,
SCCL_LOG_TOPO = 2,
SCCL_LOG_BOOTSTRAP = 3,
SCCL_LOG_TRANSPORT = 4,
SCCL_LOG_GRAPH = 5,
SCCL_LOG_CONNECT = 6,
SCCL_LOG_P2P = 7,
SCCL_LOG_COLLECTIVE = 8,
SCCL_LOG_ALLOC = 9
} scclDebugLogPos_t;
namespace debug {
static char scclLastError[1024] = ""; // 全局字符串,用于存储可读的最后错误信息
static char hostname[1024]; // 存储主机名的全局字符串
static pthread_mutex_t scclDebugLock = PTHREAD_MUTEX_INITIALIZER; // 用于调试操作的互斥锁,保证多线程环境下的线程安全
static __thread int tid = -1; // 线程局部存储(Thread Local Storage)变量,存储当前线程的ID,默认值为-1
static int pid = -1; // 存储当前进程的ID,默认值为-1
static FILE* scclDebugFile = stdout; // 指向调试输出流的文件指针,默认指向标准输出(stdout
static int scclDebugLevel = -1; // 初始化为 -1,表示未设置
// 在文件顶部或适当位置定义变量
static int scclDebugPos = -1; // 初始化为 -1,表示未设置
/**
* @brief 获取主机名并截断到指定分隔符
*
* 该函数获取当前主机名,并将其截断到第一个出现的指定分隔符处。
* 如果获取主机名失败,则返回"unknown"。
*
* @param hostname 用于存储主机名的缓冲区
* @param maxlen 缓冲区最大长度
* @param delim 用于截断主机名的分隔符
*/
static void getHostName(char* hostname, int maxlen, const char delim) {
if(gethostname(hostname, maxlen) != 0) {
strncpy(hostname, "unknown", maxlen);
return;
}
int i = 0;
while((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1))
i++;
hostname[i] = '\0';
}
////////////////////////////// 初始化debug //////////////////////////////
/**
* @brief 初始化SCCL调试系统
*
* 该函数负责初始化SCCL的调试功能,包括:
* 1. 从环境变量SCCL_DEBUG_LEVEL读取并设置调试等级
* 2. 从环境变量SCCL_DEBUG_POS读取并设置调试位置
* 3. 缓存当前进程的PID和主机名
* 4. 根据SCCL_DEBUG_FILE环境变量创建调试日志文件
*
* 函数使用互斥锁保证线程安全,并通过原子操作设置最终的调试等级和位置。
* 调试等级和位置的默认值分别为SCCL_LOG_INFO和SCCL_LOG_CODEALL。
*
* @note 该函数是线程安全的,但应在程序早期调用以避免竞态条件
*/
static void scclDebugInit() {
pthread_mutex_lock(&scclDebugLock);
if(scclDebugLevel != -1) {
pthread_mutex_unlock(&scclDebugLock);
return;
}
//// 按照debug等级划分
int tempScclDebugLevel = -1;
{
const char* sccl_debug = getenv("SCCL_DEBUG_LEVEL");
if(sccl_debug == NULL) {
tempScclDebugLevel = SCCL_LOG_INFO;
} else if(strcasecmp(sccl_debug, "VERSION") == 0) {
tempScclDebugLevel = SCCL_LOG_VERSION;
} else if(strcasecmp(sccl_debug, "WARN") == 0) {
tempScclDebugLevel = SCCL_LOG_WARN;
} else if(strcasecmp(sccl_debug, "INFO") == 0) {
tempScclDebugLevel = SCCL_LOG_INFO;
} else if(strcasecmp(sccl_debug, "ABORT") == 0) {
tempScclDebugLevel = SCCL_LOG_ABORT;
}
}
//// 按照代码位置划分
int tempScclDebugPos = -1;
{
const char* sccl_debug = getenv("SCCL_DEBUG_POS");
if(sccl_debug == NULL) {
tempScclDebugPos = SCCL_LOG_CODEALL;
} else if(strcasecmp(sccl_debug, "NET") == 0) {
tempScclDebugPos = SCCL_LOG_NET;
} else if(strcasecmp(sccl_debug, "TOPO") == 0) {
tempScclDebugPos = SCCL_LOG_TOPO;
} else if(strcasecmp(sccl_debug, "BOOTSTRAP") == 0) {
tempScclDebugPos = SCCL_LOG_BOOTSTRAP;
} else if(strcasecmp(sccl_debug, "TRANSPORT") == 0) {
tempScclDebugPos = SCCL_LOG_TRANSPORT;
} else if(strcasecmp(sccl_debug, "GRAPH") == 0) {
tempScclDebugPos = SCCL_LOG_GRAPH;
} else if(strcasecmp(sccl_debug, "CONNECT") == 0) {
tempScclDebugPos = SCCL_LOG_CONNECT;
} else if(strcasecmp(sccl_debug, "P2P") == 0) {
tempScclDebugPos = SCCL_LOG_P2P;
} else if(strcasecmp(sccl_debug, "COLLECTIVE") == 0) {
tempScclDebugPos = SCCL_LOG_COLLECTIVE;
} else if(strcasecmp(sccl_debug, "ALLOC") == 0) {
tempScclDebugPos = SCCL_LOG_ALLOC;
}
}
// Cache pid and hostname
getHostName(hostname, 1024, '.');
pid = getpid();
/* Parse and expand the SCCL_DEBUG_FILE path and
* then create the debug file. But don't bother unless the
* SCCL_DEBUG level is > VERSION
*/
const char* scclDebugFileEnv = getenv("SCCL_DEBUG_FILE");
if(tempScclDebugLevel > SCCL_LOG_VERSION && scclDebugFileEnv != NULL) {
int c = 0;
char debugFn[PATH_MAX + 1] = "";
char* dfn = debugFn;
while(scclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
if(scclDebugFileEnv[c++] != '%') {
*dfn++ = scclDebugFileEnv[c - 1];
continue;
}
switch(scclDebugFileEnv[c++]) {
case '%': // Double %
*dfn++ = '%';
break;
case 'h': // %h = hostname
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
break;
case 'p': // %p = pid
dfn += snprintf(dfn, PATH_MAX, "%d", pid);
break;
default: // Echo everything we don't understand
*dfn++ = '%';
*dfn++ = scclDebugFileEnv[c - 1];
break;
}
}
*dfn = '\0';
if(debugFn[0] != '\0') {
FILE* file = fopen(debugFn, "w");
if(file != nullptr) {
setbuf(file, nullptr); // disable buffering
scclDebugFile = file;
}
}
}
__atomic_store_n(&scclDebugLevel, tempScclDebugLevel, __ATOMIC_RELEASE);
__atomic_store_n(&scclDebugPos, tempScclDebugPos, __ATOMIC_RELEASE);
pthread_mutex_unlock(&scclDebugLock);
}
////////////////////////////// 打印DEBUG信息 //////////////////////////////
template <scclDebugLogLevel_t level>
void scclDebugLog(scclDebugLogPos_t pos_flags, const char* filepath, int line, const char* fmt, ...) {
if(__atomic_load_n(&scclDebugLevel, __ATOMIC_ACQUIRE) == -1)
scclDebugInit();
if constexpr(level == SCCL_LOG_WARN)
scclDebugPos = SCCL_LOG_CODEALL;
// 检查调试级别和位置标志
bool isDebugLevelSufficient = (scclDebugLevel >= level);
bool isDebugPositionMatch = (scclDebugPos == SCCL_LOG_CODEALL || scclDebugPos == pos_flags);
// 如果调试级别不足或位置标志不匹配,则不执行后续操作
if(!isDebugLevelSufficient || !isDebugPositionMatch) {
return;
}
// Save the last error (WARN) as a human readable string
if constexpr(level == SCCL_LOG_WARN) {
pthread_mutex_lock(&scclDebugLock);
va_list vargs;
va_start(vargs, fmt);
(void)vsnprintf(scclLastError, sizeof(scclLastError), fmt, vargs);
va_end(vargs);
pthread_mutex_unlock(&scclDebugLock);
}
if(tid == -1) {
tid = syscall(SYS_gettid);
}
char buffer[1024];
size_t len = 0;
if constexpr(level == SCCL_LOG_WARN) {
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d %s:%d SCCL WARN ", hostname, pid, tid, filepath, line);
} else if constexpr(level == SCCL_LOG_INFO) {
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d %s:%d SCCL INFO ", hostname, pid, tid, filepath, line);
}
if(len) {
va_list vargs;
va_start(vargs, fmt);
len += vsnprintf(buffer + len, sizeof(buffer) - len, fmt, vargs);
va_end(vargs);
buffer[len++] = '\n';
fwrite(buffer, 1, len, scclDebugFile);
}
}
} // namespace debug
#define WARN(...) debug::scclDebugLog<SCCL_LOG_WARN>(SCCL_LOG_CODEALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) debug::scclDebugLog<SCCL_LOG_INFO>((FLAGS), __FILE__, __LINE__, __VA_ARGS__)
} // namespace sccl
#pragma once
#define DIVUP(x, y) (((x) + (y) - 1) / (y))
#define ROUNDUP(x, y) (DIVUP((x), (y)) * (y))
#define ALIGN_POWER(x, y) ((x) > (y) ? ROUNDUP(x, y) : ((y) / ((y) / (x))))
#define ALIGN_SIZE(size, align) size = ((size + (align) - 1) / (align)) * (align);
#if !__CUDA_ARCH__
#ifndef __host__
#define __host__
#endif
#ifndef __device__
#define __device__
#endif
#endif
template <typename X, typename Y, typename Z = decltype(X() + Y())>
/**
* @brief 计算向上取整的除法结果
* @tparam X 被除数的类型
* @tparam Y 除数的类型
* @tparam Z 返回值的类型
* @param x 被除数
* @param y 除数
* @return 返回 (x + y - 1) / y 的结果
* @note 该函数为constexpr,可在编译时计算
* @note 支持host和device端调用
*/
__host__ __device__ constexpr Z divUp(X x, Y y) {
return (x + y - 1) / y;
}
template <typename X, typename Y, typename Z = decltype(X() + Y())>
/**
* @brief 将数值x向上对齐到y的倍数
*
* @tparam X 输入数值类型
* @tparam Y 对齐基数类型
* @tparam Z 返回数值类型
* @param x 需要对齐的数值
* @param y 对齐基数
* @return constexpr Z 返回向上对齐后的数值
*
* @note 该函数支持主机端(__host__)和设备端(__device__)调用
* @note 使用公式 (x + y - 1) - (x + y - 1) % y 实现向上对齐
*/
__host__ __device__ constexpr Z roundUp(X x, Y y) {
return (x + y - 1) - (x + y - 1) % y;
}
// assumes second argument is a power of 2
template <typename X, typename Z = decltype(X() + int())>
/**
* @brief 将给定值向上对齐到指定边界
*
* @tparam X 输入值类型
* @tparam Z 返回值类型
* @param x 需要对齐的值
* @param a 对齐边界(必须是2的幂次)
* @return constexpr Z 对齐后的值
*
* @note 该函数支持主机和设备端调用
*/
__host__ __device__ constexpr Z alignUp(X x, int a) {
return (x + a - 1) & Z(-a);
}
#pragma once
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include "align.h"
#include "check.h"
namespace sccl {
namespace alloc {
template <typename T>
/**
* @brief 使用HIP分配并初始化主机内存(带调试信息)
*
* 该函数使用HIP API分配主机内存,并将内存初始化为0。支持HIP流捕获模式切换,
* 并记录分配调试信息(文件/行号)。
*
* @tparam T 要分配的数据类型
* @param[out] ptr 指向分配内存的指针
* @param[in] nelem 要分配的元素数量
* @param[in] filefunc 调用位置的文件/函数名(调试用)
* @param[in] line 调用位置的行号(调试用)
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*
* @note 分配失败时会输出警告日志,成功时会记录分配信息
*/
scclResult_t scclHipHostCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
scclResult_t result = scclSuccess;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
*ptr = nullptr;
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
HIPCHECKGOTO(hipHostMalloc(ptr, nelem * sizeof(T), hipHostMallocMapped), result, finish);
memset(*ptr, 0, nelem * sizeof(T));
finish:
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
if(*ptr == nullptr)
WARN("Failed to HIP host alloc %ld bytes", nelem * sizeof(T));
INFO(SCCL_LOG_ALLOC, "%s:%d Hip Host Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
return result;
}
inline scclResult_t scclHipHostFree(void* ptr) {
HIPCHECK(hipHostFree(ptr));
return scclSuccess;
}
template <typename T>
/**
* @brief 分配并清零指定数量的元素内存(调试版本)
*
* @tparam T 元素类型
* @param[out] ptr 指向分配内存的指针
* @param nelem 要分配的元素数量
* @param filefunc 调用位置的文件/函数信息(用于调试)
* @param line 调用位置的行号(用于调试)
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*
* @note 此函数会记录内存分配日志,并在失败时返回错误
*/
scclResult_t scclCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
void* p = malloc(nelem * sizeof(T));
if(p == NULL) {
WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
return scclSystemError;
}
INFO(SCCL_LOG_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), p);
memset(p, 0, nelem * sizeof(T));
*ptr = (T*)p;
return scclSuccess;
}
template <typename T>
scclResult_t scclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
if(nelem < oldNelem)
return scclInternalError;
if(nelem == oldNelem)
return scclSuccess;
T* oldp = *ptr;
T* p = (T*)malloc(nelem * sizeof(T));
if(p == NULL) {
WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
return scclSystemError;
}
memcpy(p, oldp, oldNelem * sizeof(T));
free(oldp);
memset(p + oldNelem, 0, (nelem - oldNelem) * sizeof(T));
*ptr = (T*)p;
INFO(SCCL_LOG_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem * sizeof(T), nelem * sizeof(T), *ptr);
return scclSuccess;
}
struct __attribute__((aligned(64))) allocationTracker {
union {
struct {
uint64_t totalAlloc;
uint64_t totalAllocSize;
};
char align[64];
};
};
static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be size of 64 bytes");
static constexpr int MAX_ALLOC_TRACK_NGPU = 32;
extern struct allocationTracker allocTracker[];
static int scclCuMemEnable() { return 0; }
static inline scclResult_t scclCuMemAlloc(void** ptr, void* handlep, size_t size) {
WARN("CUMEM not supported prior to HIP 11.3");
return scclInternalError;
}
static inline scclResult_t scclCuMemFree(void* ptr) {
WARN("CUMEM not supported prior to HIP 11.3");
return scclInternalError;
}
template <typename T>
/**
* @brief 使用HIP分配设备内存(带调试信息)
*
* @tparam T 数据类型
* @param filefunc 调用位置的文件/函数信息
* @param line 调用位置的行号
* @param[out] ptr 分配的内存指针
* @param nelem 元素数量
* @param isFineGrain 是否使用细粒度内存(默认为false)
* @return scclResult_t 返回操作结果状态码
*
* @note 此函数会记录分配大小和指针地址的调试信息
* 支持细粒度内存分配选项,并根据HIP_UNCACHED_MEMORY宏选择分配方式
* 自动处理流捕获模式切换
*/
scclResult_t scclHipMallocDebug(const char* filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
scclResult_t result = scclSuccess;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
*ptr = nullptr;
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
if(isFineGrain) {
#if defined(HIP_UNCACHED_MEMORY)
HIPCHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem * sizeof(T), hipDeviceMallocUncached), result, finish);
#else
HIPCHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem * sizeof(T), hipDeviceMallocFinegrained), result, finish);
#endif
} else
HIPCHECKGOTO(hipMalloc(ptr, nelem * sizeof(T)), result, finish);
finish:
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
if(*ptr == nullptr)
WARN("Failed to HIP malloc %ld bytes", nelem * sizeof(T));
INFO(SCCL_LOG_ALLOC, "%s:%d Hip Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
return result;
}
template <typename T>
/**
* @brief 使用HIP分配并清零设备内存(调试版本)
*
* @tparam T 数据类型
* @param filefunc 调用源文件名/函数名(用于调试)
* @param line 调用行号(用于调试)
* @param[out] ptr 分配的设备内存指针
* @param nelem 元素数量
* @param sideStream 可选侧边流(避免干扰图捕获)
* @param isFineGrain 是否使用细粒度内存
* @return scclResult_t 返回操作结果状态码
*
* @note 1. 会自动跟踪分配统计
* 2. 支持细粒度内存分配(需HSA支持)
* 3. 使用异步方式清零内存
* 4. 会临时修改流捕获模式
*/
scclResult_t scclHipCallocDebug(const char* filefunc, int line, T** ptr, size_t nelem, hipStream_t sideStream = nullptr, bool isFineGrain = false) {
scclResult_t result = scclSuccess;
extern bool hsaFineGrainFlag;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
*ptr = nullptr;
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
// Need a side stream so as not to interfere with graph capture.
hipStream_t stream = sideStream;
if(stream == nullptr)
HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
if(isFineGrain && hsaFineGrainFlag) {
#if defined(HIP_UNCACHED_MEMORY)
HIPCHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem * sizeof(T), hipDeviceMallocUncached), result, finish);
#else
HIPCHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem * sizeof(T), hipDeviceMallocFinegrained), result, finish);
#endif
} else
HIPCHECKGOTO(hipMalloc(ptr, nelem * sizeof(T)), result, finish);
HIPCHECKGOTO(hipMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
HIPCHECKGOTO(hipStreamSynchronize(stream), result, finish);
if(sideStream == nullptr)
HIPCHECKGOTO(hipStreamDestroy(stream), result, finish);
int dev;
HIPCHECK(hipGetDevice(&dev));
if(dev < MAX_ALLOC_TRACK_NGPU) {
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem * sizeof(T), __ATOMIC_RELAXED);
}
finish:
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
if(*ptr == nullptr)
WARN("Failed to HIP calloc %ld bytes", nelem * sizeof(T));
INFO(SCCL_LOG_ALLOC, "%s:%d Hip Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
return result;
}
template <typename T>
/**
* @brief 异步分配并初始化HIP设备内存(调试版本)
*
* 该函数用于在HIP设备上异步分配内存并将其初始化为0,支持细粒度内存分配选项。
* 同时会跟踪内存分配情况并记录调试信息。
*
* @tparam T 数据类型
* @param filefunc 调用位置的文件名和函数名(用于调试)
* @param line 调用位置的行号(用于调试)
* @param[out] ptr 指向分配内存的指针
* @param nelem 要分配的元素数量
* @param stream HIP流,用于异步操作
* @param isFineGrain 是否使用细粒度内存分配(默认为false)
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*
* @note 该函数会修改全局内存分配跟踪器,并记录分配日志
* @warning 分配失败时会输出警告信息
*/
scclResult_t scclHipCallocAsyncDebug(const char* filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, bool isFineGrain = false) {
scclResult_t result = scclSuccess;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
*ptr = nullptr;
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
if(isFineGrain) {
#if defined(HIP_UNCACHED_MEMORY)
HIPCHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem * sizeof(T), hipDeviceMallocUncached), result, finish);
#else
HIPCHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem * sizeof(T), hipDeviceMallocFinegrained), result, finish);
#endif
} else
HIPCHECKGOTO(hipMalloc(ptr, nelem * sizeof(T)), result, finish);
HIPCHECKGOTO(hipMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
int dev;
HIPCHECK(hipGetDevice(&dev));
if(dev < MAX_ALLOC_TRACK_NGPU) {
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem * sizeof(T), __ATOMIC_RELAXED);
}
finish:
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
if(*ptr == nullptr)
WARN("Failed to HIP calloc async %ld bytes", nelem * sizeof(T));
INFO(SCCL_LOG_ALLOC, "%s:%d Hip Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
return result;
}
template <typename T>
/**
* 异步执行HIP内存拷贝操作
*
* @tparam T 数据类型模板参数
* @param dst 目标内存地址
* @param src 源内存地址
* @param nelem 要拷贝的元素数量
* @param stream HIP流对象
* @return scclResult_t 返回操作结果,成功返回scclSuccess
*
* @note 此函数会临时修改流捕获模式为hipStreamCaptureModeRelaxed,
* 并在操作完成后恢复原始模式
*/
scclResult_t scclHipMemcpyAsync(T* dst, T* src, size_t nelem, hipStream_t stream) {
scclResult_t result = scclSuccess;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
HIPCHECKGOTO(hipMemcpyAsync(dst, src, nelem * sizeof(T), hipMemcpyDefault, stream), result, finish);
finish:
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
return result;
}
template <typename T>
/**
* @brief 使用HIP在主机和设备间同步拷贝数据
*
* 该函数创建一个非阻塞流执行异步内存拷贝,并同步等待完成。
* 使用hipStreamCaptureModeRelaxed模式避免干扰图捕获操作。
*
* @tparam T 数据类型模板参数
* @param dst 目标内存地址
* @param src 源内存地址
* @param nelem 要拷贝的元素数量
* @return scclResult_t 返回操作结果状态码
*/
scclResult_t scclHipMemcpy(T* dst, T* src, size_t nelem) {
scclResult_t result = scclSuccess;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
// Need a side stream so as not to interfere with graph capture.
hipStream_t stream;
HIPCHECKGOTO(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking), result, finish);
SCCLCHECKGOTO(scclHipMemcpyAsync(dst, src, nelem, stream), result, finish);
HIPCHECKGOTO(hipStreamSynchronize(stream), result, finish);
HIPCHECKGOTO(hipStreamDestroy(stream), result, finish);
finish:
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
return result;
}
template <typename T>
/**
* @brief 释放HIP设备内存
*
* 该函数用于释放通过HIP分配的设备内存指针。支持两种模式:
* 1. 当启用CUDA内存管理时,调用scclCuMemFree释放
* 2. 否则直接调用hipFree释放
*
* @tparam T 指针类型
* @param ptr 要释放的设备内存指针
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*
* @note 函数会在执行前后自动处理HIP流捕获模式
*/
scclResult_t scclHipFree(T* ptr) {
scclResult_t result = scclSuccess;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
INFO(SCCL_LOG_ALLOC, "Hip Free pointer %p", ptr);
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
if(scclCuMemEnable()) {
SCCLCHECKGOTO(scclCuMemFree((void*)ptr), result, finish);
} else {
HIPCHECKGOTO(hipFree(ptr), result, finish);
}
finish:
HIPCHECK(hipThreadExchangeStreamCaptureMode(&mode));
return result;
}
/**
* @brief 分配对齐的内存并初始化为0(调试版本)
*
* 使用posix_memalign分配页面对齐的内存,并将内存区域初始化为0。
* 记录分配信息到日志系统。
*
* @param[out] ptr 指向分配内存指针的指针
* @param[in] size 请求分配的内存大小(字节)
* @param[in] filefunc 调用位置的文件/函数信息(用于调试)
* @param[in] line 调用位置的行号(用于调试)
* @return scclResult_t 返回操作状态(scclSuccess或scclSystemError)
*/
inline scclResult_t scclIbMallocDebug(void** ptr, size_t size, const char* filefunc, int line) {
size_t page_size = sysconf(_SC_PAGESIZE);
void* p;
int size_aligned = ROUNDUP(size, page_size);
int ret = posix_memalign(&p, page_size, size_aligned);
if(ret != 0)
return scclSystemError;
memset(p, 0, size);
*ptr = p;
INFO(SCCL_LOG_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
return scclSuccess;
}
} // namespace alloc
// 定义宏 scclHipHostCalloc,用于调试版本的主机端内存分配,自动添加文件名和行号信息
#define scclHipHostCalloc(...) alloc::scclHipHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
// 定义宏 scclCalloc,用于调试版本的常规内存分配,自动添加文件名和行号信息
#define scclCalloc(...) alloc::scclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
// 定义宏 scclHipMalloc,用于调试版本的 HIP (Heterogeneous-Compute Interface for Portability) 内存分配,自动添加文件名和行号信息
#define scclHipMalloc(...) alloc::scclHipMallocDebug(__FILE__, __LINE__, __VA_ARGS__)
// 定义宏 scclHipCalloc,用于调试版本的 HIP 内存清零分配,自动添加文件名和行号信息
#define scclHipCalloc(...) alloc::scclHipCallocDebug(__FILE__, __LINE__, __VA_ARGS__)
// 定义宏 scclHipCallocAsync,用于调试版本的异步 HIP 内存清零分配,自动添加文件名和行号信息
#define scclHipCallocAsync(...) alloc::scclHipCallocAsyncDebug(__FILE__, __LINE__, __VA_ARGS__)
// 定义宏 scclIbMalloc,用于调试版本的 InfiniBand 内存分配,自动添加文件名和行号信息
#define scclIbMalloc(...) alloc::scclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
///////////////////////////////////////// 内存申请和释放函数 /////////////////////////////////////////
inline scclResult_t scclHipHostFree(void* ptr) { return alloc::scclHipFree(ptr); }
template <typename T>
scclResult_t scclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
return alloc::scclRealloc(ptr, oldNelem, nelem);
}
template <typename T>
scclResult_t scclHipMemcpyAsync(T* dst, T* src, size_t nelem, hipStream_t stream) {
return alloc::scclHipMemcpyAsync(dst, src, nelem, stream);
}
template <typename T>
scclResult_t scclHipMemcpy(T* dst, T* src, size_t nelem) {
return alloc::scclHipMemcpy(dst, src, nelem);
}
template <typename T>
scclResult_t scclHipFree(T* ptr) {
return alloc::scclHipFree(ptr);
}
} // namespace sccl
#include "archinfo.h"
#include "check.h"
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
namespace sccl {
void GcnArchNameFormat(char* gcnArchName, char* out) {
// this function parses the char array from the device properties into something easier to handle.
// as the gcnArchName attribute looks something like: "gfx900:xnack+:blah-:etc-"
char* gcnArchNameToken = strtok(gcnArchName, ":");
strcpy(out, gcnArchNameToken);
return;
}
void convertGcnArchToGcnArchName(const char* gcnArch, const char** gcnArchName) {
if(strcmp(gcnArch, "906") == 0)
*gcnArchName = "gfx906";
else if(strcmp(gcnArch, "908") == 0)
*gcnArchName = "gfx908";
else if(strcmp(gcnArch, "910") == 0)
*gcnArchName = "gfx90a";
else if(strcmp(gcnArch, "940") == 0)
*gcnArchName = "gfx940";
else if(strcmp(gcnArch, "941") == 0)
*gcnArchName = "gfx941";
else if(strcmp(gcnArch, "942") == 0)
*gcnArchName = "gfx942";
else
*gcnArchName = gcnArch;
return;
}
int GetGcnArchName(int deviceId, char* out) {
hipDeviceProp_t devProp;
HIPCHECK(hipGetDeviceProperties(&devProp, deviceId));
GcnArchNameFormat(devProp.gcnArchName, out);
return 0;
}
double GetDeviceWallClockRateInKhz(int deviceId) {
char gcn[256];
GetGcnArchName(deviceId, gcn);
if(strncmp("gfx94", gcn, 5) == 0)
return 1.0E5;
else
return 2.5E4;
}
bool IsArchMatch(char const* arch, char const* target) { return (strncmp(arch, target, strlen(target)) == 0); }
} // namespace sccl
#ifndef ARCHINFO_H_
#define ARCHINFO_H_
#include <string.h>
/*
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
*/
namespace sccl {
// 将GCN架构名称格式化为指定的输出格式
void GcnArchNameFormat(char* gcnArchName, char* out);
// 将GCN架构转换为GCN架构名称
void convertGcnArchToGcnArchName(const char* gcnArch, const char** gcnArchName);
// 获取指定设备ID的GCN架构名称
int GetGcnArchName(int deviceId, char* out);
// 获取指定设备ID的设备墙钟速率(以KHz为单位)
double GetDeviceWallClockRateInKhz(int deviceId);
// 判断指定的架构名称是否与目标架构匹配
bool IsArchMatch(char const* arch, char const* target);
} // namespace sccl
#endif // ARCHINFO_H_
#include <algorithm>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <pthread.h>
#include <pwd.h>
#include "param.h"
#include "debug.h"
namespace sccl {
/**
* 获取当前用户的主目录路径
*
* @return 返回指向用户主目录路径的指针,如果获取失败则返回NULL
*/
const char* userHomeDir() {
struct passwd* pwUser = getpwuid(getuid());
return pwUser == NULL ? NULL : pwUser->pw_dir;
}
/**
* @brief 从指定文件中读取环境变量并设置到系统环境
*
* 该函数读取指定格式的配置文件,每行格式为"VAR=VALUE",
* 忽略以#开头的注释行,并将解析出的环境变量设置到当前进程环境。
*
* @param fileName 环境变量配置文件路径
*/
void setEnvFile(const char* fileName) {
FILE* file = fopen(fileName, "r");
if(file == NULL)
return;
char* line = NULL;
char envVar[1024];
char envValue[1024];
size_t n = 0;
ssize_t read;
while((read = getline(&line, &n, file)) != -1) {
if(line[0] == '#')
continue;
if(line[read - 1] == '\n')
line[read - 1] = '\0';
int s = 0; // Env Var Size
while(line[s] != '\0' && line[s] != '=')
s++;
if(line[s] == '\0')
continue;
strncpy(envVar, line, std::min(1023, s));
envVar[std::min(1023, s)] = '\0';
s++;
strncpy(envValue, line + s, 1023);
envValue[1023] = '\0';
setenv(envVar, envValue, 0);
// printf("%s : %s->%s\n", fileName, envVar, envValue);
}
if(line)
free(line);
fclose(file);
}
/**
* 初始化环境配置函数
*
* 该函数用于加载SCCL配置文件,按照以下顺序查找:
* 1. 首先检查环境变量"SCCL_CONF_FILE"指定的文件
* 2. 其次查找用户主目录下的".sccl.conf"文件
* 3. 最后尝试加载系统默认的"/etc/sccl.conf"文件
*
* 每个找到的配置文件都会被通过setEnvFile函数加载
*/
static void initEnvFunc() {
char confFilePath[1024];
const char* userFile = getenv("SCCL_CONF_FILE");
if(userFile && strlen(userFile) > 0) {
snprintf(confFilePath, sizeof(confFilePath), "%s", userFile);
setEnvFile(confFilePath);
} else {
const char* userDir = userHomeDir();
if(userDir) {
snprintf(confFilePath, sizeof(confFilePath), "%s/.sccl.conf", userDir);
setEnvFile(confFilePath);
}
}
snprintf(confFilePath, sizeof(confFilePath), "/etc/sccl.conf");
setEnvFile(confFilePath);
return;
}
/**
* 初始化环境变量(线程安全)
*
* 使用pthread_once确保initEnvFunc仅被调用一次
* 适用于多线程环境下环境变量的初始化
*/
void initEnv() {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, initEnvFunc);
return;
}
/**
* @brief 加载环境变量参数并缓存
*
* 该函数用于从环境变量中读取整型参数值,并进行缓存以避免重复读取。
* 如果环境变量未设置或解析失败,则使用默认值。
*
* @param env 环境变量名
* @param deftVal 默认值
* @param uninitialized 未初始化标记值
* @param cache 用于缓存参数值的指针
*
* @note 该函数是线程安全的,使用互斥锁保护缓存操作
*/
void scclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_lock(&mutex);
if(__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
const char* str = scclGetEnv(env);
int64_t value = deftVal;
if(str && strlen(str) > 0) {
errno = 0;
value = strtoll(str, nullptr, 0);
if(errno) {
value = deftVal;
INFO(SCCL_LOG_CODEALL, "Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
} else {
INFO(SCCL_LOG_TRANSPORT, "%s set by environment to %lld.", env, (long long)value);
}
}
__atomic_store_n(cache, value, __ATOMIC_RELAXED);
}
pthread_mutex_unlock(&mutex);
return;
}
/**
* 获取环境变量的值
*
* @param name 环境变量名称
* @return 环境变量的值,如果未找到则返回NULL
*
* @note 该函数会先初始化环境变量
*/
const char* scclGetEnv(const char* name) {
initEnv();
return getenv(name);
}
#define SCCL_THREAD_NAMELEN 16
SCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
/**
* @brief 设置指定线程的名称
*
* 该函数使用GNU扩展的pthread_setname_np来设置线程名称。名称通过可变参数格式化字符串生成,
* 最大长度为SCCL_THREAD_NAMELEN。仅在启用了_GNU_SOURCE宏且scclParamSetThreadName()返回1时生效。
*
* @param thread 要设置名称的线程句柄
* @param fmt 格式化字符串,用于生成线程名称
* @param ... 可变参数,用于格式化字符串
*/
void scclSetThreadName(pthread_t thread, const char* fmt, ...) {
// pthread_setname_np 是 GNU 的非标准扩展
// 需要以下特性测试宏
#ifdef _GNU_SOURCE
// 检查是否启用了设置线程名称的功能,如果未启用则直接返回
if(scclParamSetThreadName() != 1)
return;
// 定义一个足够长的字符数组用于存储线程名
char threadName[SCCL_THREAD_NAMELEN];
// 声明可变参数列表变量
va_list vargs;
// 初始化可变参数列表
va_start(vargs, fmt);
// 使用可变参数和格式化字符串生成线程名,并写入 threadName 数组
vsnprintf(threadName, SCCL_THREAD_NAMELEN, fmt, vargs);
// 结束可变参数列表的使用
va_end(vargs);
// 使用 pthread_setname_np 设置线程名称
pthread_setname_np(thread, threadName);
#endif
}
} // namespace sccl
#pragma once
#include <stdint.h>
namespace sccl {
// 返回用户的主目录路径
const char* userHomeDir();
// 设置环境文件的名称
void setEnvFile(const char* fileName);
// 初始化环境变量
void initEnv();
// 获取指定名称的环境变量
const char* scclGetEnv(const char* name);
// 加载参数,如果环境变量未设置,则使用默认值
void scclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
// 设置线程名称,使用可变参数格式化字符串
void scclSetThreadName(pthread_t thread, const char* fmt, ...);
/**
* 定义SCCL参数宏。
* 此宏用于创建一个获取SCCL参数的函数。
* 参数通过环境变量获取,若环境变量未设置,则使用默认值。
*
* @param name 参数名称
* @param env 环境变量名称
* @param deftVal 默认值
*
* @return 返回参数的值
*
* 宏内部实现:
* - 使用constexpr定义一个未初始化的值。
* - 使用static_assert确保默认值不是未初始化的值。
* - 定义一个静态变量cache用于缓存参数值,初始为未初始化的值。
* - 使用__builtin_expect和__atomic_load_n检查cache是否为未初始化的值。
* - 若cache未初始化,则调用scclLoadParam函数加载参数值。
* - 返回cache的值。
*/
#define SCCL_PARAM(name, env, deftVal) \
int64_t scclParam##name() { \
constexpr int64_t uninitialized = INT64_MIN; \
static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
static int64_t cache = uninitialized; \
if(__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
scclLoadParam("SCCL_" env, deftVal, uninitialized, &cache); \
} \
return cache; \
}
} // namespace sccl
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "utils.h"
// #include "core.h"
// #include "nvmlwrap.h"
#include <dirent.h>
#include <fstream>
#include <stdlib.h>
namespace sccl {
// // Get current Compute Capability
// int scclCudaCompCap() {
// int cudaDev;
// if(cudaGetDevice(&cudaDev) != cudaSuccess)
// return 0;
// int ccMajor, ccMinor;
// if(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess)
// return 0;
// if(cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess)
// return 0;
// return ccMajor * 10 + ccMinor;
// }
// scclResult_t int64ToBusId(int64_t id, char* busId) {
// sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
// return scclSuccess;
// }
// scclResult_t busIdToInt64(const char* busId, int64_t* id) {
// char hexStr[17]; // Longest possible int64 hex string + null terminator.
// int hexOffset = 0;
// for(int i = 0; hexOffset < sizeof(hexStr) - 1; i++) {
// char c = busId[i];
// if(c == '.' || c == ':')
// continue;
// if((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) {
// hexStr[hexOffset++] = busId[i];
// } else
// break;
// }
// hexStr[hexOffset] = '\0';
// *id = strtol(hexStr, NULL, 16);
// return scclSuccess;
// }
// // Convert a logical cudaDev index to the NVML device minor number
// scclResult_t getBusId(int cudaDev, int64_t* busId) {
// // On most systems, the PCI bus ID comes back as in the 0000:00:00.0
// // format. Still need to allocate proper space in case PCI domain goes
// // higher.
// char busIdStr[] = "00000000:00:00.0";
// CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
// NCCLCHECK(busIdToInt64(busIdStr, busId));
// return scclSuccess;
// }
// scclResult_t getHostName(char* hostname, int maxlen, const char delim) {
// if(gethostname(hostname, maxlen) != 0) {
// strncpy(hostname, "unknown", maxlen);
// return scclSystemError;
// }
// int i = 0;
// while((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1))
// i++;
// hostname[i] = '\0';
// return scclSuccess;
// }
// uint64_t getHash(const char* string, int n) {
// // Based on DJB2a, result = result * 33 ^ char
// uint64_t result = 5381;
// for(int c = 0; c < n; c++) {
// result = ((result << 5) + result) ^ string[c];
// }
// return result;
// }
// /* Generate a hash of the unique identifying string for this host
// * that will be unique for both bare-metal and container instances
// * Equivalent of a hash of;
// *
// * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
// *
// * This string can be overridden by using the NCCL_HOSTID env var.
// */
// #define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
// uint64_t getHostHash(void) {
// char hostHash[1024];
// char* hostId;
// // Fall back is the full hostname if something fails
// (void)getHostName(hostHash, sizeof(hostHash), '\0');
// int offset = strlen(hostHash);
// if((hostId = getenv("NCCL_HOSTID")) != NULL) {
// INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId);
// strncpy(hostHash, hostId, sizeof(hostHash));
// } else {
// FILE* file = fopen(HOSTID_FILE, "r");
// if(file != NULL) {
// char* p;
// if(fscanf(file, "%ms", &p) == 1) {
// strncpy(hostHash + offset, p, sizeof(hostHash) - offset - 1);
// free(p);
// }
// }
// fclose(file);
// }
// // Make sure the string is terminated
// hostHash[sizeof(hostHash) - 1] = '\0';
// TRACE(NCCL_INIT, "unique hostname '%s'", hostHash);
// return getHash(hostHash, strlen(hostHash));
// }
// /* Generate a hash of the unique identifying string for this process
// * that will be unique for both bare-metal and container instances
// * Equivalent of a hash of;
// *
// * $$ $(readlink /proc/self/ns/pid)
// */
// uint64_t getPidHash(void) {
// char pname[1024];
// // Start off with our pid ($$)
// sprintf(pname, "%ld", (long)getpid());
// int plen = strlen(pname);
// int len = readlink("/proc/self/ns/pid", pname + plen, sizeof(pname) - 1 - plen);
// if(len < 0)
// len = 0;
// pname[plen + len] = '\0';
// TRACE(NCCL_INIT, "unique PID '%s'", pname);
// return getHash(pname, strlen(pname));
// }
// int parseStringList(const char* string, struct netIf* ifList, int maxList) {
// if(!string)
// return 0;
// const char* ptr = string;
// int ifNum = 0;
// int ifC = 0;
// char c;
// do {
// c = *ptr;
// if(c == ':') {
// if(ifC > 0) {
// ifList[ifNum].prefix[ifC] = '\0';
// ifList[ifNum].port = atoi(ptr + 1);
// ifNum++;
// ifC = 0;
// }
// while(c != ',' && c != '\0')
// c = *(++ptr);
// } else if(c == ',' || c == '\0') {
// if(ifC > 0) {
// ifList[ifNum].prefix[ifC] = '\0';
// ifList[ifNum].port = -1;
// ifNum++;
// ifC = 0;
// }
// } else {
// ifList[ifNum].prefix[ifC] = c;
// ifC++;
// }
// ptr++;
// } while(ifNum < maxList && c);
// return ifNum;
// }
// static bool matchIf(const char* string, const char* ref, bool matchExact) {
// // Make sure to include '\0' in the exact case
// int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
// return strncmp(string, ref, matchLen) == 0;
// }
// static bool matchPort(const int port1, const int port2) {
// if(port1 == -1)
// return true;
// if(port2 == -1)
// return true;
// if(port1 == port2)
// return true;
// return false;
// }
// bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
// // Make an exception for the case where no user list is defined
// if(listSize == 0)
// return true;
// for(int i = 0; i < listSize; i++) {
// if(matchIf(string, ifList[i].prefix, matchExact) && matchPort(port, ifList[i].port)) {
// return true;
// }
// }
// return false;
// }
// __thread struct scclThreadSignal scclThreadSignalLocalInstance = scclThreadSignalStaticInitializer();
// void* scclMemoryStack::allocateSpilled(struct scclMemoryStack* me, size_t size, size_t align) {
// // `me->hunks` points to the top of the stack non-empty hunks. Hunks above
// // this (reachable via `->above`) are empty.
// struct Hunk* top = me->topFrame.hunk;
// size_t mallocSize = 0;
// // If we have lots of space left in hunk but that wasn't enough then we'll
// // allocate the object unhunked.
// if(me->topFrame.end - me->topFrame.bumper >= 8 << 10)
// goto unhunked;
// // If we have another hunk (which must be empty) waiting above this one and
// // the object fits then use that.
// if(top && top->above) {
// struct Hunk* top1 = top->above;
// uintptr_t uobj = (reinterpret_cast<uintptr_t>(top1) + sizeof(struct Hunk) + align - 1) & -uintptr_t(align);
// if(uobj + size <= reinterpret_cast<uintptr_t>(top1) + top1->size) {
// me->topFrame.hunk = top1;
// me->topFrame.bumper = uobj + size;
// me->topFrame.end = reinterpret_cast<uintptr_t>(top1) + top1->size;
// return reinterpret_cast<void*>(uobj);
// }
// }
// { // If the next hunk we're going to allocate wouldn't be big enough but the
// // Unhunk proxy fits in the current hunk then go allocate as unhunked.
// size_t nextSize = (top ? top->size : 0) + (64 << 10);
// constexpr size_t maxAlign = 64;
// if(nextSize < sizeof(struct Hunk) + maxAlign + size) {
// uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk) - 1) & -uintptr_t(alignof(Unhunk));
// if(uproxy + sizeof(struct Unhunk) <= me->topFrame.end)
// goto unhunked;
// }
// // At this point we must need another hunk, either to fit the object
// // itself or its Unhunk proxy.
// mallocSize = nextSize;
// INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
// struct Hunk* top1 = (struct Hunk*)malloc(mallocSize);
// if(top1 == nullptr)
// goto malloc_exhausted;
// top1->size = nextSize;
// top1->above = nullptr;
// if(top)
// top->above = top1;
// top = top1;
// me->topFrame.hunk = top;
// me->topFrame.end = reinterpret_cast<uintptr_t>(top) + nextSize;
// me->topFrame.bumper = reinterpret_cast<uintptr_t>(top) + sizeof(struct Hunk);
// }
// { // Try to fit object in the new top hunk.
// uintptr_t uobj = (me->topFrame.bumper + align - 1) & -uintptr_t(align);
// if(uobj + size <= me->topFrame.end) {
// me->topFrame.bumper = uobj + size;
// return reinterpret_cast<void*>(uobj);
// }
// }
// unhunked: { // We need to allocate the object out-of-band and put an Unhunk proxy in-band
// // to keep track of it.
// uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk) - 1) & -uintptr_t(alignof(Unhunk));
// Unhunk* proxy = reinterpret_cast<Unhunk*>(uproxy);
// me->topFrame.bumper = uproxy + sizeof(Unhunk);
// proxy->next = me->topFrame.unhunks;
// me->topFrame.unhunks = proxy;
// mallocSize = size;
// proxy->obj = malloc(mallocSize);
// INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
// if(proxy->obj == nullptr)
// goto malloc_exhausted;
// return proxy->obj;
// }
// malloc_exhausted:
// WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
// abort();
// }
// void scclMemoryStackDestruct(struct scclMemoryStack* me) {
// // Free unhunks first because both the frames and unhunk proxies lie within the hunks.
// struct scclMemoryStack::Frame* f = &me->topFrame;
// while(f != nullptr) {
// struct scclMemoryStack::Unhunk* u = f->unhunks;
// while(u != nullptr) {
// free(u->obj);
// u = u->next;
// }
// f = f->below;
// }
// // Free hunks
// struct scclMemoryStack::Hunk* h = me->stub.above;
// while(h != nullptr) {
// struct scclMemoryStack::Hunk* h1 = h->above;
// free(h);
// h = h1;
// }
// }
// typedef struct {
// pid_t pid;
// pid_t ppid;
// char pcmdLine[4096];
// char cmdLine[4096];
// } appConfigOptimizeArg_t;
// static bool barrier_Flag;
// int maxGPUs = -1;
// int initInfo() {
// /* get barrier_Flag */
// uint32_t index = 0;
// appConfigOptimizeArg_t args = {0};
// args.pid = getpid();
// args.ppid = getppid();
// std::string cmdLinePath = "/proc/" + std::to_string(args.ppid) + "/cmdline";
// std::ifstream cmdLineFile;
// cmdLineFile.open(cmdLinePath.c_str());
// cmdLineFile.read(args.pcmdLine, sizeof(args.pcmdLine));
// cmdLineFile.close();
// cmdLinePath = "/proc/" + std::to_string(args.pid) + "/cmdline";
// cmdLineFile.open(cmdLinePath.c_str());
// cmdLineFile.read(args.cmdLine, sizeof(args.cmdLine));
// cmdLineFile.close();
// if(memmem(args.cmdLine, sizeof(args.cmdLine), "sccl_context_test", strlen("sccl_context_test")) ||
// memmem(args.pcmdLine, sizeof(args.pcmdLine), "sccl_context_test", strlen("sccl_context_test"))) {
// barrier_Flag = true;
// } else {
// barrier_Flag = false;
// }
// INFO(NCCL_INIT, "Init config for sccl_context_test: %d", barrier_Flag);
// /* get maximum number of GPUs in all NUMA nodes */
// if(maxGPUs == -1) {
// int gpuCount[32] = {0}; // Assume MAX_NUMA_NODES=32
// int deviceCount;
// hipGetDeviceCount(&deviceCount);
// // Get numbers of GPUs in all NUMA nodes in system
// for(int i = 1; i <= deviceCount; ++i) {
// char path[256];
// snprintf(path, sizeof(path), "/sys/class/drm/card%d/device/numa_node", i);
// FILE* fp = fopen(path, "r");
// if(fp == NULL) {
// perror("Error opening NUMA node file");
// continue;
// }
// int numaNode;
// if(fscanf(fp, "%d", &numaNode) == 1 && numaNode >= 0 && numaNode < 32) {
// gpuCount[numaNode]++;
// }
// fclose(fp);
// }
// // Find maximum number of GPUs in all NUMA nodes
// for(int i = 0; i < 32; ++i) {
// if(gpuCount[i] > maxGPUs) {
// maxGPUs = gpuCount[i];
// }
// }
// INFO(NCCL_INIT, "Maximum number of GPUs in any NUMA node: %d\n", maxGPUs);
// }
// return 0;
// }
// bool getBarrierFlag() { return barrier_Flag; }
// int getNumaMaxGpus() { return maxGPUs; }
} // namespace sccl
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_UTILS_H_
#define NCCL_UTILS_H_
#include "check.h"
#include <stdint.h>
#include <time.h>
#include <sched.h>
#include <new>
namespace sccl {
// int ncclCudaCompCap();
// scclResult_t int64ToBusId(int64_t id, char* busId);
// scclResult_t busIdToInt64(const char* busId, int64_t* id);
// ncclResult_t getBusId(int cudaDev, int64_t* busId);
// ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
// uint64_t getHash(const char* string, int n);
// uint64_t getHostHash();
// uint64_t getPidHash();
// ncclResult_t getRandomData(void* buffer, size_t bytes);
// struct netIf {
// char prefix[64];
// int port;
// };
// int parseStringList(const char* string, struct netIf* ifList, int maxList);
// bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
// static long log2i(long n) {
// long l = 0;
// while(n >>= 1)
// l++;
// return l;
// }
// inline uint64_t clockNano() {
// struct timespec ts;
// clock_gettime(CLOCK_MONOTONIC, &ts);
// return uint64_t(ts.tv_sec) * 1000 * 1000 * 1000 + ts.tv_nsec;
// }
// /* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
// * return -1 */
// inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
// ncclResult_t ret = ncclSuccess;
// if(bytes > 0) {
// const size_t one = 1UL;
// FILE* fp = fopen("/dev/urandom", "r");
// if(buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one)
// ret = ncclSystemError;
// if(fp)
// fclose(fp);
// }
// return ret;
// }
// ////////////////////////////////////////////////////////////////////////////////
// template <typename Int>
// inline void ncclAtomicRefCountIncrement(Int* refs) {
// __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
// }
// template <typename Int>
// inline Int ncclAtomicRefCountDecrement(Int* refs) {
// return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL);
// }
// ////////////////////////////////////////////////////////////////////////////////
// /* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that
// * granularity of LIFO is not per object, instead frames containing many objects
// * are pushed and popped. Therefor deallocation is extremely cheap since its
// * done at the frame granularity.
// *
// * The initial state of the stack is with one frame, the "nil" frame, which
// * cannot be popped. Therefor objects allocated in the nil frame cannot be
// * deallocated sooner than stack destruction.
// */
// struct ncclMemoryStack;
// void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
// void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
// void ncclMemoryStackPush(struct ncclMemoryStack* me);
// void ncclMemoryStackPop(struct ncclMemoryStack* me);
// template <typename T>
// T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n = 1);
// int initInfo();
// bool getBarrierFlag();
// int getNumaMaxGpus();
// ////////////////////////////////////////////////////////////////////////////////
// /* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
// * a pool instance to ever hold objects whose type have differing
// * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by
// * a backing `ncclMemoryStack` passed during Alloc(). If memory
// * backing any currently held object is deallocated then it is an error to do
// * anything other than reconstruct it, after which it is a valid empty pool.
// */
// struct ncclMemoryPool;
// // Equivalent to zero-initialization
// void ncclMemoryPoolConstruct(struct ncclMemoryPool* me);
// template <typename T>
// T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing);
// template <typename T>
// void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj);
// void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from);
// ////////////////////////////////////////////////////////////////////////////////
// /* ncclIntruQueue: A singly-linked list queue where the per-object next pointer
// * field is given via the `next` template argument.
// *
// * Example:
// * struct Foo {
// * struct Foo *next1, *next2; // can be a member of two lists at once
// * };
// * ncclIntruQueue<Foo, &Foo::next1> list1;
// * ncclIntruQueue<Foo, &Foo::next2> list2;
// */
// template <typename T, T* T::* next>
// struct ncclIntruQueue;
// template <typename T, T* T::* next>
// void ncclIntruQueueConstruct(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// bool ncclIntruQueueEmpty(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// T* ncclIntruQueueHead(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// void ncclIntruQueueEnqueue(ncclIntruQueue<T, next>* me, T* x);
// template <typename T, T* T::* next>
// T* ncclIntruQueueDequeue(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// T* ncclIntruQueueTryDequeue(ncclIntruQueue<T, next>* me);
// template <typename T, T* T::* next>
// void ncclIntruQueueFreeAll(ncclIntruQueue<T, next>* me, ncclMemoryPool* memPool);
// ////////////////////////////////////////////////////////////////////////////////
// /* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
// * and "cond" fields are part of the public interface.
// */
// struct ncclThreadSignal {
// pthread_mutex_t mutex;
// pthread_cond_t cond;
// };
// // returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}
// constexpr ncclThreadSignal ncclThreadSignalStaticInitializer();
// void ncclThreadSignalConstruct(struct ncclThreadSignal* me);
// void ncclThreadSignalDestruct(struct ncclThreadSignal* me);
// // A convenience instance per-thread.
// extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance;
// ////////////////////////////////////////////////////////////////////////////////
// template <typename T, T* T::* next>
// struct ncclIntruQueueMpsc;
// template <typename T, T* T::* next>
// void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T, next>* me);
// template <typename T, T* T::* next>
// bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T, next>* me);
// // Enqueue element. Returns true if queue is not abandoned. Even if queue is
// // abandoned the element enqueued, so the caller needs to make arrangements for
// // the queue to be tended.
// template <typename T, T* T::* next>
// bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc<T, next>* me, T* x);
// // Dequeue all elements at a glance. If there aren't any and `waitSome` is
// // true then this call will wait until it can return a non empty list.
// template <typename T, T* T::* next>
// T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc<T, next>* me, bool waitSome);
// // Dequeue all elements and set queue to abandoned state.
// template <typename T, T* T::* next>
// T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc<T, next>* me);
// ////////////////////////////////////////////////////////////////////////////////
// struct ncclMemoryStack {
// struct Hunk {
// struct Hunk* above; // reverse stack pointer
// size_t size; // size of this allocation (including this header struct)
// };
// struct Unhunk { // proxy header for objects allocated out-of-hunk
// struct Unhunk* next;
// void* obj;
// };
// struct Frame {
// struct Hunk* hunk; // top of non-empty hunks
// uintptr_t bumper, end; // points into top hunk
// struct Unhunk* unhunks;
// struct Frame* below;
// };
// static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align);
// static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align);
// struct Hunk stub;
// struct Frame topFrame;
// };
// inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) {
// me->stub.above = nullptr;
// me->stub.size = 0;
// me->topFrame.hunk = &me->stub;
// me->topFrame.bumper = 0;
// me->topFrame.end = 0;
// me->topFrame.unhunks = nullptr;
// me->topFrame.below = nullptr;
// }
// inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) {
// uintptr_t o = (me->topFrame.bumper + align - 1) & -uintptr_t(align);
// void* obj;
// if(__builtin_expect(o + size <= me->topFrame.end, true)) {
// me->topFrame.bumper = o + size;
// obj = reinterpret_cast<void*>(o);
// } else {
// obj = allocateSpilled(me, size, align);
// }
// return obj;
// }
// template <typename T>
// inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
// void* obj = ncclMemoryStack::allocate(me, n * sizeof(T), alignof(T));
// memset(obj, 0, n * sizeof(T));
// return (T*)obj;
// }
// inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
// using Frame = ncclMemoryStack::Frame;
// Frame tmp = me->topFrame;
// Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame));
// *snapshot = tmp; // C++ struct assignment
// me->topFrame.unhunks = nullptr;
// me->topFrame.below = snapshot;
// }
// inline void ncclMemoryStackPop(struct ncclMemoryStack* me) {
// ncclMemoryStack::Unhunk* un = me->topFrame.unhunks;
// while(un != nullptr) {
// free(un->obj);
// un = un->next;
// }
// me->topFrame = *me->topFrame.below; // C++ struct assignment
// }
// ////////////////////////////////////////////////////////////////////////////////
// struct ncclMemoryPool {
// struct Cell {
// Cell* next;
// };
// template <int Size, int Align>
// union CellSized {
// Cell cell;
// alignas(Align) char space[Size];
// };
// struct Cell* head;
// struct Cell* tail; // meaningful only when head != nullptr
// };
// inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) { me->head = nullptr; }
// template <typename T>
// inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
// using Cell = ncclMemoryPool::Cell;
// using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
// Cell* cell;
// if(__builtin_expect(me->head != nullptr, true)) {
// cell = me->head;
// me->head = cell->next;
// } else {
// // Use the internal allocate() since it doesn't memset to 0 yet.
// cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
// }
// memset(cell, 0, sizeof(T));
// return reinterpret_cast<T*>(cell);
// }
// template <typename T>
// inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) {
// using Cell = ncclMemoryPool::Cell;
// Cell* cell = reinterpret_cast<Cell*>(obj);
// cell->next = me->head;
// if(me->head == nullptr)
// me->tail = cell;
// me->head = cell;
// }
// inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) {
// if(from->head != nullptr) {
// from->tail->next = me->head;
// if(me->head == nullptr)
// me->tail = from->tail;
// me->head = from->head;
// from->head = nullptr;
// }
// }
// ////////////////////////////////////////////////////////////////////////////////
// template <typename T, T* T::* next>
// struct ncclIntruQueue {
// T *head, *tail;
// };
// template <typename T, T* T::* next>
// inline void ncclIntruQueueConstruct(ncclIntruQueue<T, next>* me) {
// me->head = nullptr;
// me->tail = nullptr;
// }
// template <typename T, T* T::* next>
// inline bool ncclIntruQueueEmpty(ncclIntruQueue<T, next>* me) {
// return me->head == nullptr;
// }
// template <typename T, T* T::* next>
// inline T* ncclIntruQueueHead(ncclIntruQueue<T, next>* me) {
// return me->head;
// }
// template <typename T, T* T::* next>
// inline T* ncclIntruQueueTail(ncclIntruQueue<T, next>* me) {
// return me->tail;
// }
// template <typename T, T* T::* next>
// inline void ncclIntruQueueEnqueue(ncclIntruQueue<T, next>* me, T* x) {
// x->*next = nullptr;
// (me->head ? me->tail->*next : me->head) = x;
// me->tail = x;
// }
// template <typename T, T* T::* next>
// inline T* ncclIntruQueueDequeue(ncclIntruQueue<T, next>* me) {
// T* ans = me->head;
// me->head = ans->*next;
// if(me->head == nullptr)
// me->tail = nullptr;
// return ans;
// }
// template <typename T, T* T::* next>
// inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T, next>* me) {
// T* ans = me->head;
// if(ans != nullptr) {
// me->head = ans->*next;
// if(me->head == nullptr)
// me->tail = nullptr;
// }
// return ans;
// }
// template <typename T, T* T::* next>
// void ncclIntruQueueFreeAll(ncclIntruQueue<T, next>* me, ncclMemoryPool* pool) {
// T* head = me->head;
// me->head = nullptr;
// me->tail = nullptr;
// while(head != nullptr) {
// T* tmp = head->*next;
// ncclMemoryPoolFree(pool, tmp);
// head = tmp;
// }
// }
// ////////////////////////////////////////////////////////////////////////////////
// constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() { return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}; }
// inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) {
// pthread_mutex_init(&me->mutex, nullptr);
// pthread_cond_init(&me->cond, nullptr);
// }
// inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) {
// pthread_mutex_destroy(&me->mutex);
// pthread_cond_destroy(&me->cond);
// }
// ////////////////////////////////////////////////////////////////////////////////
// template <typename T, T* T::* next>
// struct ncclIntruQueueMpsc {
// T* head;
// uintptr_t tail;
// struct ncclThreadSignal* waiting;
// };
// template <typename T, T* T::* next>
// void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T, next>* me) {
// me->head = nullptr;
// me->tail = 0x0;
// me->waiting = nullptr;
// }
// template <typename T, T* T::* next>
// bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T, next>* me) {
// return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2;
// }
// template <typename T, T* T::* next>
// bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc<T, next>* me, T* x) {
// __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED);
// uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast<uintptr_t>(x), __ATOMIC_ACQ_REL);
// T* prev = reinterpret_cast<T*>(utail);
// T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next);
// __atomic_store_n(prevNext, x, __ATOMIC_RELAXED);
// if(utail == 0x1) { // waiting
// __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting
// // This lock/unlock is essential to ensure we don't race ahead of the consumer
// // and signal the cond before they begin waiting on it.
// struct ncclThreadSignal* waiting = me->waiting;
// pthread_mutex_lock(&waiting->mutex);
// pthread_mutex_unlock(&waiting->mutex);
// pthread_cond_broadcast(&waiting->cond);
// }
// return utail != 0x2; // not abandoned
// }
// template <typename T, T* T::* next>
// T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc<T, next>* me, bool waitSome) {
// T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
// if(head == nullptr) {
// if(!waitSome)
// return nullptr;
// uint64_t t0 = clockNano();
// bool sleeping = false;
// do {
// if(clockNano() - t0 >= 10 * 1000) { // spin for first 10us
// struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance;
// pthread_mutex_lock(&waitSignal->mutex);
// uintptr_t expected = sleeping ? 0x1 : 0x0;
// uintptr_t desired = 0x1;
// me->waiting = waitSignal; // release done by successful compare exchange
// if(__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
// sleeping = true;
// pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
// }
// pthread_mutex_unlock(&waitSignal->mutex);
// }
// head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
// } while(head == nullptr);
// }
// __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
// uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL);
// T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
// T* x = head;
// while(x != tail) {
// T* x1;
// int spins = 0;
// while(true) {
// x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
// if(x1 != nullptr)
// break;
// if(++spins == 1024) {
// spins = 1024 - 1;
// sched_yield();
// }
// }
// x = x1;
// }
// return head;
// }
// template <typename T, T* T::* next>
// T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T, next>* me) {
// uintptr_t expected = 0x0;
// if(__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
// return nullptr;
// } else {
// int spins = 0;
// T* head;
// while(true) {
// head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
// if(head != nullptr)
// break;
// if(++spins == 1024) {
// spins = 1024 - 1;
// sched_yield();
// }
// }
// __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
// uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL);
// T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
// T* x = head;
// while(x != tail) {
// T* x1;
// spins = 0;
// while(true) {
// x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
// if(x1 != nullptr)
// break;
// if(++spins == 1024) {
// spins = 1024 - 1;
// sched_yield();
// }
// }
// x = x1;
// }
// return head;
// }
// }
// ////////////////////////////////////////////////////////////////////////////////
// static inline long get_now_ns(void) {
// struct timespec time;
// if(clock_gettime(CLOCK_MONOTONIC, &time) != 0) {
// return 0;
// }
// return time.tv_sec * 1000000000L + time.tv_nsec;
// }
// static inline void thread_bind_cpu(int coreid) {
// cpu_set_t cpuset;
// CPU_ZERO(&cpuset);
// CPU_SET(coreid, &cpuset);
// pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
// }
} // namespace sccl
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment