Commit d9d23f34 authored by lishen's avatar lishen
Browse files

Initial Code for SCCL_v1

parent 57df3737
#pragma once
#include <stdint.h>
#include "base.h"
namespace sccl {
namespace hardware {
namespace net {
struct netIf { // 网络接口结构体
char prefix[64]; // 网络前缀
int port; // 端口号
};
// 解析字符串列表,将结果存储在网络接口列表中
int parseStringList(const char* string, struct netIf* ifList, int maxList);
// 根据给定的字符串和端口,匹配网络接口列表中的接口
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
scclResult_t rocmLibraryInit(void);
////////////////////////////////// 用于定义网络设备 //////////////////////////////////
typedef struct {
char* name; // 主要用于日志记录。
char* pciPath; // PCI设备在/sys中的路径。
uint64_t guid; // NIC芯片的唯一标识符。对于具有多个PCI功能(物理或虚拟)的卡非常重要。
int ptrSupport; // [SCCL_PTR_HOST|SCCL_PTR_CUDA|SCCL_PTR_DMABUF]
int speed; // 端口速度,单位为Mbps。
int port; // 端口号。
float latency; // 网络延迟
int maxComms; // 我们可以创建的最大通信数量
int maxRecvs; // 最大分组接收数量。
} scclNetProperties_t;
typedef struct {
// 网络的名称(主要用于日志)
const char* name;
// 初始化网络。
scclResult_t (*init)();
// 返回适配器的数量。
scclResult_t (*devices)(int* ndev);
// 获取各种设备属性。
scclResult_t (*getProperties)(int dev, scclNetProperties_t* props);
// 创建一个接收对象并提供一个句柄以连接到它。该句柄最多可以是 SCCL_NET_HANDLE_MAXSIZE 字节,并将在排名之间交换以创建连接。
scclResult_t (*listen)(int dev, void* handle, void** listenComm);
// 连接到一个句柄并返回一个发送 comm 对象给该对等体。
// 此调用不应阻塞以建立连接,而应成功返回 sendComm == NULL,并期望再次调用直到 sendComm != NULL。
scclResult_t (*connect)(int dev, void* handle, void** sendComm);
// 在远程对等体调用 connect 后最终确定连接建立。
// 此调用不应阻塞以建立连接,而应成功返回 recvComm == NULL,并期望再次调用直到 recvComm != NULL。
scclResult_t (*accept)(void* listenComm, void** recvComm);
// 注册/注销内存。Comm 可以是 sendComm 或 recvComm。
// 类型是 SCCL_PTR_HOST 或 SCCL_PTR_CUDA。
scclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF 支持 */
scclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
scclResult_t (*deregMr)(void* comm, void* mhandle);
// 异步发送到对等体。
// 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// 异步从对等体接收。 如果调用不能执行(或会阻塞),则可能返回 request == NULL
scclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// 执行刷新/栅栏操作,以确保所有使用 SCCL_PTR_CUDA 接收到的数据对 GPU 可见
scclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// 测试请求是否完成。如果 size 不为 NULL,则返回发送/接收的字节数。
scclResult_t (*test)(void* request, int* done, int* sizes);
// 关闭并释放 send/recv comm 对象
scclResult_t (*closeSend)(void* sendComm);
scclResult_t (*closeRecv)(void* recvComm);
scclResult_t (*closeListen)(void* listenComm);
} scclNet_t;
////////////////////////////////// 其他定义 //////////////////////////////////
typedef enum sccl_ptr {
SCCL_PTR_HOST = 0x1,
SCCL_PTR_CUDA = 0x2,
SCCL_PTR_DMABUF = 0x4
} sccl_ptr_t;
#define SCCL_NET_HANDLE_MAXSIZE 128
} // namespace net
} // namespace hardware
} // namespace sccl
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <dlfcn.h>
#include <sys/utsname.h>
#include <fstream>
#include "base.h"
#include "rocm_wrap.h"
namespace sccl {
namespace hardware {
namespace net {
namespace rocm_wrap {
#define DECLARE_ROCM_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
DECLARE_ROCM_PFN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
/* ROCr Driver functions loaded with dlsym() */
DECLARE_ROCM_PFN(hsa_init);
DECLARE_ROCM_PFN(hsa_system_get_info);
DECLARE_ROCM_PFN(hsa_status_string);
SCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 0);
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
static scclResult_t initResult;
static void* hsaLib;
static uint16_t version_major, version_minor;
bool scclCudaLaunchBlocking = false;
//////////////////////////////////////////////////////////////////////////////
static void initOnceFunc() {
do {
char* val = getenv("CUDA_LAUNCH_BLOCKING");
scclCudaLaunchBlocking = val != nullptr && val[0] != 0 && !(val[0] == '0' && val[1] == 0);
} while(0);
bool dmaBufSupport = false;
hsa_status_t res;
/*
* Load ROCr driver library
*/
char path[1024];
char* scclCudaPath = getenv("RCCL_ROCR_PATH");
if(scclCudaPath == NULL)
snprintf(path, 1024, "%s", "libhsa-runtime64.so");
else
snprintf(path, 1024, "%s%s", scclCudaPath, "libhsa-runtime64.so");
hsaLib = dlopen(path, RTLD_LAZY);
if(hsaLib == NULL) {
WARN("Failed to find ROCm runtime library in %s (RCCL_ROCR_PATH=%s)", scclCudaPath, scclCudaPath);
goto error;
}
/*
* Load initial ROCr functions
*/
pfn_hsa_init = (PFN_hsa_init)dlsym(hsaLib, "hsa_init");
if(pfn_hsa_init == NULL) {
WARN("Failed to load ROCr missing symbol hsa_init");
goto error;
}
pfn_hsa_init();
pfn_hsa_system_get_info = (PFN_hsa_system_get_info)dlsym(hsaLib, "hsa_system_get_info");
if(pfn_hsa_system_get_info == NULL) {
WARN("Failed to load ROCr missing symbol hsa_system_get_info");
goto error;
}
pfn_hsa_status_string = (PFN_hsa_status_string)dlsym(hsaLib, "hsa_status_string");
if(pfn_hsa_status_string == NULL) {
WARN("Failed to load ROCr missing symbol hsa_status_string");
goto error;
}
res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &version_major);
if(res != 0) {
WARN("pfn_hsa_system_get_info failed with %d", res);
goto error;
}
res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &version_minor);
if(res != 0) {
WARN("pfn_hsa_system_get_info failed with %d", res);
goto error;
}
INFO(SCCL_LOG_NET, "ROCr version %d.%d", version_major, version_minor);
// if (hsaDriverVersion < ROCR_DRIVER_MIN_VERSION) {
// WARN("ROCr Driver version found is %d. Minimum requirement is %d", hsaDriverVersion, ROCR_DRIVER_MIN_VERSION);
// Silently ignore version check mismatch for backwards compatibility
// goto error;
//}
/* DMA-BUF support */
// ROCm support
if(scclParamDmaBufEnable() == 0) {
INFO(SCCL_LOG_NET, "Dmabuf feature disabled without SCCL_ENABLE_DMABUF_SUPPORT=1");
goto error;
}
res = pfn_hsa_system_get_info((hsa_system_info_t)0x204, &dmaBufSupport);
if(res != HSA_STATUS_SUCCESS || !dmaBufSupport) {
INFO(SCCL_LOG_NET, "Current version of ROCm does not support dmabuf feature.");
goto error;
} else {
pfn_hsa_amd_portable_export_dmabuf = (PFN_hsa_amd_portable_export_dmabuf)dlsym(hsaLib, "hsa_amd_portable_export_dmabuf");
if(pfn_hsa_amd_portable_export_dmabuf == NULL) {
WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf");
goto error;
} else {
// check OS kernel support
struct utsname utsname;
FILE* fp = NULL;
char kernel_opt1[28] = "CONFIG_DMABUF_MOVE_NOTIFY=y";
char kernel_opt2[20] = "CONFIG_PCI_P2PDMA=y";
char kernel_conf_file[128];
char buf[256];
int found_opt1 = 0;
int found_opt2 = 0;
// check for kernel name exists
if(uname(&utsname) == -1)
INFO(SCCL_LOG_NET, "Could not get kernel name");
// format and store the kernel conf file location
snprintf(kernel_conf_file, sizeof(kernel_conf_file), "/boot/config-%s", utsname.release);
fp = fopen(kernel_conf_file, "r");
if(fp == NULL)
INFO(SCCL_LOG_NET, "Could not open kernel conf file");
// look for kernel_opt1 and kernel_opt2 in the conf file and check
while(fgets(buf, sizeof(buf), fp) != NULL) {
if(strstr(buf, kernel_opt1) != NULL) {
found_opt1 = 1;
INFO(SCCL_LOG_NET, "CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release);
}
if(strstr(buf, kernel_opt2) != NULL) {
found_opt2 = 1;
INFO(SCCL_LOG_NET, "CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release);
}
}
if(!found_opt1 || !found_opt2) {
dmaBufSupport = 0;
INFO(SCCL_LOG_NET, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release);
INFO(SCCL_LOG_NET, "DMA_BUF_SUPPORT Failed due to OS kernel support");
}
if(dmaBufSupport)
INFO(SCCL_LOG_NET, "DMA_BUF Support Enabled");
else
goto error;
}
}
/*
* Required to initialize the ROCr Driver.
* Multiple calls of hsa_init() will return immediately
* without making any relevant change
*/
initResult = scclSuccess;
error:
initResult = scclSystemError;
return;
}
} // namespace rocm_wrap
scclResult_t rocmLibraryInit() {
pthread_once(&rocm_wrap::initOnceControl, rocm_wrap::initOnceFunc);
return rocm_wrap::initResult;
}
} // namespace net
} // namespace hardware
} // namespace sccl
#pragma once
#include <hsa/hsa.h>
namespace sccl {
namespace hardware {
namespace net {
namespace rocm_wrap {
typedef hsa_status_t (*PFN_hsa_init)();
typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, void* value);
typedef hsa_status_t (*PFN_hsa_status_string)(hsa_status_t status, const char** status_string);
typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size_t size, int* dmabuf, uint64_t* offset);
#define DECLARE_ROCM_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
DECLARE_ROCM_PFN_EXTERN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
/* ROCr Driver functions loaded with dlsym() */
DECLARE_ROCM_PFN_EXTERN(hsa_init);
DECLARE_ROCM_PFN_EXTERN(hsa_system_get_info);
DECLARE_ROCM_PFN_EXTERN(hsa_status_string);
} // namespace rocm_wrap
scclResult_t rocmLibraryInit(void);
} // namespace net
} // namespace hardware
} // namespace sccl
# hardware功能
包括基于硬件的网络连接,以及通信相关的底层指令
/*************************************************************************
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef SCCL_CPUSET_H_
#define SCCL_CPUSET_H_
#include "base.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace topo {
// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
/**
* 将十六进制字符转换为对应的整数值
*
* @param c 输入的十六进制字符(0-9, a-f)
* @return 返回对应的整数值(0-15),如果输入无效则返回-1
*/
static int hexToInt(char c) {
int v = c - '0';
if(v < 0)
return -1;
if(v > 9)
v = 10 + c - 'a';
if((v < 0) || (v > 15))
return -1;
return v;
}
#define CPU_SET_N_U32 (sizeof(cpu_set_t) / sizeof(uint32_t))
/**
* 将十六进制字符串转换为CPU集合掩码
*
* @param str 输入的十六进制字符串,用逗号分隔不同部分
* @param mask 输出的CPU集合掩码
* @return scclSuccess 表示转换成功
*
* @note 字符串从左到右对应掩码从高到低的32位字
* 每个字符代表4位十六进制数
* 遇到非十六进制字符会提前终止转换
*/
static scclResult_t scclStrToCpuset(const char* str, cpu_set_t* mask) {
uint32_t cpumasks[CPU_SET_N_U32];
int m = CPU_SET_N_U32 - 1;
cpumasks[m] = 0;
for(int o = 0; o < strlen(str); o++) {
char c = str[o];
if(c == ',') {
m--;
cpumasks[m] = 0;
} else {
int v = hexToInt(c);
if(v == -1)
break;
cpumasks[m] <<= 4;
cpumasks[m] += v;
}
}
// Copy cpumasks to mask
for(int a = 0; m < CPU_SET_N_U32; a++, m++) {
memcpy(((uint32_t*)mask) + a, cpumasks + m, sizeof(uint32_t));
}
return scclSuccess;
}
/**
* 将CPU集合掩码转换为十六进制字符串表示
*
* @param mask 输入的CPU集合掩码
* @param str 输出的字符串缓冲区,用于存储转换结果
* @return 返回操作结果(scclSuccess表示成功)
*
* 转换规则:
* 1. 将cpu_set_t按字节从高到低转换为十六进制字符串
* 2. 每4个字节后添加一个逗号分隔符
* 3. 忽略前导零
*/
static scclResult_t scclCpusetToStr(cpu_set_t* mask, char* str) {
int c = 0;
uint8_t* m8 = (uint8_t*)mask;
for(int o = sizeof(cpu_set_t) - 1; o >= 0; o--) {
if(c == 0 && m8[o] == 0)
continue;
sprintf(str + c, "%02x", m8[o]);
c += 2;
if(o && o % 4 == 0) {
sprintf(str + c, ",");
c++;
}
}
str[c] = '\0';
return scclSuccess;
}
/**
* 将CPU集合掩码转换为范围字符串表示
*
* @param mask 输入的CPU集合掩码
* @param str 用于存储结果的缓冲区
* @param len 缓冲区长度
* @return 返回转换后的字符串指针(即str参数)
*
* 该函数将CPU集合掩码转换为可读的范围字符串格式,例如"0-3,5,7-9"。
* 如果缓冲区空间不足,结果会被截断。空集合会返回空字符串。
*/
static char* scclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) {
int c = 0;
int start = -1;
// Iterate through all possible CPU bits plus one extra position
for(int cpu = 0; cpu <= CPU_SETSIZE; cpu++) {
int isSet = (cpu == CPU_SETSIZE) ? 0 : CPU_ISSET(cpu, mask);
// Start of a new range
if(isSet && start == -1) {
start = cpu;
}
// End of a range, add comma between ranges
if(!isSet && start != -1) {
if(cpu - 1 == start) {
c += snprintf(str + c, len - c, "%s%d", c ? "," : "", start);
} else {
c += snprintf(str + c, len - c, "%s%d-%d", c ? "," : "", start, cpu - 1);
}
if(c >= len - 1)
break;
start = -1;
}
}
if(c == 0)
str[0] = '\0';
return str;
}
} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nvmlwrap.h"
#include "base.h"
#include <initializer_list>
#include <memory>
#include <mutex>
namespace sccl {
namespace hardware {
namespace topology {
int scclNvmlDeviceCount = 0;
scclNvmlDeviceInfo scclNvmlDevices[scclNvmlMaxDevices];
scclNvmlDevicePairInfo scclNvmlDevicePairs[scclNvmlMaxDevices][scclNvmlMaxDevices];
#if SCCL_NVML_DIRECT
#define SCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name) arglist = name;
#else
#include <dlfcn.h>
#define SCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name) arglist = nullptr;
#endif
namespace {
SCCL_NVML_FN(nvmlInit, nvmlReturn_t, ())
SCCL_NVML_FN(nvmlInit_v2, nvmlReturn_t, ())
SCCL_NVML_FN(nvmlShutdown, nvmlReturn_t, ())
SCCL_NVML_FN(nvmlDeviceGetCount, nvmlReturn_t, (unsigned int*))
SCCL_NVML_FN(nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*))
SCCL_NVML_FN(nvmlDeviceGetHandleByPciBusId, nvmlReturn_t, (const char* pciBusId, nvmlDevice_t* device))
SCCL_NVML_FN(nvmlDeviceGetHandleByIndex, nvmlReturn_t, (unsigned int index, nvmlDevice_t* device))
SCCL_NVML_FN(nvmlDeviceGetIndex, nvmlReturn_t, (nvmlDevice_t device, unsigned* index))
SCCL_NVML_FN(nvmlErrorString, char const*, (nvmlReturn_t r))
SCCL_NVML_FN(nvmlDeviceGetNvLinkState, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive))
SCCL_NVML_FN(nvmlDeviceGetNvLinkRemotePciInfo, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci))
SCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult))
SCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
SCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
SCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values))
std::mutex lock; // NVML has had some thread safety bugs
bool initialized = false;
thread_local bool threadInitialized = false;
scclResult_t initResult;
} // namespace
scclResult_t scclNvmlEnsureInitialized() {
// Optimization to avoid repeatedly grabbing the lock when we only want to
// read from the global tables.
if(threadInitialized)
return initResult;
threadInitialized = true;
std::lock_guard<std::mutex> locked(lock);
if(initialized)
return initResult;
initialized = true;
#if !SCCL_NVML_DIRECT
if(pfn_nvmlInit == nullptr) {
void* libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
if(libhandle == nullptr) {
WARN("Failed to open libnvidia-ml.so.1");
initResult = scclSystemError;
return initResult;
}
struct Symbol {
void** ppfn;
char const* name;
};
std::initializer_list<Symbol> symbols = {{(void**)&pfn_nvmlInit, "nvmlInit"},
{(void**)&pfn_nvmlInit_v2, "nvmlInit_v2"},
{(void**)&pfn_nvmlShutdown, "nvmlShutdown"},
{(void**)&pfn_nvmlDeviceGetCount, "nvmlDeviceGetCount"},
{(void**)&pfn_nvmlDeviceGetCount_v2, "nvmlDeviceGetCount_v2"},
{(void**)&pfn_nvmlDeviceGetHandleByPciBusId, "nvmlDeviceGetHandleByPciBusId"},
{(void**)&pfn_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"},
{(void**)&pfn_nvmlDeviceGetIndex, "nvmlDeviceGetIndex"},
{(void**)&pfn_nvmlErrorString, "nvmlErrorString"},
{(void**)&pfn_nvmlDeviceGetNvLinkState, "nvmlDeviceGetNvLinkState"},
{(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"},
{(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
{(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
{(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"},
{(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"}};
for(Symbol sym : symbols) {
*sym.ppfn = dlsym(libhandle, sym.name);
}
}
#endif
#if SCCL_NVML_DIRECT
bool have_v2 = true;
#else
bool have_v2 = pfn_nvmlInit_v2 != nullptr; // if this compare is done in the SCCL_NVML_DIRECT=1 case then GCC warns about it never being null
#endif
nvmlReturn_t res1 = (have_v2 ? pfn_nvmlInit_v2 : pfn_nvmlInit)();
if(res1 != NVML_SUCCESS) {
WARN("nvmlInit%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1));
initResult = scclSystemError;
return initResult;
}
unsigned int ndev;
res1 = (have_v2 ? pfn_nvmlDeviceGetCount_v2 : pfn_nvmlDeviceGetCount)(&ndev);
if(res1 != NVML_SUCCESS) {
WARN("nvmlDeviceGetCount%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1));
initResult = scclSystemError;
return initResult;
}
scclNvmlDeviceCount = int(ndev);
if(scclNvmlMaxDevices < scclNvmlDeviceCount) {
WARN("nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (scclNvmlMaxDevices=%d)", scclNvmlDeviceCount, scclNvmlMaxDevices);
initResult = scclInternalError;
return initResult;
}
for(int a = 0; a < scclNvmlDeviceCount; a++) {
res1 = pfn_nvmlDeviceGetHandleByIndex(a, &scclNvmlDevices[a].handle);
if(res1 != NVML_SUCCESS) {
WARN("nvmlDeviceGetHandleByIndex(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
initResult = scclSystemError;
return initResult;
}
res1 = pfn_nvmlDeviceGetCudaComputeCapability(
scclNvmlDevices[a].handle, &scclNvmlDevices[a].computeCapabilityMajor, &scclNvmlDevices[a].computeCapabilityMinor);
if(res1 != NVML_SUCCESS) {
WARN("nvmlDeviceGetCudaComputeCapability(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
initResult = scclSystemError;
return initResult;
}
}
for(int a = 0; a < scclNvmlDeviceCount; a++) {
for(int b = 0; b < scclNvmlDeviceCount; b++) {
nvmlDevice_t da = scclNvmlDevices[a].handle;
nvmlDevice_t db = scclNvmlDevices[b].handle;
res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_READ, &scclNvmlDevicePairs[a][b].p2pStatusRead);
if(res1 != NVML_SUCCESS) {
WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
initResult = scclSystemError;
return initResult;
}
res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_WRITE, &scclNvmlDevicePairs[a][b].p2pStatusWrite);
if(res1 != NVML_SUCCESS) {
WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
initResult = scclSystemError;
return initResult;
}
}
}
initResult = scclSuccess;
return initResult;
}
#define NVMLCHECK(name, ...) \
do { \
nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
if(e44241808 != NVML_SUCCESS) { \
WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
return scclSystemError; \
} \
} while(0)
#define NVMLTRY(name, ...) \
do { \
if(!SCCL_NVML_DIRECT && pfn_##name == nullptr) \
return scclInternalError; /* missing symbol is not a warned error */ \
nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
if(e44241808 != NVML_SUCCESS) { \
if(e44241808 != NVML_ERROR_NOT_SUPPORTED) \
INFO(SCCL_LOG_TOPO, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
return scclSystemError; \
} \
} while(0)
scclResult_t scclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
SCCLCHECK(scclNvmlEnsureInitialized());
std::lock_guard<std::mutex> locked(lock);
NVMLCHECK(nvmlDeviceGetHandleByPciBusId, pciBusId, device);
return scclSuccess;
}
scclResult_t scclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
SCCLCHECK(scclNvmlEnsureInitialized());
*device = scclNvmlDevices[index].handle;
return scclSuccess;
}
scclResult_t scclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
SCCLCHECK(scclNvmlEnsureInitialized());
for(int d = 0; d < scclNvmlDeviceCount; d++) {
if(scclNvmlDevices[d].handle == device) {
*index = d;
return scclSuccess;
}
}
return scclInvalidArgument;
}
scclResult_t scclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) {
SCCLCHECK(scclNvmlEnsureInitialized());
std::lock_guard<std::mutex> locked(lock);
NVMLTRY(nvmlDeviceGetNvLinkState, device, link, isActive);
return scclSuccess;
}
scclResult_t scclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) {
SCCLCHECK(scclNvmlEnsureInitialized());
std::lock_guard<std::mutex> locked(lock);
NVMLTRY(nvmlDeviceGetNvLinkRemotePciInfo, device, link, pci);
return scclSuccess;
}
scclResult_t scclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) {
SCCLCHECK(scclNvmlEnsureInitialized());
std::lock_guard<std::mutex> locked(lock);
NVMLTRY(nvmlDeviceGetNvLinkCapability, device, link, capability, capResult);
return scclSuccess;
}
scclResult_t scclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
SCCLCHECK(scclNvmlEnsureInitialized());
for(int d = 0; d < scclNvmlDeviceCount; d++) {
if(device == scclNvmlDevices[d].handle) {
*major = scclNvmlDevices[d].computeCapabilityMajor;
*minor = scclNvmlDevices[d].computeCapabilityMinor;
return scclSuccess;
}
}
return scclInvalidArgument;
}
scclResult_t scclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus) {
SCCLCHECK(scclNvmlEnsureInitialized());
if(p2pIndex == NVML_P2P_CAPS_INDEX_READ || p2pIndex == NVML_P2P_CAPS_INDEX_WRITE) {
int a = -1, b = -1;
for(int d = 0; d < scclNvmlDeviceCount; d++) {
if(device1 == scclNvmlDevices[d].handle)
a = d;
if(device2 == scclNvmlDevices[d].handle)
b = d;
}
if(a == -1 || b == -1)
return scclInvalidArgument;
if(p2pIndex == NVML_P2P_CAPS_INDEX_READ)
*p2pStatus = scclNvmlDevicePairs[a][b].p2pStatusRead;
else
*p2pStatus = scclNvmlDevicePairs[a][b].p2pStatusWrite;
} else {
std::lock_guard<std::mutex> locked(lock);
NVMLCHECK(nvmlDeviceGetP2PStatus, device1, device2, p2pIndex, p2pStatus);
}
return scclSuccess;
}
scclResult_t scclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values) {
SCCLCHECK(scclNvmlEnsureInitialized());
std::lock_guard<std::mutex> locked(lock);
NVMLTRY(nvmlDeviceGetFieldValues, device, valuesCount, values);
return scclSuccess;
}
} // namespace topology
} // namespace hardware
} // namespace sccl
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef SCCL_NVMLWRAP_H_
#define SCCL_NVMLWRAP_H_
#include "check.h"
namespace sccl {
namespace hardware {
namespace topology {
// #define SCCL_NVML_DIRECT 1
#ifndef SCCL_NVML_DIRECT
#define SCCL_NVML_DIRECT 0
#endif
#if SCCL_NVML_DIRECT
#include "nvml.h"
#else
// Dynamically handle dependencies on NVML
/* Extracted from nvml.h */
typedef struct nvmlDevice_st* nvmlDevice_t;
#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
typedef enum nvmlEnableState_enum {
NVML_FEATURE_DISABLED = 0, //!< Feature disabled
NVML_FEATURE_ENABLED = 1 //!< Feature enabled
} nvmlEnableState_t;
typedef enum nvmlNvLinkCapability_enum {
NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported
NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported
NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported
NVML_NVLINK_CAP_SYSMEM_ATOMICS = 3, // System memory atomics are supported
NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link
NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device
// should be last
NVML_NVLINK_CAP_COUNT
} nvmlNvLinkCapability_t;
typedef enum nvmlReturn_enum {
NVML_SUCCESS = 0, //!< The operation was successful
NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit()
NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid
NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device
NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation
NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful
NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough
NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached
NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded
NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed
NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU
NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded
NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted
NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible
NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again
NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups
NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch
NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use
NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred
} nvmlReturn_t;
typedef struct nvmlPciInfo_st {
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffff
unsigned int bus; //!< The bus on which the device resides, 0 to 0xff
unsigned int device; //!< The device's id on the bus, 0 to 31
unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id
// Added in NVML 2.285 API
unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID
// NVIDIA reserved for internal use only
unsigned int reserved0;
unsigned int reserved1;
unsigned int reserved2;
unsigned int reserved3;
} nvmlPciInfo_t;
/* P2P Capability Index Status*/
typedef enum nvmlGpuP2PStatus_enum {
NVML_P2P_STATUS_OK = 0,
NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
NVML_P2P_STATUS_DISABLED_BY_REGKEY,
NVML_P2P_STATUS_NOT_SUPPORTED,
NVML_P2P_STATUS_UNKNOWN
} nvmlGpuP2PStatus_t;
/* P2P Capability Index*/
typedef enum nvmlGpuP2PCapsIndex_enum {
NVML_P2P_CAPS_INDEX_READ = 0,
NVML_P2P_CAPS_INDEX_WRITE,
NVML_P2P_CAPS_INDEX_NVLINK,
NVML_P2P_CAPS_INDEX_ATOMICS,
NVML_P2P_CAPS_INDEX_PROP,
NVML_P2P_CAPS_INDEX_UNKNOWN
} nvmlGpuP2PCapsIndex_t;
/**
* Represents the type for sample value returned
*/
typedef enum nvmlValueType_enum {
NVML_VALUE_TYPE_DOUBLE = 0,
NVML_VALUE_TYPE_UNSIGNED_INT = 1,
NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
// Keep this last
NVML_VALUE_TYPE_COUNT
} nvmlValueType_t;
/**
* Union to represent different types of Value
*/
typedef union nvmlValue_st {
double dVal; //!< If the value is double
unsigned int uiVal; //!< If the value is unsigned int
unsigned long ulVal; //!< If the value is unsigned long
unsigned long long ullVal; //!< If the value is unsigned long long
signed long long sllVal; //!< If the value is signed long long
} nvmlValue_t;
/**
* Field Identifiers.
*
* All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
*/
/* NVLink Speed */
#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links
#define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device
/**
* Remote device NVLink ID
*
* Link ID needs to be specified in the scopeId field in nvmlFieldValue_t.
*/
#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146 //!< Remote device NVLink ID
/**
* NVSwitch: connected NVLink count
*/
#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch
#define NVML_FI_DEV_NVLINK_GET_SPEED 164
#define NVML_FI_DEV_NVLINK_GET_STATE 165
#define NVML_FI_DEV_NVLINK_GET_VERSION 166
#define NVML_FI_MAX 167 //!< One greater than the largest field ID defined above
/**
* Information for a Field Value Sample
*/
typedef struct nvmlFieldValue_st {
unsigned int
fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above.
unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId
//!< can represent linkId.
long long timestamp; //!< CPU Timestamp of this value in microseconds since 1970
long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by
//!< the same driver call.
nvmlValueType_t valueType; //!< Type of the value stored in value
nvmlReturn_t nvmlReturn; //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn !=
//!< NVML_SUCCESS
nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
} nvmlFieldValue_t;
/* End of nvml.h */
#endif // SCCL_NVML_DIRECT
constexpr int scclNvmlMaxDevices = 32;
struct scclNvmlDeviceInfo {
nvmlDevice_t handle;
int computeCapabilityMajor, computeCapabilityMinor;
};
struct scclNvmlDevicePairInfo {
nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite;
};
extern int scclNvmlDeviceCount;
extern scclNvmlDeviceInfo scclNvmlDevices[scclNvmlMaxDevices];
extern scclNvmlDevicePairInfo scclNvmlDevicePairs[scclNvmlMaxDevices][scclNvmlMaxDevices];
// All scclNvmlFoo() functions call scclNvmlEnsureInitialized() implicitly.
// Outsiders need only call it if they want to inspect the scclNvml global
// tables above.
scclResult_t scclNvmlEnsureInitialized();
scclResult_t scclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
scclResult_t scclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
scclResult_t scclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device);
scclResult_t scclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive);
scclResult_t scclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci);
scclResult_t scclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult);
scclResult_t scclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
scclResult_t scclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
scclResult_t scclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values);
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif // End include guard
#include "rocm_smi_wrap.h"
namespace sccl {
#define ROCMSMICHECK(cmd) \
do { \
rsmi_status_t ret = cmd; \
if(ret != RSMI_STATUS_SUCCESS) { \
const char* err; \
rsmi_status_string(ret, &err); \
WARN("ROCm SMI init failure %s", err); \
return scclInternalError; \
} \
} while(false)
/**
* 初始化ROCm SMI库并获取版本信息
*
* @return scclSuccess 初始化成功
* @note 该函数会打印ROCm SMI库的版本信息到日志
*/
scclResult_t rocm_smi_init() {
ROCMSMICHECK(rsmi_init(0));
rsmi_version_t version;
ROCMSMICHECK(rsmi_version_get(&version));
INFO(SCCL_LOG_TOPO, "rocm_smi_lib: version %d.%d.%d.%s", version.major, version.minor, version.patch, version.build);
return scclSuccess;
}
/**
* 获取系统中可用的ROCm设备数量
*
* @param num_devs 输出参数,用于存储获取到的设备数量
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*/
scclResult_t rocm_smi_getNumDevice(uint32_t* num_devs) {
ROCMSMICHECK(rsmi_num_monitor_devices(num_devs));
return scclSuccess;
}
scclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* busId, size_t len) {
uint64_t id;
ROCMSMICHECK(rsmi_dev_pci_id_get(deviceIndex, &id));
/** rocm_smi's bus ID format
* | Name | Field |
* ---------- | ------- |
* | Domain | [64:32] |
* | Reserved | [31:16] |
* | Bus | [15: 8] |
* | Device | [ 7: 3] |
* | Function | [ 2: 0] |
**/
// snprintf(busId, len, "%04lx:%02lx:%02lx.%01lx", (id) >> 32, (id & 0xff00) >> 8, (id & 0xf8) >> 3, (id & 0x7));
printf(busId, len, "%04lx:%02lx:%02lx.%01lx", (id) >> 32, (id & 0xff00) >> 8, (id & 0xf8) >> 3, (id & 0x7));
return scclSuccess;
}
scclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex) {
uint32_t i, num_devs = 0;
int64_t busid;
busIdToInt64(pciBusId, &busid);
/** convert to rocm_smi's bus ID format
* | Name | Field |
* ---------- | ------- |
* | Domain | [64:32] |
* | Reserved | [31:16] |
* | Bus | [15: 8] |
* | Device | [ 7: 3] |
* | Function | [ 2: 0] |
**/
busid = ((busid & 0xffff00000L) << 12) + ((busid & 0xff000L) >> 4) + ((busid & 0xff0L) >> 1) + (busid & 0x7L);
ROCMSMICHECK(rsmi_num_monitor_devices(&num_devs));
for(i = 0; i < num_devs; i++) {
uint64_t bdfid;
ROCMSMICHECK(rsmi_dev_pci_id_get(i, &bdfid));
if(bdfid == busid)
break;
}
if(i < num_devs) {
*deviceIndex = i;
return scclSuccess;
} else {
WARN("rocm_smi_lib: %s device index not found", pciBusId);
return scclInternalError;
}
}
/**
* 获取两个ROCm设备之间的链接信息
*
* @param srcIndex 源设备索引
* @param dstIndex 目标设备索引
* @param rsmi_type [out] 返回链接类型(RSMI_IO_LINK_TYPE)
* @param hops [out] 返回跳数(默认为2,XGMI类型且权重为15时为1)
* @param count [out] 返回链接计数(默认为1,XGMI类型时根据带宽计算)
*
* @return 成功返回scclSuccess,失败返回错误码
*
* @note 对于XGMI类型链接,当ROCm SMI版本>=2时,会根据最小/最大带宽计算链接计数
*/
scclResult_t rocm_smi_getLinkInfo(int srcIndex, int dstIndex, RSMI_IO_LINK_TYPE* rsmi_type, int* hops, int* count) {
uint64_t rsmi_hops, rsmi_weight;
ROCMSMICHECK(rsmi_topo_get_link_type(srcIndex, dstIndex, &rsmi_hops, rsmi_type));
ROCMSMICHECK(rsmi_topo_get_link_weight(srcIndex, dstIndex, &rsmi_weight));
*hops = 2;
*count = 1;
if(*rsmi_type == RSMI_IOLINK_TYPE_XGMI && rsmi_weight == 15) {
*hops = 1;
// #if defined USE_ROCM_SMI64CONFIG && rocm_smi_VERSION_MAJOR >= 2
#if 1
uint64_t min_bw = 0, max_bw = 0;
rsmi_version_t version;
ROCMSMICHECK(rsmi_version_get(&version));
if(version.major >= 2)
ROCMSMICHECK(rsmi_minmax_bandwidth_get(srcIndex, dstIndex, &min_bw, &max_bw));
if(max_bw && min_bw)
*count = max_bw / min_bw;
INFO(SCCL_LOG_GRAPH, "rocm smi srcIndex:%d dstIndex:%d min_bw:%ld max_bw:%ld count:%d", srcIndex, dstIndex, min_bw, max_bw, *count);
#endif
}
return scclSuccess;
}
} // namespace sccl
/*
Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef ROCM_SMI_WRAP_H_
#define ROCM_SMI_WRAP_H_
#include "rocm_smi/rocm_smi.h"
#ifdef USE_ROCM_SMI64CONFIG
#include "rocm_smi/rocm_smi64Config.h"
#endif
#include "base.h"
namespace sccl {
// 初始化ROCm SMI库
scclResult_t rocm_smi_init();
// 获取设备数量
scclResult_t rocm_smi_getNumDevice(uint32_t* num_devs);
// 根据设备索引获取设备的PCI总线ID字符串
scclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len);
// 根据PCI总线ID字符串获取设备索引
scclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex);
// 获取两个设备之间的链接信息,包括链接类型、跳数和链接数量
scclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int* hops, int* count);
} // namespace sccl
#endif
#include <sys/stat.h>
#include <fcntl.h>
#include <dirent.h>
#include <string.h>
#include <unistd.h>
#include <algorithm>
#include "topo.h"
#include "utils.h"
#include "cpuset.h"
#include "nvmlwrap.h"
// #include "net.h"
// #include "graph.h"
// #include "comm.h"
// #include "net.h"
// #include "coll_net.h"
// #include "cpuset.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace topo {
const char* topoNodeTypeStr[] = {"GPU", "PCI", "NVS", "CPU", "NIC", "NET"};
const char* topoLinkTypeStr[] = {"LOC", "XGMI", "", "PCI", "", "", "", "SYS", "NET"};
const char* topoPathTypeStr[] = {"LOC", "XGMI", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS"};
namespace topo_basic {
struct kvDict kvDictPciClass[] = {{"0x060400", PCI},
{"0x068000", NVS},
{"0x068001", CPU},
{"0x03", GPU},
{"0x02", NIC},
{"0x120000", GPU},
{"0x0b4000", GPU},
{NULL, PCI /* Default fallback value */}};
struct kvDict kvDictPciGen[] = {{"2.5 GT/s", 15},
{"5 GT/s", 30},
{"8 GT/s", 60},
{"16 GT/s", 120},
{"32 GT/s", 240}, /* Kernel 5.6 and earlier */
{"2.5 GT/s PCIe", 15},
{"5.0 GT/s PCIe", 30},
{"8.0 GT/s PCIe", 60},
{"16.0 GT/s PCIe", 120},
{"32.0 GT/s PCIe", 240},
{"64.0 GT/s PCIe", 480},
{NULL, 60 /* Default fallback */}}; // x100 Mbps per lane
// 定义一个参数 TopoDumpFileRank,用于指定拓扑结构转储文件的等级,默认值为0
SCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0);
// 定义一个参数 IgnoreCpuAffinity,用于指定是否忽略CPU亲和性,默认值为0(不忽略)
SCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
//////////////////////////////////////////////////////////////////////////////////////////////
scclResult_t scclTopoAddNet(struct scclXmlNode* xmlNet, struct scclTopoSystem* system, struct scclTopoNode* nic, int64_t busId) {
int dev;
SCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
struct scclTopoNode* net;
SCCLCHECK(scclTopoCreateNode(system, &net, NET, dev));
const char* str;
SCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
if(str)
sscanf(str, "0x%lx", &net->net.asic);
else
net->net.asic = dev;
int mbps;
SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
if(mbps <= 0)
mbps = 10000; // Some NICs define speed = -1
net->net.bw = mbps / 8000.0;
if(xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != scclSuccess)
net->net.latency = 0;
SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
// SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS));
SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0));
net->net.busId = busId;
SCCLCHECK(scclTopoConnectNodes(nic, net, LINK_NET, net->net.bw));
SCCLCHECK(scclTopoConnectNodes(net, nic, LINK_NET, net->net.bw));
return scclSuccess;
}
scclResult_t scclTopoAddNic(struct scclXmlNode* xmlNic, struct scclTopoSystem* system, struct scclTopoNode* nic, int64_t busId) {
for(int s = 0; s < xmlNic->nSubs; s++) {
struct scclXmlNode* xmlNet = xmlNic->subs[s];
if(strcmp(xmlNet->name, "net") != 0)
continue;
int index;
SCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
if(index == -1)
continue;
SCCLCHECK(scclTopoAddNet(xmlNet, system, nic, busId));
}
return scclSuccess;
}
/**
* @brief 添加GPU拓扑节点到系统
*
* 从XML节点中解析GPU属性并填充到拓扑节点结构中,包括:
* - CUDA计算能力(sm)
* - GCN架构名称(gcn)
* - HIP设备架构(arch)
* - 设备排名(rank)
* - 设备号(dev)
* - GDR支持标志(gdr)
*
* @param xmlGpu 包含GPU配置的XML节点
* @param system 目标拓扑系统
* @param gpu 待填充的GPU拓扑节点
* @return scclResult_t 操作结果,成功返回scclSuccess
*
* @note 此函数仅处理GPU基础属性,NVLink连接将在后续处理
*/
scclResult_t scclTopoAddGpu(struct scclXmlNode* xmlGpu, struct scclTopoSystem* system, struct scclTopoNode* gpu) {
SCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
const char* gcnArch;
const char* gcnArchName;
SCCLCHECK(xmlGetAttr(xmlGpu, "gcn", &gcnArch));
convertGcnArchToGcnArchName(gcnArch, &gcnArchName);
gpu->gpu.gcn = strdup(gcnArchName);
scclHipDeviceArch_t arch;
SCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value));
memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t));
SCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
SCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev));
SCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport));
// Do not go any further, nvlinks will be added in a second pass
return scclSuccess;
}
/**
* @brief 添加PCI设备到拓扑系统
*
* 解析XML节点中的PCI设备信息,并根据设备类型(GPU/NIC/普通PCI)创建对应的拓扑节点。
* 对于GPU设备,会进一步解析rank信息;对于NIC设备,会合并多端口设备;对于普通PCI设备,
* 会解析vendor/device等属性并递归处理子设备。
*
* @param xmlPci 包含PCI设备信息的XML节点
* @param system 目标拓扑系统
* @param parent 父拓扑节点
* @return scclResult_t 操作结果,成功返回scclSuccess
*/
scclResult_t scclTopoAddPci(struct scclXmlNode* xmlPci, struct scclTopoSystem* system, struct scclTopoNode* parent) {
const char* str;
int type;
SCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));
SCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));
int64_t busId;
SCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));
SCCLCHECK(busIdToInt64(str, &busId));
struct scclTopoNode* node = NULL;
struct scclXmlNode* xmlGpu = NULL;
SCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));
if(xmlGpu != NULL) {
type = GPU;
int index;
SCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
if(index == -1)
return scclSuccess;
SCCLCHECK(scclTopoCreateNode(system, &node, type, busId));
SCCLCHECK(scclTopoAddGpu(xmlGpu, system, node));
}
struct scclXmlNode* xmlNic = NULL;
SCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));
if(xmlNic != NULL) {
type = NIC;
// Ignore sub device ID and merge multi-port NICs into one PCI device.
busId &= 0xfffffffffffffff0;
struct scclTopoNode* nicNode = NULL;
SCCLCHECK(scclTopoGetNode(system, &nicNode, type, busId));
if(nicNode == NULL) {
SCCLCHECK(scclTopoCreateNode(system, &nicNode, type, busId));
node = nicNode; // Connect it to parent later on
}
SCCLCHECK(scclTopoAddNic(xmlNic, system, nicNode, busId));
} else if(type == PCI) {
SCCLCHECK(scclTopoCreateNode(system, &node, type, busId));
SCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str));
if(str)
node->pci.device += strtol(str, NULL, 0) << 48;
SCCLCHECK(xmlGetAttr(xmlPci, "device", &str));
if(str)
node->pci.device += strtol(str, NULL, 0) << 32;
SCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str));
if(str)
node->pci.device += strtol(str, NULL, 0) << 16;
SCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str));
if(str)
node->pci.device += strtol(str, NULL, 0);
for(int s = 0; s < xmlPci->nSubs; s++) {
struct scclXmlNode* xmlSubPci = xmlPci->subs[s];
SCCLCHECK(scclTopoAddPci(xmlSubPci, system, node));
}
}
if(node) {
int width, speed;
SCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
SCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));
// Manage cases where speed was not indicated in /sys
if(width == 0)
width = 16;
SCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)
SCCLCHECK(scclTopoConnectNodes(node, parent, LINK_PCI, width * speed / 80.0));
SCCLCHECK(scclTopoConnectNodes(parent, node, LINK_PCI, width * speed / 80.0));
}
return scclSuccess;
}
struct kvDict kvDictCpuArch[] = {{"x86_64", SCCL_TOPO_CPU_ARCH_X86}, {"arm64", SCCL_TOPO_CPU_ARCH_ARM}, {"ppc64", SCCL_TOPO_CPU_ARCH_POWER}, {NULL, 0}};
struct kvDict kvDictCpuVendor[] = {{"GenuineIntel", SCCL_TOPO_CPU_VENDOR_INTEL},
{"AuthenticAMD", SCCL_TOPO_CPU_VENDOR_AMD},
{"CentaurHauls", SCCL_TOPO_CPU_VENDOR_ZHAOXIN},
{" Shanghai ", SCCL_TOPO_CPU_VENDOR_ZHAOXIN},
{NULL, 0}};
/**
* @brief 添加CPU拓扑信息到系统拓扑结构中
*
* 从XML节点中解析CPU信息,包括NUMA ID、CPU架构、厂商、型号等,
* 并创建对应的拓扑节点。同时处理CPU关联的PCI设备和NIC设备。
*
* @param xmlCpu 包含CPU配置信息的XML节点
* @param system 目标拓扑系统
* @return scclResult_t 操作结果,成功返回scclSuccess
*/
scclResult_t scclTopoAddCpu(struct scclXmlNode* xmlCpu, struct scclTopoSystem* system) {
int numaId;
// 从XML节点获取NUMA ID
SCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
struct scclTopoNode* cpu;
// 创建一个新的CPU节点
SCCLCHECK(scclTopoCreateNode(system, &cpu, CPU, numaId));
const char* str;
// 获取CPU的亲和性属性
SCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
if(str != NULL) {
SCCLCHECK(scclStrToCpuset(str, &cpu->cpu.affinity));
}
// 获取CPU架构信息
SCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));
SCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));
if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86) {
// 获取CPU供应商信息
SCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));
SCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));
if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
int familyId, modelId;
// 获取Intel CPU的家族ID和型号ID
SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
// 根据家族ID和型号ID确定CPU型号
cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? SCCL_TOPO_CPU_TYPE_SKL : SCCL_TOPO_CPU_INTEL_BDW;
} else if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
int familyId, modelId;
// 获取兆芯CPU的家族ID和型号ID
SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
if(familyId == 7 && modelId == 0x5B)
cpu->cpu.model = SCCL_TOPO_CPU_TYPE_YONGFENG;
}
if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_AMD) {
int familyId, modelId;
// 获取AMD CPU的家族ID和型号ID
SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
// 将“Milan”也视为“Rome”
cpu->cpu.model = ((familyId == 143 && modelId >= 49) || familyId == 175) ? SCCL_TOPO_CPU_TYPE_ROME : SCCL_TOPO_CPU_TYPE_ZEN;
}
}
// 遍历CPU节点的子节点
for(int s = 0; s < xmlCpu->nSubs; s++) {
struct scclXmlNode* node = xmlCpu->subs[s];
// 如果子节点是PCI设备,添加PCI节点
if(strcmp(node->name, "pci") == 0)
SCCLCHECK(scclTopoAddPci(node, system, cpu));
// 如果子节点是NIC设备,添加NIC节点
if(strcmp(node->name, "nic") == 0) {
struct scclTopoNode* nic = NULL;
SCCLCHECK(scclTopoGetNode(system, &nic, NIC, 0));
if(nic == NULL) {
SCCLCHECK(scclTopoCreateNode(system, &nic, NIC, 0));
SCCLCHECK(scclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW));
SCCLCHECK(scclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW));
}
SCCLCHECK(scclTopoAddNic(node, system, nic, 0));
}
}
return scclSuccess;
}
// scclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
// char* str = path + offset;
// // Remove trailing "/"
// if(*str == '/')
// str--;
// // Find next /
// while(*str != '/')
// str--;
// str++;
// int64_t numid;
// SCCLCHECK(busIdToInt64(str, &numid));
// // Ignore subdevice because those should use the same PCI link so we want to merge nodes.
// numid -= numid & 0xf;
// *id = numid;
// return scclSuccess;
// }
static scclResult_t findLocalCpu(struct scclTopoNode* node, struct scclTopoNode** cpu) {
*cpu = NULL;
if(node->type == CPU) {
*cpu = node;
return scclSuccess;
}
for(int l = 0; l < node->nlinks; l++) {
if(node->links[l].type == LINK_PCI)
SCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
if(*cpu != NULL)
return scclSuccess;
}
return scclSuccess;
}
static scclResult_t scclTopoGetInterCpuBw(struct scclTopoNode* cpu, float* bw) {
*bw = LOC_BW;
if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_POWER) {
*bw = P9_BW;
return scclSuccess;
}
if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_ARM) {
*bw = ARM_BW;
return scclSuccess;
}
if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
*bw = cpu->cpu.model == SCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
}
if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
*bw = cpu->cpu.model == SCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
}
return scclSuccess;
}
// BCM Gen4 Switches present themselves as a two-level hierarchical switch
// even though they're supposed to sustain full BW across all ports.
// Flatten the switch as this extra level can break the search and make
// SCCL take wrong topology decisions.
scclResult_t scclTopoFlattenBcmSwitches(struct scclTopoSystem* system) {
for(int s = 0; s < system->nodes[PCI].count; s++) {
struct scclTopoNode* pciSwitch = system->nodes[PCI].nodes + s;
uint64_t device = pciSwitch->pci.device;
// Only flatten PEX Gen 4 switches in base mode
if((device & 0xfffffffffffff000) == 0x1000c0101000a000) {
// Find sub switches with the same device ID.
int64_t* subSwIds;
SCCLCHECK(scclCalloc(&subSwIds, pciSwitch->nlinks));
int subs = 0;
for(int l = 0; l < pciSwitch->nlinks; l++) {
struct scclTopoNode* sub = pciSwitch->links[l].remNode;
// Only fuse sub switches with the same device ID.
if(sub->type != PCI || sub->pci.device != device)
continue;
// Save sub switch for later
subSwIds[subs++] = sub->id;
// Remove link to that sub switch
memmove(pciSwitch->links + l, pciSwitch->links + l + 1, (pciSwitch->nlinks - l - 1) * (sizeof(struct scclTopoLink)));
pciSwitch->nlinks--;
// Don't increase l for the next iteration as we just shifted all links by one.
l--;
}
for(int s = 0; s < subs; s++) {
// Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
int index;
SCCLCHECK(scclTopoIdToIndex(system, PCI, subSwIds[s], &index));
struct scclTopoNode* sub = system->nodes[PCI].nodes + index;
// Connect all sub PCI devices to the parent switch
for(int l = 0; l < sub->nlinks; l++) {
struct scclTopoNode* remNode = sub->links[l].remNode;
if(remNode == pciSwitch)
continue;
// Add link from parent PCI switch -> PCI device
memcpy(pciSwitch->links + pciSwitch->nlinks, sub->links + l, sizeof(struct scclTopoLink));
pciSwitch->nlinks++;
// Update link from PCI device -> parent PCI switch
for(int rl = 0; rl < remNode->nlinks; rl++) {
if(remNode->links[rl].remNode == sub) {
remNode->links[rl].remNode = pciSwitch;
break;
}
}
}
SCCLCHECK(scclTopoRemoveNode(system, PCI, index));
}
// Set subdevice to 0x0000 to make sure we don't merge this switch again.
pciSwitch->pci.device = 0x1000c01010000000;
free(subSwIds);
// Restart, as system->nodes[PCI].nodes has changed.
s = 0;
}
}
return scclSuccess;
}
scclResult_t scclTopoConnectCpus(struct scclTopoSystem* system) {
// And connect all CPU nodes together
for(int n = 0; n < system->nodes[CPU].count; n++) {
for(int p = 0; p < system->nodes[CPU].count; p++) {
if(n == p)
continue;
float bw;
SCCLCHECK(scclTopoGetInterCpuBw(system->nodes[CPU].nodes + n, &bw));
SCCLCHECK(scclTopoConnectNodes(system->nodes[CPU].nodes + n, system->nodes[CPU].nodes + p, LINK_SYS, bw));
}
}
return scclSuccess;
}
static scclResult_t scclTopoSort(struct scclTopoNode* node, struct scclTopoNode* upNode) {
// 如果存在上级节点,则调整当前节点的链接顺序,使上级节点的链接位于最后
if(upNode) {
int l = 0;
// 找到指向upNode的链接
while(node->links[l].remNode != upNode)
l++;
struct scclTopoLink upLink;
// 复制找到的链接到upLink
memcpy(&upLink, node->links + l, sizeof(struct scclTopoLink));
// 将所有链接左移,直到upLink被移动到链接列表的末尾
while(node->links[l + 1].remNode) {
memcpy(node->links + l, node->links + l + 1, sizeof(struct scclTopoLink));
l++;
}
// 将upLink放到链接列表的末尾
memcpy(node->links + l, &upLink, sizeof(struct scclTopoLink));
}
// 递归地对PCI树进行排序
for(int l = 0; l < node->nlinks; l++) {
struct scclTopoLink* link = node->links + l;
// 如果链接类型是PCI且远端节点不是上级节点,则递归排序
if(link->type == LINK_PCI && link->remNode != upNode)
SCCLCHECK(scclTopoSort(link->remNode, node));
}
return scclSuccess;
}
// We want the graph to be organized to ease/accelerate traversal :
// 1. NVLinks (already the case)
// 2. PCI down
// 3. PCI up
// 4. SYS (already the case)
scclResult_t scclTopoSortSystem(struct scclTopoSystem* system) {
for(int n = 0; n < system->nodes[CPU].count; n++)
SCCLCHECK(scclTopoSort(system->nodes[CPU].nodes + n, NULL));
return scclSuccess;
}
float scclTopoXGMISpeed(const char* gcn) {
if(IsArchMatch(gcn, "gfx90a"))
return MI200_XGMI_WIDTH;
else if(IsArchMatch(gcn, "gfx94"))
return GFX94X_XGMI_WIDTH;
else
return VEGA_XGMI_WIDTH;
}
/**
* @brief 添加XGMI拓扑连接
*
* 处理XML节点中的XGMI连接信息,建立GPU与其他设备(GPU/CPU/NVS)之间的NVL连接。
*
* @param node XML节点指针,包含XGMI连接配置信息
* @param system 拓扑系统指针,用于存储和管理拓扑节点
* @param parentBusId 父设备的PCIe总线ID字符串
*
* @return scclResult_t 返回操作结果状态码:
* - scclSuccess: 操作成功
* - scclInternalError: 找不到指定GPU设备时返回错误
*
* @note 1. 支持GPU-GPU、GPU-CPU、GPU-NVS三种连接类型
* 2. 连接带宽由GPU的GCN架构和连接数量共同决定
* 3. 递归处理子节点时保持总线ID传递
*/
scclResult_t scclTopoAddXGMI(struct scclXmlNode* node, struct scclTopoSystem* system, const char* parentBusId) {
if(strcmp(node->name, "xgmi") == 0) {
struct scclTopoNode* gpu = NULL;
int64_t pBusId;
SCCLCHECK(busIdToInt64(parentBusId, &pBusId));
SCCLCHECK(scclTopoGetNode(system, &gpu, GPU, pBusId));
if(gpu == NULL) {
WARN("Add XGMI error : could not find GPU %lx\n", pBusId);
return scclInternalError;
}
int count;
SCCLCHECK(xmlGetAttrInt(node, "count", &count));
const char* targetClass;
SCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));
int targetType;
SCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));
struct scclTopoNode* remote = NULL;
if(targetType == GPU) {
// NVL P2P connection to another GPU
const char* target;
SCCLCHECK(xmlGetAttrStr(node, "target", &target));
int64_t busId;
SCCLCHECK(busIdToInt64(target, &busId));
SCCLCHECK(scclTopoGetNode(system, &remote, GPU, busId));
} else if(targetType == CPU) {
// NVL connection to the local CPU
SCCLCHECK(findLocalCpu(gpu, &remote));
} else {
if(system->nodes[NVS].count == 0) {
SCCLCHECK(scclTopoCreateNode(system, &remote, NVS, 0));
} else {
remote = system->nodes[NVS].nodes;
}
}
if(remote) {
float nvlSpeed = scclTopoXGMISpeed(gpu->gpu.gcn);
SCCLCHECK(scclTopoConnectNodes(gpu, remote, LINK_NVL, count * nvlSpeed));
if(remote->type != GPU) {
SCCLCHECK(scclTopoConnectNodes(remote, gpu, LINK_NVL, count * nvlSpeed));
}
}
} else {
const char* busId;
SCCLCHECK(xmlGetAttr(node, "busid", &busId));
for(int s = 0; s < node->nSubs; s++) {
SCCLCHECK(scclTopoAddXGMI(node->subs[s], system, busId ? busId : parentBusId));
}
}
return scclSuccess;
}
/**
* @brief 获取指定GPU组的本地网络掩码
*
* 遍历系统中所有网络节点,找到与指定GPU组(g)连接带宽最大且路径类型最优的网络节点,
* 将这些网络节点的ID转换为位掩码形式输出。
*
* @param system 拓扑系统指针
* @param g GPU组索引
* @param localNetMask [out] 输出的本地网络掩码(64位无符号整数)
* @param type [out] 可选参数,输出最优路径类型
* @return scclResult_t 成功返回scclSuccess,失败返回错误码
*/
static scclResult_t getLocalNetMask(struct scclTopoSystem* system, int g, uint64_t* localNetMask, int* type) {
int minType = PATH_DIS;
float maxBw = 0;
int count = 0;
int* nets;
SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count));
for(int n = 0; n < system->nodes[NET].count; n++) {
struct scclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU] + g;
if(path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
maxBw = path->bw;
minType = path->type;
if(type)
*type = minType;
count = 0;
}
if(path->bw == maxBw && path->type == minType)
nets[count++] = system->nodes[NET].nodes[n].id;
}
*localNetMask = 0ULL;
for(int n = 0; n < count; n++) {
if(nets[n] >= 64)
return scclInternalError;
*localNetMask |= 1ULL << nets[n];
}
free(nets);
return scclSuccess;
}
static scclResult_t scclTopoPrintRec(struct scclTopoNode* node, struct scclTopoNode* prevNode, char* line, int offset) {
if(node->type == GPU) {
sprintf(line + offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
} else if(node->type == CPU) {
sprintf(line + offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
} else if(node->type == PCI) {
sprintf(line + offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device);
} else {
sprintf(line + offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
}
INFO(SCCL_LOG_TOPO, "%s", line);
for(int i = 0; i < offset; i++)
line[i] = ' ';
for(int l = 0; l < node->nlinks; l++) {
struct scclTopoLink* link = node->links + l;
if(link->type == LINK_LOC)
continue;
if(link->type != LINK_PCI || link->remNode != prevNode) {
sprintf(line + offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
int nextOffset = strlen(line);
if(link->type == LINK_PCI) {
SCCLCHECK(scclTopoPrintRec(link->remNode, node, line, nextOffset));
} else {
if(link->remNode->type == NET) {
sprintf(line + nextOffset,
"%s/%lX (%lx/%d/%f)",
topoNodeTypeStr[link->remNode->type],
link->remNode->id,
link->remNode->net.asic,
link->remNode->net.port,
link->remNode->net.bw);
} else {
sprintf(line + nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
}
INFO(SCCL_LOG_TOPO, "%s", line);
}
}
}
return scclSuccess;
}
} // namespace topo_basic
////////////////////////////////////////////////////////////////////////////////////////////////
bool isHswDriverExist() {
const ::std::string basePath = "/sys/bus/pci/drivers";
DIR* dir = opendir(basePath.c_str());
if(!dir) {
return false;
}
struct dirent* entry;
bool found = false;
while((entry = readdir(dir)) != nullptr) {
::std::string name = entry->d_name;
if(name != "." && name != ".." && name.compare(0, 3, "hsw") == 0) {
found = true;
break;
}
}
closedir(dir);
return found;
}
int getIBNum() {
int count = 0;
const ::std::string basePath = "/sys/class/infiniband";
DIR* dir = opendir(basePath.c_str());
if(!dir) {
return count;
}
struct dirent* entry;
while((entry = readdir(dir)) != nullptr) {
if(strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
continue;
if(strncmp(entry->d_name, "mlx5", 4) == 0)
++count;
}
closedir(dir);
return count;
}
scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id) {
uint64_t* localNetMasks;
int ngpus = system->nodes[GPU].count;
SCCLCHECK(scclCalloc(&localNetMasks, ngpus));
// Fill localNetMasks for all GPUs.
for(int g = 0; g < ngpus; g++) {
SCCLCHECK(topo_basic::getLocalNetMask(system, g, localNetMasks + g, NULL));
}
// Find GPUs which have the same mask as rank, i.e. share the same local Nets.
int gpu;
SCCLCHECK(scclTopoRankToIndex(system, rank, &gpu));
int netLocalGpus = 0, netLocalGpu = 0;
for(int g = 0; g < ngpus; g++) {
if(localNetMasks[g] == localNetMasks[gpu]) {
if(g == gpu)
netLocalGpu = netLocalGpus;
netLocalGpus++;
}
}
uint64_t localNetMask = localNetMasks[gpu];
free(localNetMasks);
if(localNetMask == 0)
return scclInternalError;
// Round robin on GPUs and channels
int gIndex = 0, cId = 0, n = 0;
while(1) {
if(1ULL << n & localNetMask) {
if(gIndex == netLocalGpu && cId == channelId) {
*id = n;
return scclSuccess;
}
gIndex++;
if(gIndex == netLocalGpus) {
gIndex = 0;
cId++;
}
}
n = (n + 1) % 64;
}
}
scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex) {
int ngpus = system->nodes[GPU].count;
int* gpus;
SCCLCHECK(scclCalloc(&gpus, ngpus));
// Find localNetMask which includes net with the most local GPUs.
int netLocalGpus = 0, minType = PATH_DIS;
uint64_t localNetMask = 0ULL;
for(int g = 0; g < ngpus; g++) {
int type = PATH_DIS;
uint64_t mask;
SCCLCHECK(topo_basic::getLocalNetMask(system, g, &mask, &type));
if((1ULL << net) & mask) {
if(type < minType) {
localNetMask = mask;
netLocalGpus = 0;
minType = type;
}
if(type == minType) {
if(localNetMask && mask != localNetMask) {
WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n",
g,
gpus[netLocalGpus - 1],
minType,
net,
mask,
localNetMask);
free(gpus);
return scclInternalError;
}
gpus[netLocalGpus] = g;
netLocalGpus++;
}
}
}
if(localNetMask == 0ULL) {
*gpuIndex = -1;
free(gpus);
return scclSuccess;
}
// Round robin on GPUs and channels
int gIndex = 0, cId = 0, n = 0;
while(1) {
if(1ULL << n & localNetMask) {
if(n == net) {
*gpuIndex = gpus[gIndex];
free(gpus);
return scclSuccess;
}
gIndex++;
if(gIndex == netLocalGpus) {
gIndex = 0;
cId++;
}
}
n = (n + 1) % 64;
}
}
scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model) {
*arch = system->nodes[CPU].nodes[0].cpu.arch;
*vendor = system->nodes[CPU].nodes[0].cpu.vendor;
*model = system->nodes[CPU].nodes[0].cpu.model;
return scclSuccess;
}
scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity) {
struct scclTopoNode *cpu = NULL, *gpu = NULL;
for(int g = 0; g < system->nodes[GPU].count; g++) {
if(system->nodes[GPU].nodes[g].gpu.rank == rank) {
gpu = system->nodes[GPU].nodes + g;
// Find closer CPU
int cpuIndex = -1, minHops = 0;
for(int c = 0; c < system->nodes[CPU].count; c++) {
int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
if(cpuIndex == -1 || nHops < minHops) {
cpuIndex = c;
minHops = nHops;
}
}
cpu = system->nodes[CPU].nodes + cpuIndex;
}
}
if(cpu == NULL) {
WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
return scclInternalError;
}
// Query the CPU affinity set we were provided
cpu_set_t mask;
SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
// Get the affinity of the CPU close to our GPU.
cpu_set_t cpuMask = cpu->cpu.affinity;
cpu_set_t finalMask;
if(topo_basic::scclParamIgnoreCpuAffinity())
// Ignore the CPU affinity set and use the GPU one instead
finalMask = cpuMask;
else
// Use a subset of the GPU affinity set
CPU_AND(&finalMask, &mask, &cpuMask);
memcpy(affinity, &finalMask, sizeof(cpu_set_t));
// If there is a non empty set, use it to set affinity
if(CPU_COUNT(&finalMask)) {
char affinityStr[sizeof(cpu_set_t) * 2];
SCCLCHECK(scclCpusetToStr(&finalMask, affinityStr));
INFO(SCCL_LOG_TOPO, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
}
return scclSuccess;
}
scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count) {
*count = system->nodes[GPU].count;
return scclSuccess;
}
scclResult_t scclTopoGetNetCount(struct scclTopoSystem* system, int* count) {
*count = system->nodes[NET].count;
return scclSuccess;
}
scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count) {
*count = system->nodes[NVS].count;
return scclSuccess;
}
scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank) {
for(int g = 0; g < system->nodes[GPU].count; g++) {
if(system->nodes[GPU].nodes[g].gpu.rank == rank) {
*localRank = g;
return scclSuccess;
}
}
WARN("Could not find local GPU with rank %d", rank);
return scclInternalError;
}
scclResult_t scclTopoPrint(struct scclTopoSystem* s) {
INFO(SCCL_LOG_TOPO, "=== System : maxBw %2.1f totalBw %2.1f ===", s->maxBw, s->totalBw);
char line[1024];
for(int n = 0; n < s->nodes[CPU].count; n++)
SCCLCHECK(topo_basic::scclTopoPrintRec(s->nodes[CPU].nodes + n, NULL, line, 0));
INFO(SCCL_LOG_TOPO, "==========================================");
return scclSuccess;
}
scclResult_t scclTopoGetNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id) {
for(int i = 0; i < system->nodes[type].count; i++) {
if(system->nodes[type].nodes[i].id == id) {
*node = system->nodes[type].nodes + i;
return scclSuccess;
}
}
return scclSuccess;
}
scclResult_t scclTopoCreateNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id) {
if(system->nodes[type].count == SCCL_TOPO_MAX_NODES) {
WARN("Error : tried to create too many nodes of type %d", type);
return scclInternalError;
}
struct scclTopoNode* n = system->nodes[type].nodes + system->nodes[type].count;
system->nodes[type].count++;
n->type = type;
n->id = id;
if(type == GPU) {
// Create link to itself (used in some corner cases)
n->nlinks = 1;
n->links[0].type = LINK_LOC;
n->links[0].remNode = n;
n->links[0].bw = LOC_BW;
n->gpu.dev = SCCL_TOPO_UNDEF;
n->gpu.rank = SCCL_TOPO_UNDEF;
n->gpu.cudaCompCap = SCCL_TOPO_UNDEF;
} else if(type == CPU) {
n->cpu.arch = SCCL_TOPO_UNDEF;
n->cpu.vendor = SCCL_TOPO_UNDEF;
n->cpu.model = SCCL_TOPO_UNDEF;
} else if(type == NET) {
n->net.asic = 0ULL;
n->net.port = SCCL_TOPO_UNDEF;
n->net.bw = 0.0;
n->net.latency = 0.0;
}
*node = n;
return scclSuccess;
}
/**
* 从拓扑系统中移除指定类型的节点
*
* @param system 拓扑系统指针
* @param type 要移除的节点类型
* @param index 要移除的节点索引
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*
* 该函数会:
* 1. 释放被移除节点的所有路径内存
* 2. 更新其他节点到被移除节点的链接关系
* 3. 调整节点数组中剩余节点的位置
* 4. 减少该类型节点的计数
*/
scclResult_t scclTopoRemoveNode(struct scclTopoSystem* system, int type, int index) {
struct scclTopoNode* delNode = system->nodes[type].nodes + index;
for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
free(delNode->paths[t]);
for(int n = 0; n < system->nodes[t].count; n++) {
struct scclTopoNode* node = system->nodes[t].nodes + n;
if(node == delNode)
continue;
for(int l = 0; l < node->nlinks; l++) {
while(l < node->nlinks && node->links[l].remNode == delNode) {
memmove(node->links + l, node->links + l + 1, (node->nlinks - l - 1) * sizeof(struct scclTopoLink));
node->nlinks--;
}
if(l < node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) {
node->links[l].remNode--;
}
}
}
}
memmove(delNode, delNode + 1, (system->nodes[type].count - index - 1) * sizeof(struct scclTopoNode));
system->nodes[type].count--;
return scclSuccess;
}
scclResult_t scclTopoConnectNodes(struct scclTopoNode* node, struct scclTopoNode* remNode, int type, float bw) {
// Aggregate links into higher bw for NVLink
struct scclTopoLink* link;
for(link = node->links; link->remNode; link++) {
if(link->remNode == remNode && link->type == type)
break;
}
if(link->remNode == NULL)
node->nlinks++;
link->type = type;
link->remNode = remNode;
link->bw += bw;
// Sort links in BW descending order
struct scclTopoLink linkSave;
memcpy(&linkSave, link, sizeof(struct scclTopoLink));
while(link != node->links) {
if((link - 1)->bw >= linkSave.bw)
break;
memcpy(link, link - 1, sizeof(struct scclTopoLink));
link--;
}
memcpy(link, &linkSave, sizeof(struct scclTopoLink));
return scclSuccess;
}
scclResult_t scclTopoGetSystemFromXml(struct scclXml* xml, struct scclTopoSystem** topoSystem) {
SCCLCHECK(scclCalloc(topoSystem, 1));
struct scclXmlNode* topNode;
SCCLCHECK(xmlFindTag(xml, "system", &topNode));
printf("topNode->nSubs=%d\n", topNode->nSubs);
for(int s = 0; s < topNode->nSubs; s++) {
struct scclXmlNode* node = topNode->subs[s];
if(strcmp(node->name, "cpu") == 0)
SCCLCHECK(topo_basic::scclTopoAddCpu(node, *topoSystem));
}
SCCLCHECK(topo_basic::scclTopoAddXGMI(topNode, *topoSystem, NULL));
SCCLCHECK(topo_basic::scclTopoFlattenBcmSwitches(*topoSystem));
SCCLCHECK(topo_basic::scclTopoConnectCpus(*topoSystem));
SCCLCHECK(topo_basic::scclTopoSortSystem(*topoSystem));
return scclSuccess;
}
/**
* 获取系统中所有GPU节点的计算能力范围
*
* @param system 拓扑系统指针
* @param ccMin 输出参数,返回最小计算能力版本
* @param ccMax 输出参数,返回最大计算能力版本
* @return scclResult_t 成功返回scclSuccess,无GPU节点返回scclInternalError
*/
scclResult_t scclTopoGetCompCap(struct scclTopoSystem* system, int* ccMin, int* ccMax) {
if(system->nodes[GPU].count == 0)
return scclInternalError;
int min, max;
min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap;
for(int g = 1; g < system->nodes[GPU].count; g++) {
min = ::std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
max = ::std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
}
if(ccMin)
*ccMin = min;
if(ccMax)
*ccMax = max;
return scclSuccess;
}
scclResult_t scclTopoIdToIndex(struct scclTopoSystem* system, int type, int64_t id, int* index) {
*index = -1;
for(int i = 0; i < system->nodes[type].count; i++) {
if(system->nodes[type].nodes[i].id == id) {
*index = i;
return scclSuccess;
}
}
return scclInternalError;
}
scclResult_t scclTopoRankToIndex(struct scclTopoSystem* system, int rank, int* index) {
*index = -1;
for(int i = 0; i < system->nodes[GPU].count; i++) {
if(system->nodes[GPU].nodes[i].gpu.rank == rank) {
*index = i;
return scclSuccess;
}
}
return scclInternalError;
}
scclResult_t scclTopoDevToRank(struct scclTopoSystem* system, int dev, int* rank) {
*rank = -1;
for(int i = 0; i < system->nodes[GPU].count; i++) {
if(system->nodes[GPU].nodes[i].gpu.dev == dev) {
*rank = system->nodes[GPU].nodes[i].gpu.rank;
return scclSuccess;
}
}
return scclInternalError;
}
/**
* @brief 获取系统拓扑结构
*
* 该函数用于获取系统的拓扑结构信息,包括GPU和NIC设备。
* 首先尝试从环境变量SCCL_TOPO_FILE指定的XML文件加载拓扑,
* 若未指定则尝试加载默认拓扑文件(根据IB设备数量选择不同文件)。
* 自动检测本地GPU和NIC设备信息并填充到拓扑结构中。
*
* @param comm 通信上下文指针
* @param system 输出参数,返回创建的拓扑系统指针
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*/
// scclResult_t scclTopoGetSystem(struct scclTopoComm* comm, struct scclTopoSystem** system) {
// struct scclXml* xml;
// SCCLCHECK(scclCalloc(&xml, 1));
// char* xmlTopoFile = getenv("SCCL_TOPO_FILE");
// if(xmlTopoFile) {
// INFO(SCCL_LOG_TOPO, "SCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
// SCCLCHECK(scclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
// } else {
// bool useDefaultTopo = true;
// bool HswExist = topo_basic::isHswDriverExist();
// if(HswExist == true) {
// char* rocmPath = getenv("ROCM_PATH");
// if(rocmPath != NULL) {
// ::std::string xmlPath;
// int IBNum = topo_basic::getIBNum();
// if(IBNum == 8 || IBNum == 9 || IBNum == 10) {
// xmlPath = ::std::string(rocmPath) + "/rccl/lib/built-in-BW-topo-input.xml";
// if(access(xmlPath.c_str(), F_OK) == 0) {
// SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
// useDefaultTopo = false;
// }
// } else if(IBNum == 4 || IBNum == 5 || IBNum == 6) {
// xmlPath = ::std::string(rocmPath) + "/rccl/lib/built-in-508-topo-input.xml";
// if(access(xmlPath.c_str(), F_OK) == 0) {
// SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
// useDefaultTopo = false;
// }
// }
// }
// }
// if(useDefaultTopo) {
// INFO(SCCL_LOG_TOPO, "No default topo for now, please provide your own topo xml file");
// }
// }
// if(xml->maxIndex == 0) {
// // Create top tag
// struct scclXmlNode* top;
// SCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
// SCCLCHECK(xmlSetAttrInt(top, "version", SCCL_TOPO_XML_VERSION));
// }
// // Auto-detect GPUs if needed
// for(int r = 0; r < comm->nRanks; r++) {
// if(comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
// char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
// SCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
// struct scclXmlNode* node;
// SCCLCHECK(scclTopoFillGpu(xml, busId, &node));
// if(node == NULL)
// continue;
// SCCLCHECK(xmlSetAttrInt(node, "keep", 1));
// SCCLCHECK(xmlSetAttrInt(node, "rank", r));
// SCCLCHECK(topo_basic::xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
// }
// }
// // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// // so we start with collnet so that it has precedence.
// int netDevCount = 0;
// if(netDevCount == 0) {
// SCCLCHECK(comm->scclNet->devices(&netDevCount));
// }
// for(int n = 0; n < netDevCount; n++) {
// sccl::hardware::net::scclNetProperties_t props;
// SCCLCHECK(comm->scclNet->getProperties(n, &props));
// struct scclXmlNode* netNode;
// SCCLCHECK(scclTopoFillNet(xml, props.pciPath, props.name, &netNode));
// SCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
// SCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
// SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "speed", props.speed));
// SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "port", props.port));
// SCCLCHECK(topo_basic::xmlInitAttrFloat(netNode, "latency", props.latency));
// SCCLCHECK(topo_basic::xmlInitAttrUint64(netNode, "guid", props.guid));
// SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "maxconn", props.maxComms));
// bool gdrSupport =
// (props.ptrSupport & sccl::hardware::net::SCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & sccl::hardware::net::SCCL_PTR_DMABUF));
// INFO(SCCL_LOG_TOPO, "NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->scclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
// SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "gdr", gdrSupport));
// }
// // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
// SCCLCHECK(scclTopoTrimXml(xml));
// xmlTopoFile = getenv("SCCL_TOPO_DUMP_FILE");
// if(xmlTopoFile && comm->rank == topo_basic::scclParamTopoDumpFileRank()) {
// INFO(SCCL_LOG_TOPO, "SCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
// SCCLCHECK(scclTopoDumpXmlToFile(xmlTopoFile, xml));
// }
// SCCLCHECK(scclTopoGetSystemFromXml(xml, system));
// free(xml);
// return scclSuccess;
// }
scclResult_t scclTopoGetSystem(struct scclTopoSystem** system) {
using namespace sccl;
struct scclXml* xml;
SCCLCHECK(scclCalloc(&xml, 1));
bool HswExist = isHswDriverExist();
if(HswExist == true) {
::std::string xmlPath;
int IBNum = getIBNum();
if(IBNum == 8 || IBNum == 9 || IBNum == 10) {
xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
}
}
if(xml->maxIndex == 0) {
// Create top tag
struct scclXmlNode* top;
SCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
SCCLCHECK(xmlSetAttrInt(top, "version", SCCL_TOPO_XML_VERSION));
}
// Auto-detect GPUs if needed
// for(int r = 0; r < comm->nRanks; r++) {
// if(comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
// char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
// SCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
// struct scclXmlNode* node;
// SCCLCHECK(scclTopoFillGpu(xml, busId, &node));
// if(node == NULL)
// continue;
// SCCLCHECK(xmlSetAttrInt(node, "keep", 1));
// SCCLCHECK(xmlSetAttrInt(node, "rank", r));
// SCCLCHECK(topo_basic::xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
// }
// }
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// so we start with collnet so that it has precedence.
int netDevCount = 0;
auto scclNet = sccl::hardware::net::initNet(sccl::hardware::net::NET_IB);
if(netDevCount == 0) {
SCCLCHECK(scclNet->devices(&netDevCount));
}
for(int n = 0; n < netDevCount; n++) {
sccl::hardware::net::scclNetProperties_t props;
SCCLCHECK(scclNet->getProperties(n, &props));
struct scclXmlNode* netNode;
SCCLCHECK(scclTopoFillNet(xml, props.pciPath, props.name, &netNode));
SCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
SCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
SCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
SCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
SCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
SCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
SCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
bool gdrSupport = (props.ptrSupport & sccl::hardware::net::SCCL_PTR_CUDA) && (props.ptrSupport & sccl::hardware::net::SCCL_PTR_DMABUF);
INFO(SCCL_LOG_TOPO, "NET/%s : GPU Direct RDMA %s for HCA %d '%s'", scclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
SCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
}
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
SCCLCHECK(scclTopoTrimXml(xml));
SCCLCHECK(scclTopoGetSystemFromXml(xml, system));
free(xml);
return scclSuccess;
}
} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl
#ifndef SCCL_TOPO_H_
#define SCCL_TOPO_H_
#include <string.h>
#include "base.h"
#include "archinfo.h"
#include "xml.h"
#include "net.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace topo {
#define SCCL_TOPO_NODE_TYPES 6
static constexpr int SCCL_TOPO_MAX_NODES = 256;
#define SCCL_TOPO_MAX_LINKS 32
#define SCCL_TOPO_MAX_HOPS (SCCL_TOPO_MAX_NODES * SCCL_TOPO_NODE_TYPES)
// 定义硬件拓扑类型枚举
typedef enum topoNodeType {
GPU = 0, // 图形处理单元
PCI = 1, // 外围组件互连
NVS = 2, // 非易失性存储器
CPU = 3, // 中央处理器,实际上是NUMA域
NIC = 4, // 网络接口控制器
NET = 5 // 网络
} topoNodeType_t;
extern const char* topoNodeTypeStr[];
// 定义链接类型和路径类型的枚举,以确保它们尽可能匹配
typedef enum topoLinkType {
LINK_LOC = 0, // 本地链接
LINK_NVL = 1, // NVLink链接
// 路径类型PATH_NVB占位,不定义
LINK_PCI = 3, // PCI链接
// 路径类型PATH_PXB占位,不定义
// 路径类型PATH_PXN占位,不定义
// 路径类型PATH_PHB占位,不定义
LINK_SYS = 7, // 系统链接
LINK_NET = 8 // 网络链接
} topoLinkType_t;
extern const char* topoLinkTypeStr[];
// 定义 topoPathType_t 枚举类型,用于表示不同的路径类型。
enum topoPathType {
PATH_LOC = 0, // 本地路径
PATH_NVL = 1, // 通过 NVLink 连接
PATH_NVB = 2, // 通过中间 GPU 使用 NVLink 连接
PATH_PIX = 3, // 通过最多一个 PCIe 桥连接
PATH_PXB = 4, // 通过多个 PCIe 桥连接(不经过 PCIe 主桥)
PATH_PXN = 5, // GPU 和 NIC 之间通过中间 GPU 连接
PATH_PHB = 6, // 通过 PCIe 以及 PCIe 主桥连接
PATH_SYS = 7, // 通过 PCIe 以及 NUMA 节点之间的 SMP 互连连接
PATH_NET = 8, // 通过网络连接
PATH_DIS = 9 // 断开连接
};
////////////////////////////////////////////////////////////////////////////////////////////////
struct scclTopoNode;
struct scclTopoLink {
int type;
float bw;
struct scclTopoNode* remNode;
};
struct scclTopoLinkList {
int type;
float bw;
int count;
struct scclTopoLink* list[SCCL_TOPO_MAX_HOPS];
};
struct scclTopoNode {
int type; // 节点类型
int64_t id; // 节点ID
// 类型特定数据
union {
struct {
int dev; // NVML设备编号
int rank; // 排名
int cudaCompCap; // CUDA计算能力
int gdrSupport; // GDR支持
const char* gcn; // GCN架构名称
hipDeviceArch_t arch; // HIP设备架构
} gpu; // GPU节点
struct {
uint64_t asic; // ASIC标识
int port; // 端口编号
float bw; // 带宽
float latency; // 延迟
int gdrSupport; // GDR支持
int collSupport; // 集合操作支持
int maxChannels; // 最大通道数
int64_t busId; // 总线ID
} net; // 网络节点
struct {
int arch; // 架构
int vendor; // 供应商
int model; // 模型
cpu_set_t affinity; // CPU亲和性
} cpu; // CPU节点
struct {
uint64_t device; // PCI设备
} pci; // PCI节点
};
int nlinks; // 链接数量
struct scclTopoLink links[SCCL_TOPO_MAX_LINKS]; // 链接列表
// 预计算路径到GPU和NIC
struct scclTopoLinkList* paths[SCCL_TOPO_NODE_TYPES];
// 搜索期间使用
uint64_t used;
};
struct scclTopoNodeSet {
int count; // 节点数量
struct scclTopoNode nodes[SCCL_TOPO_MAX_NODES]; // 节点数组,最大数量由SCCL_TOPO_MAX_NODES定义
};
struct scclTopoSystem {
struct scclTopoNodeSet nodes[SCCL_TOPO_NODE_TYPES]; // 节点集,用于存储不同类型的节点
float maxBw; // 系统最大带宽
float baseBw; // 基础带宽
float totalBw; // 系统总带宽
int type; // 系统类型
int nRanks; // 系统中的秩数
int netGdrLevel; // 网络GDR级别
int tuning; // 调优参数
int pivotA2ANumBiRings; // Pivot A2A模式下的双向环路数量
bool pivotA2AEnabled; // 是否启用Pivot A2A通信模式
bool treeDefined; // 是否定义了树结构
bool ll128Enabled; // 是否启用了LL128模式
bool mscclEnabled; // 是否启用了MSCCL模式
};
#define LOC_BW 5000.0
#define SM60_NVLINK_BW 18.0
#define SM70_NVLINK_BW 20.0
#define SM80_NVLINK_BW 20.0
#define SM90_NVLINK_BW 20.0
#define SM86_NVLINK_BW 12.0
#define PCI_BW 12.0 // PCI Gen3 x16
#define QPI_BW 6.0
#define SKL_QPI_BW 10.0
#define ZPI_BW 6.0
#define YONGFENG_ZPI_BW 9.0
#define P9_BW 32.0
#define ARM_BW 6.0
#define NET_BW 12.0 // 100Gbit
#define VEGA_XGMI_WIDTH 24.0
#define MI200_XGMI_WIDTH 36.0
#define GFX94X_XGMI_WIDTH 48.0
// 英特尔CPU将GPU的P2P流量转换为64字节的PCI TLP,因此GPU之间的流量消耗更多的PCI带宽。
#define INTEL_P2P_OVERHEAD(bw) (bw * 6 / 5)
enum topoCpuArch {
SCCL_TOPO_CPU_ARCH_X86 = 1,
SCCL_TOPO_CPU_ARCH_POWER = 2,
SCCL_TOPO_CPU_ARCH_ARM = 3
};
enum topoCpuVendor {
SCCL_TOPO_CPU_VENDOR_INTEL = 1,
SCCL_TOPO_CPU_VENDOR_AMD = 2,
SCCL_TOPO_CPU_VENDOR_ZHAOXIN = 3
};
enum topoCpuType {
SCCL_TOPO_CPU_TYPE_BDW = 1,
SCCL_TOPO_CPU_TYPE_SKL = 2,
SCCL_TOPO_CPU_TYPE_ZEN = 3,
SCCL_TOPO_CPU_TYPE_ROME = 4,
SCCL_TOPO_CPU_TYPE_YONGFENG = 5
};
enum topoCpuPattern {
SCCL_TOPO_PATTERN_BALANCED_TREE = 1,
SCCL_TOPO_PATTERN_SPLIT_TREE = 2,
SCCL_TOPO_PATTERN_TREE = 3,
SCCL_TOPO_PATTERN_RING = 4,
SCCL_TOPO_PATTERN_NVLS = 5
};
#define SCCL_TOPO_MAX_NODES 256
extern const char* topoPathTypeStr[];
#define SCCL_TOPO_CPU_INTEL_BDW 1
#define SCCL_TOPO_CPU_INTEL_SKL 2
enum topoSysType {
SCCL_TOPO_UNDEF = -1,
SCCL_TOPO_CR8G = 1,
SCCL_TOPO_4P2H_ROME = 2,
SCCL_TOPO_GDR_ALL = 4,
SCCL_TOPO_16P1H = 8,
SCCL_TOPO_FORCE_INTRA = 16,
SCCL_TOPO_XGMI_ALL = 32
};
// struct scclTopoComm {
// int type;
// int id;
// int rank;
// int nRanks;
// int node;
// int nNodes;
// int localRank;
// int localRanks;
// bool dmaBufSupport;
// struct scclPeerInfo* peerInfo;
// sccl::hardware::net::scclNet_t* scclNet;
// };
////////////////////////////////////////////////////////////////////////////////////////////////
// 检查是否存在Hsw驱动程序
bool isHswDriverExist();
// 获取InfiniBand (IB) 设备的数量
int getIBNum();
// 获取拓扑节点
scclResult_t scclTopoGetNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id);
// 创建拓扑节点
scclResult_t scclTopoCreateNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id);
// 移除拓扑节点
scclResult_t scclTopoRemoveNode(struct scclTopoSystem* system, int type, int id);
// 连接两个拓扑节点
scclResult_t scclTopoConnectNodes(struct scclTopoNode* node, struct scclTopoNode* remNode, int type, float bw);
// 从XML获取系统拓扑
scclResult_t scclTopoGetSystemFromXml(struct scclXml* xml, struct scclTopoSystem** topoSystem);
// 打印系统路径
scclResult_t scclTopoPrint(struct scclTopoSystem* system);
// 获取计算能力
scclResult_t scclTopoGetCompCap(struct scclTopoSystem* system, int* ccMin, int* ccMax);
// 将ID转换为索引
scclResult_t scclTopoIdToIndex(struct scclTopoSystem* system, int type, int64_t id, int* index);
// 将Rank转换为索引
scclResult_t scclTopoRankToIndex(struct scclTopoSystem* system, int rank, int* index);
// 将设备ID转换为Rank
scclResult_t scclTopoDevToRank(struct scclTopoSystem* system, int dev, int* rank);
// 获取XGMI速度
float scclTopoXGMISpeed(const char* gcn);
// 获取本地网络信息
scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id);
// 获取本地GPU信息
scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex);
// 获取CPU类型信息
scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model);
// 查找CPU亲和性
scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity);
// 获取GPU数量
scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count);
// 获取网络接口数量
scclResult_t scclTopoGetNetCount(struct scclTopoSystem* system, int* count);
// 获取NVS(非易失性存储器)数量
scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count);
// 获取本地排名
scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank);
// // 获取系统拓扑结构
// scclResult_t scclTopoGetSystem(struct scclTopoComm* comm, struct scclTopoSystem** system);
scclResult_t scclTopoGetSystem(struct scclTopoSystem** system);
} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif
#pragma once
#include <string.h>
#include "base.h"
#include "archinfo.h"
#include "xml.h"
// #include "net.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace topo {
#define SCCL_TOPO_NODE_TYPES 6
static constexpr int SCCL_TOPO_MAX_NODES = 256;
#define SCCL_TOPO_MAX_LINKS 32
#define SCCL_TOPO_MAX_HOPS (SCCL_TOPO_MAX_NODES * SCCL_TOPO_NODE_TYPES)
// 定义硬件拓扑类型枚举
typedef enum topoNodeType {
GPU = 0, // 图形处理单元
PCI = 1, // 外围组件互连
NVS = 2, // 非易失性存储器
CPU = 3, // 中央处理器,实际上是NUMA域
NIC = 4, // 网络接口控制器
NET = 5 // 网络
} topoNodeType_t;
extern const char* topoNodeTypeStr[];
// 定义链接类型和路径类型的枚举,以确保它们尽可能匹配
typedef enum topoLinkType {
LINK_LOC = 0, // 本地链接
LINK_NVL = 1, // NVLink链接
// 路径类型PATH_NVB占位,不定义
LINK_PCI = 3, // PCI链接
// 路径类型PATH_PXB占位,不定义
// 路径类型PATH_PXN占位,不定义
// 路径类型PATH_PHB占位,不定义
LINK_SYS = 7, // 系统链接
LINK_NET = 8 // 网络链接
} topoLinkType_t;
extern const char* topoLinkTypeStr[];
// 定义 topoPathType_t 枚举类型,用于表示不同的路径类型。
enum topoPathType {
PATH_LOC = 0, // 本地路径
PATH_NVL = 1, // 通过 NVLink 连接
PATH_NVB = 2, // 通过中间 GPU 使用 NVLink 连接
PATH_PIX = 3, // 通过最多一个 PCIe 桥连接
PATH_PXB = 4, // 通过多个 PCIe 桥连接(不经过 PCIe 主桥)
PATH_PXN = 5, // GPU 和 NIC 之间通过中间 GPU 连接
PATH_PHB = 6, // 通过 PCIe 以及 PCIe 主桥连接
PATH_SYS = 7, // 通过 PCIe 以及 NUMA 节点之间的 SMP 互连连接
PATH_NET = 8, // 通过网络连接
PATH_DIS = 9 // 断开连接
};
////////////////////////////////////////////////////////////////////////////////////////////////
struct scclTopoNode;
struct scclTopoLink {
int type;
float bw;
struct scclTopoNode* remNode;
};
struct scclTopoLinkList {
int type;
float bw;
int count;
struct scclTopoLink* list[SCCL_TOPO_MAX_HOPS];
};
struct scclTopoNode {
int type; // 节点类型
int id; // 节点ID
// 类型特定数据
union {
struct {
int dev; // NVML设备编号
int rank; // 排名
int cudaCompCap; // CUDA计算能力
int gdrSupport; // GDR支持
const char* gcn; // GCN架构名称
hipDeviceArch_t arch; // HIP设备架构
} gpu; // GPU节点
struct {
uint64_t asic; // ASIC标识
int port; // 端口编号
float bw; // 带宽
float latency; // 延迟
int gdrSupport; // GDR支持
int collSupport; // 集合操作支持
int maxChannels; // 最大通道数
int64_t busId; // 总线ID
} net; // 网络节点
struct {
int arch; // 架构
int vendor; // 供应商
int model; // 模型
cpu_set_t affinity; // CPU亲和性
} cpu; // CPU节点
struct {
uint64_t device; // PCI设备
} pci; // PCI节点
};
int nlinks; // 链接数量
struct scclTopoLink links[SCCL_TOPO_MAX_LINKS]; // 链接列表
// 预计算路径到GPU和NIC
struct scclTopoLinkList* paths[SCCL_TOPO_NODE_TYPES];
// 搜索期间使用
uint64_t used;
};
struct scclTopoNodeSet {
int count; // 节点数量
struct scclTopoNode nodes[SCCL_TOPO_MAX_NODES]; // 节点数组,最大数量由SCCL_TOPO_MAX_NODES定义
};
struct scclTopoSystem {
struct scclTopoNodeSet nodes[SCCL_TOPO_NODE_TYPES]; // 节点集,用于存储不同类型的节点
float maxBw; // 系统最大带宽
float baseBw; // 基础带宽
float totalBw; // 系统总带宽
int type; // 系统类型
int nRanks; // 系统中的秩数
int netGdrLevel; // 网络GDR级别
int tuning; // 调优参数
int pivotA2ANumBiRings; // Pivot A2A模式下的双向环路数量
bool pivotA2AEnabled; // 是否启用Pivot A2A通信模式
bool treeDefined; // 是否定义了树结构
bool ll128Enabled; // 是否启用了LL128模式
bool mscclEnabled; // 是否启用了MSCCL模式
};
#define LOC_BW 5000.0
#define SM60_NVLINK_BW 18.0
#define SM70_NVLINK_BW 20.0
#define SM80_NVLINK_BW 20.0
#define SM90_NVLINK_BW 20.0
#define SM86_NVLINK_BW 12.0
#define PCI_BW 12.0 // PCI Gen3 x16
#define QPI_BW 6.0
#define SKL_QPI_BW 10.0
#define ZPI_BW 6.0
#define YONGFENG_ZPI_BW 9.0
#define P9_BW 32.0
#define ARM_BW 6.0
#define NET_BW 12.0 // 100Gbit
#define VEGA_XGMI_WIDTH 24.0
#define MI200_XGMI_WIDTH 36.0
#define GFX94X_XGMI_WIDTH 48.0
// 英特尔CPU将GPU的P2P流量转换为64字节的PCI TLP,因此GPU之间的流量消耗更多的PCI带宽。
#define INTEL_P2P_OVERHEAD(bw) (bw * 6 / 5)
enum topoCpuArch {
SCCL_TOPO_CPU_ARCH_X86 = 1,
SCCL_TOPO_CPU_ARCH_POWER = 2,
SCCL_TOPO_CPU_ARCH_ARM = 3
};
enum topoCpuVendor {
SCCL_TOPO_CPU_VENDOR_INTEL = 1,
SCCL_TOPO_CPU_VENDOR_AMD = 2,
SCCL_TOPO_CPU_VENDOR_ZHAOXIN = 3
};
enum topoCpuType {
SCCL_TOPO_CPU_TYPE_BDW = 1,
SCCL_TOPO_CPU_TYPE_SKL = 2,
SCCL_TOPO_CPU_TYPE_ZEN = 3,
SCCL_TOPO_CPU_TYPE_ROME = 4,
SCCL_TOPO_CPU_TYPE_YONGFENG = 5
};
enum topoCpuPattern {
SCCL_TOPO_PATTERN_BALANCED_TREE = 1,
SCCL_TOPO_PATTERN_SPLIT_TREE = 2,
SCCL_TOPO_PATTERN_TREE = 3,
SCCL_TOPO_PATTERN_RING = 4,
SCCL_TOPO_PATTERN_NVLS = 5
};
#define SCCL_TOPO_MAX_NODES 256
extern const char* topoPathTypeStr[];
#define SCCL_TOPO_CPU_INTEL_BDW 1
#define SCCL_TOPO_CPU_INTEL_SKL 2
enum topoSysType {
SCCL_TOPO_UNDEF = -1,
SCCL_TOPO_CR8G = 1,
SCCL_TOPO_4P2H_ROME = 2,
SCCL_TOPO_GDR_ALL = 4,
SCCL_TOPO_16P1H = 8,
SCCL_TOPO_FORCE_INTRA = 16,
SCCL_TOPO_XGMI_ALL = 32
};
} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl
/*************************************************************************
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <ctype.h>
#include "check.h"
#include "nvmlwrap.h"
#include "xml.h"
#include "rocm_smi_wrap.h"
#include "archinfo.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace topo {
/**************/
/* XML Struct */
/* Functions */
/**************/
scclResult_t xmlGetAttrIndex(struct scclXmlNode* node, const char* attrName, int* index) {
*index = -1;
const int nAttrs = node->nAttrs;
for(int a = 0; a < nAttrs; a++) {
if(strncmp(node->attrs[a].key, attrName, MAX_STR_LEN) == 0) {
*index = a;
return scclSuccess;
}
}
return scclSuccess;
}
scclResult_t xmlGetAttr(struct scclXmlNode* node, const char* attrName, const char** value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
*value = index == -1 ? NULL : node->attrs[index].value;
return scclSuccess;
}
scclResult_t xmlGetAttrStr(struct scclXmlNode* node, const char* attrName, const char** value) {
SCCLCHECK(xmlGetAttr(node, attrName, value));
if(*value == NULL) {
WARN("Attribute %s of node %s not found", attrName, node->name);
return scclInternalError;
}
return scclSuccess;
}
/**
* 从XML节点属性中获取整数值
*
* @param node XML节点指针
* @param attrName 属性名称
* @param value 输出参数,用于存储解析后的整数值
* @return 成功返回scclSuccess,失败返回错误码
*
* @note 该函数会先获取属性字符串值,然后将其转换为整数
*/
scclResult_t xmlGetAttrInt(struct scclXmlNode* node, const char* attrName, int* value) {
const char* str;
SCCLCHECK(xmlGetAttrStr(node, attrName, &str));
*value = strtol(str, NULL, 0);
return scclSuccess;
}
/**
* 从XML节点获取整数属性值,若属性不存在则返回默认值
*
* @param node XML节点指针
* @param attrName 要获取的属性名
* @param value 输出参数,用于存储获取到的整数值
* @param defaultValue 当属性不存在时返回的默认值
* @return scclResult_t 操作结果,成功返回scclSuccess
*/
scclResult_t xmlGetAttrIntDefault(struct scclXmlNode* node, const char* attrName, int* value, int defaultValue) {
const char* str;
SCCLCHECK(xmlGetAttr(node, attrName, &str));
*value = str ? strtol(str, NULL, 0) : defaultValue;
return scclSuccess;
}
// Only set values if not already set
/**
* @brief 初始化XML节点的整数属性
*
* 如果属性不存在则创建并设置值,已存在则不修改
*
* @param node XML节点指针
* @param attrName 属性名称
* @param value 要设置的整数值
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*/
scclResult_t xmlInitAttrInt(struct scclXmlNode* node, const char* attrName, const int value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
}
return scclSuccess;
}
/**
* 初始化XML节点的uint64类型属性
*
* @param node XML节点指针
* @param attrName 属性名称
* @param value 要设置的属性值(16进制格式)
* @return 成功返回scclSuccess,失败返回错误码
*
* 功能:为指定XML节点添加或更新一个uint64类型的属性,属性值将以"0x%lx"格式存储
* 注意:如果属性已存在,则直接使用新值覆盖原有值
*/
scclResult_t xmlInitAttrUint64(struct scclXmlNode* node, const char* attrName, const uint64_t value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value);
}
return scclSuccess;
}
scclResult_t xmlGetAttrFloat(struct scclXmlNode* node, const char* attrName, float* value) {
const char* str;
SCCLCHECK(xmlGetAttrStr(node, attrName, &str));
*value = strtof(str, NULL);
return scclSuccess;
}
scclResult_t xmlInitAttrFloat(struct scclXmlNode* node, const char* attrName, const float value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value);
}
return scclSuccess;
}
scclResult_t xmlFindTag(struct scclXml* xml, const char* tagName, struct scclXmlNode** node) {
*node = NULL;
for(int i = 0; i < xml->maxIndex; i++) {
struct scclXmlNode* n = xml->nodes + i;
if(strcmp(n->name, tagName) == 0) {
*node = n;
return scclSuccess;
}
}
return scclSuccess;
}
scclResult_t xmlFindTagKv(struct scclXml* xml, const char* tagName, struct scclXmlNode** node, const char* attrName, const char* attrValue) {
*node = NULL;
for(int i = 0; i < xml->maxIndex; i++) {
struct scclXmlNode* n = xml->nodes + i;
if(strcmp(n->name, tagName) == 0) {
const char* value;
SCCLCHECK(xmlGetAttr(n, attrName, &value));
if(value && strcmp(value, attrValue) == 0) {
*node = n;
return scclSuccess;
}
}
}
return scclSuccess;
}
scclResult_t xmlSetAttr(struct scclXmlNode* node, const char* attrName, const char* value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
strncpy(node->attrs[index].value, value, MAX_STR_LEN);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return scclSuccess;
}
scclResult_t xmlSetAttrIfUnset(struct scclXmlNode* node, const char* attrName, const char* value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index != -1)
return scclSuccess;
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
strncpy(node->attrs[index].value, value, MAX_STR_LEN);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return scclSuccess;
}
scclResult_t xmlSetAttrInt(struct scclXmlNode* node, const char* attrName, const int value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return scclSuccess;
}
scclResult_t xmlSetAttrFloat(struct scclXmlNode* node, const char* attrName, const float value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return scclSuccess;
}
scclResult_t xmlUnsetAttr(struct scclXmlNode* node, const char* attrName) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1)
return scclSuccess;
for(int i = index + 1; i < node->nAttrs; i++) {
strcpy(node->attrs[i - 1].key, node->attrs[i].key);
strcpy(node->attrs[i - 1].value, node->attrs[i].value);
}
node->nAttrs--;
return scclSuccess;
}
scclResult_t xmlGetSub(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub) {
*sub = NULL;
for(int s = 0; s < node->nSubs; s++) {
if(strcmp(node->subs[s]->name, subName) == 0) {
*sub = node->subs[s];
return scclSuccess;
}
}
return scclSuccess;
}
scclResult_t xmlGetSubKv(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub, const char* attrName, const char* attrValue) {
*sub = NULL;
for(int s = 0; s < node->nSubs; s++) {
struct scclXmlNode* subNode = node->subs[s];
if(strcmp(subNode->name, subName) == 0) {
const char* value;
SCCLCHECK(xmlGetAttr(subNode, attrName, &value));
if(value && strcmp(value, attrValue) == 0) {
*sub = node->subs[s];
return scclSuccess;
}
}
}
return scclSuccess;
}
scclResult_t xmlGetSubKvInt(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub, const char* attrName, const int attrValue) {
char strValue[10];
snprintf(strValue, 10, "%d", attrValue);
SCCLCHECK(xmlGetSubKv(node, subName, sub, attrName, strValue));
return scclSuccess;
}
scclResult_t xmlAddNode(struct scclXml* xml, struct scclXmlNode* parent, const char* subName, struct scclXmlNode** sub) {
if(xml->maxIndex == MAX_NODES) {
WARN("Error : too many XML nodes (max %d)", MAX_NODES);
return scclInternalError;
}
struct scclXmlNode* s = xml->nodes + xml->maxIndex++;
s->nSubs = 0;
s->nAttrs = 0;
*sub = s;
s->parent = parent;
if(parent)
parent->subs[parent->nSubs++] = s;
strncpy(s->name, subName, MAX_STR_LEN);
s->name[MAX_STR_LEN] = '\0';
return scclSuccess;
}
scclResult_t xmlRemoveNode(struct scclXmlNode* node) {
node->type = NODE_TYPE_NONE;
struct scclXmlNode* parent = node->parent;
if(parent == NULL)
return scclSuccess;
int shift = 0;
for(int s = 0; s < parent->nSubs; s++) {
if(parent->subs[s] == node)
shift = 1;
else if(shift)
parent->subs[s - 1] = parent->subs[s];
}
parent->nSubs--;
return scclSuccess;
}
scclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* dict) {
struct kvDict* d = dict;
while(d->str) {
if(strncmp(str, d->str, strlen(d->str)) == 0) {
*value = d->value;
return scclSuccess;
}
d++;
}
INFO(SCCL_LOG_GRAPH, "KV Convert to int : could not find value of '%s' in dictionary, falling back to %d", str, d->value);
*value = d->value;
return scclSuccess;
}
scclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict) {
struct kvDict* d = dict;
while(d->str) {
if(value == d->value) {
*str = d->str;
return scclSuccess;
}
d++;
}
WARN("KV Convert to str : could not find value %d in dictionary", value);
return scclInternalError;
}
namespace xml {
/*******************/
/* XML File Parser */
/*******************/
scclResult_t xmlGetChar(FILE* file, char* c) {
if(fread(c, 1, 1, file) == 0) {
WARN("XML Parse : Unexpected EOF");
return scclInternalError;
}
return scclSuccess;
}
scclResult_t xmlGetValue(FILE* file, char* value, char* last) {
char c;
SCCLCHECK(xmlGetChar(file, &c));
if(c != '"' && c != '\'') {
#if INT_OK
int o = 0;
do {
value[o++] = c;
SCCLCHECK(xmlGetChar(file, &c));
} while(c >= '0' && c <= '9');
value[o] = '\0';
*last = c;
return scclSuccess;
#else
WARN("XML Parse : Expected (double) quote.");
return scclInternalError;
#endif
}
int o = 0;
do {
SCCLCHECK(xmlGetChar(file, &c));
value[o++] = c;
} while(c != '"');
value[o - 1] = '\0';
SCCLCHECK(xmlGetChar(file, last));
return scclSuccess;
}
scclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
char c;
char* ptr = name;
int o = 0;
do {
SCCLCHECK(xmlGetChar(file, &c));
if(c == '=') {
ptr[o] = '\0';
if(value == NULL) {
WARN("XML Parse : Unexpected value with name %s", ptr);
return scclInternalError;
}
return xmlGetValue(file, value, last);
}
ptr[o] = c;
if(o == MAX_STR_LEN - 1) {
ptr[o] = '\0';
WARN("Error : name %s too long (max %d)", ptr, MAX_STR_LEN);
return scclInternalError;
}
o++;
} while(c != ' ' && c != '>' && c != '/' && c != '\n' && c != '\r');
ptr[o - 1] = '\0';
*last = c;
return scclSuccess;
}
// Shift the 3-chars string by one char and append c at the end
#define SHIFT_APPEND(s, c) \
do { \
s[0] = s[1]; \
s[1] = s[2]; \
s[2] = c; \
} while(0)
scclResult_t xmlSkipComment(FILE* file, char* start, char next) {
// Start from something neutral with \0 at the end.
char end[4] = "...";
// Inject all trailing chars from previous reads. We don't need
// to check for --> here because there cannot be a > in the name.
for(int i = 0; i < strlen(start); i++)
SHIFT_APPEND(end, start[i]);
SHIFT_APPEND(end, next);
// Stop when we find "-->"
while(strcmp(end, "-->") != 0) {
int c;
if(fread(&c, 1, 1, file) != 1) {
WARN("XML Parse error : unterminated comment");
return scclInternalError;
}
SHIFT_APPEND(end, c);
}
return scclSuccess;
}
scclResult_t xmlGetNode(FILE* file, struct scclXmlNode* node) {
node->type = NODE_TYPE_NONE;
char c = ' ';
while(c == ' ' || c == '\n' || c == '\r') {
if(fread(&c, 1, 1, file) == 0)
return scclSuccess;
}
if(c != '<') {
WARN("XML Parse error : expecting '<', got '%c'", c);
return scclInternalError;
}
// Read XML element name
SCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
// Check for comments
if(strncmp(node->name, "!--", 3) == 0) {
SCCLCHECK(xmlSkipComment(file, node->name + 3, c));
return xmlGetNode(file, node);
}
// Check for closing tag
if(node->name[0] == '\0' && c == '/') {
node->type = NODE_TYPE_CLOSE;
// Re-read the name, we got '/' in the first call
SCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
if(c != '>') {
WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name);
return scclInternalError;
}
return scclSuccess;
}
node->type = NODE_TYPE_OPEN;
// Get Attributes
int a = 0;
while(c == ' ') {
SCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
if(a == MAX_ATTR_COUNT) {
INFO(SCCL_LOG_TOPO, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT);
// Actually we need to still consume the extra attributes so we have an extra one.
} else
a++;
}
node->nAttrs = a;
if(c == '/') {
node->type = NODE_TYPE_SINGLE;
char str[MAX_STR_LEN];
SCCLCHECK(xmlGetToken(file, str, NULL, &c));
}
if(c != '>') {
WARN("XML Parse : expected >, got '%c'", c);
return scclInternalError;
}
return scclSuccess;
}
typedef scclResult_t (*xmlHandlerFunc_t)(FILE*, struct scclXml*, struct scclXmlNode*);
struct xmlHandler {
const char* name;
xmlHandlerFunc_t func;
};
scclResult_t xmlLoadSub(FILE* file, struct scclXml* xml, struct scclXmlNode* head, struct xmlHandler handlers[], int nHandlers) {
if(head && head->type == NODE_TYPE_SINGLE)
return scclSuccess;
while(1) {
if(xml->maxIndex == MAX_NODES) {
WARN("Error : XML parser is limited to 1024 nodes");
return scclInternalError;
}
struct scclXmlNode* node = xml->nodes + xml->maxIndex;
memset(node, 0, sizeof(struct scclXmlNode));
SCCLCHECK(xmlGetNode(file, node));
if(node->type == NODE_TYPE_NONE) {
if(head) {
WARN("XML Parse : unterminated %s", head->name);
return scclInternalError;
} else {
// All done
return scclSuccess;
}
}
if(head && node->type == NODE_TYPE_CLOSE) {
if(strcmp(node->name, head->name) != 0) {
WARN("XML Mismatch : %s / %s", head->name, node->name);
return scclInternalError;
}
return scclSuccess;
}
int found = 0;
for(int h = 0; h < nHandlers; h++) {
if(strcmp(node->name, handlers[h].name) == 0) {
if(head)
head->subs[head->nSubs++] = node;
node->parent = head;
node->nSubs = 0;
xml->maxIndex++;
SCCLCHECK(handlers[h].func(file, xml, node));
found = 1;
break;
}
}
if(!found) {
if(nHandlers)
INFO(SCCL_LOG_TOPO, "Ignoring element %s", node->name);
SCCLCHECK(xmlLoadSub(file, xml, node, NULL, 0));
}
}
}
/**************/
/* XML Writer */
/**************/
scclResult_t scclTopoDumpXmlRec(int indent, FILE* file, struct scclXmlNode* node) {
for(int i = 0; i < indent; i++)
fprintf(file, " ");
fprintf(file, "<%s", node->name);
for(int a = 0; a < node->nAttrs; a++) {
fprintf(file, " %s=\"%s\"", node->attrs[a].key, node->attrs[a].value);
}
if(node->nSubs == 0) {
fprintf(file, "/>\n");
} else {
fprintf(file, ">\n");
for(int s = 0; s < node->nSubs; s++) {
SCCLCHECK(scclTopoDumpXmlRec(indent + 2, file, node->subs[s]));
}
for(int i = 0; i < indent; i++)
fprintf(file, " ");
fprintf(file, "</%s>\n", node->name);
}
return scclSuccess;
}
/****************************************/
/* Parser rules for our specific format */
/****************************************/
scclResult_t scclTopoXmlLoadNvlink(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
SCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
return scclSuccess;
}
scclResult_t scclTopoXmlLoadGpu(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
struct xmlHandler handlers[] = {{"xgmi", scclTopoXmlLoadNvlink}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
return scclSuccess;
}
scclResult_t scclTopoXmlLoadNet(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
SCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
return scclSuccess;
}
scclResult_t scclTopoXmlLoadNic(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
struct xmlHandler handlers[] = {{"net", scclTopoXmlLoadNet}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
return scclSuccess;
}
scclResult_t scclTopoXmlLoadPci(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
struct xmlHandler handlers[] = {{"pci", scclTopoXmlLoadPci}, {"gpu", scclTopoXmlLoadGpu}, {"nic", scclTopoXmlLoadNic}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3));
return scclSuccess;
}
scclResult_t scclTopoXmlLoadCpu(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
struct xmlHandler handlers[] = {{"pci", scclTopoXmlLoadPci}, {"nic", scclTopoXmlLoadNic}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
return scclSuccess;
}
scclResult_t scclTopoXmlLoadSystem(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
int version;
SCCLCHECK(xmlGetAttrInt(head, "version", &version));
if(version != SCCL_TOPO_XML_VERSION) {
WARN("XML Topology has wrong version %d, %d needed", version, SCCL_TOPO_XML_VERSION);
return scclInvalidUsage;
}
const char* name;
SCCLCHECK(xmlGetAttr(head, "name", &name));
if(name != NULL)
INFO(SCCL_LOG_TOPO, "Loading topology %s", name);
else
INFO(SCCL_LOG_TOPO, "Loading unnamed topology");
struct xmlHandler handlers[] = {{"cpu", scclTopoXmlLoadCpu}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
return scclSuccess;
}
/**********************/
/* XML creation */
/* from autodetection */
/**********************/
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
static void memcpylower(char* dst, const char* src, const size_t size) {
for(int i = 0; i < size; i++)
dst[i] = tolower(src[i]);
return;
}
static scclResult_t getPciPath(const char* busId, char** path) {
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
memcpylower(busPath + sizeof("/sys/class/pci_bus/") - 1, busId, BUSID_REDUCED_SIZE - 1);
memcpylower(busPath + sizeof("/sys/class/pci_bus/0000:00/../../") - 1, busId, BUSID_SIZE - 1);
*path = realpath(busPath, NULL);
if(*path == NULL) {
WARN("Could not find real path of %s", busPath);
return scclSystemError;
}
return scclSuccess;
}
scclResult_t scclTopoSetAttrFromSys(struct scclXmlNode* pciNode, const char* path, const char* fileName, const char* attrName) {
char strValue[MAX_STR_LEN];
SCCLCHECK(scclTopoGetStrFromSys(path, fileName, strValue));
if(strValue[0] != '\0') {
SCCLCHECK(xmlSetAttr(pciNode, attrName, strValue));
}
INFO(SCCL_LOG_TOPO, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue);
return scclSuccess;
}
scclResult_t scclTopoGetXmlFromCpu(struct scclXmlNode* cpuNode, struct scclXml* xml) {
int index;
SCCLCHECK(xmlGetAttrIndex(cpuNode, "affinity", &index));
if(index == -1) {
const char* numaId;
SCCLCHECK(xmlGetAttr(cpuNode, "numaid", &numaId));
if(numaId == NULL) {
WARN("GetXmlFromCpu : could not find CPU numa ID.");
return scclInternalError;
}
// Set affinity
char cpumaskPath[] = "/sys/devices/system/node/node0000";
sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId);
SCCLCHECK(scclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity"));
}
SCCLCHECK(xmlGetAttrIndex(cpuNode, "arch", &index));
if(index == -1) {
// Fill CPU type / vendor / model
#if defined(__PPC__)
SCCLCHECK(xmlSetAttr(cpuNode, "arch", "ppc64"));
#elif defined(__aarch64__)
SCCLCHECK(xmlSetAttr(cpuNode, "arch", "arm64"));
#elif defined(__x86_64__)
SCCLCHECK(xmlSetAttr(cpuNode, "arch", "x86_64"));
#endif
}
#if defined(__x86_64__)
SCCLCHECK(xmlGetAttrIndex(cpuNode, "vendor", &index));
if(index == -1) {
union {
struct {
// CPUID 0 String register order
uint32_t ebx;
uint32_t edx;
uint32_t ecx;
};
char vendor[12];
} cpuid0;
asm volatile("cpuid" : "=b"(cpuid0.ebx), "=c"(cpuid0.ecx), "=d"(cpuid0.edx) : "a"(0) : "memory");
char vendor[13];
strncpy(vendor, cpuid0.vendor, 12);
vendor[12] = '\0';
SCCLCHECK(xmlSetAttr(cpuNode, "vendor", vendor));
}
SCCLCHECK(xmlGetAttrIndex(cpuNode, "familyid", &index));
if(index == -1) {
union {
struct {
unsigned steppingId : 4;
unsigned modelId : 4;
unsigned familyId : 4;
unsigned processorType : 2;
unsigned resv0 : 2;
unsigned extModelId : 4;
unsigned extFamilyId : 8;
unsigned resv1 : 4;
};
uint32_t val;
} cpuid1;
asm volatile("cpuid" : "=a"(cpuid1.val) : "a"(1) : "memory");
int familyId = cpuid1.familyId + (cpuid1.extFamilyId << 4);
int modelId = cpuid1.modelId + (cpuid1.extModelId << 4);
SCCLCHECK(xmlSetAttrInt(cpuNode, "familyid", familyId));
SCCLCHECK(xmlSetAttrInt(cpuNode, "modelid", modelId));
}
#endif
return scclSuccess;
}
scclResult_t scclTopoGetPciNode(struct scclXml* xml, const char* busId, struct scclXmlNode** pciNode) {
SCCLCHECK(xmlFindTagKv(xml, "pci", pciNode, "busid", busId));
if(*pciNode == NULL) {
SCCLCHECK(xmlAddNode(xml, NULL, "pci", pciNode));
SCCLCHECK(xmlSetAttr(*pciNode, "busid", busId));
}
return scclSuccess;
}
// Check whether a string is in BDF format or not.
// BDF (Bus-Device-Function) is "BBBB:BB:DD.F" where B, D and F are hex digits.
// There can be trailing chars.
int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); }
int checkBDFFormat(char* bdf) {
if(bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.')
return 0;
if(isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) || isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) ||
isHex(bdf[9] == 0) || isHex(bdf[11] == 0))
return 0;
return 1;
}
scclResult_t scclTopoGetXmlFromSys(struct scclXmlNode* pciNode, struct scclXml* xml) {
// Fill info, then parent
const char* busId;
SCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
char* path = NULL;
getPciPath(busId, &path);
if(path) {
SCCLCHECK(scclTopoSetAttrFromSys(pciNode, path, "class", "class"));
}
int index;
SCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
if(index == -1) {
if(path)
scclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor");
}
SCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
if(index == -1) {
if(path)
scclTopoSetAttrFromSys(pciNode, path, "device", "device");
}
SCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
if(index == -1) {
if(path)
scclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor");
}
SCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
if(index == -1) {
if(path)
scclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device");
}
SCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
if(index == -1) {
if(path) {
char deviceSpeedStr[MAX_STR_LEN];
float deviceSpeed;
SCCLCHECK(scclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
sscanf(deviceSpeedStr, "%f GT/s", &deviceSpeed);
char portSpeedStr[MAX_STR_LEN];
float portSpeed;
SCCLCHECK(scclTopoGetStrFromSys(path, "../max_link_speed", portSpeedStr));
if(portSpeedStr[0])
sscanf(portSpeedStr, "%f GT/s", &portSpeed);
else
portSpeed = deviceSpeed;
SCCLCHECK(xmlSetAttr(pciNode, "link_speed", portSpeed < deviceSpeed ? portSpeedStr : deviceSpeedStr));
} else {
SCCLCHECK(xmlSetAttr(pciNode, "link_speed", ""));
}
}
SCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
if(index == -1) {
if(path) {
char strValue[MAX_STR_LEN];
SCCLCHECK(scclTopoGetStrFromSys(path, "max_link_width", strValue));
int deviceWidth = strtol(strValue, NULL, 0);
SCCLCHECK(scclTopoGetStrFromSys(path, "../max_link_width", strValue));
int portWidth;
if(strValue[0])
portWidth = strtol(strValue, NULL, 0);
else
portWidth = deviceWidth;
SCCLCHECK(xmlSetAttrInt(pciNode, "link_width", std::min(deviceWidth, portWidth)));
} else {
SCCLCHECK(xmlSetAttr(pciNode, "link_width", ""));
}
}
struct scclXmlNode* parent = pciNode->parent;
if(parent == NULL) {
if(path) {
// Save that for later in case next step is a CPU
char numaIdStr[MAX_STR_LEN];
SCCLCHECK(scclTopoGetStrFromSys(path, "numa_node", numaIdStr));
// Workaround kernel bug for now
if(strcmp(numaIdStr, "-1") == 0)
strcpy(numaIdStr, "0");
// Go up one level in the PCI tree. Rewind two "/" and follow the upper PCI
// switch, or stop if we reach a CPU root complex.
int slashCount = 0;
int parentOffset;
for(parentOffset = strlen(path) - 1; parentOffset > 0; parentOffset--) {
if(path[parentOffset] == '/') {
slashCount++;
path[parentOffset] = '\0';
int start = parentOffset - 1;
while(start > 0 && path[start] != '/')
start--;
// Check whether the parent path looks like "BBBB:BB:DD.F" or not.
if(checkBDFFormat(path + start + 1) == 0) {
// This a CPU root complex. Create a CPU tag and stop there.
struct scclXmlNode* topNode;
SCCLCHECK(xmlFindTag(xml, "system", &topNode));
SCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr));
if(parent == NULL) {
SCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
SCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr));
}
} else if(slashCount == 2) {
// Continue on the upper PCI switch
for(int i = strlen(path) - 1; i > 0; i--) {
if(path[i] == '/') {
SCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", path + i + 1));
if(parent == NULL) {
SCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
SCCLCHECK(xmlSetAttr(parent, "busid", path + i + 1));
}
break;
}
}
}
}
if(parent)
break;
}
} else {
// No information on /sys, attach GPU to unknown CPU
SCCLCHECK(xmlFindTagKv(xml, "cpu", &parent, "numaid", "-1"));
if(parent == NULL) {
struct scclXmlNode* topNode;
SCCLCHECK(xmlFindTag(xml, "system", &topNode));
SCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
SCCLCHECK(xmlSetAttr(parent, "numaid", "-1"));
SCCLCHECK(scclTopoGetXmlFromCpu(parent, xml));
}
}
pciNode->parent = parent;
parent->subs[parent->nSubs++] = pciNode;
}
if(strcmp(parent->name, "pci") == 0) {
SCCLCHECK(scclTopoGetXmlFromSys(parent, xml));
} else if(strcmp(parent->name, "cpu") == 0) {
SCCLCHECK(scclTopoGetXmlFromCpu(parent, xml));
}
free(path);
return scclSuccess;
}
scclResult_t scclTopoGetXmlFromGpu(struct scclXmlNode* pciNode, uint32_t rocmDev, struct scclXml* xml, struct scclXmlNode** gpuNodeRet) {
struct scclXmlNode* gpuNode = NULL;
SCCLCHECK(xmlGetSub(pciNode, "gpu", &gpuNode));
if(gpuNode == NULL)
SCCLCHECK(xmlAddNode(xml, pciNode, "gpu", &gpuNode));
int index = -1;
int dev = -1;
SCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
if(index == -1) {
if(rocmDev == -1) {
const char* busId;
SCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
if(busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess)
dev = -1;
} else {
dev = rocmDev;
}
SCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev));
}
SCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev));
if(dev == -1) {
*gpuNodeRet = NULL;
return scclSuccess;
}
SCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index));
if(index == -1) {
int hipMajor, hipMinor;
hipDeviceProp_t devProp;
HIPCHECK(hipGetDeviceProperties(&devProp, 0));
hipMajor = devProp.major;
hipMinor = devProp.minor;
SCCLCHECK(xmlSetAttrInt(gpuNode, "sm", hipMajor * 10 + hipMinor));
}
int sm;
SCCLCHECK(xmlGetAttrInt(gpuNode, "sm", &sm));
const char* gcn;
const char* gcnArchName;
SCCLCHECK(xmlGetAttrIndex(gpuNode, "gcn", &index));
if(index == -1) {
hipDeviceProp_t devProp;
HIPCHECK(hipGetDeviceProperties(&devProp, 0));
// extract only the releveant info from the gcnArchName attribute
// e.g.: convert "gfx908:sramecc+:xnack-" to "gfx908"
char gcnArchNameSubstr[6];
GcnArchNameFormat(devProp.gcnArchName, gcnArchNameSubstr);
gcn = gcnArchNameSubstr;
SCCLCHECK(xmlSetAttr(gpuNode, "gcn", gcn));
}
SCCLCHECK(xmlGetAttr(gpuNode, "gcn", &gcn));
convertGcnArchToGcnArchName(gcn, &gcnArchName);
SCCLCHECK(xmlSetAttr(gpuNode, "gcn", gcnArchName));
scclHipDeviceArch_t arch;
SCCLCHECK(xmlGetAttrIndex(gpuNode, "arch", &index));
if(index == -1) {
hipDeviceProp_t devProp;
HIPCHECK(hipGetDeviceProperties(&devProp, 0));
memcpy(&arch.arch, &devProp.arch, sizeof(hipDeviceArch_t));
SCCLCHECK(xmlSetAttrInt(gpuNode, "arch", arch.value));
}
SCCLCHECK(xmlGetAttrInt(gpuNode, "arch", &arch.value));
struct scclXmlNode* nvlNode = NULL;
SCCLCHECK(xmlGetSub(gpuNode, "nvlink", &nvlNode));
if(nvlNode == NULL) {
const char* busId;
SCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
uint32_t deviceCnt;
SCCLCHECK(rocm_smi_getNumDevice(&deviceCnt));
for(int i = 0; i < deviceCnt; i++) {
if(i != dev) {
RSMI_IO_LINK_TYPE rsmi_type;
int hops, count;
if(rocm_smi_getLinkInfo(dev, i, &rsmi_type, &hops, &count) == scclSuccess) {
if(rsmi_type >= RSMI_IOLINK_TYPE_XGMI && hops >= 1) {
char busIdStr[] = "00000000:00:00.0";
SCCLCHECK(rocm_smi_getDevicePciBusIdString(i, busIdStr, sizeof(busIdStr)));
char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
for(int c = 0; c < NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
lowerId[c] = tolower(busIdStr[c]);
if(busIdStr[c] == 0)
break;
}
SCCLCHECK(xmlGetSubKv(gpuNode, "xgmi", &nvlNode, "target", lowerId));
if(nvlNode == NULL) {
SCCLCHECK(xmlAddNode(xml, gpuNode, "xgmi", &nvlNode));
SCCLCHECK(xmlSetAttr(nvlNode, "target", lowerId));
SCCLCHECK(xmlSetAttrInt(nvlNode, "count", count));
}
}
}
}
}
}
// Fill target classes
for(int s = 0; s < gpuNode->nSubs; s++) {
struct scclXmlNode* sub = gpuNode->subs[s];
if(strcmp(sub->name, "xgmi") != 0)
continue;
int index;
SCCLCHECK(xmlGetAttrIndex(sub, "tclass", &index));
if(index == -1) {
const char* busId;
SCCLCHECK(xmlGetAttr(sub, "target", &busId));
char* path;
getPciPath(busId, &path);
if(path == NULL || strcmp(busId, "fffffff:ffff:ff") == 0) {
// Remote NVLink device is not visible inside this VM. Assume NVSwitch.
SCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000"));
} else {
SCCLCHECK(scclTopoSetAttrFromSys(sub, path, "class", "tclass"));
free(path);
}
}
}
*gpuNodeRet = gpuNode;
return scclSuccess;
}
// Returns the subsystem name of a path, i.e. the end of the path
// where sysPath/subsystem points to.
scclResult_t scclTopoGetSubsystem(const char* sysPath, char* subSys) {
char subSysPath[PATH_MAX];
sprintf(subSysPath, "%s/subsystem", sysPath);
char* path = realpath(subSysPath, NULL);
if(path == NULL) {
subSys[0] = '\0';
} else {
int offset;
for(offset = strlen(path); offset > 0 && path[offset] != '/'; offset--)
;
strcpy(subSys, path + offset + 1);
free(path);
}
return scclSuccess;
}
scclResult_t scclTopoTrimXmlRec(struct scclXmlNode* node) {
const char* str;
SCCLCHECK(xmlGetAttr(node, "keep", &str));
if(str && strcmp(str, "1") == 0) {
SCCLCHECK(xmlUnsetAttr(node, "keep"));
} else {
// Copy nSubs and subs as they could change as we trim recursively.
struct scclXmlNode* subs[MAX_SUBS];
int nSubs = node->nSubs;
memcpy(subs, node->subs, node->nSubs * sizeof(struct scclXmlNode*));
for(int s = 0; s < nSubs; s++) {
SCCLCHECK(scclTopoTrimXmlRec(subs[s]));
}
if(node->nSubs == 0)
SCCLCHECK(xmlRemoveNode(node));
}
return scclSuccess;
}
/**************************************************/
/* Parser rules for the user-defined graph search */
/**************************************************/
scclResult_t scclTopoXmlGraphLoadGpu(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
SCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
return scclSuccess;
}
scclResult_t scclTopoXmlGraphLoadNet(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
SCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
return scclSuccess;
}
scclResult_t scclTopoXmlGraphLoadChannel(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
struct xmlHandler handlers[] = {{"net", scclTopoXmlGraphLoadNet}, {"gpu", scclTopoXmlGraphLoadGpu}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
return scclSuccess;
}
scclResult_t scclTopoXmlGraphLoadGraph(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
struct xmlHandler handlers[] = {{"channel", scclTopoXmlGraphLoadChannel}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
return scclSuccess;
}
scclResult_t scclTopoXmlGraphLoadGraphs(FILE* file, struct scclXml* xmlGraph, struct scclXmlNode* head) {
int version;
SCCLCHECK(xmlGetAttrInt(head, "version", &version));
if(version != SCCL_GRAPH_XML_VERSION) {
WARN("XML Graph has wrong version %d, %d needed", version, SCCL_GRAPH_XML_VERSION);
return scclInvalidUsage;
}
const char* name;
SCCLCHECK(xmlGetAttr(head, "name", &name));
if(name != NULL)
INFO(SCCL_LOG_TOPO, "Loading graphs for topology %s", name);
else
INFO(SCCL_LOG_TOPO, "Loading graphs");
struct xmlHandler handlers[] = {{"graph", scclTopoXmlGraphLoadGraph}};
SCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1));
return scclSuccess;
}
} // namespace xml
scclResult_t scclTopoGetXmlFromFile(const char* xmlTopoFile, struct scclXml* xml, int warn) {
FILE* file = fopen(xmlTopoFile, "r");
if(file == NULL) {
if(warn) {
WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno));
}
return scclSuccess;
}
INFO(SCCL_LOG_TOPO, "Loading topology file %s", xmlTopoFile);
struct xml::xmlHandler handlers[] = {{"system", xml::scclTopoXmlLoadSystem}};
xml->maxIndex = 0;
SCCLCHECK(xml::xmlLoadSub(file, xml, NULL, handlers, 1));
fclose(file);
return scclSuccess;
}
scclResult_t scclTopoDumpXmlToFile(const char* xmlTopoFile, struct scclXml* xml) {
FILE* file = fopen(xmlTopoFile, "w");
if(file == NULL) {
WARN("Unable to open %s, not dumping topology.", xmlTopoFile);
return scclSuccess;
}
SCCLCHECK(xml::scclTopoDumpXmlRec(0, file, xml->nodes));
fclose(file);
return scclSuccess;
}
scclResult_t scclTopoFillGpu(struct scclXml* xml, const char* busId, struct scclXmlNode** gpuNode) {
struct scclXmlNode* node;
SCCLCHECK(xml::scclTopoGetPciNode(xml, busId, &node));
SCCLCHECK(xmlSetAttrIfUnset(node, "class", "0x03"));
SCCLCHECK(xml::scclTopoGetXmlFromSys(node, xml));
uint32_t devIndex;
static int rocmsmiInit = 0;
if(rocmsmiInit == 0) {
rocmsmiInit = (rocm_smi_init() != scclSuccess) ? 2 : 1;
}
if(rocmsmiInit == 1) {
if(rocm_smi_getDeviceIndexByPciBusId(busId, &devIndex) != scclSuccess)
devIndex = -1;
}
SCCLCHECK(xml::scclTopoGetXmlFromGpu(node, devIndex, xml, gpuNode));
return scclSuccess;
}
scclResult_t scclTopoFillNet(struct scclXml* xml, const char* pciPath, const char* netName, struct scclXmlNode** netNode) {
SCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName));
if(*netNode != NULL)
return scclSuccess;
const char* pciSysPath = pciPath;
if(pciSysPath) {
char subSystem[PATH_MAX];
SCCLCHECK(xml::scclTopoGetSubsystem(pciSysPath, subSystem));
// This is not a PCI device (virtual, usb, ...).
if(strcmp(subSystem, "pci") != 0) {
INFO(SCCL_LOG_TOPO, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
pciSysPath = NULL;
}
}
struct scclXmlNode* parent = NULL;
if(pciSysPath) {
int offset;
for(offset = strlen(pciSysPath) - 1; pciSysPath[offset] != '/'; offset--)
;
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
strcpy(busId, pciSysPath + offset + 1);
SCCLCHECK(xml::scclTopoGetPciNode(xml, busId, &parent));
SCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02"));
SCCLCHECK(xml::scclTopoGetXmlFromSys(parent, xml));
} else {
// Virtual NIC, no PCI device, attach to first CPU
SCCLCHECK(xmlFindTag(xml, "cpu", &parent));
}
struct scclXmlNode* nicNode = NULL;
SCCLCHECK(xmlGetSub(parent, "nic", &nicNode));
if(nicNode == NULL) {
SCCLCHECK(xmlAddNode(xml, parent, "nic", &nicNode));
}
// We know that this net does not exist yet (we searched for it at the
// beginning of this function), so we can add it.
SCCLCHECK(xmlAddNode(xml, nicNode, "net", netNode));
SCCLCHECK(xmlSetAttr(*netNode, "name", netName));
return scclSuccess;
}
scclResult_t scclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct scclXml* xml) {
FILE* file = fopen(xmlGraphFile, "r");
if(file == NULL) {
WARN("Could not open XML graph file %s : %s", xmlGraphFile, strerror(errno));
return scclSystemError;
}
struct xml::xmlHandler handlers[] = {{"graphs", xml::scclTopoXmlGraphLoadGraphs}};
xml->maxIndex = 0;
SCCLCHECK(xml::xmlLoadSub(file, xml, NULL, handlers, 1));
fclose(file);
return scclSuccess;
}
scclResult_t scclTopoTrimXml(struct scclXml* xml) {
SCCLCHECK(xml::scclTopoTrimXmlRec(xml->nodes));
return scclSuccess;
}
scclResult_t scclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
char filePath[PATH_MAX];
sprintf(filePath, "%s/%s", path, fileName);
int offset = 0;
FILE* file;
if((file = fopen(filePath, "r")) != NULL) {
while(feof(file) == 0 && ferror(file) == 0 && offset < MAX_STR_LEN) {
int len = fread(strValue + offset, 1, MAX_STR_LEN - offset, file);
offset += len;
}
fclose(file);
}
if(offset == 0) {
strValue[0] = '\0';
INFO(SCCL_LOG_TOPO, "Topology detection : could not read %s, ignoring", filePath);
} else {
strValue[offset - 1] = '\0';
}
return scclSuccess;
}
} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl
#ifndef XML_H_
#define XML_H_
#include <stdlib.h>
#include "base.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace topo {
///////////////////////////////////////// 基础struct /////////////////////////////////////////
// A few constraints to make the implementation easy
#define MAX_STR_LEN 255
#define MAX_ATTR_COUNT 16
#define MAX_SUBS 32
#define MAX_NODES 1024
typedef enum node_type {
NODE_TYPE_NONE = 0,
NODE_TYPE_OPEN = 1,
NODE_TYPE_CLOSE = 2,
NODE_TYPE_SINGLE = 3
} node_type_t;
// 定义一个结构体 scclXmlNode,用于表示XML节点
struct scclXmlNode {
char name[MAX_STR_LEN + 1]; // 节点名称
struct {
char key[MAX_STR_LEN + 1]; // 属性键
char value[MAX_STR_LEN + 1]; // 属性值
} attrs[MAX_ATTR_COUNT + 1]; // 需要额外的一个来消耗额外参数
int nAttrs; // 属性数量
int type; // 节点类型
struct scclXmlNode* parent; // 父节点指针
struct scclXmlNode* subs[MAX_SUBS]; // 子节点指针数组
int nSubs; // 子节点数量
};
// 定义了一个结构体 scclXml,用于表示XML文档的结构
struct scclXml {
struct scclXmlNode nodes[MAX_NODES]; // 节点数组,每个节点代表XML中的一个元素
int maxIndex; // 当前XML结构中最大节点索引
};
struct kvDict {
const char* str;
int value;
};
typedef union {
hipDeviceArch_t arch;
int value;
static_assert(sizeof(hipDeviceArch_t) == sizeof(int), "value must be the same size of hipDeviceArch_t.");
} scclHipDeviceArch_t;
///////////////////////////////////////// File functions /////////////////////////////////////////
#define SCCL_TOPO_XML_VERSION 2
#define SCCL_GRAPH_XML_VERSION 1
// 从文件中获取XML拓扑结构
scclResult_t scclTopoGetXmlFromFile(const char* xmlTopoFile, struct scclXml* xml, int warn);
// 将XML拓扑结构保存到文件中
scclResult_t scclTopoDumpXmlToFile(const char* xmlTopoFile, struct scclXml* xml);
// 从文件中获取XML图形结构
scclResult_t scclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct scclXml* xml);
/* 自动检测功能 */
// 根据总线ID填充GPU信息到XML结构中
scclResult_t scclTopoFillGpu(struct scclXml* xml, const char* busId, struct scclXmlNode** gpuNode);
// 根据PCI路径和网络名称填充网络信息到XML结构中
scclResult_t scclTopoFillNet(struct scclXml* xml, const char* pciPath, const char* netName, struct scclXmlNode** netNode);
/* 移除不需要的部分 */
// 修剪XML结构,移除不需要的部分
scclResult_t scclTopoTrimXml(struct scclXml* xml);
// 从系统路径中获取字符串值
scclResult_t scclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue);
/**************/
/* XML Struct */
/* Functions */
/**************/
// 获取XML节点的属性索引
scclResult_t xmlGetAttrIndex(struct scclXmlNode* node, const char* attrName, int* index);
// 获取XML节点的属性值,返回为字符串
scclResult_t xmlGetAttr(struct scclXmlNode* node, const char* attrName, const char** value);
// 获取XML节点的属性值,返回为字符串(与xmlGetAttr类似)
scclResult_t xmlGetAttrStr(struct scclXmlNode* node, const char* attrName, const char** value);
// 获取XML节点的属性值,返回为整数
scclResult_t xmlGetAttrInt(struct scclXmlNode* node, const char* attrName, int* value);
// 获取XML节点的属性值,返回为整数,如果属性不存在则返回默认值
scclResult_t xmlGetAttrIntDefault(struct scclXmlNode* node, const char* attrName, int* value, int defaultValue);
// 初始化XML节点的整数属性
scclResult_t xmlInitAttrInt(struct scclXmlNode* node, const char* attrName, const int value);
// 初始化XML节点的无符号64位整数属性
scclResult_t xmlInitAttrUint64(struct scclXmlNode* node, const char* attrName, const uint64_t value);
// 获取XML节点的属性值,返回为浮点数
scclResult_t xmlGetAttrFloat(struct scclXmlNode* node, const char* attrName, float* value);
// 初始化XML节点的浮点数属性
scclResult_t xmlInitAttrFloat(struct scclXmlNode* node, const char* attrName, const float value);
// 在XML中查找指定标签名的节点
scclResult_t xmlFindTag(struct scclXml* xml, const char* tagName, struct scclXmlNode** node);
// 在XML中查找指定标签名和属性值的节点
scclResult_t xmlFindTagKv(struct scclXml* xml, const char* tagName, struct scclXmlNode** node, const char* attrName, const char* attrValue);
// 设置XML节点的属性值
scclResult_t xmlSetAttr(struct scclXmlNode* node, const char* attrName, const char* value);
// 如果属性未设置,则设置XML节点的属性值
scclResult_t xmlSetAttrIfUnset(struct scclXmlNode* node, const char* attrName, const char* value);
// 设置XML节点的属性值为整数
scclResult_t xmlSetAttrInt(struct scclXmlNode* node, const char* attrName, const int value);
// 设置XML节点的属性值为浮点数
scclResult_t xmlSetAttrFloat(struct scclXmlNode* node, const char* attrName, const float value);
// 移除XML节点的属性
scclResult_t xmlUnsetAttr(struct scclXmlNode* node, const char* attrName);
// 获取XML节点的子节点
scclResult_t xmlGetSub(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub);
// 获取XML节点的子节点,子节点需匹配指定属性值
scclResult_t xmlGetSubKv(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub, const char* attrName, const char* attrValue);
// 获取XML节点的子节点,子节点需匹配指定整数属性值
scclResult_t xmlGetSubKvInt(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub, const char* attrName, const int attrValue);
// 在XML中添加新节点
scclResult_t xmlAddNode(struct scclXml* xml, struct scclXmlNode* parent, const char* subName, struct scclXmlNode** sub);
// 从XML中移除节点
scclResult_t xmlRemoveNode(struct scclXmlNode* node);
// 字符串到整数的转换字典,最后一个元素的str应为NULL
// 将字符串转换为整数
scclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* dict);
// 将整数转换为字符串
scclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict);
} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif
#pragma once
#include <string.h>
#include "base.h"
#include "archinfo.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
///////////
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
#include <unistd.h>
#include <sys/types.h>
#include <string.h>
#include "bootstrap_net.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
namespace bootstrap_net {
/* Init functions */
static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1];
static scclSocketAddress_t bootstrapNetIfAddr;
static int bootstrapNetInitDone = 0;
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
/**
* @brief 初始化引导网络
*
* 该函数用于初始化SCCL的引导网络。它会检查环境变量"SCCL_COMM_ID"来获取远程地址,
* 如果没有设置则自动查找可用的网络接口。函数使用互斥锁确保线程安全。
*
* @return scclResult_t 返回操作结果:
* - scclSuccess: 初始化成功
* - scclInvalidArgument: 无效的SCCL_COMM_ID格式
* - scclSystemError: 找不到匹配的网络接口
* - scclInternalError: 找不到可用的网络接口
*/
scclResult_t bootstrapNetInit() {
if(bootstrapNetInitDone == 0) {
pthread_mutex_lock(&bootstrapNetLock);
if(bootstrapNetInitDone == 0) {
char* env = getenv("SCCL_COMM_ID");
if(env) {
scclSocketAddress_t remoteAddr;
if(net::host::scclSocketGetAddrFromString(&remoteAddr, env) != scclSuccess) {
WARN("Invalid SCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return scclInvalidArgument;
}
if(net::host::scclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
WARN("NET/Socket : No usable listening interface found");
return scclSystemError;
}
} else {
int nIfs = net::host::scclFindSocketInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
if(nIfs <= 0) {
WARN("Bootstrap : no socket interface found");
return scclInternalError;
}
}
char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2];
sprintf(line, " %s:", bootstrapNetIfName);
net::host::scclSocketToString(&bootstrapNetIfAddr, line + strlen(line));
INFO(SCCL_LOG_BOOTSTRAP, "Bootstrap : Using%s", line);
bootstrapNetInitDone = 1;
printf("line=%s\n", line);
}
pthread_mutex_unlock(&bootstrapNetLock);
}
return scclSuccess;
}
// Additional sync functions
/**
* 通过网络发送数据
*
* @param sock 已连接的socket指针
* @param data 要发送的数据指针
* @param size 要发送的数据大小(字节)
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*
* @note 先发送数据大小(sizeof(int)),再发送实际数据
*/
scclResult_t bootstrapNetSend(scclSocket_t* sock, void* data, int size) {
SCCLCHECK(net::host::scclSocketSend(sock, &size, sizeof(int)));
SCCLCHECK(net::host::scclSocketSend(sock, data, size));
return scclSuccess;
}
/**
* 从socket接收数据
*
* @param sock 要接收数据的socket
* @param data 接收数据的缓冲区
* @param size 缓冲区大小
* @return scclResult_t 返回操作结果,成功返回scclSuccess,否则返回错误码
*
* @note 如果接收到的数据大小超过缓冲区大小,会截断数据并返回scclInternalError
*/
scclResult_t bootstrapNetRecv(scclSocket_t* sock, void* data, int size) {
int recvSize;
SCCLCHECK(net::host::scclSocketRecv(sock, &recvSize, sizeof(int)));
if(recvSize > size) {
WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
return scclInternalError;
}
SCCLCHECK(net::host::scclSocketRecv(sock, data, std::min(recvSize, size)));
return scclSuccess;
}
} // namespace bootstrap_net
/**
* 将未预期的连接请求加入队列
*
* @param state 引导状态指针
* @param peer 对端节点ID
* @param tag 连接标签
* @param sock 套接字指针
* @return 成功返回scclSuccess
*
* @note 该函数用于处理未预期的连接请求,将其加入等待队列
*/
scclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, scclSocket_t* sock) {
// New unex
struct unexConn* unex;
SCCLCHECK(scclCalloc(&unex, 1));
unex->peer = peer;
unex->tag = tag;
memcpy(&unex->sock, sock, sizeof(scclSocket_t));
// Enqueue
struct unexConn* list = state->unexpectedConnections;
if(list == NULL) {
state->unexpectedConnections = unex;
return scclSuccess;
}
while(list->next)
list = list->next;
list->next = unex;
return scclSuccess;
}
/**
* 从意外连接队列中查找并移除指定peer和tag的连接
*
* @param state 引导状态指针
* @param peer 目标peer ID
* @param tag 目标tag值
* @param sock 输出参数,用于存储找到的socket
* @param found 输出参数,指示是否找到匹配项
* @return 总是返回scclSuccess
*
* @note 该函数会遍历意外连接链表,查找匹配peer和tag的连接,
* 找到后将其从链表中移除并释放内存,通过sock参数返回socket信息
*/
scclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, scclSocket_t* sock, int* found) {
struct unexConn* elem = state->unexpectedConnections;
struct unexConn* prev = NULL;
*found = 0;
while(elem) {
if(elem->peer == peer && elem->tag == tag) {
if(prev == NULL) {
state->unexpectedConnections = elem->next;
} else {
prev->next = elem->next;
}
memcpy(sock, &elem->sock, sizeof(scclSocket_t));
free(elem);
*found = 1;
return scclSuccess;
}
prev = elem;
elem = elem->next;
}
return scclSuccess;
}
/**
* 释放未预期的连接链表
*
* 遍历并释放bootstrapState中存储的所有未预期连接
*
* @param state 包含未预期连接链表的状态结构体指针
*/
static void unexpectedFree(struct bootstrapState* state) {
struct unexConn* elem = state->unexpectedConnections;
struct unexConn* prev = NULL;
while(elem) {
prev = elem;
elem = elem->next;
free(prev);
}
return;
}
/**
* 执行基于环的AllGather操作
*
* @param commState 通信状态指针
* @param allData 用于收集所有rank数据的缓冲区
* @param size 每个rank数据块的大小(字节)
* @return 成功返回scclSuccess,失败返回错误码
*
* @note 该函数实现了一个简单的基于环的AllGather算法:
* 1. 每个rank在步骤i从(rank-i-1)接收数据
* 2. 将前一步骤从(rank-i)接收的数据发送给右侧rank
* 3. 共进行nranks-1次步骤完成全收集
*/
scclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
struct bootstrapState* state = (struct bootstrapState*)commState;
char* data = (char*)allData;
int rank = state->rank;
int nranks = state->nranks;
INFO(SCCL_LOG_BOOTSTRAP, "rank %d nranks %d size %d", rank, nranks, size);
/* Simple ring based AllGather
* At each step i receive data from (rank-i-1) from left
* and send previous step's data from (rank-i) to right
*/
for(int i = 0; i < nranks - 1; i++) {
size_t rslice = (rank - i - 1 + nranks) % nranks;
size_t sslice = (rank - i + nranks) % nranks;
// Send slice to the right
SCCLCHECK(bootstrap_net::bootstrapNetSend(&state->ringSendSocket, data + sslice * size, size));
// Recv slice from the left
SCCLCHECK(bootstrap_net::bootstrapNetRecv(&state->ringRecvSocket, data + rslice * size, size));
}
INFO(SCCL_LOG_BOOTSTRAP, "rank %d nranks %d size %d - DONE", rank, nranks, size);
return scclSuccess;
}
/**
* 通过socket向指定对等节点发送数据
*
* @param commState 通信状态指针
* @param peer 对等节点编号
* @param tag 消息标签
* @param data 要发送的数据指针
* @param size 数据大小(字节)
* @return scclResult_t 返回操作结果状态码(scclSuccess表示成功)
*/
scclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
scclResult_t ret = scclSuccess;
struct bootstrapState* state = (struct bootstrapState*)commState;
scclSocket_t sock;
SCCLCHECKGOTO(net::host::scclSocketInit(&sock, state->peerCommAddresses + peer, state->magic, net::host::scclSocketTypeBootstrap), ret, fail);
SCCLCHECKGOTO(net::host::scclSocketConnect(&sock), ret, fail);
SCCLCHECKGOTO(bootstrap_net::bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
SCCLCHECKGOTO(bootstrap_net::bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
SCCLCHECKGOTO(bootstrap_net::bootstrapNetSend(&sock, data, size), ret, fail);
exit:
SCCLCHECK(net::host::scclSocketClose(&sock));
return ret;
fail:
goto exit;
}
/**
* @brief 从指定对等节点接收数据
*
* 该函数首先检查未预期的连接队列,若找到匹配的(peer, tag)则直接接收数据。
* 若未找到,则持续监听新连接,接收对等节点和标签信息进行匹配。
* 若匹配成功则接收数据,否则将连接信息存入未预期队列供后续使用。
*
* @param commState 通信状态指针
* @param peer 对等节点标识
* @param tag 消息标签
* @param data 接收数据缓冲区
* @param size 接收数据大小
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*/
scclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
scclResult_t ret = scclSuccess;
struct bootstrapState* state = (struct bootstrapState*)commState;
scclSocket_t sock;
int newPeer, newTag;
// Search unexpected connections first
int found;
SCCLCHECK(unexpectedDequeue(state, peer, tag, &sock, &found));
if(found) {
SCCLCHECKGOTO(bootstrap_net::bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
goto exit;
}
// Then look for new connections
while(1) {
SCCLCHECKGOTO(net::host::scclSocketInit(&sock), ret, fail);
SCCLCHECKGOTO(net::host::scclSocketAccept(&sock, &state->listenSock), ret, fail);
SCCLCHECKGOTO(bootstrap_net::bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail);
SCCLCHECKGOTO(bootstrap_net::bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail);
if(newPeer == peer && newTag == tag) {
SCCLCHECKGOTO(bootstrap_net::bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
goto exit;
}
// Unexpected connection. Save for later.
SCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail);
}
exit:
SCCLCHECK(net::host::scclSocketClose(&sock));
return ret;
fail:
goto exit;
}
scclResult_t bootstrapInit() {}
// /**
// * @brief 初始化bootstrap网络通信
// *
// * 该函数负责初始化bootstrap网络通信环境,包括:
// * 1. 创建监听socket供其他rank连接
// * 2. 与root节点交换连接信息
// * 3. 建立环形通信拓扑
// * 4. 收集所有peer的通信地址
// * 5. 创建并收集代理服务地址
// *
// * @param handle bootstrap句柄
// * @param comm bootstrap通信上下文
// * @return scclResult_t 返回操作结果,scclSuccess表示成功
// */
// scclResult_t bootstrapInit(struct scclBootstrapHandle* handle, struct scclBootstrapComm* comm) {
// int rank = comm->rank; // 当前进程的排名
// int nranks = comm->nRanks; // 进程的总数
// struct bootstrapState* state; // 引导状态结构体
// scclSocket_t* proxySocket; // 代理套接字
// scclSocketAddress_t nextAddr; // 下一个地址
// scclSocket_t sock, listenSockRoot; // 套接字和根监听套接字
// struct extInfo info = {0}; // 扩展信息结构体
// SCCLCHECK(scclCalloc(&state, 1)); // 分配引导状态结构体
// state->rank = rank; // 设置当前进程的排名
// state->nranks = nranks; // 设置进程的总数
// state->abortFlag = comm->abortFlag; // 设置中止标志
// comm->bootstrap = state; // 将引导状态结构体赋值给通信结构体
// comm->magic = state->magic = handle->magic; // 设置魔术值
// INFO(SCCL_LOG_BOOTSTRAP, "rank %d nranks %d", rank, nranks); // 打印日志信息
// info.rank = rank; // 设置扩展信息结构体中的排名
// info.nranks = nranks; // 设置扩展信息结构体中的进程总数
// // 创建套接字供其他进程联系
// SCCLCHECK(
// net::host::scclSocketInit(&state->listenSock, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag));
// SCCLCHECK(net::host::scclSocketListen(&state->listenSock)); // 监听套接字
// SCCLCHECK(net::host::scclSocketGetAddr(&state->listenSock, &info.extAddressListen)); // 获取监听套接字地址
// // 创建套接字供根进程联系
// SCCLCHECK(net::host::scclSocketInit(&listenSockRoot, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeBootstrap,
// comm->abortFlag)); SCCLCHECK(net::host::scclSocketListen(&listenSockRoot)); // 监听根进程套接字
// SCCLCHECK(net::host::scclSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); // 获取根进程监听套接字地址
// // // 分散连接时间以避免根进程过载
// // if(nranks > 128) {
// // long msec = rank;
// // struct timespec tv;
// // tv.tv_sec = msec / 1000;
// // tv.tv_nsec = 1000000 * (msec % 1000);
// // TRACE(SCCL_LOG_BOOTSTRAP, "rank %d delaying connection to root by %ld msec", rank, msec);
// // (void)nanosleep(&tv, NULL);
// // }
// // 向根进程发送我的监听套接字信息
// SCCLCHECK(net::host::scclSocketInit(&sock, &handle->addr, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag));
// SCCLCHECK(net::host::scclSocketConnect(&sock)); // 连接套接字
// SCCLCHECK(bootstrap_net::bootstrapNetSend(&sock, &info, sizeof(info))); // 发送扩展信息
// SCCLCHECK(net::host::scclSocketClose(&sock)); // 关闭套接字
// // 从根进程获取我在引导环中的“下一个”进程的信息
// SCCLCHECK(net::host::scclSocketInit(&sock)); // 初始化套接字
// SCCLCHECK(net::host::scclSocketAccept(&sock, &listenSockRoot)); // 接受根进程的连接
// SCCLCHECK(bootstrap_net::bootstrapNetRecv(&sock, &nextAddr, sizeof(scclSocketAddress_t))); // 接收下一个地址
// SCCLCHECK(net::host::scclSocketClose(&sock)); // 关闭套接字
// SCCLCHECK(net::host::scclSocketClose(&listenSockRoot)); // 关闭根监听套接字
// SCCLCHECK(net::host::scclSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag));
// SCCLCHECK(net::host::scclSocketConnect(&state->ringSendSocket)); // 连接环发送套接字
// // 接受引导环中前一个进程的连接请求
// SCCLCHECK(net::host::scclSocketInit(&state->ringRecvSocket)); // 初始化环接收套接字
// SCCLCHECK(net::host::scclSocketAccept(&state->ringRecvSocket, &state->listenSock)); // 接受连接
// // 全部收集所有监听处理器
// SCCLCHECK(scclCalloc(&state->peerCommAddresses, nranks)); // 分配对等通信地址
// SCCLCHECK(net::host::scclSocketGetAddr(&state->listenSock, state->peerCommAddresses + rank)); // 获取监听套接字地址
// SCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(scclSocketAddress_t))); // 全部收集地址
// // 创建服务代理
// SCCLCHECK(scclCalloc(&state->peerProxyAddresses, nranks)); // 分配对等代理地址
// // 代理通过消息中止;不要设置中止标志
// SCCLCHECK(scclCalloc(&proxySocket, 1)); // 分配代理套接字
// SCCLCHECK(net::host::scclSocketInit(proxySocket, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeProxy, comm->abortFlag));
// SCCLCHECK(net::host::scclSocketListen(proxySocket)); // 监听代理套接字
// SCCLCHECK(net::host::scclSocketGetAddr(proxySocket, state->peerProxyAddresses + rank)); // 获取代理套接字地址
// SCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(scclSocketAddress_t))); // 全部收集代理地址
// // SCCLCHECK(scclProxyInit(comm, proxySocket, state->peerProxyAddresses));
// INFO(SCCL_LOG_BOOTSTRAP, "rank %d nranks %d - DONE", rank, nranks); // 打印完成日志信息
// return scclSuccess; // 返回成功
// }
// /**
// * @brief 在bootstrap通信中创建新的子通信域
// *
// * 该函数用于将当前通信域按照指定颜色和键值拆分为子通信域,并建立相应的环状通信拓扑。
// *
// * @param handle bootstrap句柄
// * @param comm 新创建的子通信域
// * @param parent 父通信域
// * @param color 用于划分通信域的颜色值
// * @param key 用于确定新通信域中进程排名的键值
// * @param parentRanks 父通信域中的进程排名映射
// *
// * @return scclResult_t 返回操作结果,成功返回scclSuccess
// *
// * @note 函数会建立环状通信拓扑,包括:
// * 1. 初始化监听socket和环形接收socket
// * 2. 与前后节点交换地址信息
// * 3. 执行AllGather收集所有节点的通信地址
// * 4. 根据配置决定是否共享代理状态或创建新的代理服务
// */
// scclResult_t
// bootstrapSplit(struct scclBootstrapHandle* handle, struct scclBootstrapComm* comm, struct scclBootstrapComm* parent, int color, int key, int* parentRanks) {
// scclResult_t ret = scclSuccess;
// int rank = comm->rank;
// int nranks = comm->nRanks;
// int prev, next;
// scclSocketAddress_t listenAddr, tmpAddr;
// scclSocket_t* proxySocket;
// struct bootstrapState* state;
// // SCCLCHECKGOTO(scclCalloc(&state, 1), ret, fail);
// // state->rank = rank;
// // state->nranks = nranks;
// // state->abortFlag = comm->abortFlag;
// // comm->bootstrap = state;
// // comm->magic = state->magic = handle->magic;
// // prev = parentRanks[(rank - 1 + nranks) % nranks];
// // next = parentRanks[(rank + 1) % nranks];
// // // Setup my sockets for the allgather ring and other p2p connections
// // SCCLCHECKGOTO(
// // net::host::scclSocketInit(&state->listenSock, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeBootstrap,
// comm->abortFlag,
// // 0), ret, fail);
// // SCCLCHECKGOTO(net::host::scclSocketInit(&state->ringRecvSocket, NULL, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag, 0), ret,
// fail);
// // // Create socket for other ranks to contact me
// // SCCLCHECKGOTO(net::host::scclSocketListen(&state->listenSock), ret, fail);
// // // Get addr from next rank
// // SCCLCHECKGOTO(net::host::scclSocketGetAddr(&state->listenSock, &listenAddr), ret, fail);
// // SCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, -2, &listenAddr, sizeof(scclSocketAddress_t)), ret, fail);
// // SCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, -2, &tmpAddr, sizeof(scclSocketAddress_t)), ret, fail);
// // SCCLCHECKGOTO(net::host::scclSocketInit(&state->ringSendSocket, &tmpAddr, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag, 0), ret,
// // fail); SCCLCHECKGOTO(net::host::scclSocketConnect(&state->ringSendSocket), ret, fail);
// // // Accept the connect request from the previous rank in the AllGather ring
// // SCCLCHECKGOTO(net::host::scclSocketAccept(&state->ringRecvSocket, &state->listenSock), ret, fail);
// // // AllGather all listen handlers
// // SCCLCHECKGOTO(scclCalloc(&state->peerCommAddresses, nranks), ret, fail);
// // memcpy(state->peerCommAddresses + rank, &listenAddr, sizeof(scclSocketAddress_t));
// // SCCLCHECKGOTO(bootstrapAllGather(state, state->peerCommAddresses, sizeof(scclSocketAddress_t)), ret, fail);
// // if(parent->splitShare) {
// // /* map local rank to top parent local rank. */
// // for(int i = 0; i < nranks; ++i) {
// // comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
// // }
// // comm->proxyState = parent->sharedRes->proxyState;
// // scclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
// // } else {
// // // Create the service proxy
// // SCCLCHECKGOTO(scclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
// // SCCLCHECKGOTO(scclCalloc(&proxySocket, 1), ret, fail);
// // SCCLCHECKGOTO(
// // net::host::scclSocketInit(proxySocket, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeProxy, comm->abortFlag, 0),
// // ret,
// // fail);
// // SCCLCHECKGOTO(net::host::scclSocketListen(proxySocket), ret, fail);
// // SCCLCHECKGOTO(net::host::scclSocketGetAddr(proxySocket, &tmpAddr), ret, fail);
// // memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(scclSocketAddress_t));
// // SCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(scclSocketAddress_t)), ret, fail);
// // // SCCLCHECKGOTO(scclProxyInit(comm, proxySocket, state->peerProxyAddresses), ret, fail);
// // }
// // INFO(sccl_INIT, "bootstrapSplit: rank %d nranks %d color %d key %d prev %d next %d - DONE", rank, nranks, color, key, prev, next);
// exit:
// return ret;
// fail:
// goto exit;
// }
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
#pragma once
#include <string.h>
#include "base.h"
#include "socket.h"
#include "bootstrap_utils.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
template <typename Int>
inline void scclAtomicRefCountIncrement(Int* refs) {
__atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
}
////////////////////////////////////////////////////////////////////////////////////////////////
namespace bootstrap_net {
// 通过socket发送数据
scclResult_t bootstrapNetSend(scclSocket_t* sock, void* data, int size);
// 通过socket接收数据
scclResult_t bootstrapNetRecv(scclSocket_t* sock, void* data, int size);
// 初始化网络引导
scclResult_t bootstrapNetInit();
} // namespace bootstrap_net
// 将消息加入到未预期消息队列中
scclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, scclSocket_t* sock);
// 从未预期消息队列中取出消息
scclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, scclSocket_t* sock, int* found);
// 释放未预期消息队列中的资源
static void unexpectedFree(struct bootstrapState* state);
// 执行全节点数据收集操作
scclResult_t bootstrapAllGather(void* commState, void* allData, int size);
// 向指定节点发送数据
scclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
// 从指定节点接收数据
scclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
////////////////////
scclResult_t bootstrapInit(struct scclUniqueId* unique_id, struct scclBootstrapComm* comm);
// scclResult_t bootstrapInit(struct scclBootstrapHandle* handle, struct scclBootstrapComm* comm);
// scclResult_t
// bootstrapSplit(struct scclBootstrapHandle* handle, struct scclBootstrapComm* comm, struct scclBootstrapComm* parent, int color, int key, int* parentRanks);
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
#pragma once
#include <string.h>
#include "base.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
//
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment