Commit a4ac3320 authored by lishen's avatar lishen
Browse files

通过线程池实现ipcsocket,满足节点内通信

parent d9d23f34
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef SCCL_NVMLWRAP_H_
#define SCCL_NVMLWRAP_H_
#include "check.h"
namespace sccl {
namespace hardware {
namespace topology {
// #define SCCL_NVML_DIRECT 1
#ifndef SCCL_NVML_DIRECT
#define SCCL_NVML_DIRECT 0
#endif
#if SCCL_NVML_DIRECT
#include "nvml.h"
#else
// Dynamically handle dependencies on NVML
/* Extracted from nvml.h */
typedef struct nvmlDevice_st* nvmlDevice_t;
#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
typedef enum nvmlEnableState_enum {
NVML_FEATURE_DISABLED = 0, //!< Feature disabled
NVML_FEATURE_ENABLED = 1 //!< Feature enabled
} nvmlEnableState_t;
typedef enum nvmlNvLinkCapability_enum {
NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported
NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported
NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported
NVML_NVLINK_CAP_SYSMEM_ATOMICS = 3, // System memory atomics are supported
NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link
NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device
// should be last
NVML_NVLINK_CAP_COUNT
} nvmlNvLinkCapability_t;
typedef enum nvmlReturn_enum {
NVML_SUCCESS = 0, //!< The operation was successful
NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit()
NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid
NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device
NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation
NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful
NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough
NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached
NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded
NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed
NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU
NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded
NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted
NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible
NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again
NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups
NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch
NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use
NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred
} nvmlReturn_t;
typedef struct nvmlPciInfo_st {
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffff
unsigned int bus; //!< The bus on which the device resides, 0 to 0xff
unsigned int device; //!< The device's id on the bus, 0 to 31
unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id
// Added in NVML 2.285 API
unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID
// NVIDIA reserved for internal use only
unsigned int reserved0;
unsigned int reserved1;
unsigned int reserved2;
unsigned int reserved3;
} nvmlPciInfo_t;
/* P2P Capability Index Status*/
typedef enum nvmlGpuP2PStatus_enum {
NVML_P2P_STATUS_OK = 0,
NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
NVML_P2P_STATUS_DISABLED_BY_REGKEY,
NVML_P2P_STATUS_NOT_SUPPORTED,
NVML_P2P_STATUS_UNKNOWN
} nvmlGpuP2PStatus_t;
/* P2P Capability Index*/
typedef enum nvmlGpuP2PCapsIndex_enum {
NVML_P2P_CAPS_INDEX_READ = 0,
NVML_P2P_CAPS_INDEX_WRITE,
NVML_P2P_CAPS_INDEX_NVLINK,
NVML_P2P_CAPS_INDEX_ATOMICS,
NVML_P2P_CAPS_INDEX_PROP,
NVML_P2P_CAPS_INDEX_UNKNOWN
} nvmlGpuP2PCapsIndex_t;
/**
* Represents the type for sample value returned
*/
typedef enum nvmlValueType_enum {
NVML_VALUE_TYPE_DOUBLE = 0,
NVML_VALUE_TYPE_UNSIGNED_INT = 1,
NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
// Keep this last
NVML_VALUE_TYPE_COUNT
} nvmlValueType_t;
/**
* Union to represent different types of Value
*/
typedef union nvmlValue_st {
double dVal; //!< If the value is double
unsigned int uiVal; //!< If the value is unsigned int
unsigned long ulVal; //!< If the value is unsigned long
unsigned long long ullVal; //!< If the value is unsigned long long
signed long long sllVal; //!< If the value is signed long long
} nvmlValue_t;
/**
* Field Identifiers.
*
* All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
*/
/* NVLink Speed */
#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links
#define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device
/**
* Remote device NVLink ID
*
* Link ID needs to be specified in the scopeId field in nvmlFieldValue_t.
*/
#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146 //!< Remote device NVLink ID
/**
* NVSwitch: connected NVLink count
*/
#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch
#define NVML_FI_DEV_NVLINK_GET_SPEED 164
#define NVML_FI_DEV_NVLINK_GET_STATE 165
#define NVML_FI_DEV_NVLINK_GET_VERSION 166
#define NVML_FI_MAX 167 //!< One greater than the largest field ID defined above
/**
* Information for a Field Value Sample
*/
typedef struct nvmlFieldValue_st {
unsigned int
fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above.
unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId
//!< can represent linkId.
long long timestamp; //!< CPU Timestamp of this value in microseconds since 1970
long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by
//!< the same driver call.
nvmlValueType_t valueType; //!< Type of the value stored in value
nvmlReturn_t nvmlReturn; //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn !=
//!< NVML_SUCCESS
nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
} nvmlFieldValue_t;
/* End of nvml.h */
#endif // SCCL_NVML_DIRECT
constexpr int scclNvmlMaxDevices = 32;
struct scclNvmlDeviceInfo {
nvmlDevice_t handle;
int computeCapabilityMajor, computeCapabilityMinor;
};
struct scclNvmlDevicePairInfo {
nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite;
};
extern int scclNvmlDeviceCount;
extern scclNvmlDeviceInfo scclNvmlDevices[scclNvmlMaxDevices];
extern scclNvmlDevicePairInfo scclNvmlDevicePairs[scclNvmlMaxDevices][scclNvmlMaxDevices];
// All scclNvmlFoo() functions call scclNvmlEnsureInitialized() implicitly.
// Outsiders need only call it if they want to inspect the scclNvml global
// tables above.
scclResult_t scclNvmlEnsureInitialized();
scclResult_t scclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
scclResult_t scclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
scclResult_t scclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device);
scclResult_t scclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive);
scclResult_t scclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci);
scclResult_t scclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult);
scclResult_t scclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
scclResult_t scclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
scclResult_t scclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values);
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif // End include guard
#include <sys/stat.h>
#include <fcntl.h>
#include <dirent.h>
#include <string.h>
#include <unistd.h>
#include <algorithm>
#include "topo.h"
#include "utils.h"
#include "cpuset.h"
#include "nvmlwrap.h"
// #include "net.h"
// #include "graph.h"
// #include "comm.h"
// #include "net.h"
// #include "coll_net.h"
// #include "cpuset.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace topo {
const char* topoNodeTypeStr[] = {"GPU", "PCI", "NVS", "CPU", "NIC", "NET"};
const char* topoLinkTypeStr[] = {"LOC", "XGMI", "", "PCI", "", "", "", "SYS", "NET"};
const char* topoPathTypeStr[] = {"LOC", "XGMI", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS"};
namespace topo_basic {
struct kvDict kvDictPciClass[] = {{"0x060400", PCI},
{"0x068000", NVS},
{"0x068001", CPU},
{"0x03", GPU},
{"0x02", NIC},
{"0x120000", GPU},
{"0x0b4000", GPU},
{NULL, PCI /* Default fallback value */}};
struct kvDict kvDictPciGen[] = {{"2.5 GT/s", 15},
{"5 GT/s", 30},
{"8 GT/s", 60},
{"16 GT/s", 120},
{"32 GT/s", 240}, /* Kernel 5.6 and earlier */
{"2.5 GT/s PCIe", 15},
{"5.0 GT/s PCIe", 30},
{"8.0 GT/s PCIe", 60},
{"16.0 GT/s PCIe", 120},
{"32.0 GT/s PCIe", 240},
{"64.0 GT/s PCIe", 480},
{NULL, 60 /* Default fallback */}}; // x100 Mbps per lane
// 定义一个参数 TopoDumpFileRank,用于指定拓扑结构转储文件的等级,默认值为0
SCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0);
// 定义一个参数 IgnoreCpuAffinity,用于指定是否忽略CPU亲和性,默认值为0(不忽略)
SCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
//////////////////////////////////////////////////////////////////////////////////////////////
scclResult_t scclTopoAddNet(struct scclXmlNode* xmlNet, struct scclTopoSystem* system, struct scclTopoNode* nic, int64_t busId) {
int dev;
SCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
struct scclTopoNode* net;
SCCLCHECK(scclTopoCreateNode(system, &net, NET, dev));
const char* str;
SCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
if(str)
sscanf(str, "0x%lx", &net->net.asic);
else
net->net.asic = dev;
int mbps;
SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
if(mbps <= 0)
mbps = 10000; // Some NICs define speed = -1
net->net.bw = mbps / 8000.0;
if(xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != scclSuccess)
net->net.latency = 0;
SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
// SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS));
SCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0));
net->net.busId = busId;
SCCLCHECK(scclTopoConnectNodes(nic, net, LINK_NET, net->net.bw));
SCCLCHECK(scclTopoConnectNodes(net, nic, LINK_NET, net->net.bw));
return scclSuccess;
}
scclResult_t scclTopoAddNic(struct scclXmlNode* xmlNic, struct scclTopoSystem* system, struct scclTopoNode* nic, int64_t busId) {
for(int s = 0; s < xmlNic->nSubs; s++) {
struct scclXmlNode* xmlNet = xmlNic->subs[s];
if(strcmp(xmlNet->name, "net") != 0)
continue;
int index;
SCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
if(index == -1)
continue;
SCCLCHECK(scclTopoAddNet(xmlNet, system, nic, busId));
}
return scclSuccess;
}
/**
* @brief 添加GPU拓扑节点到系统
*
* 从XML节点中解析GPU属性并填充到拓扑节点结构中,包括:
* - CUDA计算能力(sm)
* - GCN架构名称(gcn)
* - HIP设备架构(arch)
* - 设备排名(rank)
* - 设备号(dev)
* - GDR支持标志(gdr)
*
* @param xmlGpu 包含GPU配置的XML节点
* @param system 目标拓扑系统
* @param gpu 待填充的GPU拓扑节点
* @return scclResult_t 操作结果,成功返回scclSuccess
*
* @note 此函数仅处理GPU基础属性,NVLink连接将在后续处理
*/
scclResult_t scclTopoAddGpu(struct scclXmlNode* xmlGpu, struct scclTopoSystem* system, struct scclTopoNode* gpu) {
SCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
const char* gcnArch;
const char* gcnArchName;
SCCLCHECK(xmlGetAttr(xmlGpu, "gcn", &gcnArch));
convertGcnArchToGcnArchName(gcnArch, &gcnArchName);
gpu->gpu.gcn = strdup(gcnArchName);
scclHipDeviceArch_t arch;
SCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value));
memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t));
SCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
SCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev));
SCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport));
// Do not go any further, nvlinks will be added in a second pass
return scclSuccess;
}
/**
* @brief 添加PCI设备到拓扑系统
*
* 解析XML节点中的PCI设备信息,并根据设备类型(GPU/NIC/普通PCI)创建对应的拓扑节点。
* 对于GPU设备,会进一步解析rank信息;对于NIC设备,会合并多端口设备;对于普通PCI设备,
* 会解析vendor/device等属性并递归处理子设备。
*
* @param xmlPci 包含PCI设备信息的XML节点
* @param system 目标拓扑系统
* @param parent 父拓扑节点
* @return scclResult_t 操作结果,成功返回scclSuccess
*/
scclResult_t scclTopoAddPci(struct scclXmlNode* xmlPci, struct scclTopoSystem* system, struct scclTopoNode* parent) {
const char* str;
int type;
SCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));
SCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));
int64_t busId;
SCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));
SCCLCHECK(busIdToInt64(str, &busId));
struct scclTopoNode* node = NULL;
struct scclXmlNode* xmlGpu = NULL;
SCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));
if(xmlGpu != NULL) {
type = GPU;
int index;
SCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
if(index == -1)
return scclSuccess;
SCCLCHECK(scclTopoCreateNode(system, &node, type, busId));
SCCLCHECK(scclTopoAddGpu(xmlGpu, system, node));
}
struct scclXmlNode* xmlNic = NULL;
SCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));
if(xmlNic != NULL) {
type = NIC;
// Ignore sub device ID and merge multi-port NICs into one PCI device.
busId &= 0xfffffffffffffff0;
struct scclTopoNode* nicNode = NULL;
SCCLCHECK(scclTopoGetNode(system, &nicNode, type, busId));
if(nicNode == NULL) {
SCCLCHECK(scclTopoCreateNode(system, &nicNode, type, busId));
node = nicNode; // Connect it to parent later on
}
SCCLCHECK(scclTopoAddNic(xmlNic, system, nicNode, busId));
} else if(type == PCI) {
SCCLCHECK(scclTopoCreateNode(system, &node, type, busId));
SCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str));
if(str)
node->pci.device += strtol(str, NULL, 0) << 48;
SCCLCHECK(xmlGetAttr(xmlPci, "device", &str));
if(str)
node->pci.device += strtol(str, NULL, 0) << 32;
SCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str));
if(str)
node->pci.device += strtol(str, NULL, 0) << 16;
SCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str));
if(str)
node->pci.device += strtol(str, NULL, 0);
for(int s = 0; s < xmlPci->nSubs; s++) {
struct scclXmlNode* xmlSubPci = xmlPci->subs[s];
SCCLCHECK(scclTopoAddPci(xmlSubPci, system, node));
}
}
if(node) {
int width, speed;
SCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
SCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));
// Manage cases where speed was not indicated in /sys
if(width == 0)
width = 16;
SCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)
SCCLCHECK(scclTopoConnectNodes(node, parent, LINK_PCI, width * speed / 80.0));
SCCLCHECK(scclTopoConnectNodes(parent, node, LINK_PCI, width * speed / 80.0));
}
return scclSuccess;
}
struct kvDict kvDictCpuArch[] = {{"x86_64", SCCL_TOPO_CPU_ARCH_X86}, {"arm64", SCCL_TOPO_CPU_ARCH_ARM}, {"ppc64", SCCL_TOPO_CPU_ARCH_POWER}, {NULL, 0}};
struct kvDict kvDictCpuVendor[] = {{"GenuineIntel", SCCL_TOPO_CPU_VENDOR_INTEL},
{"AuthenticAMD", SCCL_TOPO_CPU_VENDOR_AMD},
{"CentaurHauls", SCCL_TOPO_CPU_VENDOR_ZHAOXIN},
{" Shanghai ", SCCL_TOPO_CPU_VENDOR_ZHAOXIN},
{NULL, 0}};
/**
* @brief 添加CPU拓扑信息到系统拓扑结构中
*
* 从XML节点中解析CPU信息,包括NUMA ID、CPU架构、厂商、型号等,
* 并创建对应的拓扑节点。同时处理CPU关联的PCI设备和NIC设备。
*
* @param xmlCpu 包含CPU配置信息的XML节点
* @param system 目标拓扑系统
* @return scclResult_t 操作结果,成功返回scclSuccess
*/
scclResult_t scclTopoAddCpu(struct scclXmlNode* xmlCpu, struct scclTopoSystem* system) {
int numaId;
// 从XML节点获取NUMA ID
SCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
struct scclTopoNode* cpu;
// 创建一个新的CPU节点
SCCLCHECK(scclTopoCreateNode(system, &cpu, CPU, numaId));
const char* str;
// 获取CPU的亲和性属性
SCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
if(str != NULL) {
SCCLCHECK(scclStrToCpuset(str, &cpu->cpu.affinity));
}
// 获取CPU架构信息
SCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));
SCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));
if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86) {
// 获取CPU供应商信息
SCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));
SCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));
if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
int familyId, modelId;
// 获取Intel CPU的家族ID和型号ID
SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
// 根据家族ID和型号ID确定CPU型号
cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? SCCL_TOPO_CPU_TYPE_SKL : SCCL_TOPO_CPU_INTEL_BDW;
} else if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
int familyId, modelId;
// 获取兆芯CPU的家族ID和型号ID
SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
if(familyId == 7 && modelId == 0x5B)
cpu->cpu.model = SCCL_TOPO_CPU_TYPE_YONGFENG;
}
if(cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_AMD) {
int familyId, modelId;
// 获取AMD CPU的家族ID和型号ID
SCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
SCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
// 将“Milan”也视为“Rome”
cpu->cpu.model = ((familyId == 143 && modelId >= 49) || familyId == 175) ? SCCL_TOPO_CPU_TYPE_ROME : SCCL_TOPO_CPU_TYPE_ZEN;
}
}
// 遍历CPU节点的子节点
for(int s = 0; s < xmlCpu->nSubs; s++) {
struct scclXmlNode* node = xmlCpu->subs[s];
// 如果子节点是PCI设备,添加PCI节点
if(strcmp(node->name, "pci") == 0)
SCCLCHECK(scclTopoAddPci(node, system, cpu));
// 如果子节点是NIC设备,添加NIC节点
if(strcmp(node->name, "nic") == 0) {
struct scclTopoNode* nic = NULL;
SCCLCHECK(scclTopoGetNode(system, &nic, NIC, 0));
if(nic == NULL) {
SCCLCHECK(scclTopoCreateNode(system, &nic, NIC, 0));
SCCLCHECK(scclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW));
SCCLCHECK(scclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW));
}
SCCLCHECK(scclTopoAddNic(node, system, nic, 0));
}
}
return scclSuccess;
}
// scclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
// char* str = path + offset;
// // Remove trailing "/"
// if(*str == '/')
// str--;
// // Find next /
// while(*str != '/')
// str--;
// str++;
// int64_t numid;
// SCCLCHECK(busIdToInt64(str, &numid));
// // Ignore subdevice because those should use the same PCI link so we want to merge nodes.
// numid -= numid & 0xf;
// *id = numid;
// return scclSuccess;
// }
static scclResult_t findLocalCpu(struct scclTopoNode* node, struct scclTopoNode** cpu) {
*cpu = NULL;
if(node->type == CPU) {
*cpu = node;
return scclSuccess;
}
for(int l = 0; l < node->nlinks; l++) {
if(node->links[l].type == LINK_PCI)
SCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
if(*cpu != NULL)
return scclSuccess;
}
return scclSuccess;
}
static scclResult_t scclTopoGetInterCpuBw(struct scclTopoNode* cpu, float* bw) {
*bw = LOC_BW;
if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_POWER) {
*bw = P9_BW;
return scclSuccess;
}
if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_ARM) {
*bw = ARM_BW;
return scclSuccess;
}
if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
*bw = cpu->cpu.model == SCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
}
if(cpu->cpu.arch == SCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
*bw = cpu->cpu.model == SCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
}
return scclSuccess;
}
// BCM Gen4 Switches present themselves as a two-level hierarchical switch
// even though they're supposed to sustain full BW across all ports.
// Flatten the switch as this extra level can break the search and make
// SCCL take wrong topology decisions.
scclResult_t scclTopoFlattenBcmSwitches(struct scclTopoSystem* system) {
for(int s = 0; s < system->nodes[PCI].count; s++) {
struct scclTopoNode* pciSwitch = system->nodes[PCI].nodes + s;
uint64_t device = pciSwitch->pci.device;
// Only flatten PEX Gen 4 switches in base mode
if((device & 0xfffffffffffff000) == 0x1000c0101000a000) {
// Find sub switches with the same device ID.
int64_t* subSwIds;
SCCLCHECK(scclCalloc(&subSwIds, pciSwitch->nlinks));
int subs = 0;
for(int l = 0; l < pciSwitch->nlinks; l++) {
struct scclTopoNode* sub = pciSwitch->links[l].remNode;
// Only fuse sub switches with the same device ID.
if(sub->type != PCI || sub->pci.device != device)
continue;
// Save sub switch for later
subSwIds[subs++] = sub->id;
// Remove link to that sub switch
memmove(pciSwitch->links + l, pciSwitch->links + l + 1, (pciSwitch->nlinks - l - 1) * (sizeof(struct scclTopoLink)));
pciSwitch->nlinks--;
// Don't increase l for the next iteration as we just shifted all links by one.
l--;
}
for(int s = 0; s < subs; s++) {
// Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
int index;
SCCLCHECK(scclTopoIdToIndex(system, PCI, subSwIds[s], &index));
struct scclTopoNode* sub = system->nodes[PCI].nodes + index;
// Connect all sub PCI devices to the parent switch
for(int l = 0; l < sub->nlinks; l++) {
struct scclTopoNode* remNode = sub->links[l].remNode;
if(remNode == pciSwitch)
continue;
// Add link from parent PCI switch -> PCI device
memcpy(pciSwitch->links + pciSwitch->nlinks, sub->links + l, sizeof(struct scclTopoLink));
pciSwitch->nlinks++;
// Update link from PCI device -> parent PCI switch
for(int rl = 0; rl < remNode->nlinks; rl++) {
if(remNode->links[rl].remNode == sub) {
remNode->links[rl].remNode = pciSwitch;
break;
}
}
}
SCCLCHECK(scclTopoRemoveNode(system, PCI, index));
}
// Set subdevice to 0x0000 to make sure we don't merge this switch again.
pciSwitch->pci.device = 0x1000c01010000000;
free(subSwIds);
// Restart, as system->nodes[PCI].nodes has changed.
s = 0;
}
}
return scclSuccess;
}
scclResult_t scclTopoConnectCpus(struct scclTopoSystem* system) {
// And connect all CPU nodes together
for(int n = 0; n < system->nodes[CPU].count; n++) {
for(int p = 0; p < system->nodes[CPU].count; p++) {
if(n == p)
continue;
float bw;
SCCLCHECK(scclTopoGetInterCpuBw(system->nodes[CPU].nodes + n, &bw));
SCCLCHECK(scclTopoConnectNodes(system->nodes[CPU].nodes + n, system->nodes[CPU].nodes + p, LINK_SYS, bw));
}
}
return scclSuccess;
}
static scclResult_t scclTopoSort(struct scclTopoNode* node, struct scclTopoNode* upNode) {
// 如果存在上级节点,则调整当前节点的链接顺序,使上级节点的链接位于最后
if(upNode) {
int l = 0;
// 找到指向upNode的链接
while(node->links[l].remNode != upNode)
l++;
struct scclTopoLink upLink;
// 复制找到的链接到upLink
memcpy(&upLink, node->links + l, sizeof(struct scclTopoLink));
// 将所有链接左移,直到upLink被移动到链接列表的末尾
while(node->links[l + 1].remNode) {
memcpy(node->links + l, node->links + l + 1, sizeof(struct scclTopoLink));
l++;
}
// 将upLink放到链接列表的末尾
memcpy(node->links + l, &upLink, sizeof(struct scclTopoLink));
}
// 递归地对PCI树进行排序
for(int l = 0; l < node->nlinks; l++) {
struct scclTopoLink* link = node->links + l;
// 如果链接类型是PCI且远端节点不是上级节点,则递归排序
if(link->type == LINK_PCI && link->remNode != upNode)
SCCLCHECK(scclTopoSort(link->remNode, node));
}
return scclSuccess;
}
// We want the graph to be organized to ease/accelerate traversal :
// 1. NVLinks (already the case)
// 2. PCI down
// 3. PCI up
// 4. SYS (already the case)
scclResult_t scclTopoSortSystem(struct scclTopoSystem* system) {
for(int n = 0; n < system->nodes[CPU].count; n++)
SCCLCHECK(scclTopoSort(system->nodes[CPU].nodes + n, NULL));
return scclSuccess;
}
float scclTopoXGMISpeed(const char* gcn) {
if(IsArchMatch(gcn, "gfx90a"))
return MI200_XGMI_WIDTH;
else if(IsArchMatch(gcn, "gfx94"))
return GFX94X_XGMI_WIDTH;
else
return VEGA_XGMI_WIDTH;
}
/**
* @brief 添加XGMI拓扑连接
*
* 处理XML节点中的XGMI连接信息,建立GPU与其他设备(GPU/CPU/NVS)之间的NVL连接。
*
* @param node XML节点指针,包含XGMI连接配置信息
* @param system 拓扑系统指针,用于存储和管理拓扑节点
* @param parentBusId 父设备的PCIe总线ID字符串
*
* @return scclResult_t 返回操作结果状态码:
* - scclSuccess: 操作成功
* - scclInternalError: 找不到指定GPU设备时返回错误
*
* @note 1. 支持GPU-GPU、GPU-CPU、GPU-NVS三种连接类型
* 2. 连接带宽由GPU的GCN架构和连接数量共同决定
* 3. 递归处理子节点时保持总线ID传递
*/
scclResult_t scclTopoAddXGMI(struct scclXmlNode* node, struct scclTopoSystem* system, const char* parentBusId) {
if(strcmp(node->name, "xgmi") == 0) {
struct scclTopoNode* gpu = NULL;
int64_t pBusId;
SCCLCHECK(busIdToInt64(parentBusId, &pBusId));
SCCLCHECK(scclTopoGetNode(system, &gpu, GPU, pBusId));
if(gpu == NULL) {
WARN("Add XGMI error : could not find GPU %lx\n", pBusId);
return scclInternalError;
}
int count;
SCCLCHECK(xmlGetAttrInt(node, "count", &count));
const char* targetClass;
SCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));
int targetType;
SCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));
struct scclTopoNode* remote = NULL;
if(targetType == GPU) {
// NVL P2P connection to another GPU
const char* target;
SCCLCHECK(xmlGetAttrStr(node, "target", &target));
int64_t busId;
SCCLCHECK(busIdToInt64(target, &busId));
SCCLCHECK(scclTopoGetNode(system, &remote, GPU, busId));
} else if(targetType == CPU) {
// NVL connection to the local CPU
SCCLCHECK(findLocalCpu(gpu, &remote));
} else {
if(system->nodes[NVS].count == 0) {
SCCLCHECK(scclTopoCreateNode(system, &remote, NVS, 0));
} else {
remote = system->nodes[NVS].nodes;
}
}
if(remote) {
float nvlSpeed = scclTopoXGMISpeed(gpu->gpu.gcn);
SCCLCHECK(scclTopoConnectNodes(gpu, remote, LINK_NVL, count * nvlSpeed));
if(remote->type != GPU) {
SCCLCHECK(scclTopoConnectNodes(remote, gpu, LINK_NVL, count * nvlSpeed));
}
}
} else {
const char* busId;
SCCLCHECK(xmlGetAttr(node, "busid", &busId));
for(int s = 0; s < node->nSubs; s++) {
SCCLCHECK(scclTopoAddXGMI(node->subs[s], system, busId ? busId : parentBusId));
}
}
return scclSuccess;
}
/**
* @brief 获取指定GPU组的本地网络掩码
*
* 遍历系统中所有网络节点,找到与指定GPU组(g)连接带宽最大且路径类型最优的网络节点,
* 将这些网络节点的ID转换为位掩码形式输出。
*
* @param system 拓扑系统指针
* @param g GPU组索引
* @param localNetMask [out] 输出的本地网络掩码(64位无符号整数)
* @param type [out] 可选参数,输出最优路径类型
* @return scclResult_t 成功返回scclSuccess,失败返回错误码
*/
static scclResult_t getLocalNetMask(struct scclTopoSystem* system, int g, uint64_t* localNetMask, int* type) {
int minType = PATH_DIS;
float maxBw = 0;
int count = 0;
int* nets;
SCCLCHECK(scclCalloc(&nets, system->nodes[NET].count));
for(int n = 0; n < system->nodes[NET].count; n++) {
struct scclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU] + g;
if(path->bw > maxBw || (path->bw == maxBw && path->type < minType)) {
maxBw = path->bw;
minType = path->type;
if(type)
*type = minType;
count = 0;
}
if(path->bw == maxBw && path->type == minType)
nets[count++] = system->nodes[NET].nodes[n].id;
}
*localNetMask = 0ULL;
for(int n = 0; n < count; n++) {
if(nets[n] >= 64)
return scclInternalError;
*localNetMask |= 1ULL << nets[n];
}
free(nets);
return scclSuccess;
}
static scclResult_t scclTopoPrintRec(struct scclTopoNode* node, struct scclTopoNode* prevNode, char* line, int offset) {
if(node->type == GPU) {
sprintf(line + offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
} else if(node->type == CPU) {
sprintf(line + offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
} else if(node->type == PCI) {
sprintf(line + offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device);
} else {
sprintf(line + offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
}
INFO(SCCL_LOG_TOPO, "%s", line);
for(int i = 0; i < offset; i++)
line[i] = ' ';
for(int l = 0; l < node->nlinks; l++) {
struct scclTopoLink* link = node->links + l;
if(link->type == LINK_LOC)
continue;
if(link->type != LINK_PCI || link->remNode != prevNode) {
sprintf(line + offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
int nextOffset = strlen(line);
if(link->type == LINK_PCI) {
SCCLCHECK(scclTopoPrintRec(link->remNode, node, line, nextOffset));
} else {
if(link->remNode->type == NET) {
sprintf(line + nextOffset,
"%s/%lX (%lx/%d/%f)",
topoNodeTypeStr[link->remNode->type],
link->remNode->id,
link->remNode->net.asic,
link->remNode->net.port,
link->remNode->net.bw);
} else {
sprintf(line + nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
}
INFO(SCCL_LOG_TOPO, "%s", line);
}
}
}
return scclSuccess;
}
} // namespace topo_basic
////////////////////////////////////////////////////////////////////////////////////////////////
bool isHswDriverExist() {
const ::std::string basePath = "/sys/bus/pci/drivers";
DIR* dir = opendir(basePath.c_str());
if(!dir) {
return false;
}
struct dirent* entry;
bool found = false;
while((entry = readdir(dir)) != nullptr) {
::std::string name = entry->d_name;
if(name != "." && name != ".." && name.compare(0, 3, "hsw") == 0) {
found = true;
break;
}
}
closedir(dir);
return found;
}
int getIBNum() {
int count = 0;
const ::std::string basePath = "/sys/class/infiniband";
DIR* dir = opendir(basePath.c_str());
if(!dir) {
return count;
}
struct dirent* entry;
while((entry = readdir(dir)) != nullptr) {
if(strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
continue;
if(strncmp(entry->d_name, "mlx5", 4) == 0)
++count;
}
closedir(dir);
return count;
}
scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id) {
uint64_t* localNetMasks;
int ngpus = system->nodes[GPU].count;
SCCLCHECK(scclCalloc(&localNetMasks, ngpus));
// Fill localNetMasks for all GPUs.
for(int g = 0; g < ngpus; g++) {
SCCLCHECK(topo_basic::getLocalNetMask(system, g, localNetMasks + g, NULL));
}
// Find GPUs which have the same mask as rank, i.e. share the same local Nets.
int gpu;
SCCLCHECK(scclTopoRankToIndex(system, rank, &gpu));
int netLocalGpus = 0, netLocalGpu = 0;
for(int g = 0; g < ngpus; g++) {
if(localNetMasks[g] == localNetMasks[gpu]) {
if(g == gpu)
netLocalGpu = netLocalGpus;
netLocalGpus++;
}
}
uint64_t localNetMask = localNetMasks[gpu];
free(localNetMasks);
if(localNetMask == 0)
return scclInternalError;
// Round robin on GPUs and channels
int gIndex = 0, cId = 0, n = 0;
while(1) {
if(1ULL << n & localNetMask) {
if(gIndex == netLocalGpu && cId == channelId) {
*id = n;
return scclSuccess;
}
gIndex++;
if(gIndex == netLocalGpus) {
gIndex = 0;
cId++;
}
}
n = (n + 1) % 64;
}
}
scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex) {
int ngpus = system->nodes[GPU].count;
int* gpus;
SCCLCHECK(scclCalloc(&gpus, ngpus));
// Find localNetMask which includes net with the most local GPUs.
int netLocalGpus = 0, minType = PATH_DIS;
uint64_t localNetMask = 0ULL;
for(int g = 0; g < ngpus; g++) {
int type = PATH_DIS;
uint64_t mask;
SCCLCHECK(topo_basic::getLocalNetMask(system, g, &mask, &type));
if((1ULL << net) & mask) {
if(type < minType) {
localNetMask = mask;
netLocalGpus = 0;
minType = type;
}
if(type == minType) {
if(localNetMask && mask != localNetMask) {
WARN("Gpus %d and %d both have a type of %d with net %d yet have different netMasks of %lx and %lx\n",
g,
gpus[netLocalGpus - 1],
minType,
net,
mask,
localNetMask);
free(gpus);
return scclInternalError;
}
gpus[netLocalGpus] = g;
netLocalGpus++;
}
}
}
if(localNetMask == 0ULL) {
*gpuIndex = -1;
free(gpus);
return scclSuccess;
}
// Round robin on GPUs and channels
int gIndex = 0, cId = 0, n = 0;
while(1) {
if(1ULL << n & localNetMask) {
if(n == net) {
*gpuIndex = gpus[gIndex];
free(gpus);
return scclSuccess;
}
gIndex++;
if(gIndex == netLocalGpus) {
gIndex = 0;
cId++;
}
}
n = (n + 1) % 64;
}
}
scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model) {
*arch = system->nodes[CPU].nodes[0].cpu.arch;
*vendor = system->nodes[CPU].nodes[0].cpu.vendor;
*model = system->nodes[CPU].nodes[0].cpu.model;
return scclSuccess;
}
scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity) {
struct scclTopoNode *cpu = NULL, *gpu = NULL;
for(int g = 0; g < system->nodes[GPU].count; g++) {
if(system->nodes[GPU].nodes[g].gpu.rank == rank) {
gpu = system->nodes[GPU].nodes + g;
// Find closer CPU
int cpuIndex = -1, minHops = 0;
for(int c = 0; c < system->nodes[CPU].count; c++) {
int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
if(cpuIndex == -1 || nHops < minHops) {
cpuIndex = c;
minHops = nHops;
}
}
cpu = system->nodes[CPU].nodes + cpuIndex;
}
}
if(cpu == NULL) {
WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
return scclInternalError;
}
// Query the CPU affinity set we were provided
cpu_set_t mask;
SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
// Get the affinity of the CPU close to our GPU.
cpu_set_t cpuMask = cpu->cpu.affinity;
cpu_set_t finalMask;
if(topo_basic::scclParamIgnoreCpuAffinity())
// Ignore the CPU affinity set and use the GPU one instead
finalMask = cpuMask;
else
// Use a subset of the GPU affinity set
CPU_AND(&finalMask, &mask, &cpuMask);
memcpy(affinity, &finalMask, sizeof(cpu_set_t));
// If there is a non empty set, use it to set affinity
if(CPU_COUNT(&finalMask)) {
char affinityStr[sizeof(cpu_set_t) * 2];
SCCLCHECK(scclCpusetToStr(&finalMask, affinityStr));
INFO(SCCL_LOG_TOPO, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
}
return scclSuccess;
}
scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count) {
*count = system->nodes[GPU].count;
return scclSuccess;
}
scclResult_t scclTopoGetNetCount(struct scclTopoSystem* system, int* count) {
*count = system->nodes[NET].count;
return scclSuccess;
}
scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count) {
*count = system->nodes[NVS].count;
return scclSuccess;
}
scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank) {
for(int g = 0; g < system->nodes[GPU].count; g++) {
if(system->nodes[GPU].nodes[g].gpu.rank == rank) {
*localRank = g;
return scclSuccess;
}
}
WARN("Could not find local GPU with rank %d", rank);
return scclInternalError;
}
scclResult_t scclTopoPrint(struct scclTopoSystem* s) {
INFO(SCCL_LOG_TOPO, "=== System : maxBw %2.1f totalBw %2.1f ===", s->maxBw, s->totalBw);
char line[1024];
for(int n = 0; n < s->nodes[CPU].count; n++)
SCCLCHECK(topo_basic::scclTopoPrintRec(s->nodes[CPU].nodes + n, NULL, line, 0));
INFO(SCCL_LOG_TOPO, "==========================================");
return scclSuccess;
}
scclResult_t scclTopoGetNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id) {
for(int i = 0; i < system->nodes[type].count; i++) {
if(system->nodes[type].nodes[i].id == id) {
*node = system->nodes[type].nodes + i;
return scclSuccess;
}
}
return scclSuccess;
}
scclResult_t scclTopoCreateNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id) {
if(system->nodes[type].count == SCCL_TOPO_MAX_NODES) {
WARN("Error : tried to create too many nodes of type %d", type);
return scclInternalError;
}
struct scclTopoNode* n = system->nodes[type].nodes + system->nodes[type].count;
system->nodes[type].count++;
n->type = type;
n->id = id;
if(type == GPU) {
// Create link to itself (used in some corner cases)
n->nlinks = 1;
n->links[0].type = LINK_LOC;
n->links[0].remNode = n;
n->links[0].bw = LOC_BW;
n->gpu.dev = SCCL_TOPO_UNDEF;
n->gpu.rank = SCCL_TOPO_UNDEF;
n->gpu.cudaCompCap = SCCL_TOPO_UNDEF;
} else if(type == CPU) {
n->cpu.arch = SCCL_TOPO_UNDEF;
n->cpu.vendor = SCCL_TOPO_UNDEF;
n->cpu.model = SCCL_TOPO_UNDEF;
} else if(type == NET) {
n->net.asic = 0ULL;
n->net.port = SCCL_TOPO_UNDEF;
n->net.bw = 0.0;
n->net.latency = 0.0;
}
*node = n;
return scclSuccess;
}
/**
* 从拓扑系统中移除指定类型的节点
*
* @param system 拓扑系统指针
* @param type 要移除的节点类型
* @param index 要移除的节点索引
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*
* 该函数会:
* 1. 释放被移除节点的所有路径内存
* 2. 更新其他节点到被移除节点的链接关系
* 3. 调整节点数组中剩余节点的位置
* 4. 减少该类型节点的计数
*/
scclResult_t scclTopoRemoveNode(struct scclTopoSystem* system, int type, int index) {
struct scclTopoNode* delNode = system->nodes[type].nodes + index;
for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
free(delNode->paths[t]);
for(int n = 0; n < system->nodes[t].count; n++) {
struct scclTopoNode* node = system->nodes[t].nodes + n;
if(node == delNode)
continue;
for(int l = 0; l < node->nlinks; l++) {
while(l < node->nlinks && node->links[l].remNode == delNode) {
memmove(node->links + l, node->links + l + 1, (node->nlinks - l - 1) * sizeof(struct scclTopoLink));
node->nlinks--;
}
if(l < node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) {
node->links[l].remNode--;
}
}
}
}
memmove(delNode, delNode + 1, (system->nodes[type].count - index - 1) * sizeof(struct scclTopoNode));
system->nodes[type].count--;
return scclSuccess;
}
scclResult_t scclTopoConnectNodes(struct scclTopoNode* node, struct scclTopoNode* remNode, int type, float bw) {
// Aggregate links into higher bw for NVLink
struct scclTopoLink* link;
for(link = node->links; link->remNode; link++) {
if(link->remNode == remNode && link->type == type)
break;
}
if(link->remNode == NULL)
node->nlinks++;
link->type = type;
link->remNode = remNode;
link->bw += bw;
// Sort links in BW descending order
struct scclTopoLink linkSave;
memcpy(&linkSave, link, sizeof(struct scclTopoLink));
while(link != node->links) {
if((link - 1)->bw >= linkSave.bw)
break;
memcpy(link, link - 1, sizeof(struct scclTopoLink));
link--;
}
memcpy(link, &linkSave, sizeof(struct scclTopoLink));
return scclSuccess;
}
scclResult_t scclTopoGetSystemFromXml(struct scclXml* xml, struct scclTopoSystem** topoSystem) {
SCCLCHECK(scclCalloc(topoSystem, 1));
struct scclXmlNode* topNode;
SCCLCHECK(xmlFindTag(xml, "system", &topNode));
printf("topNode->nSubs=%d\n", topNode->nSubs);
for(int s = 0; s < topNode->nSubs; s++) {
struct scclXmlNode* node = topNode->subs[s];
if(strcmp(node->name, "cpu") == 0)
SCCLCHECK(topo_basic::scclTopoAddCpu(node, *topoSystem));
}
SCCLCHECK(topo_basic::scclTopoAddXGMI(topNode, *topoSystem, NULL));
SCCLCHECK(topo_basic::scclTopoFlattenBcmSwitches(*topoSystem));
SCCLCHECK(topo_basic::scclTopoConnectCpus(*topoSystem));
SCCLCHECK(topo_basic::scclTopoSortSystem(*topoSystem));
return scclSuccess;
}
/**
* 获取系统中所有GPU节点的计算能力范围
*
* @param system 拓扑系统指针
* @param ccMin 输出参数,返回最小计算能力版本
* @param ccMax 输出参数,返回最大计算能力版本
* @return scclResult_t 成功返回scclSuccess,无GPU节点返回scclInternalError
*/
scclResult_t scclTopoGetCompCap(struct scclTopoSystem* system, int* ccMin, int* ccMax) {
if(system->nodes[GPU].count == 0)
return scclInternalError;
int min, max;
min = max = system->nodes[GPU].nodes[0].gpu.cudaCompCap;
for(int g = 1; g < system->nodes[GPU].count; g++) {
min = ::std::min(min, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
max = ::std::max(max, system->nodes[GPU].nodes[g].gpu.cudaCompCap);
}
if(ccMin)
*ccMin = min;
if(ccMax)
*ccMax = max;
return scclSuccess;
}
scclResult_t scclTopoIdToIndex(struct scclTopoSystem* system, int type, int64_t id, int* index) {
*index = -1;
for(int i = 0; i < system->nodes[type].count; i++) {
if(system->nodes[type].nodes[i].id == id) {
*index = i;
return scclSuccess;
}
}
return scclInternalError;
}
scclResult_t scclTopoRankToIndex(struct scclTopoSystem* system, int rank, int* index) {
*index = -1;
for(int i = 0; i < system->nodes[GPU].count; i++) {
if(system->nodes[GPU].nodes[i].gpu.rank == rank) {
*index = i;
return scclSuccess;
}
}
return scclInternalError;
}
scclResult_t scclTopoDevToRank(struct scclTopoSystem* system, int dev, int* rank) {
*rank = -1;
for(int i = 0; i < system->nodes[GPU].count; i++) {
if(system->nodes[GPU].nodes[i].gpu.dev == dev) {
*rank = system->nodes[GPU].nodes[i].gpu.rank;
return scclSuccess;
}
}
return scclInternalError;
}
/**
* @brief 获取系统拓扑结构
*
* 该函数用于获取系统的拓扑结构信息,包括GPU和NIC设备。
* 首先尝试从环境变量SCCL_TOPO_FILE指定的XML文件加载拓扑,
* 若未指定则尝试加载默认拓扑文件(根据IB设备数量选择不同文件)。
* 自动检测本地GPU和NIC设备信息并填充到拓扑结构中。
*
* @param comm 通信上下文指针
* @param system 输出参数,返回创建的拓扑系统指针
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*/
// scclResult_t scclTopoGetSystem(struct scclTopoComm* comm, struct scclTopoSystem** system) {
// struct scclXml* xml;
// SCCLCHECK(scclCalloc(&xml, 1));
// char* xmlTopoFile = getenv("SCCL_TOPO_FILE");
// if(xmlTopoFile) {
// INFO(SCCL_LOG_TOPO, "SCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
// SCCLCHECK(scclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
// } else {
// bool useDefaultTopo = true;
// bool HswExist = topo_basic::isHswDriverExist();
// if(HswExist == true) {
// char* rocmPath = getenv("ROCM_PATH");
// if(rocmPath != NULL) {
// ::std::string xmlPath;
// int IBNum = topo_basic::getIBNum();
// if(IBNum == 8 || IBNum == 9 || IBNum == 10) {
// xmlPath = ::std::string(rocmPath) + "/rccl/lib/built-in-BW-topo-input.xml";
// if(access(xmlPath.c_str(), F_OK) == 0) {
// SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
// useDefaultTopo = false;
// }
// } else if(IBNum == 4 || IBNum == 5 || IBNum == 6) {
// xmlPath = ::std::string(rocmPath) + "/rccl/lib/built-in-508-topo-input.xml";
// if(access(xmlPath.c_str(), F_OK) == 0) {
// SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
// useDefaultTopo = false;
// }
// }
// }
// }
// if(useDefaultTopo) {
// INFO(SCCL_LOG_TOPO, "No default topo for now, please provide your own topo xml file");
// }
// }
// if(xml->maxIndex == 0) {
// // Create top tag
// struct scclXmlNode* top;
// SCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
// SCCLCHECK(xmlSetAttrInt(top, "version", SCCL_TOPO_XML_VERSION));
// }
// // Auto-detect GPUs if needed
// for(int r = 0; r < comm->nRanks; r++) {
// if(comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
// char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
// SCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
// struct scclXmlNode* node;
// SCCLCHECK(scclTopoFillGpu(xml, busId, &node));
// if(node == NULL)
// continue;
// SCCLCHECK(xmlSetAttrInt(node, "keep", 1));
// SCCLCHECK(xmlSetAttrInt(node, "rank", r));
// SCCLCHECK(topo_basic::xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
// }
// }
// // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// // so we start with collnet so that it has precedence.
// int netDevCount = 0;
// if(netDevCount == 0) {
// SCCLCHECK(comm->scclNet->devices(&netDevCount));
// }
// for(int n = 0; n < netDevCount; n++) {
// sccl::hardware::net::scclNetProperties_t props;
// SCCLCHECK(comm->scclNet->getProperties(n, &props));
// struct scclXmlNode* netNode;
// SCCLCHECK(scclTopoFillNet(xml, props.pciPath, props.name, &netNode));
// SCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
// SCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
// SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "speed", props.speed));
// SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "port", props.port));
// SCCLCHECK(topo_basic::xmlInitAttrFloat(netNode, "latency", props.latency));
// SCCLCHECK(topo_basic::xmlInitAttrUint64(netNode, "guid", props.guid));
// SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "maxconn", props.maxComms));
// bool gdrSupport =
// (props.ptrSupport & sccl::hardware::net::SCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & sccl::hardware::net::SCCL_PTR_DMABUF));
// INFO(SCCL_LOG_TOPO, "NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->scclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
// SCCLCHECK(topo_basic::xmlInitAttrInt(netNode, "gdr", gdrSupport));
// }
// // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
// SCCLCHECK(scclTopoTrimXml(xml));
// xmlTopoFile = getenv("SCCL_TOPO_DUMP_FILE");
// if(xmlTopoFile && comm->rank == topo_basic::scclParamTopoDumpFileRank()) {
// INFO(SCCL_LOG_TOPO, "SCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
// SCCLCHECK(scclTopoDumpXmlToFile(xmlTopoFile, xml));
// }
// SCCLCHECK(scclTopoGetSystemFromXml(xml, system));
// free(xml);
// return scclSuccess;
// }
scclResult_t scclTopoGetSystem(struct scclTopoSystem** system) {
using namespace sccl;
struct scclXml* xml;
SCCLCHECK(scclCalloc(&xml, 1));
bool HswExist = isHswDriverExist();
if(HswExist == true) {
::std::string xmlPath;
int IBNum = getIBNum();
if(IBNum == 8 || IBNum == 9 || IBNum == 10) {
xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
}
}
if(xml->maxIndex == 0) {
// Create top tag
struct scclXmlNode* top;
SCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
SCCLCHECK(xmlSetAttrInt(top, "version", SCCL_TOPO_XML_VERSION));
}
// Auto-detect GPUs if needed
// for(int r = 0; r < comm->nRanks; r++) {
// if(comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
// char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
// SCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
// struct scclXmlNode* node;
// SCCLCHECK(scclTopoFillGpu(xml, busId, &node));
// if(node == NULL)
// continue;
// SCCLCHECK(xmlSetAttrInt(node, "keep", 1));
// SCCLCHECK(xmlSetAttrInt(node, "rank", r));
// SCCLCHECK(topo_basic::xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
// }
// }
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// so we start with collnet so that it has precedence.
int netDevCount = 0;
auto scclNet = sccl::hardware::net::initNet(sccl::hardware::net::NET_IB);
if(netDevCount == 0) {
SCCLCHECK(scclNet->devices(&netDevCount));
}
for(int n = 0; n < netDevCount; n++) {
sccl::hardware::net::scclNetProperties_t props;
SCCLCHECK(scclNet->getProperties(n, &props));
struct scclXmlNode* netNode;
SCCLCHECK(scclTopoFillNet(xml, props.pciPath, props.name, &netNode));
SCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
SCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
SCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
SCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
SCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
SCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
SCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
bool gdrSupport = (props.ptrSupport & sccl::hardware::net::SCCL_PTR_CUDA) && (props.ptrSupport & sccl::hardware::net::SCCL_PTR_DMABUF);
INFO(SCCL_LOG_TOPO, "NET/%s : GPU Direct RDMA %s for HCA %d '%s'", scclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
SCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
}
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
SCCLCHECK(scclTopoTrimXml(xml));
SCCLCHECK(scclTopoGetSystemFromXml(xml, system));
free(xml);
return scclSuccess;
}
} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl
#ifndef SCCL_TOPO_H_
#define SCCL_TOPO_H_
#include <string.h>
#include "base.h"
#include "archinfo.h"
#include "xml.h"
#include "net.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace topo {
#define SCCL_TOPO_NODE_TYPES 6
static constexpr int SCCL_TOPO_MAX_NODES = 256;
#define SCCL_TOPO_MAX_LINKS 32
#define SCCL_TOPO_MAX_HOPS (SCCL_TOPO_MAX_NODES * SCCL_TOPO_NODE_TYPES)
// 定义硬件拓扑类型枚举
typedef enum topoNodeType {
GPU = 0, // 图形处理单元
PCI = 1, // 外围组件互连
NVS = 2, // 非易失性存储器
CPU = 3, // 中央处理器,实际上是NUMA域
NIC = 4, // 网络接口控制器
NET = 5 // 网络
} topoNodeType_t;
extern const char* topoNodeTypeStr[];
// 定义链接类型和路径类型的枚举,以确保它们尽可能匹配
typedef enum topoLinkType {
LINK_LOC = 0, // 本地链接
LINK_NVL = 1, // NVLink链接
// 路径类型PATH_NVB占位,不定义
LINK_PCI = 3, // PCI链接
// 路径类型PATH_PXB占位,不定义
// 路径类型PATH_PXN占位,不定义
// 路径类型PATH_PHB占位,不定义
LINK_SYS = 7, // 系统链接
LINK_NET = 8 // 网络链接
} topoLinkType_t;
extern const char* topoLinkTypeStr[];
// 定义 topoPathType_t 枚举类型,用于表示不同的路径类型。
enum topoPathType {
PATH_LOC = 0, // 本地路径
PATH_NVL = 1, // 通过 NVLink 连接
PATH_NVB = 2, // 通过中间 GPU 使用 NVLink 连接
PATH_PIX = 3, // 通过最多一个 PCIe 桥连接
PATH_PXB = 4, // 通过多个 PCIe 桥连接(不经过 PCIe 主桥)
PATH_PXN = 5, // GPU 和 NIC 之间通过中间 GPU 连接
PATH_PHB = 6, // 通过 PCIe 以及 PCIe 主桥连接
PATH_SYS = 7, // 通过 PCIe 以及 NUMA 节点之间的 SMP 互连连接
PATH_NET = 8, // 通过网络连接
PATH_DIS = 9 // 断开连接
};
////////////////////////////////////////////////////////////////////////////////////////////////
struct scclTopoNode;
struct scclTopoLink {
int type;
float bw;
struct scclTopoNode* remNode;
};
struct scclTopoLinkList {
int type;
float bw;
int count;
struct scclTopoLink* list[SCCL_TOPO_MAX_HOPS];
};
struct scclTopoNode {
int type; // 节点类型
int64_t id; // 节点ID
// 类型特定数据
union {
struct {
int dev; // NVML设备编号
int rank; // 排名
int cudaCompCap; // CUDA计算能力
int gdrSupport; // GDR支持
const char* gcn; // GCN架构名称
hipDeviceArch_t arch; // HIP设备架构
} gpu; // GPU节点
struct {
uint64_t asic; // ASIC标识
int port; // 端口编号
float bw; // 带宽
float latency; // 延迟
int gdrSupport; // GDR支持
int collSupport; // 集合操作支持
int maxChannels; // 最大通道数
int64_t busId; // 总线ID
} net; // 网络节点
struct {
int arch; // 架构
int vendor; // 供应商
int model; // 模型
cpu_set_t affinity; // CPU亲和性
} cpu; // CPU节点
struct {
uint64_t device; // PCI设备
} pci; // PCI节点
};
int nlinks; // 链接数量
struct scclTopoLink links[SCCL_TOPO_MAX_LINKS]; // 链接列表
// 预计算路径到GPU和NIC
struct scclTopoLinkList* paths[SCCL_TOPO_NODE_TYPES];
// 搜索期间使用
uint64_t used;
};
struct scclTopoNodeSet {
int count; // 节点数量
struct scclTopoNode nodes[SCCL_TOPO_MAX_NODES]; // 节点数组,最大数量由SCCL_TOPO_MAX_NODES定义
};
struct scclTopoSystem {
struct scclTopoNodeSet nodes[SCCL_TOPO_NODE_TYPES]; // 节点集,用于存储不同类型的节点
float maxBw; // 系统最大带宽
float baseBw; // 基础带宽
float totalBw; // 系统总带宽
int type; // 系统类型
int nRanks; // 系统中的秩数
int netGdrLevel; // 网络GDR级别
int tuning; // 调优参数
int pivotA2ANumBiRings; // Pivot A2A模式下的双向环路数量
bool pivotA2AEnabled; // 是否启用Pivot A2A通信模式
bool treeDefined; // 是否定义了树结构
bool ll128Enabled; // 是否启用了LL128模式
bool mscclEnabled; // 是否启用了MSCCL模式
};
#define LOC_BW 5000.0
#define SM60_NVLINK_BW 18.0
#define SM70_NVLINK_BW 20.0
#define SM80_NVLINK_BW 20.0
#define SM90_NVLINK_BW 20.0
#define SM86_NVLINK_BW 12.0
#define PCI_BW 12.0 // PCI Gen3 x16
#define QPI_BW 6.0
#define SKL_QPI_BW 10.0
#define ZPI_BW 6.0
#define YONGFENG_ZPI_BW 9.0
#define P9_BW 32.0
#define ARM_BW 6.0
#define NET_BW 12.0 // 100Gbit
#define VEGA_XGMI_WIDTH 24.0
#define MI200_XGMI_WIDTH 36.0
#define GFX94X_XGMI_WIDTH 48.0
// 英特尔CPU将GPU的P2P流量转换为64字节的PCI TLP,因此GPU之间的流量消耗更多的PCI带宽。
#define INTEL_P2P_OVERHEAD(bw) (bw * 6 / 5)
enum topoCpuArch {
SCCL_TOPO_CPU_ARCH_X86 = 1,
SCCL_TOPO_CPU_ARCH_POWER = 2,
SCCL_TOPO_CPU_ARCH_ARM = 3
};
enum topoCpuVendor {
SCCL_TOPO_CPU_VENDOR_INTEL = 1,
SCCL_TOPO_CPU_VENDOR_AMD = 2,
SCCL_TOPO_CPU_VENDOR_ZHAOXIN = 3
};
enum topoCpuType {
SCCL_TOPO_CPU_TYPE_BDW = 1,
SCCL_TOPO_CPU_TYPE_SKL = 2,
SCCL_TOPO_CPU_TYPE_ZEN = 3,
SCCL_TOPO_CPU_TYPE_ROME = 4,
SCCL_TOPO_CPU_TYPE_YONGFENG = 5
};
enum topoCpuPattern {
SCCL_TOPO_PATTERN_BALANCED_TREE = 1,
SCCL_TOPO_PATTERN_SPLIT_TREE = 2,
SCCL_TOPO_PATTERN_TREE = 3,
SCCL_TOPO_PATTERN_RING = 4,
SCCL_TOPO_PATTERN_NVLS = 5
};
#define SCCL_TOPO_MAX_NODES 256
extern const char* topoPathTypeStr[];
#define SCCL_TOPO_CPU_INTEL_BDW 1
#define SCCL_TOPO_CPU_INTEL_SKL 2
enum topoSysType {
SCCL_TOPO_UNDEF = -1,
SCCL_TOPO_CR8G = 1,
SCCL_TOPO_4P2H_ROME = 2,
SCCL_TOPO_GDR_ALL = 4,
SCCL_TOPO_16P1H = 8,
SCCL_TOPO_FORCE_INTRA = 16,
SCCL_TOPO_XGMI_ALL = 32
};
// struct scclTopoComm {
// int type;
// int id;
// int rank;
// int nRanks;
// int node;
// int nNodes;
// int localRank;
// int localRanks;
// bool dmaBufSupport;
// struct scclPeerInfo* peerInfo;
// sccl::hardware::net::scclNet_t* scclNet;
// };
////////////////////////////////////////////////////////////////////////////////////////////////
// 检查是否存在Hsw驱动程序
bool isHswDriverExist();
// 获取InfiniBand (IB) 设备的数量
int getIBNum();
// 获取拓扑节点
scclResult_t scclTopoGetNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id);
// 创建拓扑节点
scclResult_t scclTopoCreateNode(struct scclTopoSystem* system, struct scclTopoNode** node, int type, uint64_t id);
// 移除拓扑节点
scclResult_t scclTopoRemoveNode(struct scclTopoSystem* system, int type, int id);
// 连接两个拓扑节点
scclResult_t scclTopoConnectNodes(struct scclTopoNode* node, struct scclTopoNode* remNode, int type, float bw);
// 从XML获取系统拓扑
scclResult_t scclTopoGetSystemFromXml(struct scclXml* xml, struct scclTopoSystem** topoSystem);
// 打印系统路径
scclResult_t scclTopoPrint(struct scclTopoSystem* system);
// 获取计算能力
scclResult_t scclTopoGetCompCap(struct scclTopoSystem* system, int* ccMin, int* ccMax);
// 将ID转换为索引
scclResult_t scclTopoIdToIndex(struct scclTopoSystem* system, int type, int64_t id, int* index);
// 将Rank转换为索引
scclResult_t scclTopoRankToIndex(struct scclTopoSystem* system, int rank, int* index);
// 将设备ID转换为Rank
scclResult_t scclTopoDevToRank(struct scclTopoSystem* system, int dev, int* rank);
// 获取XGMI速度
float scclTopoXGMISpeed(const char* gcn);
// 获取本地网络信息
scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id);
// 获取本地GPU信息
scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex);
// 获取CPU类型信息
scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model);
// 查找CPU亲和性
scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity);
// 获取GPU数量
scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count);
// 获取网络接口数量
scclResult_t scclTopoGetNetCount(struct scclTopoSystem* system, int* count);
// 获取NVS(非易失性存储器)数量
scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count);
// 获取本地排名
scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank);
// // 获取系统拓扑结构
// scclResult_t scclTopoGetSystem(struct scclTopoComm* comm, struct scclTopoSystem** system);
scclResult_t scclTopoGetSystem(struct scclTopoSystem** system);
} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif
#pragma once
#include <string.h>
#include "base.h"
#include "archinfo.h"
#include "xml.h"
// #include "net.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace topo {
#define SCCL_TOPO_NODE_TYPES 6
static constexpr int SCCL_TOPO_MAX_NODES = 256;
#define SCCL_TOPO_MAX_LINKS 32
#define SCCL_TOPO_MAX_HOPS (SCCL_TOPO_MAX_NODES * SCCL_TOPO_NODE_TYPES)
// 定义硬件拓扑类型枚举
typedef enum topoNodeType {
GPU = 0, // 图形处理单元
PCI = 1, // 外围组件互连
NVS = 2, // 非易失性存储器
CPU = 3, // 中央处理器,实际上是NUMA域
NIC = 4, // 网络接口控制器
NET = 5 // 网络
} topoNodeType_t;
extern const char* topoNodeTypeStr[];
// 定义链接类型和路径类型的枚举,以确保它们尽可能匹配
typedef enum topoLinkType {
LINK_LOC = 0, // 本地链接
LINK_NVL = 1, // NVLink链接
// 路径类型PATH_NVB占位,不定义
LINK_PCI = 3, // PCI链接
// 路径类型PATH_PXB占位,不定义
// 路径类型PATH_PXN占位,不定义
// 路径类型PATH_PHB占位,不定义
LINK_SYS = 7, // 系统链接
LINK_NET = 8 // 网络链接
} topoLinkType_t;
extern const char* topoLinkTypeStr[];
// 定义 topoPathType_t 枚举类型,用于表示不同的路径类型。
enum topoPathType {
PATH_LOC = 0, // 本地路径
PATH_NVL = 1, // 通过 NVLink 连接
PATH_NVB = 2, // 通过中间 GPU 使用 NVLink 连接
PATH_PIX = 3, // 通过最多一个 PCIe 桥连接
PATH_PXB = 4, // 通过多个 PCIe 桥连接(不经过 PCIe 主桥)
PATH_PXN = 5, // GPU 和 NIC 之间通过中间 GPU 连接
PATH_PHB = 6, // 通过 PCIe 以及 PCIe 主桥连接
PATH_SYS = 7, // 通过 PCIe 以及 NUMA 节点之间的 SMP 互连连接
PATH_NET = 8, // 通过网络连接
PATH_DIS = 9 // 断开连接
};
////////////////////////////////////////////////////////////////////////////////////////////////
struct scclTopoNode;
struct scclTopoLink {
int type;
float bw;
struct scclTopoNode* remNode;
};
struct scclTopoLinkList {
int type;
float bw;
int count;
struct scclTopoLink* list[SCCL_TOPO_MAX_HOPS];
};
struct scclTopoNode {
int type; // 节点类型
int id; // 节点ID
// 类型特定数据
union {
struct {
int dev; // NVML设备编号
int rank; // 排名
int cudaCompCap; // CUDA计算能力
int gdrSupport; // GDR支持
const char* gcn; // GCN架构名称
hipDeviceArch_t arch; // HIP设备架构
} gpu; // GPU节点
struct {
uint64_t asic; // ASIC标识
int port; // 端口编号
float bw; // 带宽
float latency; // 延迟
int gdrSupport; // GDR支持
int collSupport; // 集合操作支持
int maxChannels; // 最大通道数
int64_t busId; // 总线ID
} net; // 网络节点
struct {
int arch; // 架构
int vendor; // 供应商
int model; // 模型
cpu_set_t affinity; // CPU亲和性
} cpu; // CPU节点
struct {
uint64_t device; // PCI设备
} pci; // PCI节点
};
int nlinks; // 链接数量
struct scclTopoLink links[SCCL_TOPO_MAX_LINKS]; // 链接列表
// 预计算路径到GPU和NIC
struct scclTopoLinkList* paths[SCCL_TOPO_NODE_TYPES];
// 搜索期间使用
uint64_t used;
};
struct scclTopoNodeSet {
int count; // 节点数量
struct scclTopoNode nodes[SCCL_TOPO_MAX_NODES]; // 节点数组,最大数量由SCCL_TOPO_MAX_NODES定义
};
struct scclTopoSystem {
struct scclTopoNodeSet nodes[SCCL_TOPO_NODE_TYPES]; // 节点集,用于存储不同类型的节点
float maxBw; // 系统最大带宽
float baseBw; // 基础带宽
float totalBw; // 系统总带宽
int type; // 系统类型
int nRanks; // 系统中的秩数
int netGdrLevel; // 网络GDR级别
int tuning; // 调优参数
int pivotA2ANumBiRings; // Pivot A2A模式下的双向环路数量
bool pivotA2AEnabled; // 是否启用Pivot A2A通信模式
bool treeDefined; // 是否定义了树结构
bool ll128Enabled; // 是否启用了LL128模式
bool mscclEnabled; // 是否启用了MSCCL模式
};
#define LOC_BW 5000.0
#define SM60_NVLINK_BW 18.0
#define SM70_NVLINK_BW 20.0
#define SM80_NVLINK_BW 20.0
#define SM90_NVLINK_BW 20.0
#define SM86_NVLINK_BW 12.0
#define PCI_BW 12.0 // PCI Gen3 x16
#define QPI_BW 6.0
#define SKL_QPI_BW 10.0
#define ZPI_BW 6.0
#define YONGFENG_ZPI_BW 9.0
#define P9_BW 32.0
#define ARM_BW 6.0
#define NET_BW 12.0 // 100Gbit
#define VEGA_XGMI_WIDTH 24.0
#define MI200_XGMI_WIDTH 36.0
#define GFX94X_XGMI_WIDTH 48.0
// 英特尔CPU将GPU的P2P流量转换为64字节的PCI TLP,因此GPU之间的流量消耗更多的PCI带宽。
#define INTEL_P2P_OVERHEAD(bw) (bw * 6 / 5)
enum topoCpuArch {
SCCL_TOPO_CPU_ARCH_X86 = 1,
SCCL_TOPO_CPU_ARCH_POWER = 2,
SCCL_TOPO_CPU_ARCH_ARM = 3
};
enum topoCpuVendor {
SCCL_TOPO_CPU_VENDOR_INTEL = 1,
SCCL_TOPO_CPU_VENDOR_AMD = 2,
SCCL_TOPO_CPU_VENDOR_ZHAOXIN = 3
};
enum topoCpuType {
SCCL_TOPO_CPU_TYPE_BDW = 1,
SCCL_TOPO_CPU_TYPE_SKL = 2,
SCCL_TOPO_CPU_TYPE_ZEN = 3,
SCCL_TOPO_CPU_TYPE_ROME = 4,
SCCL_TOPO_CPU_TYPE_YONGFENG = 5
};
enum topoCpuPattern {
SCCL_TOPO_PATTERN_BALANCED_TREE = 1,
SCCL_TOPO_PATTERN_SPLIT_TREE = 2,
SCCL_TOPO_PATTERN_TREE = 3,
SCCL_TOPO_PATTERN_RING = 4,
SCCL_TOPO_PATTERN_NVLS = 5
};
#define SCCL_TOPO_MAX_NODES 256
extern const char* topoPathTypeStr[];
#define SCCL_TOPO_CPU_INTEL_BDW 1
#define SCCL_TOPO_CPU_INTEL_SKL 2
enum topoSysType {
SCCL_TOPO_UNDEF = -1,
SCCL_TOPO_CR8G = 1,
SCCL_TOPO_4P2H_ROME = 2,
SCCL_TOPO_GDR_ALL = 4,
SCCL_TOPO_16P1H = 8,
SCCL_TOPO_FORCE_INTRA = 16,
SCCL_TOPO_XGMI_ALL = 32
};
} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl
/*************************************************************************
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <ctype.h>
#include "check.h"
#include "nvmlwrap.h"
#include "xml.h"
#include "rocm_smi_wrap.h"
#include "archinfo.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace topo {
/**************/
/* XML Struct */
/* Functions */
/**************/
scclResult_t xmlGetAttrIndex(struct scclXmlNode* node, const char* attrName, int* index) {
*index = -1;
const int nAttrs = node->nAttrs;
for(int a = 0; a < nAttrs; a++) {
if(strncmp(node->attrs[a].key, attrName, MAX_STR_LEN) == 0) {
*index = a;
return scclSuccess;
}
}
return scclSuccess;
}
scclResult_t xmlGetAttr(struct scclXmlNode* node, const char* attrName, const char** value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
*value = index == -1 ? NULL : node->attrs[index].value;
return scclSuccess;
}
scclResult_t xmlGetAttrStr(struct scclXmlNode* node, const char* attrName, const char** value) {
SCCLCHECK(xmlGetAttr(node, attrName, value));
if(*value == NULL) {
WARN("Attribute %s of node %s not found", attrName, node->name);
return scclInternalError;
}
return scclSuccess;
}
/**
* 从XML节点属性中获取整数值
*
* @param node XML节点指针
* @param attrName 属性名称
* @param value 输出参数,用于存储解析后的整数值
* @return 成功返回scclSuccess,失败返回错误码
*
* @note 该函数会先获取属性字符串值,然后将其转换为整数
*/
scclResult_t xmlGetAttrInt(struct scclXmlNode* node, const char* attrName, int* value) {
const char* str;
SCCLCHECK(xmlGetAttrStr(node, attrName, &str));
*value = strtol(str, NULL, 0);
return scclSuccess;
}
/**
* 从XML节点获取整数属性值,若属性不存在则返回默认值
*
* @param node XML节点指针
* @param attrName 要获取的属性名
* @param value 输出参数,用于存储获取到的整数值
* @param defaultValue 当属性不存在时返回的默认值
* @return scclResult_t 操作结果,成功返回scclSuccess
*/
scclResult_t xmlGetAttrIntDefault(struct scclXmlNode* node, const char* attrName, int* value, int defaultValue) {
const char* str;
SCCLCHECK(xmlGetAttr(node, attrName, &str));
*value = str ? strtol(str, NULL, 0) : defaultValue;
return scclSuccess;
}
// Only set values if not already set
/**
* @brief 初始化XML节点的整数属性
*
* 如果属性不存在则创建并设置值,已存在则不修改
*
* @param node XML节点指针
* @param attrName 属性名称
* @param value 要设置的整数值
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*/
scclResult_t xmlInitAttrInt(struct scclXmlNode* node, const char* attrName, const int value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
}
return scclSuccess;
}
/**
* 初始化XML节点的uint64类型属性
*
* @param node XML节点指针
* @param attrName 属性名称
* @param value 要设置的属性值(16进制格式)
* @return 成功返回scclSuccess,失败返回错误码
*
* 功能:为指定XML节点添加或更新一个uint64类型的属性,属性值将以"0x%lx"格式存储
* 注意:如果属性已存在,则直接使用新值覆盖原有值
*/
scclResult_t xmlInitAttrUint64(struct scclXmlNode* node, const char* attrName, const uint64_t value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value);
}
return scclSuccess;
}
scclResult_t xmlGetAttrFloat(struct scclXmlNode* node, const char* attrName, float* value) {
const char* str;
SCCLCHECK(xmlGetAttrStr(node, attrName, &str));
*value = strtof(str, NULL);
return scclSuccess;
}
scclResult_t xmlInitAttrFloat(struct scclXmlNode* node, const char* attrName, const float value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value);
}
return scclSuccess;
}
scclResult_t xmlFindTag(struct scclXml* xml, const char* tagName, struct scclXmlNode** node) {
*node = NULL;
for(int i = 0; i < xml->maxIndex; i++) {
struct scclXmlNode* n = xml->nodes + i;
if(strcmp(n->name, tagName) == 0) {
*node = n;
return scclSuccess;
}
}
return scclSuccess;
}
scclResult_t xmlFindTagKv(struct scclXml* xml, const char* tagName, struct scclXmlNode** node, const char* attrName, const char* attrValue) {
*node = NULL;
for(int i = 0; i < xml->maxIndex; i++) {
struct scclXmlNode* n = xml->nodes + i;
if(strcmp(n->name, tagName) == 0) {
const char* value;
SCCLCHECK(xmlGetAttr(n, attrName, &value));
if(value && strcmp(value, attrValue) == 0) {
*node = n;
return scclSuccess;
}
}
}
return scclSuccess;
}
scclResult_t xmlSetAttr(struct scclXmlNode* node, const char* attrName, const char* value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
strncpy(node->attrs[index].value, value, MAX_STR_LEN);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return scclSuccess;
}
scclResult_t xmlSetAttrIfUnset(struct scclXmlNode* node, const char* attrName, const char* value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index != -1)
return scclSuccess;
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
strncpy(node->attrs[index].value, value, MAX_STR_LEN);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return scclSuccess;
}
scclResult_t xmlSetAttrInt(struct scclXmlNode* node, const char* attrName, const int value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return scclSuccess;
}
scclResult_t xmlSetAttrFloat(struct scclXmlNode* node, const char* attrName, const float value) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1) {
index = node->nAttrs++;
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
node->attrs[index].key[MAX_STR_LEN] = '\0';
}
snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
node->attrs[index].value[MAX_STR_LEN] = '\0';
return scclSuccess;
}
scclResult_t xmlUnsetAttr(struct scclXmlNode* node, const char* attrName) {
int index;
SCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
if(index == -1)
return scclSuccess;
for(int i = index + 1; i < node->nAttrs; i++) {
strcpy(node->attrs[i - 1].key, node->attrs[i].key);
strcpy(node->attrs[i - 1].value, node->attrs[i].value);
}
node->nAttrs--;
return scclSuccess;
}
scclResult_t xmlGetSub(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub) {
*sub = NULL;
for(int s = 0; s < node->nSubs; s++) {
if(strcmp(node->subs[s]->name, subName) == 0) {
*sub = node->subs[s];
return scclSuccess;
}
}
return scclSuccess;
}
scclResult_t xmlGetSubKv(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub, const char* attrName, const char* attrValue) {
*sub = NULL;
for(int s = 0; s < node->nSubs; s++) {
struct scclXmlNode* subNode = node->subs[s];
if(strcmp(subNode->name, subName) == 0) {
const char* value;
SCCLCHECK(xmlGetAttr(subNode, attrName, &value));
if(value && strcmp(value, attrValue) == 0) {
*sub = node->subs[s];
return scclSuccess;
}
}
}
return scclSuccess;
}
scclResult_t xmlGetSubKvInt(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub, const char* attrName, const int attrValue) {
char strValue[10];
snprintf(strValue, 10, "%d", attrValue);
SCCLCHECK(xmlGetSubKv(node, subName, sub, attrName, strValue));
return scclSuccess;
}
scclResult_t xmlAddNode(struct scclXml* xml, struct scclXmlNode* parent, const char* subName, struct scclXmlNode** sub) {
if(xml->maxIndex == MAX_NODES) {
WARN("Error : too many XML nodes (max %d)", MAX_NODES);
return scclInternalError;
}
struct scclXmlNode* s = xml->nodes + xml->maxIndex++;
s->nSubs = 0;
s->nAttrs = 0;
*sub = s;
s->parent = parent;
if(parent)
parent->subs[parent->nSubs++] = s;
strncpy(s->name, subName, MAX_STR_LEN);
s->name[MAX_STR_LEN] = '\0';
return scclSuccess;
}
scclResult_t xmlRemoveNode(struct scclXmlNode* node) {
node->type = NODE_TYPE_NONE;
struct scclXmlNode* parent = node->parent;
if(parent == NULL)
return scclSuccess;
int shift = 0;
for(int s = 0; s < parent->nSubs; s++) {
if(parent->subs[s] == node)
shift = 1;
else if(shift)
parent->subs[s - 1] = parent->subs[s];
}
parent->nSubs--;
return scclSuccess;
}
scclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* dict) {
struct kvDict* d = dict;
while(d->str) {
if(strncmp(str, d->str, strlen(d->str)) == 0) {
*value = d->value;
return scclSuccess;
}
d++;
}
INFO(SCCL_LOG_GRAPH, "KV Convert to int : could not find value of '%s' in dictionary, falling back to %d", str, d->value);
*value = d->value;
return scclSuccess;
}
scclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict) {
struct kvDict* d = dict;
while(d->str) {
if(value == d->value) {
*str = d->str;
return scclSuccess;
}
d++;
}
WARN("KV Convert to str : could not find value %d in dictionary", value);
return scclInternalError;
}
namespace xml {
/*******************/
/* XML File Parser */
/*******************/
scclResult_t xmlGetChar(FILE* file, char* c) {
if(fread(c, 1, 1, file) == 0) {
WARN("XML Parse : Unexpected EOF");
return scclInternalError;
}
return scclSuccess;
}
scclResult_t xmlGetValue(FILE* file, char* value, char* last) {
char c;
SCCLCHECK(xmlGetChar(file, &c));
if(c != '"' && c != '\'') {
#if INT_OK
int o = 0;
do {
value[o++] = c;
SCCLCHECK(xmlGetChar(file, &c));
} while(c >= '0' && c <= '9');
value[o] = '\0';
*last = c;
return scclSuccess;
#else
WARN("XML Parse : Expected (double) quote.");
return scclInternalError;
#endif
}
int o = 0;
do {
SCCLCHECK(xmlGetChar(file, &c));
value[o++] = c;
} while(c != '"');
value[o - 1] = '\0';
SCCLCHECK(xmlGetChar(file, last));
return scclSuccess;
}
scclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
char c;
char* ptr = name;
int o = 0;
do {
SCCLCHECK(xmlGetChar(file, &c));
if(c == '=') {
ptr[o] = '\0';
if(value == NULL) {
WARN("XML Parse : Unexpected value with name %s", ptr);
return scclInternalError;
}
return xmlGetValue(file, value, last);
}
ptr[o] = c;
if(o == MAX_STR_LEN - 1) {
ptr[o] = '\0';
WARN("Error : name %s too long (max %d)", ptr, MAX_STR_LEN);
return scclInternalError;
}
o++;
} while(c != ' ' && c != '>' && c != '/' && c != '\n' && c != '\r');
ptr[o - 1] = '\0';
*last = c;
return scclSuccess;
}
// Shift the 3-chars string by one char and append c at the end
#define SHIFT_APPEND(s, c) \
do { \
s[0] = s[1]; \
s[1] = s[2]; \
s[2] = c; \
} while(0)
scclResult_t xmlSkipComment(FILE* file, char* start, char next) {
// Start from something neutral with \0 at the end.
char end[4] = "...";
// Inject all trailing chars from previous reads. We don't need
// to check for --> here because there cannot be a > in the name.
for(int i = 0; i < strlen(start); i++)
SHIFT_APPEND(end, start[i]);
SHIFT_APPEND(end, next);
// Stop when we find "-->"
while(strcmp(end, "-->") != 0) {
int c;
if(fread(&c, 1, 1, file) != 1) {
WARN("XML Parse error : unterminated comment");
return scclInternalError;
}
SHIFT_APPEND(end, c);
}
return scclSuccess;
}
scclResult_t xmlGetNode(FILE* file, struct scclXmlNode* node) {
node->type = NODE_TYPE_NONE;
char c = ' ';
while(c == ' ' || c == '\n' || c == '\r') {
if(fread(&c, 1, 1, file) == 0)
return scclSuccess;
}
if(c != '<') {
WARN("XML Parse error : expecting '<', got '%c'", c);
return scclInternalError;
}
// Read XML element name
SCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
// Check for comments
if(strncmp(node->name, "!--", 3) == 0) {
SCCLCHECK(xmlSkipComment(file, node->name + 3, c));
return xmlGetNode(file, node);
}
// Check for closing tag
if(node->name[0] == '\0' && c == '/') {
node->type = NODE_TYPE_CLOSE;
// Re-read the name, we got '/' in the first call
SCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
if(c != '>') {
WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name);
return scclInternalError;
}
return scclSuccess;
}
node->type = NODE_TYPE_OPEN;
// Get Attributes
int a = 0;
while(c == ' ') {
SCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
if(a == MAX_ATTR_COUNT) {
INFO(SCCL_LOG_TOPO, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT);
// Actually we need to still consume the extra attributes so we have an extra one.
} else
a++;
}
node->nAttrs = a;
if(c == '/') {
node->type = NODE_TYPE_SINGLE;
char str[MAX_STR_LEN];
SCCLCHECK(xmlGetToken(file, str, NULL, &c));
}
if(c != '>') {
WARN("XML Parse : expected >, got '%c'", c);
return scclInternalError;
}
return scclSuccess;
}
typedef scclResult_t (*xmlHandlerFunc_t)(FILE*, struct scclXml*, struct scclXmlNode*);
struct xmlHandler {
const char* name;
xmlHandlerFunc_t func;
};
scclResult_t xmlLoadSub(FILE* file, struct scclXml* xml, struct scclXmlNode* head, struct xmlHandler handlers[], int nHandlers) {
if(head && head->type == NODE_TYPE_SINGLE)
return scclSuccess;
while(1) {
if(xml->maxIndex == MAX_NODES) {
WARN("Error : XML parser is limited to 1024 nodes");
return scclInternalError;
}
struct scclXmlNode* node = xml->nodes + xml->maxIndex;
memset(node, 0, sizeof(struct scclXmlNode));
SCCLCHECK(xmlGetNode(file, node));
if(node->type == NODE_TYPE_NONE) {
if(head) {
WARN("XML Parse : unterminated %s", head->name);
return scclInternalError;
} else {
// All done
return scclSuccess;
}
}
if(head && node->type == NODE_TYPE_CLOSE) {
if(strcmp(node->name, head->name) != 0) {
WARN("XML Mismatch : %s / %s", head->name, node->name);
return scclInternalError;
}
return scclSuccess;
}
int found = 0;
for(int h = 0; h < nHandlers; h++) {
if(strcmp(node->name, handlers[h].name) == 0) {
if(head)
head->subs[head->nSubs++] = node;
node->parent = head;
node->nSubs = 0;
xml->maxIndex++;
SCCLCHECK(handlers[h].func(file, xml, node));
found = 1;
break;
}
}
if(!found) {
if(nHandlers)
INFO(SCCL_LOG_TOPO, "Ignoring element %s", node->name);
SCCLCHECK(xmlLoadSub(file, xml, node, NULL, 0));
}
}
}
/**************/
/* XML Writer */
/**************/
scclResult_t scclTopoDumpXmlRec(int indent, FILE* file, struct scclXmlNode* node) {
for(int i = 0; i < indent; i++)
fprintf(file, " ");
fprintf(file, "<%s", node->name);
for(int a = 0; a < node->nAttrs; a++) {
fprintf(file, " %s=\"%s\"", node->attrs[a].key, node->attrs[a].value);
}
if(node->nSubs == 0) {
fprintf(file, "/>\n");
} else {
fprintf(file, ">\n");
for(int s = 0; s < node->nSubs; s++) {
SCCLCHECK(scclTopoDumpXmlRec(indent + 2, file, node->subs[s]));
}
for(int i = 0; i < indent; i++)
fprintf(file, " ");
fprintf(file, "</%s>\n", node->name);
}
return scclSuccess;
}
/****************************************/
/* Parser rules for our specific format */
/****************************************/
scclResult_t scclTopoXmlLoadNvlink(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
SCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
return scclSuccess;
}
scclResult_t scclTopoXmlLoadGpu(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
struct xmlHandler handlers[] = {{"xgmi", scclTopoXmlLoadNvlink}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
return scclSuccess;
}
scclResult_t scclTopoXmlLoadNet(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
SCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
return scclSuccess;
}
scclResult_t scclTopoXmlLoadNic(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
struct xmlHandler handlers[] = {{"net", scclTopoXmlLoadNet}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
return scclSuccess;
}
scclResult_t scclTopoXmlLoadPci(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
struct xmlHandler handlers[] = {{"pci", scclTopoXmlLoadPci}, {"gpu", scclTopoXmlLoadGpu}, {"nic", scclTopoXmlLoadNic}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3));
return scclSuccess;
}
scclResult_t scclTopoXmlLoadCpu(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
struct xmlHandler handlers[] = {{"pci", scclTopoXmlLoadPci}, {"nic", scclTopoXmlLoadNic}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
return scclSuccess;
}
scclResult_t scclTopoXmlLoadSystem(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
int version;
SCCLCHECK(xmlGetAttrInt(head, "version", &version));
if(version != SCCL_TOPO_XML_VERSION) {
WARN("XML Topology has wrong version %d, %d needed", version, SCCL_TOPO_XML_VERSION);
return scclInvalidUsage;
}
const char* name;
SCCLCHECK(xmlGetAttr(head, "name", &name));
if(name != NULL)
INFO(SCCL_LOG_TOPO, "Loading topology %s", name);
else
INFO(SCCL_LOG_TOPO, "Loading unnamed topology");
struct xmlHandler handlers[] = {{"cpu", scclTopoXmlLoadCpu}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
return scclSuccess;
}
/**********************/
/* XML creation */
/* from autodetection */
/**********************/
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
static void memcpylower(char* dst, const char* src, const size_t size) {
for(int i = 0; i < size; i++)
dst[i] = tolower(src[i]);
return;
}
static scclResult_t getPciPath(const char* busId, char** path) {
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
memcpylower(busPath + sizeof("/sys/class/pci_bus/") - 1, busId, BUSID_REDUCED_SIZE - 1);
memcpylower(busPath + sizeof("/sys/class/pci_bus/0000:00/../../") - 1, busId, BUSID_SIZE - 1);
*path = realpath(busPath, NULL);
if(*path == NULL) {
WARN("Could not find real path of %s", busPath);
return scclSystemError;
}
return scclSuccess;
}
scclResult_t scclTopoSetAttrFromSys(struct scclXmlNode* pciNode, const char* path, const char* fileName, const char* attrName) {
char strValue[MAX_STR_LEN];
SCCLCHECK(scclTopoGetStrFromSys(path, fileName, strValue));
if(strValue[0] != '\0') {
SCCLCHECK(xmlSetAttr(pciNode, attrName, strValue));
}
INFO(SCCL_LOG_TOPO, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue);
return scclSuccess;
}
scclResult_t scclTopoGetXmlFromCpu(struct scclXmlNode* cpuNode, struct scclXml* xml) {
int index;
SCCLCHECK(xmlGetAttrIndex(cpuNode, "affinity", &index));
if(index == -1) {
const char* numaId;
SCCLCHECK(xmlGetAttr(cpuNode, "numaid", &numaId));
if(numaId == NULL) {
WARN("GetXmlFromCpu : could not find CPU numa ID.");
return scclInternalError;
}
// Set affinity
char cpumaskPath[] = "/sys/devices/system/node/node0000";
sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId);
SCCLCHECK(scclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity"));
}
SCCLCHECK(xmlGetAttrIndex(cpuNode, "arch", &index));
if(index == -1) {
// Fill CPU type / vendor / model
#if defined(__PPC__)
SCCLCHECK(xmlSetAttr(cpuNode, "arch", "ppc64"));
#elif defined(__aarch64__)
SCCLCHECK(xmlSetAttr(cpuNode, "arch", "arm64"));
#elif defined(__x86_64__)
SCCLCHECK(xmlSetAttr(cpuNode, "arch", "x86_64"));
#endif
}
#if defined(__x86_64__)
SCCLCHECK(xmlGetAttrIndex(cpuNode, "vendor", &index));
if(index == -1) {
union {
struct {
// CPUID 0 String register order
uint32_t ebx;
uint32_t edx;
uint32_t ecx;
};
char vendor[12];
} cpuid0;
asm volatile("cpuid" : "=b"(cpuid0.ebx), "=c"(cpuid0.ecx), "=d"(cpuid0.edx) : "a"(0) : "memory");
char vendor[13];
strncpy(vendor, cpuid0.vendor, 12);
vendor[12] = '\0';
SCCLCHECK(xmlSetAttr(cpuNode, "vendor", vendor));
}
SCCLCHECK(xmlGetAttrIndex(cpuNode, "familyid", &index));
if(index == -1) {
union {
struct {
unsigned steppingId : 4;
unsigned modelId : 4;
unsigned familyId : 4;
unsigned processorType : 2;
unsigned resv0 : 2;
unsigned extModelId : 4;
unsigned extFamilyId : 8;
unsigned resv1 : 4;
};
uint32_t val;
} cpuid1;
asm volatile("cpuid" : "=a"(cpuid1.val) : "a"(1) : "memory");
int familyId = cpuid1.familyId + (cpuid1.extFamilyId << 4);
int modelId = cpuid1.modelId + (cpuid1.extModelId << 4);
SCCLCHECK(xmlSetAttrInt(cpuNode, "familyid", familyId));
SCCLCHECK(xmlSetAttrInt(cpuNode, "modelid", modelId));
}
#endif
return scclSuccess;
}
scclResult_t scclTopoGetPciNode(struct scclXml* xml, const char* busId, struct scclXmlNode** pciNode) {
SCCLCHECK(xmlFindTagKv(xml, "pci", pciNode, "busid", busId));
if(*pciNode == NULL) {
SCCLCHECK(xmlAddNode(xml, NULL, "pci", pciNode));
SCCLCHECK(xmlSetAttr(*pciNode, "busid", busId));
}
return scclSuccess;
}
// Check whether a string is in BDF format or not.
// BDF (Bus-Device-Function) is "BBBB:BB:DD.F" where B, D and F are hex digits.
// There can be trailing chars.
int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); }
int checkBDFFormat(char* bdf) {
if(bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.')
return 0;
if(isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) || isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) ||
isHex(bdf[9] == 0) || isHex(bdf[11] == 0))
return 0;
return 1;
}
scclResult_t scclTopoGetXmlFromSys(struct scclXmlNode* pciNode, struct scclXml* xml) {
// Fill info, then parent
const char* busId;
SCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
char* path = NULL;
getPciPath(busId, &path);
if(path) {
SCCLCHECK(scclTopoSetAttrFromSys(pciNode, path, "class", "class"));
}
int index;
SCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
if(index == -1) {
if(path)
scclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor");
}
SCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
if(index == -1) {
if(path)
scclTopoSetAttrFromSys(pciNode, path, "device", "device");
}
SCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
if(index == -1) {
if(path)
scclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor");
}
SCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
if(index == -1) {
if(path)
scclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device");
}
SCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
if(index == -1) {
if(path) {
char deviceSpeedStr[MAX_STR_LEN];
float deviceSpeed;
SCCLCHECK(scclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
sscanf(deviceSpeedStr, "%f GT/s", &deviceSpeed);
char portSpeedStr[MAX_STR_LEN];
float portSpeed;
SCCLCHECK(scclTopoGetStrFromSys(path, "../max_link_speed", portSpeedStr));
if(portSpeedStr[0])
sscanf(portSpeedStr, "%f GT/s", &portSpeed);
else
portSpeed = deviceSpeed;
SCCLCHECK(xmlSetAttr(pciNode, "link_speed", portSpeed < deviceSpeed ? portSpeedStr : deviceSpeedStr));
} else {
SCCLCHECK(xmlSetAttr(pciNode, "link_speed", ""));
}
}
SCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
if(index == -1) {
if(path) {
char strValue[MAX_STR_LEN];
SCCLCHECK(scclTopoGetStrFromSys(path, "max_link_width", strValue));
int deviceWidth = strtol(strValue, NULL, 0);
SCCLCHECK(scclTopoGetStrFromSys(path, "../max_link_width", strValue));
int portWidth;
if(strValue[0])
portWidth = strtol(strValue, NULL, 0);
else
portWidth = deviceWidth;
SCCLCHECK(xmlSetAttrInt(pciNode, "link_width", std::min(deviceWidth, portWidth)));
} else {
SCCLCHECK(xmlSetAttr(pciNode, "link_width", ""));
}
}
struct scclXmlNode* parent = pciNode->parent;
if(parent == NULL) {
if(path) {
// Save that for later in case next step is a CPU
char numaIdStr[MAX_STR_LEN];
SCCLCHECK(scclTopoGetStrFromSys(path, "numa_node", numaIdStr));
// Workaround kernel bug for now
if(strcmp(numaIdStr, "-1") == 0)
strcpy(numaIdStr, "0");
// Go up one level in the PCI tree. Rewind two "/" and follow the upper PCI
// switch, or stop if we reach a CPU root complex.
int slashCount = 0;
int parentOffset;
for(parentOffset = strlen(path) - 1; parentOffset > 0; parentOffset--) {
if(path[parentOffset] == '/') {
slashCount++;
path[parentOffset] = '\0';
int start = parentOffset - 1;
while(start > 0 && path[start] != '/')
start--;
// Check whether the parent path looks like "BBBB:BB:DD.F" or not.
if(checkBDFFormat(path + start + 1) == 0) {
// This a CPU root complex. Create a CPU tag and stop there.
struct scclXmlNode* topNode;
SCCLCHECK(xmlFindTag(xml, "system", &topNode));
SCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr));
if(parent == NULL) {
SCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
SCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr));
}
} else if(slashCount == 2) {
// Continue on the upper PCI switch
for(int i = strlen(path) - 1; i > 0; i--) {
if(path[i] == '/') {
SCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", path + i + 1));
if(parent == NULL) {
SCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
SCCLCHECK(xmlSetAttr(parent, "busid", path + i + 1));
}
break;
}
}
}
}
if(parent)
break;
}
} else {
// No information on /sys, attach GPU to unknown CPU
SCCLCHECK(xmlFindTagKv(xml, "cpu", &parent, "numaid", "-1"));
if(parent == NULL) {
struct scclXmlNode* topNode;
SCCLCHECK(xmlFindTag(xml, "system", &topNode));
SCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
SCCLCHECK(xmlSetAttr(parent, "numaid", "-1"));
SCCLCHECK(scclTopoGetXmlFromCpu(parent, xml));
}
}
pciNode->parent = parent;
parent->subs[parent->nSubs++] = pciNode;
}
if(strcmp(parent->name, "pci") == 0) {
SCCLCHECK(scclTopoGetXmlFromSys(parent, xml));
} else if(strcmp(parent->name, "cpu") == 0) {
SCCLCHECK(scclTopoGetXmlFromCpu(parent, xml));
}
free(path);
return scclSuccess;
}
scclResult_t scclTopoGetXmlFromGpu(struct scclXmlNode* pciNode, uint32_t rocmDev, struct scclXml* xml, struct scclXmlNode** gpuNodeRet) {
struct scclXmlNode* gpuNode = NULL;
SCCLCHECK(xmlGetSub(pciNode, "gpu", &gpuNode));
if(gpuNode == NULL)
SCCLCHECK(xmlAddNode(xml, pciNode, "gpu", &gpuNode));
int index = -1;
int dev = -1;
SCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
if(index == -1) {
if(rocmDev == -1) {
const char* busId;
SCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
if(busId == NULL || hipDeviceGetByPCIBusId(&dev, busId) != hipSuccess)
dev = -1;
} else {
dev = rocmDev;
}
SCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev));
}
SCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev));
if(dev == -1) {
*gpuNodeRet = NULL;
return scclSuccess;
}
SCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index));
if(index == -1) {
int hipMajor, hipMinor;
hipDeviceProp_t devProp;
HIPCHECK(hipGetDeviceProperties(&devProp, 0));
hipMajor = devProp.major;
hipMinor = devProp.minor;
SCCLCHECK(xmlSetAttrInt(gpuNode, "sm", hipMajor * 10 + hipMinor));
}
int sm;
SCCLCHECK(xmlGetAttrInt(gpuNode, "sm", &sm));
const char* gcn;
const char* gcnArchName;
SCCLCHECK(xmlGetAttrIndex(gpuNode, "gcn", &index));
if(index == -1) {
hipDeviceProp_t devProp;
HIPCHECK(hipGetDeviceProperties(&devProp, 0));
// extract only the releveant info from the gcnArchName attribute
// e.g.: convert "gfx908:sramecc+:xnack-" to "gfx908"
char gcnArchNameSubstr[6];
GcnArchNameFormat(devProp.gcnArchName, gcnArchNameSubstr);
gcn = gcnArchNameSubstr;
SCCLCHECK(xmlSetAttr(gpuNode, "gcn", gcn));
}
SCCLCHECK(xmlGetAttr(gpuNode, "gcn", &gcn));
convertGcnArchToGcnArchName(gcn, &gcnArchName);
SCCLCHECK(xmlSetAttr(gpuNode, "gcn", gcnArchName));
scclHipDeviceArch_t arch;
SCCLCHECK(xmlGetAttrIndex(gpuNode, "arch", &index));
if(index == -1) {
hipDeviceProp_t devProp;
HIPCHECK(hipGetDeviceProperties(&devProp, 0));
memcpy(&arch.arch, &devProp.arch, sizeof(hipDeviceArch_t));
SCCLCHECK(xmlSetAttrInt(gpuNode, "arch", arch.value));
}
SCCLCHECK(xmlGetAttrInt(gpuNode, "arch", &arch.value));
struct scclXmlNode* nvlNode = NULL;
SCCLCHECK(xmlGetSub(gpuNode, "nvlink", &nvlNode));
if(nvlNode == NULL) {
const char* busId;
SCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
uint32_t deviceCnt;
SCCLCHECK(rocm_smi_getNumDevice(&deviceCnt));
for(int i = 0; i < deviceCnt; i++) {
if(i != dev) {
RSMI_IO_LINK_TYPE rsmi_type;
int hops, count;
if(rocm_smi_getLinkInfo(dev, i, &rsmi_type, &hops, &count) == scclSuccess) {
if(rsmi_type >= RSMI_IOLINK_TYPE_XGMI && hops >= 1) {
char busIdStr[] = "00000000:00:00.0";
SCCLCHECK(rocm_smi_getDevicePciBusIdString(i, busIdStr, sizeof(busIdStr)));
char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
for(int c = 0; c < NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
lowerId[c] = tolower(busIdStr[c]);
if(busIdStr[c] == 0)
break;
}
SCCLCHECK(xmlGetSubKv(gpuNode, "xgmi", &nvlNode, "target", lowerId));
if(nvlNode == NULL) {
SCCLCHECK(xmlAddNode(xml, gpuNode, "xgmi", &nvlNode));
SCCLCHECK(xmlSetAttr(nvlNode, "target", lowerId));
SCCLCHECK(xmlSetAttrInt(nvlNode, "count", count));
}
}
}
}
}
}
// Fill target classes
for(int s = 0; s < gpuNode->nSubs; s++) {
struct scclXmlNode* sub = gpuNode->subs[s];
if(strcmp(sub->name, "xgmi") != 0)
continue;
int index;
SCCLCHECK(xmlGetAttrIndex(sub, "tclass", &index));
if(index == -1) {
const char* busId;
SCCLCHECK(xmlGetAttr(sub, "target", &busId));
char* path;
getPciPath(busId, &path);
if(path == NULL || strcmp(busId, "fffffff:ffff:ff") == 0) {
// Remote NVLink device is not visible inside this VM. Assume NVSwitch.
SCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000"));
} else {
SCCLCHECK(scclTopoSetAttrFromSys(sub, path, "class", "tclass"));
free(path);
}
}
}
*gpuNodeRet = gpuNode;
return scclSuccess;
}
// Returns the subsystem name of a path, i.e. the end of the path
// where sysPath/subsystem points to.
scclResult_t scclTopoGetSubsystem(const char* sysPath, char* subSys) {
char subSysPath[PATH_MAX];
sprintf(subSysPath, "%s/subsystem", sysPath);
char* path = realpath(subSysPath, NULL);
if(path == NULL) {
subSys[0] = '\0';
} else {
int offset;
for(offset = strlen(path); offset > 0 && path[offset] != '/'; offset--)
;
strcpy(subSys, path + offset + 1);
free(path);
}
return scclSuccess;
}
scclResult_t scclTopoTrimXmlRec(struct scclXmlNode* node) {
const char* str;
SCCLCHECK(xmlGetAttr(node, "keep", &str));
if(str && strcmp(str, "1") == 0) {
SCCLCHECK(xmlUnsetAttr(node, "keep"));
} else {
// Copy nSubs and subs as they could change as we trim recursively.
struct scclXmlNode* subs[MAX_SUBS];
int nSubs = node->nSubs;
memcpy(subs, node->subs, node->nSubs * sizeof(struct scclXmlNode*));
for(int s = 0; s < nSubs; s++) {
SCCLCHECK(scclTopoTrimXmlRec(subs[s]));
}
if(node->nSubs == 0)
SCCLCHECK(xmlRemoveNode(node));
}
return scclSuccess;
}
/**************************************************/
/* Parser rules for the user-defined graph search */
/**************************************************/
scclResult_t scclTopoXmlGraphLoadGpu(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
SCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
return scclSuccess;
}
scclResult_t scclTopoXmlGraphLoadNet(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
SCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
return scclSuccess;
}
scclResult_t scclTopoXmlGraphLoadChannel(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
struct xmlHandler handlers[] = {{"net", scclTopoXmlGraphLoadNet}, {"gpu", scclTopoXmlGraphLoadGpu}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
return scclSuccess;
}
scclResult_t scclTopoXmlGraphLoadGraph(FILE* file, struct scclXml* xml, struct scclXmlNode* head) {
struct xmlHandler handlers[] = {{"channel", scclTopoXmlGraphLoadChannel}};
SCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
return scclSuccess;
}
scclResult_t scclTopoXmlGraphLoadGraphs(FILE* file, struct scclXml* xmlGraph, struct scclXmlNode* head) {
int version;
SCCLCHECK(xmlGetAttrInt(head, "version", &version));
if(version != SCCL_GRAPH_XML_VERSION) {
WARN("XML Graph has wrong version %d, %d needed", version, SCCL_GRAPH_XML_VERSION);
return scclInvalidUsage;
}
const char* name;
SCCLCHECK(xmlGetAttr(head, "name", &name));
if(name != NULL)
INFO(SCCL_LOG_TOPO, "Loading graphs for topology %s", name);
else
INFO(SCCL_LOG_TOPO, "Loading graphs");
struct xmlHandler handlers[] = {{"graph", scclTopoXmlGraphLoadGraph}};
SCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1));
return scclSuccess;
}
} // namespace xml
scclResult_t scclTopoGetXmlFromFile(const char* xmlTopoFile, struct scclXml* xml, int warn) {
FILE* file = fopen(xmlTopoFile, "r");
if(file == NULL) {
if(warn) {
WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno));
}
return scclSuccess;
}
INFO(SCCL_LOG_TOPO, "Loading topology file %s", xmlTopoFile);
struct xml::xmlHandler handlers[] = {{"system", xml::scclTopoXmlLoadSystem}};
xml->maxIndex = 0;
SCCLCHECK(xml::xmlLoadSub(file, xml, NULL, handlers, 1));
fclose(file);
return scclSuccess;
}
scclResult_t scclTopoDumpXmlToFile(const char* xmlTopoFile, struct scclXml* xml) {
FILE* file = fopen(xmlTopoFile, "w");
if(file == NULL) {
WARN("Unable to open %s, not dumping topology.", xmlTopoFile);
return scclSuccess;
}
SCCLCHECK(xml::scclTopoDumpXmlRec(0, file, xml->nodes));
fclose(file);
return scclSuccess;
}
scclResult_t scclTopoFillGpu(struct scclXml* xml, const char* busId, struct scclXmlNode** gpuNode) {
struct scclXmlNode* node;
SCCLCHECK(xml::scclTopoGetPciNode(xml, busId, &node));
SCCLCHECK(xmlSetAttrIfUnset(node, "class", "0x03"));
SCCLCHECK(xml::scclTopoGetXmlFromSys(node, xml));
uint32_t devIndex;
static int rocmsmiInit = 0;
if(rocmsmiInit == 0) {
rocmsmiInit = (rocm_smi_init() != scclSuccess) ? 2 : 1;
}
if(rocmsmiInit == 1) {
if(rocm_smi_getDeviceIndexByPciBusId(busId, &devIndex) != scclSuccess)
devIndex = -1;
}
SCCLCHECK(xml::scclTopoGetXmlFromGpu(node, devIndex, xml, gpuNode));
return scclSuccess;
}
scclResult_t scclTopoFillNet(struct scclXml* xml, const char* pciPath, const char* netName, struct scclXmlNode** netNode) {
SCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName));
if(*netNode != NULL)
return scclSuccess;
const char* pciSysPath = pciPath;
if(pciSysPath) {
char subSystem[PATH_MAX];
SCCLCHECK(xml::scclTopoGetSubsystem(pciSysPath, subSystem));
// This is not a PCI device (virtual, usb, ...).
if(strcmp(subSystem, "pci") != 0) {
INFO(SCCL_LOG_TOPO, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
pciSysPath = NULL;
}
}
struct scclXmlNode* parent = NULL;
if(pciSysPath) {
int offset;
for(offset = strlen(pciSysPath) - 1; pciSysPath[offset] != '/'; offset--)
;
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
strcpy(busId, pciSysPath + offset + 1);
SCCLCHECK(xml::scclTopoGetPciNode(xml, busId, &parent));
SCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02"));
SCCLCHECK(xml::scclTopoGetXmlFromSys(parent, xml));
} else {
// Virtual NIC, no PCI device, attach to first CPU
SCCLCHECK(xmlFindTag(xml, "cpu", &parent));
}
struct scclXmlNode* nicNode = NULL;
SCCLCHECK(xmlGetSub(parent, "nic", &nicNode));
if(nicNode == NULL) {
SCCLCHECK(xmlAddNode(xml, parent, "nic", &nicNode));
}
// We know that this net does not exist yet (we searched for it at the
// beginning of this function), so we can add it.
SCCLCHECK(xmlAddNode(xml, nicNode, "net", netNode));
SCCLCHECK(xmlSetAttr(*netNode, "name", netName));
return scclSuccess;
}
scclResult_t scclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct scclXml* xml) {
FILE* file = fopen(xmlGraphFile, "r");
if(file == NULL) {
WARN("Could not open XML graph file %s : %s", xmlGraphFile, strerror(errno));
return scclSystemError;
}
struct xml::xmlHandler handlers[] = {{"graphs", xml::scclTopoXmlGraphLoadGraphs}};
xml->maxIndex = 0;
SCCLCHECK(xml::xmlLoadSub(file, xml, NULL, handlers, 1));
fclose(file);
return scclSuccess;
}
scclResult_t scclTopoTrimXml(struct scclXml* xml) {
SCCLCHECK(xml::scclTopoTrimXmlRec(xml->nodes));
return scclSuccess;
}
scclResult_t scclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
char filePath[PATH_MAX];
sprintf(filePath, "%s/%s", path, fileName);
int offset = 0;
FILE* file;
if((file = fopen(filePath, "r")) != NULL) {
while(feof(file) == 0 && ferror(file) == 0 && offset < MAX_STR_LEN) {
int len = fread(strValue + offset, 1, MAX_STR_LEN - offset, file);
offset += len;
}
fclose(file);
}
if(offset == 0) {
strValue[0] = '\0';
INFO(SCCL_LOG_TOPO, "Topology detection : could not read %s, ignoring", filePath);
} else {
strValue[offset - 1] = '\0';
}
return scclSuccess;
}
} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl
#ifndef XML_H_
#define XML_H_
#include <stdlib.h>
#include "base.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace topo {
///////////////////////////////////////// 基础struct /////////////////////////////////////////
// A few constraints to make the implementation easy
#define MAX_STR_LEN 255
#define MAX_ATTR_COUNT 16
#define MAX_SUBS 32
#define MAX_NODES 1024
typedef enum node_type {
NODE_TYPE_NONE = 0,
NODE_TYPE_OPEN = 1,
NODE_TYPE_CLOSE = 2,
NODE_TYPE_SINGLE = 3
} node_type_t;
// 定义一个结构体 scclXmlNode,用于表示XML节点
struct scclXmlNode {
char name[MAX_STR_LEN + 1]; // 节点名称
struct {
char key[MAX_STR_LEN + 1]; // 属性键
char value[MAX_STR_LEN + 1]; // 属性值
} attrs[MAX_ATTR_COUNT + 1]; // 需要额外的一个来消耗额外参数
int nAttrs; // 属性数量
int type; // 节点类型
struct scclXmlNode* parent; // 父节点指针
struct scclXmlNode* subs[MAX_SUBS]; // 子节点指针数组
int nSubs; // 子节点数量
};
// 定义了一个结构体 scclXml,用于表示XML文档的结构
struct scclXml {
struct scclXmlNode nodes[MAX_NODES]; // 节点数组,每个节点代表XML中的一个元素
int maxIndex; // 当前XML结构中最大节点索引
};
struct kvDict {
const char* str;
int value;
};
typedef union {
hipDeviceArch_t arch;
int value;
static_assert(sizeof(hipDeviceArch_t) == sizeof(int), "value must be the same size of hipDeviceArch_t.");
} scclHipDeviceArch_t;
///////////////////////////////////////// File functions /////////////////////////////////////////
#define SCCL_TOPO_XML_VERSION 2
#define SCCL_GRAPH_XML_VERSION 1
// 从文件中获取XML拓扑结构
scclResult_t scclTopoGetXmlFromFile(const char* xmlTopoFile, struct scclXml* xml, int warn);
// 将XML拓扑结构保存到文件中
scclResult_t scclTopoDumpXmlToFile(const char* xmlTopoFile, struct scclXml* xml);
// 从文件中获取XML图形结构
scclResult_t scclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct scclXml* xml);
/* 自动检测功能 */
// 根据总线ID填充GPU信息到XML结构中
scclResult_t scclTopoFillGpu(struct scclXml* xml, const char* busId, struct scclXmlNode** gpuNode);
// 根据PCI路径和网络名称填充网络信息到XML结构中
scclResult_t scclTopoFillNet(struct scclXml* xml, const char* pciPath, const char* netName, struct scclXmlNode** netNode);
/* 移除不需要的部分 */
// 修剪XML结构,移除不需要的部分
scclResult_t scclTopoTrimXml(struct scclXml* xml);
// 从系统路径中获取字符串值
scclResult_t scclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue);
/**************/
/* XML Struct */
/* Functions */
/**************/
// 获取XML节点的属性索引
scclResult_t xmlGetAttrIndex(struct scclXmlNode* node, const char* attrName, int* index);
// 获取XML节点的属性值,返回为字符串
scclResult_t xmlGetAttr(struct scclXmlNode* node, const char* attrName, const char** value);
// 获取XML节点的属性值,返回为字符串(与xmlGetAttr类似)
scclResult_t xmlGetAttrStr(struct scclXmlNode* node, const char* attrName, const char** value);
// 获取XML节点的属性值,返回为整数
scclResult_t xmlGetAttrInt(struct scclXmlNode* node, const char* attrName, int* value);
// 获取XML节点的属性值,返回为整数,如果属性不存在则返回默认值
scclResult_t xmlGetAttrIntDefault(struct scclXmlNode* node, const char* attrName, int* value, int defaultValue);
// 初始化XML节点的整数属性
scclResult_t xmlInitAttrInt(struct scclXmlNode* node, const char* attrName, const int value);
// 初始化XML节点的无符号64位整数属性
scclResult_t xmlInitAttrUint64(struct scclXmlNode* node, const char* attrName, const uint64_t value);
// 获取XML节点的属性值,返回为浮点数
scclResult_t xmlGetAttrFloat(struct scclXmlNode* node, const char* attrName, float* value);
// 初始化XML节点的浮点数属性
scclResult_t xmlInitAttrFloat(struct scclXmlNode* node, const char* attrName, const float value);
// 在XML中查找指定标签名的节点
scclResult_t xmlFindTag(struct scclXml* xml, const char* tagName, struct scclXmlNode** node);
// 在XML中查找指定标签名和属性值的节点
scclResult_t xmlFindTagKv(struct scclXml* xml, const char* tagName, struct scclXmlNode** node, const char* attrName, const char* attrValue);
// 设置XML节点的属性值
scclResult_t xmlSetAttr(struct scclXmlNode* node, const char* attrName, const char* value);
// 如果属性未设置,则设置XML节点的属性值
scclResult_t xmlSetAttrIfUnset(struct scclXmlNode* node, const char* attrName, const char* value);
// 设置XML节点的属性值为整数
scclResult_t xmlSetAttrInt(struct scclXmlNode* node, const char* attrName, const int value);
// 设置XML节点的属性值为浮点数
scclResult_t xmlSetAttrFloat(struct scclXmlNode* node, const char* attrName, const float value);
// 移除XML节点的属性
scclResult_t xmlUnsetAttr(struct scclXmlNode* node, const char* attrName);
// 获取XML节点的子节点
scclResult_t xmlGetSub(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub);
// 获取XML节点的子节点,子节点需匹配指定属性值
scclResult_t xmlGetSubKv(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub, const char* attrName, const char* attrValue);
// 获取XML节点的子节点,子节点需匹配指定整数属性值
scclResult_t xmlGetSubKvInt(struct scclXmlNode* node, const char* subName, struct scclXmlNode** sub, const char* attrName, const int attrValue);
// 在XML中添加新节点
scclResult_t xmlAddNode(struct scclXml* xml, struct scclXmlNode* parent, const char* subName, struct scclXmlNode** sub);
// 从XML中移除节点
scclResult_t xmlRemoveNode(struct scclXmlNode* node);
// 字符串到整数的转换字典,最后一个元素的str应为NULL
// 将字符串转换为整数
scclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* dict);
// 将整数转换为字符串
scclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict);
} // namespace topo
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif
#include <unistd.h>
#include <sys/types.h>
#include <string.h>
#include <sys/resource.h>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <chrono>
#include <ctime>
#include <cstdint>
#include "bootstrap.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
#define MAX_THREADS (128)
struct bootstrapRootArgs {
scclSocket_t* listenSock;
uint64_t magic;
};
// 构造函数
scclBootstrap::scclBootstrap(struct scclRankInfo* rank_info, struct scclBootstrapComm* comm) {
// 初始化线程池
int thread_cnt = ::std::min(MAX_THREADS, rank_info->nRanks);
pthread_pool = new ThreadPool(thread_cnt);
scclResult_t res;
// 将 handle 结构体清零
SCCLCHECKGOTO(scclCalloc(&handle, 1), res, failure);
// 初始化bootstrap网络环境
SCCLCHECKGOTO(bootstrapInit(rank_info, comm), res, failure);
return;
failure:
WARN("bootstrap not implemented yet");
return;
}
scclBootstrap::~scclBootstrap() {
if(handle) {
free(handle);
}
if(bootstrap_net) {
delete bootstrap_net;
}
if(pthread_pool) {
delete pthread_pool;
}
}
/**
* 初始化bootstrap通信环境
*
* @param rank_info 包含rank和nRanks信息的唯一标识符
* @param comm 需要初始化的bootstrap通信结构体
* @return scclResult_t 返回操作结果,成功返回scclSuccess
*
* 该函数负责:
* 1. 设置WarpSize
* 2. 初始化unique_info信息
* 3. 创建并初始化bootstrap socket句柄
*/
scclResult_t scclBootstrap::bootstrapInit(const struct scclRankInfo* rank_info, struct scclBootstrapComm* comm) {
// 如果已经初始化,直接返回成功
if(asm_ops::ld_acquire_sys_global(&initialized))
return scclSuccess;
// 加锁以确保初始化过程的线程安全
pthread_mutex_lock(&initLock);
// 如果尚未初始化,进行初始化操作
if(!initialized) {
// 初始化通信结构的WarpSize
comm->WarpSize = warpSize;
// 获取CPU亲和性
sched_getaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
// 分配并初始化scclUniqueInfo信息
SCCLCHECK(scclCalloc(&(comm->unique_info), 1));
// 获取环境变量SCCL_NET_NAME的值,如果不存在则默认使用"IB"
const char* envNetName = getenv("SCCL_NET_NAME");
char* netName = (envNetName != NULL) ? strdup(envNetName) : strdup("IB");
// 打印网络名称
printf("netName=%s\n", netName);
// 初始化网络和引导网络
SCCLCHECK(net::scclNetInit(netName, comm->scclNet));
// 释放分配的网络名称字符串
free(netName);
// 调用bootstrapNetInit初始化CPU硬件,环境检查
SCCLCHECK(bootstrapBasicInit());
// 初始化唯一信息结构体
SCCLCHECK(bootstrapUniqueInfoInit(rank_info, comm->scclNet, comm->unique_info));
// 初始化网络结构体,用于bootstrap阶段的socket通信
bootstrap_net = new bootstrapNet(comm);
// 设置当前rank的socket信息给handle
SCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
memcpy(&handle->addr, &bootstrap_net->bootstrapNetIfAddr, sizeof(scclSocketAddress_t));
// SCCLCHECK(getIpcSocketAddr(&handle->peerIpcAddr));
#if 0
// char line[100];
// sprintf(line, "pos 55: rank=%d", rank_info->rank);
// SCCLCHECK(net::printSocketAddr(&handle->addr, line));
#endif
bootstrapAllGather(comm->unique_info);
// 设置初始化完成标志
asm_ops::st_release_sys_global(&initialized, true);
}
// 解锁
pthread_mutex_unlock(&initLock);
return scclSuccess;
}
/**
* @brief 执行基本的引导程序初始化
*
* 该函数负责初始化引导程序的基本组件,包括网络引导和系统环境检查。
* 使用互斥锁确保线程安全,避免重复初始化。
*
* @note 如果NUMA自动平衡已启用,会发出警告提示可能影响性能。
* @note 会检查内核版本信息并记录。
*
* @return scclResult_t 返回初始化结果,成功返回scclSuccess
*/
scclResult_t scclBootstrap::bootstrapBasicInit() {
// 始终初始化引导网络
SCCLCHECK(bootstrap_net->bootstrapNetInit());
// SCCLCHECK(scclNetPluginInit()); // collnet使用
char strValue[1024];
// 检查NUMA自动平衡是否启用
SCCLCHECK(scclTopoGetStrFromSys("/proc/sys/kernel", "numa_balancing", strValue));
if(strcmp(strValue, "1") == 0)
WARN("NUMA自动平衡已启用,这可能导致RCCL性能的不稳定性!通过\"sudo sysctl kernel.numa_balancing=0\"禁用");
// 获取内核版本信息
SCCLCHECK(scclTopoGetStrFromSys("/proc", "version", strValue));
char *verStr, *state;
verStr = strtok_r(strValue, " ", &state);
INFO(SCCL_LOG_BOOTSTRAP, "内核版本: %s", verStr);
for(int i = 0; i < 2; i++) {
verStr = strtok_r(NULL, " ", &state);
if(verStr == NULL)
break;
}
// TODO: 最终确定是否需要检查版本信息
#if 0
// 检查是否为Cray系统
if(strstr(verStr, "cray") == NULL) {
// 获取BIOS版本信息
SCCLCHECK(scclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue));
if(strncmp("Hyper-V UEFI Release", strValue, 20) != 0) {
FILE* file;
// 读取内核命令行参数
if((file = fopen("/proc/cmdline", "r")) != NULL) {
if(feof(file) == 0 && ferror(file) == 0) {
int len = fread(strValue, 1, 1024, file);
strValue[len] = '\0';
}
fclose(file);
}
// 检查是否缺少"iommu=pt"参数
if(strstr(strValue, "iommu=pt") == NULL)
WARN("内核命令行中缺少\"iommu=pt\"参数,这可能导致系统不稳定或挂起!");
}
#ifndef HIP_UNCACHED_MEMORY
// 检查环境变量"HSA_FORCE_FINE_GRAIN_PCIE"
char* env = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
printf("HSA env=%s\n", env);
if(env == NULL || strcmp(env, "1") != 0)
WARN("环境变量中缺少\"HSA_FORCE_FINE_GRAIN_PCIE=1\",这可能导致RCCL性能低下,系统不稳定或挂起!");
#endif
float* ptr;
// 尝试分配细粒度PCIe内存
hipError_t err = hipExtMallocWithFlags((void**)&ptr, 128, hipDeviceMallocFinegrained);
if(err != hipSuccess)
hsaFineGrainFlag = false;
}
#endif
return scclSuccess;
}
/**
* @brief 初始化唯一信息结构体
*
* 该函数用于初始化scclUniqueInfo结构体,包括设置rank信息、设备信息、主机和进程哈希值,
* 以及硬件相关信息(GPU、CPU、RDMA、PCI等)。
*
* @param rank_info 输入参数,包含rank相关信息
* @param unique_info 输出参数,待初始化的唯一信息结构体
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*/
scclResult_t scclBootstrap::bootstrapUniqueInfoInit(const struct scclRankInfo* rank_info, scclNet_t* scclNet, struct scclUniqueInfo* unique_info) {
////////////////// 设置id信息 //////////////////
unique_info->rank = rank_info->rank; // 将unique_id的rank赋值给unique_info的rank
unique_info->nRanks = rank_info->nRanks; // 将unique_id的nRanks赋值给unique_info的nRanks
unique_info->localRanks = rank_info->localRanks; // 将unique_id的localRanks赋值给unique_info的localRanks
unique_info->localRank = rank_info->localRank; // 计算unique_info的localRank
LECHECK(unique_info->localRanks, unique_info->localRank); // 检查localRank是否小于localRanks
uint32_t devices_num;
SCCLCHECK(rocm_smi_init()); // 初始化ROCM SMI库
SCCLCHECK(rocm_smi_getNumDevice(&devices_num)); // 获取设备数量
LTCHECK(devices_num, 0); // 检查设备数量是否大于0
unique_info->deviceCnt = static_cast<int>(devices_num); // 将设备数量转换为int并赋值给unique_info的deviceCnt
LECHECK(unique_info->deviceCnt, unique_info->hipDev); // 检查hipDev是否小于deviceCnt
HIPCHECK(hipSetDevice(unique_info->hipDev)); // 设置当前设备为hipDev
unique_info->hipDev = rank_info->hipDev;
// 获取其他基础信息
unique_info->hostHash = getHostHash(); // 获取主机哈希值并赋值给unique_info的hostHash
unique_info->pidHash = getPidHash(); // 获取进程ID哈希值并赋值给unique_info的pidHash
////////////////// 设置硬件信息 //////////////////
struct topoLocalNode* p_localNode = &unique_info->localNode;
// 设置GPU信息
p_localNode->gpu.dev = rank_info->hipDev;
hipDeviceProp_t deviceProp;
HIPCHECK(hipGetDeviceProperties(&deviceProp, rank_info->hipDev));
snprintf(p_localNode->gpu.name, sizeof(p_localNode->gpu.name), "%s", deviceProp.name);
snprintf(p_localNode->gpu.gcn, sizeof(p_localNode->gpu.gcn), "%s", deviceProp.gcnArchName);
p_localNode->gpu.compCap = deviceProp.major * 10 + deviceProp.minor;
// 设置CPU信息
memcpy(&p_localNode->cpu.socketAddr, &bootstrap_net->bootstrapNetIfAddr, sizeof(scclSocketAddress_t));
// 设置RDMA信息
SCCLCHECK(scclNet->getProperties(rank_info->hipDev, &p_localNode->net.props));
SCCLCHECK(scclNet->devices(&p_localNode->net.count));
// 设置PCI信息
SCCLCHECK(getBusId(rank_info->hipDev, &p_localNode->pci.busId));
#if 1
printf("topoLocalNode size=%ld\n", sizeof(struct topoLocalNode));
SCCLCHECK(net::printNetProps(&p_localNode->net.props, rank_info->rank, rank_info->localRank));
#endif
return scclSuccess;
}
/**
* 检查bootstrap是否已成功初始化
*
* 该函数通过检查handle指针和initialized标志来验证初始化状态
* 使用互斥锁确保线程安全
*
* @return scclSuccess 如果已初始化成功
* @return scclSystemError 如果未初始化或初始化失败
*/
scclResult_t scclBootstrap::bootstrapInitCheck() {
scclResult_t res = scclSuccess;
// 加锁以确保初始化过程的线程安全
pthread_mutex_lock(&initLock);
if(handle == nullptr || initialized == false) {
res = scclSystemError;
}
pthread_mutex_unlock(&initLock); // 解锁
return res;
}
scclResult_t scclBootstrap::bootstrapAllGather(struct scclUniqueInfo* unique_info) {
// 1.节点内通信 allgather
// 2.节点间通信,ring allgather
// 3.节点内通信 allgather
return scclSuccess;
}
/////////////////////////////////////////////////////////////////////////////////////////////
// // 将本地socket地址写入到/tmp/文件夹的文件中,通过nfs共享存储,其他rank可见
// scclResult_t bootstrapGetAllNodes(const struct scclUniqueInfo* unique_info, struct scclBootstrapComm* comm) {
// // // 分配并初始化IPC套接字
// // struct scclIpcSocket ipcSock = {0};
// // // Create a UDS socket to receive the converted fd
// // SCCLCHECK(scclIpcSocketInit(&ipcSock, unique_info->rank, /*hash*/ handle->magic, /*abortFlag*/ NULL));
// // printf("fd=%d, socketName=%s\n", ipcSock.fd, ipcSock.socketName);
// return scclInProgress;
// }
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
#pragma once
#include <string.h>
#include "base.h"
#include "archinfo.h"
#include "socket.h"
#include "bootstrap_utils.h"
#include "bootstrap_net.h"
#include "thread_pool.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
///////////
class scclBootstrap {
public:
scclBootstrap(struct scclRankInfo* rank_info, struct scclBootstrapComm* comm);
~scclBootstrap();
// 初始化bootstrap通信环境
scclResult_t bootstrapInit(const struct scclRankInfo* rank_info, struct scclBootstrapComm* comm);
// 检查bootstrap是否已成功初始化
scclResult_t bootstrapInitCheck();
// 广播节点信息
scclResult_t bootstrapAllGather(struct scclUniqueInfo* unique_info);
private:
// 执行基本的引导程序初始化
scclResult_t bootstrapBasicInit();
// 初始化唯一ID信息结构体
scclResult_t bootstrapUniqueInfoInit(const struct scclRankInfo* rank_info, scclNet_t* scclNet, struct scclUniqueInfo* unique_info);
// scclResult_t bootstrapGetAllNodes(const struct scclUniqueInfo* unique_info, struct scclBootstrapComm* comm);
private:
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
bool initialized = false;
bool hsaFineGrainFlag = true;
// 分配并初始化引导句柄
struct scclBootstrapHandle* handle = nullptr;
// 分配并初始化网络结构体
class bootstrapNet* bootstrap_net = nullptr;
int max_pthreads = 0;
class ThreadPool* pthread_pool = nullptr;
};
} // namespace bootstrap
} // namespace topology
} // namespace hardware
......
#include <unistd.h>
#include <sys/types.h>
#include <string.h>
#include <sys/resource.h>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <chrono>
#include <ctime>
#include <cstdint>
#include "bootstrap_net.h"
namespace sccl {
......@@ -8,18 +16,26 @@ namespace hardware {
namespace topology {
namespace bootstrap {
namespace bootstrap_net {
/* Init functions */
static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1];
static scclSocketAddress_t bootstrapNetIfAddr;
static int bootstrapNetInitDone = 0;
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
bootstrapNet::bootstrapNet(struct scclBootstrapComm* bootstrap_comm) {
auto unique_info = bootstrap_comm->unique_info;
// 设置节点内socket通信工具
ipcsocket = new scclIpcSocket_t(unique_info->localRank, unique_info->nRanks, unique_info->hostHash, bootstrap_comm->abortFlag);
}
bootstrapNet::~bootstrapNet() {
if(ipcsocket) {
delete ipcsocket;
}
}
/**
* @brief 初始化引导网络
*
* 该函数用于初始化SCCL的引导网络。它会检查环境变量"SCCL_COMM_ID"来获取远程地址,
* 如果没有设置则自动查找可用的网络接口。函数使用互斥锁确保线程安全。
* 该函数用于初始化SCCL的引导网络。
* 如果设置了 NCCL_COMM_ID 环境变量,则查找一个和该环境变量中指定的 IP 地址处于同一子网的网卡作为 booststrap 网络通信所使用的网卡 bootstrapNetIfAddr
* 否则,使用 ncclFindInterfaces 函数选择一个合适的网卡
*
* 函数使用互斥锁确保线程安全。
*
* @return scclResult_t 返回操作结果:
* - scclSuccess: 初始化成功
......@@ -27,35 +43,33 @@ pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
* - scclSystemError: 找不到匹配的网络接口
* - scclInternalError: 找不到可用的网络接口
*/
scclResult_t bootstrapNetInit() {
scclResult_t bootstrapNet::bootstrapNetInit() {
if(bootstrapNetInitDone == 0) {
pthread_mutex_lock(&bootstrapNetLock);
if(bootstrapNetInitDone == 0) {
char* env = getenv("SCCL_COMM_ID");
if(env) {
scclSocketAddress_t remoteAddr;
if(net::host::scclSocketGetAddrFromString(&remoteAddr, env) != scclSuccess) {
if(net::net_socket::scclSocketGetAddrFromString(&remoteAddr, env) != scclSuccess) {
WARN("Invalid SCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return scclInvalidArgument;
}
if(net::host::scclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
if(net::net_socket::scclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
WARN("NET/Socket : No usable listening interface found");
return scclSystemError;
}
} else {
int nIfs = net::host::scclFindSocketInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
int nIfs = net::net_socket::scclFindSocketInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
if(nIfs <= 0) {
WARN("Bootstrap : no socket interface found");
return scclInternalError;
}
}
char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2];
sprintf(line, " %s:", bootstrapNetIfName);
net::host::scclSocketToString(&bootstrapNetIfAddr, line + strlen(line));
INFO(SCCL_LOG_BOOTSTRAP, "Bootstrap : Using%s", line);
sprintf(line, "%s:", bootstrapNetIfName);
net::net_socket::scclSocketToString(&bootstrapNetIfAddr, line + strlen(line));
INFO(SCCL_LOG_BOOTSTRAP, "Bootstrap : Using %s", line);
bootstrapNetInitDone = 1;
printf("line=%s\n", line);
}
pthread_mutex_unlock(&bootstrapNetLock);
}
......@@ -73,9 +87,9 @@ scclResult_t bootstrapNetInit() {
*
* @note 先发送数据大小(sizeof(int)),再发送实际数据
*/
scclResult_t bootstrapNetSend(scclSocket_t* sock, void* data, int size) {
SCCLCHECK(net::host::scclSocketSend(sock, &size, sizeof(int)));
SCCLCHECK(net::host::scclSocketSend(sock, data, size));
scclResult_t bootstrapNet::bootstrapNetSend(scclSocket_t* sock, void* data, int size) {
SCCLCHECK(net::net_socket::scclSocketSend(sock, &size, sizeof(int)));
SCCLCHECK(net::net_socket::scclSocketSend(sock, data, size));
return scclSuccess;
}
......@@ -89,413 +103,17 @@ scclResult_t bootstrapNetSend(scclSocket_t* sock, void* data, int size) {
*
* @note 如果接收到的数据大小超过缓冲区大小,会截断数据并返回scclInternalError
*/
scclResult_t bootstrapNetRecv(scclSocket_t* sock, void* data, int size) {
scclResult_t bootstrapNet::bootstrapNetRecv(scclSocket_t* sock, void* data, int size) {
int recvSize;
SCCLCHECK(net::host::scclSocketRecv(sock, &recvSize, sizeof(int)));
SCCLCHECK(net::net_socket::scclSocketRecv(sock, &recvSize, sizeof(int)));
if(recvSize > size) {
WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
return scclInternalError;
}
SCCLCHECK(net::host::scclSocketRecv(sock, data, std::min(recvSize, size)));
return scclSuccess;
}
} // namespace bootstrap_net
/**
* 将未预期的连接请求加入队列
*
* @param state 引导状态指针
* @param peer 对端节点ID
* @param tag 连接标签
* @param sock 套接字指针
* @return 成功返回scclSuccess
*
* @note 该函数用于处理未预期的连接请求,将其加入等待队列
*/
scclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, scclSocket_t* sock) {
// New unex
struct unexConn* unex;
SCCLCHECK(scclCalloc(&unex, 1));
unex->peer = peer;
unex->tag = tag;
memcpy(&unex->sock, sock, sizeof(scclSocket_t));
// Enqueue
struct unexConn* list = state->unexpectedConnections;
if(list == NULL) {
state->unexpectedConnections = unex;
return scclSuccess;
}
while(list->next)
list = list->next;
list->next = unex;
return scclSuccess;
}
/**
* 从意外连接队列中查找并移除指定peer和tag的连接
*
* @param state 引导状态指针
* @param peer 目标peer ID
* @param tag 目标tag值
* @param sock 输出参数,用于存储找到的socket
* @param found 输出参数,指示是否找到匹配项
* @return 总是返回scclSuccess
*
* @note 该函数会遍历意外连接链表,查找匹配peer和tag的连接,
* 找到后将其从链表中移除并释放内存,通过sock参数返回socket信息
*/
scclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, scclSocket_t* sock, int* found) {
struct unexConn* elem = state->unexpectedConnections;
struct unexConn* prev = NULL;
*found = 0;
while(elem) {
if(elem->peer == peer && elem->tag == tag) {
if(prev == NULL) {
state->unexpectedConnections = elem->next;
} else {
prev->next = elem->next;
}
memcpy(sock, &elem->sock, sizeof(scclSocket_t));
free(elem);
*found = 1;
return scclSuccess;
}
prev = elem;
elem = elem->next;
}
return scclSuccess;
}
/**
* 释放未预期的连接链表
*
* 遍历并释放bootstrapState中存储的所有未预期连接
*
* @param state 包含未预期连接链表的状态结构体指针
*/
static void unexpectedFree(struct bootstrapState* state) {
struct unexConn* elem = state->unexpectedConnections;
struct unexConn* prev = NULL;
while(elem) {
prev = elem;
elem = elem->next;
free(prev);
}
return;
}
/**
* 执行基于环的AllGather操作
*
* @param commState 通信状态指针
* @param allData 用于收集所有rank数据的缓冲区
* @param size 每个rank数据块的大小(字节)
* @return 成功返回scclSuccess,失败返回错误码
*
* @note 该函数实现了一个简单的基于环的AllGather算法:
* 1. 每个rank在步骤i从(rank-i-1)接收数据
* 2. 将前一步骤从(rank-i)接收的数据发送给右侧rank
* 3. 共进行nranks-1次步骤完成全收集
*/
scclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
struct bootstrapState* state = (struct bootstrapState*)commState;
char* data = (char*)allData;
int rank = state->rank;
int nranks = state->nranks;
INFO(SCCL_LOG_BOOTSTRAP, "rank %d nranks %d size %d", rank, nranks, size);
/* Simple ring based AllGather
* At each step i receive data from (rank-i-1) from left
* and send previous step's data from (rank-i) to right
*/
for(int i = 0; i < nranks - 1; i++) {
size_t rslice = (rank - i - 1 + nranks) % nranks;
size_t sslice = (rank - i + nranks) % nranks;
// Send slice to the right
SCCLCHECK(bootstrap_net::bootstrapNetSend(&state->ringSendSocket, data + sslice * size, size));
// Recv slice from the left
SCCLCHECK(bootstrap_net::bootstrapNetRecv(&state->ringRecvSocket, data + rslice * size, size));
}
INFO(SCCL_LOG_BOOTSTRAP, "rank %d nranks %d size %d - DONE", rank, nranks, size);
SCCLCHECK(net::net_socket::scclSocketRecv(sock, data, std::min(recvSize, size)));
return scclSuccess;
}
/**
* 通过socket向指定对等节点发送数据
*
* @param commState 通信状态指针
* @param peer 对等节点编号
* @param tag 消息标签
* @param data 要发送的数据指针
* @param size 数据大小(字节)
* @return scclResult_t 返回操作结果状态码(scclSuccess表示成功)
*/
scclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
scclResult_t ret = scclSuccess;
struct bootstrapState* state = (struct bootstrapState*)commState;
scclSocket_t sock;
SCCLCHECKGOTO(net::host::scclSocketInit(&sock, state->peerCommAddresses + peer, state->magic, net::host::scclSocketTypeBootstrap), ret, fail);
SCCLCHECKGOTO(net::host::scclSocketConnect(&sock), ret, fail);
SCCLCHECKGOTO(bootstrap_net::bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
SCCLCHECKGOTO(bootstrap_net::bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
SCCLCHECKGOTO(bootstrap_net::bootstrapNetSend(&sock, data, size), ret, fail);
exit:
SCCLCHECK(net::host::scclSocketClose(&sock));
return ret;
fail:
goto exit;
}
/**
* @brief 从指定对等节点接收数据
*
* 该函数首先检查未预期的连接队列,若找到匹配的(peer, tag)则直接接收数据。
* 若未找到,则持续监听新连接,接收对等节点和标签信息进行匹配。
* 若匹配成功则接收数据,否则将连接信息存入未预期队列供后续使用。
*
* @param commState 通信状态指针
* @param peer 对等节点标识
* @param tag 消息标签
* @param data 接收数据缓冲区
* @param size 接收数据大小
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*/
scclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
scclResult_t ret = scclSuccess;
struct bootstrapState* state = (struct bootstrapState*)commState;
scclSocket_t sock;
int newPeer, newTag;
// Search unexpected connections first
int found;
SCCLCHECK(unexpectedDequeue(state, peer, tag, &sock, &found));
if(found) {
SCCLCHECKGOTO(bootstrap_net::bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
goto exit;
}
// Then look for new connections
while(1) {
SCCLCHECKGOTO(net::host::scclSocketInit(&sock), ret, fail);
SCCLCHECKGOTO(net::host::scclSocketAccept(&sock, &state->listenSock), ret, fail);
SCCLCHECKGOTO(bootstrap_net::bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail);
SCCLCHECKGOTO(bootstrap_net::bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail);
if(newPeer == peer && newTag == tag) {
SCCLCHECKGOTO(bootstrap_net::bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
goto exit;
}
// Unexpected connection. Save for later.
SCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail);
}
exit:
SCCLCHECK(net::host::scclSocketClose(&sock));
return ret;
fail:
goto exit;
}
scclResult_t bootstrapInit() {}
// /**
// * @brief 初始化bootstrap网络通信
// *
// * 该函数负责初始化bootstrap网络通信环境,包括:
// * 1. 创建监听socket供其他rank连接
// * 2. 与root节点交换连接信息
// * 3. 建立环形通信拓扑
// * 4. 收集所有peer的通信地址
// * 5. 创建并收集代理服务地址
// *
// * @param handle bootstrap句柄
// * @param comm bootstrap通信上下文
// * @return scclResult_t 返回操作结果,scclSuccess表示成功
// */
// scclResult_t bootstrapInit(struct scclBootstrapHandle* handle, struct scclBootstrapComm* comm) {
// int rank = comm->rank; // 当前进程的排名
// int nranks = comm->nRanks; // 进程的总数
// struct bootstrapState* state; // 引导状态结构体
// scclSocket_t* proxySocket; // 代理套接字
// scclSocketAddress_t nextAddr; // 下一个地址
// scclSocket_t sock, listenSockRoot; // 套接字和根监听套接字
// struct extInfo info = {0}; // 扩展信息结构体
// SCCLCHECK(scclCalloc(&state, 1)); // 分配引导状态结构体
// state->rank = rank; // 设置当前进程的排名
// state->nranks = nranks; // 设置进程的总数
// state->abortFlag = comm->abortFlag; // 设置中止标志
// comm->bootstrap = state; // 将引导状态结构体赋值给通信结构体
// comm->magic = state->magic = handle->magic; // 设置魔术值
// INFO(SCCL_LOG_BOOTSTRAP, "rank %d nranks %d", rank, nranks); // 打印日志信息
// info.rank = rank; // 设置扩展信息结构体中的排名
// info.nranks = nranks; // 设置扩展信息结构体中的进程总数
// // 创建套接字供其他进程联系
// SCCLCHECK(
// net::host::scclSocketInit(&state->listenSock, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag));
// SCCLCHECK(net::host::scclSocketListen(&state->listenSock)); // 监听套接字
// SCCLCHECK(net::host::scclSocketGetAddr(&state->listenSock, &info.extAddressListen)); // 获取监听套接字地址
// // 创建套接字供根进程联系
// SCCLCHECK(net::host::scclSocketInit(&listenSockRoot, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeBootstrap,
// comm->abortFlag)); SCCLCHECK(net::host::scclSocketListen(&listenSockRoot)); // 监听根进程套接字
// SCCLCHECK(net::host::scclSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); // 获取根进程监听套接字地址
// // // 分散连接时间以避免根进程过载
// // if(nranks > 128) {
// // long msec = rank;
// // struct timespec tv;
// // tv.tv_sec = msec / 1000;
// // tv.tv_nsec = 1000000 * (msec % 1000);
// // TRACE(SCCL_LOG_BOOTSTRAP, "rank %d delaying connection to root by %ld msec", rank, msec);
// // (void)nanosleep(&tv, NULL);
// // }
// // 向根进程发送我的监听套接字信息
// SCCLCHECK(net::host::scclSocketInit(&sock, &handle->addr, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag));
// SCCLCHECK(net::host::scclSocketConnect(&sock)); // 连接套接字
// SCCLCHECK(bootstrap_net::bootstrapNetSend(&sock, &info, sizeof(info))); // 发送扩展信息
// SCCLCHECK(net::host::scclSocketClose(&sock)); // 关闭套接字
// // 从根进程获取我在引导环中的“下一个”进程的信息
// SCCLCHECK(net::host::scclSocketInit(&sock)); // 初始化套接字
// SCCLCHECK(net::host::scclSocketAccept(&sock, &listenSockRoot)); // 接受根进程的连接
// SCCLCHECK(bootstrap_net::bootstrapNetRecv(&sock, &nextAddr, sizeof(scclSocketAddress_t))); // 接收下一个地址
// SCCLCHECK(net::host::scclSocketClose(&sock)); // 关闭套接字
// SCCLCHECK(net::host::scclSocketClose(&listenSockRoot)); // 关闭根监听套接字
// SCCLCHECK(net::host::scclSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag));
// SCCLCHECK(net::host::scclSocketConnect(&state->ringSendSocket)); // 连接环发送套接字
// // 接受引导环中前一个进程的连接请求
// SCCLCHECK(net::host::scclSocketInit(&state->ringRecvSocket)); // 初始化环接收套接字
// SCCLCHECK(net::host::scclSocketAccept(&state->ringRecvSocket, &state->listenSock)); // 接受连接
// // 全部收集所有监听处理器
// SCCLCHECK(scclCalloc(&state->peerCommAddresses, nranks)); // 分配对等通信地址
// SCCLCHECK(net::host::scclSocketGetAddr(&state->listenSock, state->peerCommAddresses + rank)); // 获取监听套接字地址
// SCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(scclSocketAddress_t))); // 全部收集地址
// // 创建服务代理
// SCCLCHECK(scclCalloc(&state->peerProxyAddresses, nranks)); // 分配对等代理地址
// // 代理通过消息中止;不要设置中止标志
// SCCLCHECK(scclCalloc(&proxySocket, 1)); // 分配代理套接字
// SCCLCHECK(net::host::scclSocketInit(proxySocket, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeProxy, comm->abortFlag));
// SCCLCHECK(net::host::scclSocketListen(proxySocket)); // 监听代理套接字
// SCCLCHECK(net::host::scclSocketGetAddr(proxySocket, state->peerProxyAddresses + rank)); // 获取代理套接字地址
// SCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(scclSocketAddress_t))); // 全部收集代理地址
// // SCCLCHECK(scclProxyInit(comm, proxySocket, state->peerProxyAddresses));
// INFO(SCCL_LOG_BOOTSTRAP, "rank %d nranks %d - DONE", rank, nranks); // 打印完成日志信息
// return scclSuccess; // 返回成功
// }
// /**
// * @brief 在bootstrap通信中创建新的子通信域
// *
// * 该函数用于将当前通信域按照指定颜色和键值拆分为子通信域,并建立相应的环状通信拓扑。
// *
// * @param handle bootstrap句柄
// * @param comm 新创建的子通信域
// * @param parent 父通信域
// * @param color 用于划分通信域的颜色值
// * @param key 用于确定新通信域中进程排名的键值
// * @param parentRanks 父通信域中的进程排名映射
// *
// * @return scclResult_t 返回操作结果,成功返回scclSuccess
// *
// * @note 函数会建立环状通信拓扑,包括:
// * 1. 初始化监听socket和环形接收socket
// * 2. 与前后节点交换地址信息
// * 3. 执行AllGather收集所有节点的通信地址
// * 4. 根据配置决定是否共享代理状态或创建新的代理服务
// */
// scclResult_t
// bootstrapSplit(struct scclBootstrapHandle* handle, struct scclBootstrapComm* comm, struct scclBootstrapComm* parent, int color, int key, int* parentRanks) {
// scclResult_t ret = scclSuccess;
// int rank = comm->rank;
// int nranks = comm->nRanks;
// int prev, next;
// scclSocketAddress_t listenAddr, tmpAddr;
// scclSocket_t* proxySocket;
// struct bootstrapState* state;
// // SCCLCHECKGOTO(scclCalloc(&state, 1), ret, fail);
// // state->rank = rank;
// // state->nranks = nranks;
// // state->abortFlag = comm->abortFlag;
// // comm->bootstrap = state;
// // comm->magic = state->magic = handle->magic;
// // prev = parentRanks[(rank - 1 + nranks) % nranks];
// // next = parentRanks[(rank + 1) % nranks];
// // // Setup my sockets for the allgather ring and other p2p connections
// // SCCLCHECKGOTO(
// // net::host::scclSocketInit(&state->listenSock, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeBootstrap,
// comm->abortFlag,
// // 0), ret, fail);
// // SCCLCHECKGOTO(net::host::scclSocketInit(&state->ringRecvSocket, NULL, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag, 0), ret,
// fail);
// // // Create socket for other ranks to contact me
// // SCCLCHECKGOTO(net::host::scclSocketListen(&state->listenSock), ret, fail);
// // // Get addr from next rank
// // SCCLCHECKGOTO(net::host::scclSocketGetAddr(&state->listenSock, &listenAddr), ret, fail);
// // SCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, -2, &listenAddr, sizeof(scclSocketAddress_t)), ret, fail);
// // SCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, -2, &tmpAddr, sizeof(scclSocketAddress_t)), ret, fail);
// // SCCLCHECKGOTO(net::host::scclSocketInit(&state->ringSendSocket, &tmpAddr, comm->magic, net::host::scclSocketTypeBootstrap, comm->abortFlag, 0), ret,
// // fail); SCCLCHECKGOTO(net::host::scclSocketConnect(&state->ringSendSocket), ret, fail);
// // // Accept the connect request from the previous rank in the AllGather ring
// // SCCLCHECKGOTO(net::host::scclSocketAccept(&state->ringRecvSocket, &state->listenSock), ret, fail);
// // // AllGather all listen handlers
// // SCCLCHECKGOTO(scclCalloc(&state->peerCommAddresses, nranks), ret, fail);
// // memcpy(state->peerCommAddresses + rank, &listenAddr, sizeof(scclSocketAddress_t));
// // SCCLCHECKGOTO(bootstrapAllGather(state, state->peerCommAddresses, sizeof(scclSocketAddress_t)), ret, fail);
// // if(parent->splitShare) {
// // /* map local rank to top parent local rank. */
// // for(int i = 0; i < nranks; ++i) {
// // comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
// // }
// // comm->proxyState = parent->sharedRes->proxyState;
// // scclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
// // } else {
// // // Create the service proxy
// // SCCLCHECKGOTO(scclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
// // SCCLCHECKGOTO(scclCalloc(&proxySocket, 1), ret, fail);
// // SCCLCHECKGOTO(
// // net::host::scclSocketInit(proxySocket, &bootstrap_net::bootstrapNetIfAddr, comm->magic, net::host::scclSocketTypeProxy, comm->abortFlag, 0),
// // ret,
// // fail);
// // SCCLCHECKGOTO(net::host::scclSocketListen(proxySocket), ret, fail);
// // SCCLCHECKGOTO(net::host::scclSocketGetAddr(proxySocket, &tmpAddr), ret, fail);
// // memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(scclSocketAddress_t));
// // SCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(scclSocketAddress_t)), ret, fail);
// // // SCCLCHECKGOTO(scclProxyInit(comm, proxySocket, state->peerProxyAddresses), ret, fail);
// // }
// // INFO(sccl_INIT, "bootstrapSplit: rank %d nranks %d color %d key %d prev %d next %d - DONE", rank, nranks, color, key, prev, next);
// exit:
// return ret;
// fail:
// goto exit;
// }
} // namespace bootstrap
} // namespace topology
} // namespace hardware
......
......@@ -4,47 +4,40 @@
#include "base.h"
#include "socket.h"
#include "bootstrap_utils.h"
#include "ipc_socket.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
template <typename Int>
inline void scclAtomicRefCountIncrement(Int* refs) {
__atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
}
////////////////////////////////////////////////////////////////////////////////////////////////
namespace bootstrap_net {
// 通过socket发送数据
scclResult_t bootstrapNetSend(scclSocket_t* sock, void* data, int size);
// 通过socket接收数据
scclResult_t bootstrapNetRecv(scclSocket_t* sock, void* data, int size);
// 初始化网络引导
scclResult_t bootstrapNetInit();
} // namespace bootstrap_net
// 将消息加入到未预期消息队列中
scclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, scclSocket_t* sock);
// 从未预期消息队列中取出消息
scclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, scclSocket_t* sock, int* found);
// 释放未预期消息队列中的资源
static void unexpectedFree(struct bootstrapState* state);
// 执行全节点数据收集操作
scclResult_t bootstrapAllGather(void* commState, void* allData, int size);
// 向指定节点发送数据
scclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
// 从指定节点接收数据
scclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
////////////////////
scclResult_t bootstrapInit(struct scclUniqueId* unique_id, struct scclBootstrapComm* comm);
// scclResult_t bootstrapInit(struct scclBootstrapHandle* handle, struct scclBootstrapComm* comm);
// scclResult_t
// bootstrapSplit(struct scclBootstrapHandle* handle, struct scclBootstrapComm* comm, struct scclBootstrapComm* parent, int color, int key, int* parentRanks);
typedef class sccl::hardware::net::ipc_socket::scclIpcSocket scclIpcSocket_t;
class bootstrapNet {
public:
// 构造函数
bootstrapNet(struct scclBootstrapComm* bootstrap_comm);
virtual ~bootstrapNet();
// 初始化
scclResult_t bootstrapNetInit();
// 通过socket发送数据
scclResult_t bootstrapNetSend(scclSocket_t* sock, void* data, int size);
// 通过socket接收数据
scclResult_t bootstrapNetRecv(scclSocket_t* sock, void* data, int size);
public:
/* Init functions */
char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1];
scclSocketAddress_t bootstrapNetIfAddr;
private:
int bootstrapNetInitDone = 0;
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
// 用于节点内socket通信
scclIpcSocket_t* ipcsocket = nullptr;
};
} // namespace bootstrap
} // namespace topology
......
#pragma once
#include <string.h>
#include "base.h"
#include "bootstrap_utils.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
//
/**
* 计算字符串的哈希值(基于DJB2a算法)
*
* @param string 输入字符串指针
* @param n 字符串长度
* @return 计算得到的64位无符号哈希值
*
* @note 算法实现:result = result * 33 ^ char
* 初始种子值为5381
*/
static uint64_t getHash(const char* string, int n) {
// Based on DJB2a, result = result * 33 ^ char
uint64_t result = 5381;
for(int c = 0; c < n; c++) {
result = ((result << 5) + result) ^ string[c];
}
return result;
}
/**
* @brief 获取主机名并截断到指定分隔符
*
* 获取当前主机名,并将其截断到第一个出现的分隔符(或字符串结尾)。
* 如果获取失败,则使用"unknown"作为默认主机名。
*
* @param hostname 用于存储主机名的缓冲区
* @param maxlen 缓冲区最大长度
* @param delim 截断分隔符
* @return scclResult_t 成功返回scclSuccess,失败返回scclSystemError
*/
static scclResult_t getHostName(char* hostname, int maxlen, const char delim) {
if(gethostname(hostname, maxlen) != 0) {
strncpy(hostname, "unknown", maxlen);
return scclSystemError;
}
int i = 0;
while((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1))
i++;
hostname[i] = '\0';
return scclSuccess;
}
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
/**
* 获取主机唯一标识的哈希值,该哈希值在裸机和容器实例中都是唯一的
*
* 该函数通过以下方式获取主机标识并计算其哈希值:
* 1. 首先尝试从环境变量 SCCL_HOSTID 获取
* 2. 若未设置,则尝试从 HOSTID_FILE 文件中读取
* 3. 若都失败,则使用主机名作为后备方案
*
*
* $(hostname)$(cat /proc/sys/kernel/random/boot_id)
*
* This string can be overridden by using the SCCL_HOSTID env var.
*
* @return 返回主机标识字符串的64位哈希值
*/
uint64_t getHostHash(void) {
char hostHash[1024];
char* hostId;
// Fall back is the full hostname if something fails
(void)getHostName(hostHash, sizeof(hostHash), '\0');
int offset = strlen(hostHash);
if((hostId = getenv("SCCL_HOSTID")) != NULL) {
INFO(SCCL_LOG_BOOTSTRAP, "SCCL_HOSTID set by environment to %s", hostId);
strncpy(hostHash, hostId, sizeof(hostHash));
} else {
FILE* file = fopen(HOSTID_FILE, "r");
if(file != NULL) {
char* p;
if(fscanf(file, "%ms", &p) == 1) {
strncpy(hostHash + offset, p, sizeof(hostHash) - offset - 1);
free(p);
}
}
fclose(file);
}
// Make sure the string is terminated
hostHash[sizeof(hostHash) - 1] = '\0';
INFO(SCCL_LOG_BOOTSTRAP, "unique hostname '%s'", hostHash);
return getHash(hostHash, strlen(hostHash));
}
/**
* 获取当前进程的唯一哈希标识符
* 为这个进程的唯一标识字符串生成一个哈希值该哈希值在裸机和容器实例中都是唯一的
* 相当于以下内容的哈希值
* $$ $(readlink /proc/self/ns/pid)
*
* 通过组合进程ID和PID命名空间路径生成唯一字符串,并计算其哈希值
*
* @return uint64_t 返回基于进程ID和PID命名空间路径生成的哈希值
* @note 如果读取PID命名空间路径失败,则仅使用进程ID生成哈希
*/
uint64_t getPidHash(void) {
char pname[1024];
// Start off with our pid ($$)
sprintf(pname, "%ld", (long)getpid());
int plen = strlen(pname);
int len = readlink("/proc/self/ns/pid", pname + plen, sizeof(pname) - 1 - plen);
if(len < 0)
len = 0;
pname[plen + len] = '\0';
INFO(SCCL_LOG_BOOTSTRAP, "unique PID '%s' ", pname);
return getHash(pname, strlen(pname));
}
/**
* @brief 从/dev/urandom设备获取随机数据填充缓冲区
*
* @param buffer 指向接收随机数据的缓冲区的指针
* @param bytes 需要获取的随机数据字节数
* @return scclResult_t 操作结果状态码(scclSuccess表示成功,scclSystemError表示系统错误)
*
* @note 如果bytes为0,函数将直接返回成功状态而不执行任何操作
*/
scclResult_t getRandomData(void* buffer, size_t bytes) {
scclResult_t ret = scclSuccess;
if(bytes > 0) {
const size_t one = 1UL;
FILE* fp = fopen("/dev/urandom", "r");
if(buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one)
ret = scclSystemError;
if(fp)
fclose(fp);
}
return ret;
}
// Convert a logical hipDev index to the NVML device minor number
/**
* 获取指定CUDA设备的PCI总线ID并转换为64位整数
*
* @param hipDev 输入的CUDA设备号
* @param busId 输出参数,用于存储转换后的64位总线ID
* @return 返回操作结果,成功返回scclSuccess
*
* @note PCI总线ID通常格式为"0000:00:00.0",本函数会将其转换为64位整数
*/
scclResult_t getBusId(int hipDev, int64_t* busId) {
// On most systems, the PCI bus ID comes back as in the 0000:00:00.0
// format. Still need to allocate proper space in case PCI domain goes higher.
char busIdStr[] = "00000000:00:00.0";
HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), hipDev));
// printf("get busid func: %s\n", busIdStr);
SCCLCHECK(busIdToInt64(busIdStr, busId));
return scclSuccess;
}
// 函数:打印 scclUniqueInfo 结构体的信息
void printUniqueInfo(struct scclUniqueInfo* info) {
if(info->localRank == 0) {
printf("\n==========================================\nTotal Rank: %d/%d, Local Rank: %d/%d, CUDA Device ID/Cnt: %d/%d, \n"
"Host Hash: %lu, PID Hash: %lu, gpu.name=%s, gcn=%s\n"
"\n==========================================\n",
info->rank,
info->nRanks,
info->localRank,
info->localRanks,
info->hipDev,
info->deviceCnt,
info->hostHash,
info->pidHash,
info->localNode.gpu.name,
info->localNode.gpu.gcn);
SCCLCHECK(net::printNetProps(&info->localNode.net.props, info->rank, info->localRank));
}
return;
}
} // namespace bootstrap
} // namespace topology
} // namespace hardware
......
......@@ -2,16 +2,18 @@
#include <string.h>
#include "base.h"
#include "ipcsocket.h"
#include "proxy.h"
#include "topo_utils.h"
#include "comm.h"
#include "rocm_smi_wrap.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
typedef net::host::scclSocketAddress scclSocketAddress_t;
typedef net::host::scclSocket scclSocket_t;
typedef net::net_socket::scclSocketAddress scclSocketAddress_t;
typedef net::net_socket::scclSocket scclSocket_t;
typedef net::scclNet_t scclNet_t;
// scclBootstrapHandle 结构体定义,用于存储引导句柄
struct scclBootstrapHandle {
......@@ -19,77 +21,93 @@ struct scclBootstrapHandle {
scclSocketAddress_t addr; // 地址,用于网络通信
};
struct scclProxyState {
int refCount; // 引用计数
int tpRank; // 当前线程的排名
int tpnRanks; // 线程组中线程的总数
int tpLocalnRanks; // 本地线程组中线程的总数
int cudaDev; // CUDA设备编号
int p2pnChannels; // 点对点通信的通道数
int p2pChunkSize; // 点对点通信的数据块大小
int nChannels; // 通道总数
int buffSizes[SCCL_NUM_PROTOCOLS]; // 各种协议的缓冲区大小
// 服务线程
pthread_t thread; // 线程ID
scclSocket_t* listenSock; // 监听套接字
int stop; // 停止标志
// 由主线程使用
scclSocketAddress_t* peerAddresses; // 对等体地址
scclSocket_t* peerSocks; // 对等体套接字
struct scclIpcSocket peerIpcSock; // cuMEM API支持(UDS)
// 进展线程
struct scclProxyProgressState progressState; // 进展状态
// 从代理预期的响应队列
struct scclExpectedProxyResponse* expectedResponses; // 预期的代理响应
// 定义硬件拓扑类型枚举
typedef enum {
GPU = 0, // 图形处理单元
PCI = 1, // 外围组件互连
XGMI = 2, // 非易失性存储器,NV卡中为nvlink
CPU = 3, // 中央处理器,实际上是NUMA域
NIC = 4, // 网络接口控制器
NET = 5 // 网络
} topoNodeType_t;
// 定义每个rank所持有的所有拓扑节点
struct topoLocalNode {
struct {
int dev; // NVML设备编号
char name[8]; // 设备名称
char gcn[7]; // GCN架构名称
int compCap; // CUDA计算能力
} gpu; // GPU节点
struct {
scclSocketAddress_t socketAddr; // 网络地址
} cpu; // CPU节点
struct {
net::scclNetProperties_t props;
int count;
} net; // 网络节点
struct {
int64_t busId; // PCI总线ID以int64_t格式表示
} pci; // pci节点
};
// 定义结构体 scclUniqueInfo,用于存储每个rank的通信节点的信息
struct scclUniqueInfo {
struct topoLocalNode localNode;
int rank; // 当前节点的全局排名
int nRanks; // 总的节点数量
int localRank; // 当前节点在本地计算节点中的排名
int localRanks; // 本地计算节点中的节点总数
int deviceCnt; // 设备数量
int hipDev; // CUDA 设备 ID
uint64_t hostHash; // 主机哈希值
uint64_t pidHash; // 进程 ID 哈希值
};
struct scclUniqueInfoSet {
int nUniqueInfo; // 通信节点的数量
std::vector<struct scclUniqueInfo*> unique_info_vec;
};
// scclBootstrapComm 结构体定义,用于存储引导通信信息
struct scclBootstrapComm {
struct scclUniqueInfo unique_info; // 每个通信节点的基础信息
scclNet_t* scclNet;
struct scclUniqueInfo* unique_info; // 每个通信节点的基础信息
cpu_set_t cpuAffinity; // CPU亲和性
int WarpSize;
void* bootstrap; // 引导信息
uint64_t magic; // 魔术数,用于验证结构体
volatile uint32_t* abortFlag; // 中止标志
int splitShare; // 是否使用共享内存进行分割
int* topParentRanks; // 顶级父节点的rank
/* 与代理相关的共享资源 */
struct scclProxyState* proxyState;
};
volatile uint32_t* abortFlag; // 中止标志,非阻塞套接字设置
// extInfo 结构体定义,用于存储Socket扩展信息
struct extInfo {
int rank; // 进程排名
int nranks; // 进程总数
scclSocketAddress_t extAddressListenRoot; // 根监听地址
scclSocketAddress_t extAddressListen; // 监听地址
// int splitShare; // 是否使用共享内存进行分割
// int* topParentRanks; // 顶级父节点的rank
// /* 与代理相关的共享资源 */
// struct scclProxyState* proxyState;
};
struct unexConn {
int peer; // 对等节点的标识符
int tag; // 连接的标签,用于区分不同的连接
scclSocket_t sock; // 套接字结构,用于网络通信
struct unexConn* next; // 指向下一个未建立连接的指针,形成链表结构
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// 获取主机唯一标识的哈希值,该哈希值在裸机和容器实例中都是唯一的
uint64_t getHostHash(void);
// bootstrapState 结构体定义,用于存储引导状态
struct bootstrapState {
scclSocket_t listenSock; // 监听套接字
scclSocket_t ringRecvSocket; // 环接收套接字
scclSocket_t ringSendSocket; // 环发送套接字
scclSocketAddress_t* peerCommAddresses; // 对等通信地址
scclSocketAddress_t* peerProxyAddresses; // 对等代理地址
struct unexConn* unexpectedConnections; // 意外连接
int cudaDev; // CUDA 设备编号
int rank; // 进程排名
int nranks; // 进程总数
uint64_t magic; // 魔术数,用于验证结构体
volatile uint32_t* abortFlag; // 中止标志
};
// 获取当前进程的唯一哈希标识符
uint64_t getPidHash(void);
// 从/dev/urandom设备获取随机数据填充缓冲区
scclResult_t getRandomData(void* buffer, size_t bytes);
// 获取指定CUDA设备的PCI总线ID并转换为64位整数
scclResult_t getBusId(int hipDev, int64_t* busId);
// 获取当前HIP设备的计算能力版本号
int scclCudaCompCap(void);
// 打印唯一的拓扑信息
void printUniqueInfo(struct scclUniqueInfo* info);
} // namespace bootstrap
} // namespace topology
......
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include "ipcsocket.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
// Enable Linux abstract socket naming
#define USE_ABSTRACT_SOCKET
#define SCCL_IPC_SOCKNAME_STR "/tmp/sccl-socket-%d-%lx"
/**
* @brief 初始化IPC套接字
*
* 创建一个UNIX域数据报套接字,并绑定到指定路径。支持抽象套接字和普通文件系统套接字两种模式。
*
* @param handle 指向scclIpcSocket结构体的指针,用于存储套接字信息
* @param rank 进程排名,用于生成唯一的套接字名称
* @param hash 哈希值,与rank一起用于生成唯一的套接字名称
* @param abortFlag 指向中止标志的指针,如果非NULL则设置套接字为非阻塞模式
* @return scclResult_t 返回操作结果,成功返回scclSuccess,失败返回相应错误码
*/
scclResult_t scclIpcSocketInit(scclIpcSocket* handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) {
int fd = -1;
struct sockaddr_un cliaddr;
char temp[SCCL_IPC_SOCKNAME_LEN] = "";
if(handle == NULL) {
return scclInternalError;
}
handle->fd = -1;
handle->socketName[0] = '\0';
if((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
WARN("UDS: Socket creation error : %d", errno);
return scclSystemError;
}
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
// Create unique name for the socket.
int len = snprintf(temp, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, rank, hash);
if(len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot bind provided name to socket. Name too large");
return scclInternalError;
}
#ifndef USE_ABSTRACT_SOCKET
unlink(temp);
#endif
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Creating socket %s", temp);
strncpy(cliaddr.sun_path, temp, len);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
if(bind(fd, (struct sockaddr*)&cliaddr, sizeof(cliaddr)) < 0) {
WARN("UDS: Binding to socket %s failed : %d", temp, errno);
close(fd);
return scclSystemError;
}
handle->fd = fd;
strcpy(handle->socketName, temp);
handle->abortFlag = abortFlag;
// Mark socket as non-blocking
if(handle->abortFlag) {
int flags;
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
}
return scclSuccess;
}
/**
* 关闭IPC套接字并释放相关资源
*
* @param handle 指向scclIpcSocket结构体的指针,包含要关闭的套接字信息
* @return scclResult_t 返回操作结果:
* - scclSuccess: 操作成功完成
* - scclInternalError: 传入无效句柄(handle为NULL)
*
* @note 如果定义了USE_ABSTRACT_SOCKET宏,则不会删除socket文件
* 如果套接字文件描述符无效(fd<=0),函数会直接返回成功
*/
scclResult_t scclIpcSocketClose(scclIpcSocket* handle) {
if(handle == NULL) {
return scclInternalError;
}
if(handle->fd <= 0) {
return scclSuccess;
}
#ifndef USE_ABSTRACT_SOCKET
if(handle->socketName[0] != '\0') {
unlink(handle->socketName);
}
#endif
close(handle->fd);
return scclSuccess;
}
/**
* 通过IPC socket接收文件描述符
*
* @param handle 指向scclIpcSocket结构体的指针,包含socket相关信息
* @param recvFd 用于存储接收到的文件描述符的指针
* @return scclResult_t 返回操作结果:
* - scclSuccess: 成功接收文件描述符
* - scclSystemError: 系统调用出错
* - scclInternalError: 操作被中断
*
* @note 该函数会阻塞等待直到接收到数据或发生错误
* @warning 调用者需要确保recvFd指向有效的内存空间
*/
scclResult_t scclIpcSocketRecvFd(scclIpcSocket* handle, int* recvFd) {
struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
struct iovec iov[1];
// Union to guarantee alignment requirements for control array
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr* cmptr;
char dummy_buffer[1];
int ret;
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
iov[0].iov_base = (void*)dummy_buffer;
iov[0].iov_len = sizeof(dummy_buffer);
msg.msg_iov = iov;
msg.msg_iovlen = 1;
while((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Receiving data over socket failed : %d", errno);
return scclSystemError;
}
if(handle->abortFlag && *handle->abortFlag)
return scclInternalError;
}
if(((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
if((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
WARN("UDS: Receiving data over socket failed");
return scclSystemError;
}
memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
} else {
WARN("UDS: Receiving data over socket %s failed", handle->socketName);
return scclSystemError;
}
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
return scclSuccess;
}
/**
* 通过UNIX域套接字发送文件描述符
*
* @param handle IPC套接字句柄
* @param sendFd 要发送的文件描述符
* @param rank 目标rank号
* @param hash 用于生成套接字名的哈希值
*
* @return 成功返回scclSuccess,失败返回错误码:
* - scclInternalError: 内部错误(如名称过长或操作被中止)
* - scclSystemError: 系统调用错误
*
* @note 使用SCM_RIGHTS机制通过控制消息发送文件描述符
* 在Linux下支持抽象套接字命名空间(当USE_ABSTRACT_SOCKET定义时)
*/
scclResult_t scclIpcSocketSendFd(scclIpcSocket* handle, const int sendFd, int rank, uint64_t hash) {
struct msghdr msg;
struct iovec iov[1];
char temp[SCCL_IPC_SOCKNAME_LEN];
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr* cmptr;
struct sockaddr_un cliaddr;
// Construct client address to send this shareable handle to
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
int len = snprintf(temp, SCCL_IPC_SOCKNAME_LEN, SCCL_IPC_SOCKNAME_STR, rank, hash);
if(len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot connect to provided name for socket. Name too large");
return scclInternalError;
}
(void)strncpy(cliaddr.sun_path, temp, len);
INFO(SCCL_LOG_BOOTSTRAP, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
msg.msg_name = (void*)&cliaddr;
msg.msg_namelen = sizeof(struct sockaddr_un);
iov[0].iov_base = (void*)"";
iov[0].iov_len = 1;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_flags = 0;
ssize_t sendResult;
while((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
if(errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Sending data over socket %s failed : %d", temp, errno);
return scclSystemError;
}
if(handle->abortFlag && *handle->abortFlag)
return scclInternalError;
}
return scclSuccess;
}
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
#pragma once
#include <stdio.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <memory.h>
#include <sys/un.h>
#include <inttypes.h>
#include "base.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
#define SCCL_IPC_SOCKNAME_LEN 64
// 定义IPC套接字结构体
struct scclIpcSocket {
int fd; // 文件描述符
char socketName[SCCL_IPC_SOCKNAME_LEN]; // 套接字名称
volatile uint32_t* abortFlag; // 用于中止操作的标志
};
// 初始化IPC套接字
scclResult_t scclIpcSocketInit(struct scclIpcSocket* handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
// 关闭IPC套接字
scclResult_t scclIpcSocketClose(struct scclIpcSocket* handle);
// 接收文件描述符
scclResult_t scclIpcSocketRecvFd(struct scclIpcSocket* handle, int* fd);
// 发送文件描述符
scclResult_t scclIpcSocketSendFd(struct scclIpcSocket* handle, const int fd, int rank, uint64_t hash);
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
#include <sys/syscall.h>
#include <assert.h>
#include "proxy.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {}
} // namespace topology
} // namespace hardware
} // namespace sccl
// static bool NeedProxy(int type, int pattern, int root, struct scclRing* ring, int nranks) {
// if(pattern == scclPatternRing || pattern == scclPatternRingTwice)
// return true;
// /* In chains, one rank does not need a proxy. Let's figure out which one it is */
// /* Which index in the reorganized rings should we compare root against */
// const int myrank = 0, nextrank = 1, prevrank = nranks - 1;
// int index = pattern == scclPatternPipelineFrom ?
// /* no recv / no send if root = */
// /* bcast */ (type == proxyRecv ? myrank : nextrank)
// :
// /* reduce */ (type == proxyRecv ? prevrank : myrank);
// int rank = ring->userRanks[index];
// return (root != rank);
// }
// #define PROXYARGS_ALLOCATE_SIZE SCCL_MAX_OPS
// struct scclProxyPool {
// struct scclProxyPool* next;
// struct scclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
// };
// static void expectedProxyResponseFree(struct scclProxyState* state) {
// struct scclExpectedProxyResponse* elem = state->expectedResponses;
// struct scclExpectedProxyResponse* prev = NULL;
// while(elem) {
// prev = elem;
// elem = elem->next;
// free(prev->respBuff);
// free(prev);
// }
// }
// static scclResult_t expectedProxyResponseStore(struct scclProxyState* state, void* opId, void* respBuff, int respSize) {
// struct scclExpectedProxyResponse* elem = state->expectedResponses;
// while(elem) {
// if(elem->opId == opId) {
// if(respSize != elem->respSize) {
// WARN("Mismatched response size for opId=%p", opId);
// return scclInternalError;
// }
// if(elem->done) {
// WARN("Storing response for already completed opId=%p", opId);
// return scclInternalError;
// }
// memcpy(elem->respBuff, respBuff, respSize);
// free(respBuff);
// elem->done = true;
// return scclSuccess;
// }
// elem = elem->next;
// }
// WARN("Proxy response for opId=%p doesn't match any expected response", opId);
// return scclInternalError;
// }
// static scclResult_t expectedProxyResponseEnqueue(struct scclProxyState* state, void* opId, int respSize) {
// struct scclExpectedProxyResponse* ex;
// scclCHECK(scclCalloc(&ex, 1));
// ex->opId = opId;
// // Pre-alloc response buffer
// ex->respBuff = malloc(respSize);
// ex->respSize = respSize;
// ex->done = false;
// // Enqueue
// struct scclExpectedProxyResponse* list = state->expectedResponses;
// if(list == NULL) {
// state->expectedResponses = ex;
// return scclSuccess;
// }
// while(list->next)
// list = list->next;
// list->next = ex;
// return scclSuccess;
// }
// static scclResult_t expectedProxyResponseDequeue(struct scclProxyState* state, void* opId, void* respBuff, int* found) {
// struct scclExpectedProxyResponse* elem = state->expectedResponses;
// struct scclExpectedProxyResponse* prev = NULL;
// *found = 0;
// while(elem) {
// if((elem->opId == opId) && elem->done) {
// if(prev == NULL) {
// state->expectedResponses = elem->next;
// } else {
// prev->next = elem->next;
// }
// memcpy(respBuff, elem->respBuff, elem->respSize);
// free(elem->respBuff);
// free(elem);
// *found = 1;
// return scclSuccess;
// }
// prev = elem;
// elem = elem->next;
// }
// return scclSuccess;
// }
// static scclResult_t expectedProxyResponseRemove(struct scclProxyState* state, void* opId) {
// struct scclExpectedProxyResponse* elem = state->expectedResponses;
// struct scclExpectedProxyResponse* prev = NULL;
// while(elem) {
// if(elem->opId == opId) {
// if(prev == NULL) {
// state->expectedResponses = elem->next;
// } else {
// prev->next = elem->next;
// }
// free(elem->respBuff);
// free(elem);
// return scclSuccess;
// }
// prev = elem;
// elem = elem->next;
// }
// WARN("Couldn't find opId=%p", opId);
// return scclInternalError;
// }
// static scclResult_t asyncProxyOpEnqueue(struct scclProxyLocalPeer* peer, scclProxyAsyncOp* op) {
// scclProxyAsyncOp* list = peer->asyncOps;
// if(list == NULL) {
// peer->asyncOps = op;
// return scclSuccess;
// }
// while(list->next)
// list = list->next;
// list->next = op;
// return scclSuccess;
// }
// static scclResult_t asyncProxyOpDequeue(struct scclProxyLocalPeer* peer, scclProxyAsyncOp* op) {
// struct scclProxyAsyncOp* elem = peer->asyncOps;
// struct scclProxyAsyncOp* prev = NULL;
// while(elem) {
// if(elem->opId == op->opId) {
// if(prev == NULL) {
// peer->asyncOps = elem->next;
// } else {
// prev->next = elem->next;
// }
// if(elem->reqBuff) {
// free(elem->reqBuff);
// }
// if(elem->respBuff) {
// free(elem->respBuff);
// }
// free(elem);
// return scclSuccess;
// }
// prev = elem;
// elem = elem->next;
// }
// if(op) {
// WARN("Attempting to dequeue nonexistent async opId=%p", op->opId);
// } else {
// WARN("Attempting to dequeue null operation");
// }
// return scclInternalError;
// }
// static scclResult_t allocateArgs(struct scclProxyProgressState* state, struct scclProxyArgs** argsptr) {
// struct scclProxyArgs* elem;
// if(state->pool == NULL) {
// // Allocate a new pool of elements. Make sure we allocate the memory close
// // to the network thread
// struct scclProxyPool* newPool;
// scclCHECK(scclCalloc(&newPool, 1));
// struct scclProxyArgs* newElems = newPool->elems;
// // Chain newly allocated elements
// for(int i = 0; i < PROXYARGS_ALLOCATE_SIZE; i++) {
// if(i + 1 < PROXYARGS_ALLOCATE_SIZE)
// newElems[i].next = newElems + i + 1;
// }
// // Add them all to the pool list
// state->pool = newElems;
// // Save the pool memory block for later resource release
// newPool->next = state->pools;
// state->pools = newPool;
// }
// elem = state->pool;
// state->pool = state->pool->next;
// elem->next = elem->nextPeer = NULL;
// *argsptr = elem;
// return scclSuccess;
// }
// // #define DEBUG_PROXY 1
// #ifdef DEBUG_PROXY
// #define DEBUG_PROXY_PRINT printf
// #else
// #define DEBUG_PROXY_PRINT(...)
// #endif
// #define OP_INDEX(op) ((op) ? (op) - state->pools->elems : -1)
// #define OP_SEEN 0x100000
// scclResult_t getOpIndex(struct scclProxyArgs* op, struct scclProxyProgressState* state, int* poolIndex, int* opIndex) {
// struct scclProxyPool* pool = state->pools;
// int p = 0;
// while(pool) {
// uint64_t o = op - pool->elems;
// if(o < PROXYARGS_ALLOCATE_SIZE) {
// *opIndex = o;
// *poolIndex = p;
// return scclSuccess;
// }
// pool = pool->next;
// p++;
// }
// WARN("Could not find pool of op %p", op);
// return scclInternalError;
// }
// scclResult_t printProxyOp(struct scclProxyArgs* op, int poolIndex, int opIndex) {
// printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, op->pattern == scclPatternSend ? "Send" : op->pattern == scclPatternRecv ? "Recv" : "Coll");
// for(int s = 0; s < op->nsubs; s++) {
// struct scclProxySubArgs* sub = op->subs + s;
// if(op->state == scclProxyOpProgress) {
// char status = ' ';
// if(op->pattern == scclPatternRecv) {
// if(sub->posted < sub->nsteps && sub->posted < sub->done + SCCL_STEPS)
// status = 'I'; // Init
// else if(sub->received < sub->posted)
// status = 'R'; // Receiving
// else if(sub->received < sub->transmitted)
// status = 'R'; // Receiving
// else if(sub->transmitted < sub->received)
// status = 'F'; // Flushing
// else if(sub->done < sub->transmitted)
// status = 'G'; // Waiting on GPU
// else
// status = 'D'; // Done
// } else if(op->pattern == scclPatternSend) {
// if(sub->posted < sub->nsteps && sub->posted < sub->done + SCCL_STEPS)
// status = 'I'; // Init
// else if(sub->transmitted < sub->posted)
// status = 'G'; // Waiting on GPU
// else if(sub->done < sub->transmitted)
// status = 'S'; // Sending
// else
// status = 'D'; // Done
// }
// printf(" %d%c/%d", sub->peer, status, sub->channelId);
// } else {
// printf(" %d/%d", sub->peer, sub->channelId);
// }
// }
// printf("]");
// return scclSuccess;
// }
// scclResult_t dumpProxyState(struct scclProxyProgressState* state) {
// struct scclProxyArgs* op = state->active;
// int poolIndex, opIndex;
// printf("ACTIVE OPS\n");
// while(op) {
// scclCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
// if(op->state & OP_SEEN) {
// WARN("List loop at element %d-%d", poolIndex, opIndex);
// }
// scclCHECK(printProxyOp(op, poolIndex, opIndex));
// op->state |= OP_SEEN;
// printf("\n");
// struct scclProxyArgs* nextOp = op->nextPeer;
// while(nextOp) {
// scclCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex));
// if(nextOp->state & OP_SEEN) {
// WARN("List loop at element %d-%d", poolIndex, opIndex);
// }
// printf("| `-> ");
// scclCHECK(printProxyOp(nextOp, poolIndex, opIndex));
// nextOp->state |= OP_SEEN;
// printf("\n");
// if(nextOp->next) {
// WARN("Inactive op has next set!");
// }
// nextOp = nextOp->nextPeer;
// }
// if(op->nextPeer == NULL)
// printf("|\n");
// op = op->next;
// printf("v\n");
// }
// printf("[X]\n");
// #if 0
// printf("FREE OPS\n");
// op = state->pool;
// while (op) {
// scclCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
// if (op->state & OP_SEEN) {
// WARN("List loop at element %d-%d", poolIndex, opIndex);
// }
// scclCHECK(printProxyOp(op, poolIndex, opIndex));
// op->state |= OP_SEEN;
// printf("->");
// op = op->next;
// }
// printf("[X]\n");
// #else
// op = state->pool;
// while(op) {
// scclCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
// if(op->state & OP_SEEN) {
// WARN("List loop at element %d-%d", poolIndex, opIndex);
// }
// op->state |= OP_SEEN;
// op = op->next;
// }
// #endif
// struct scclProxyPool* pool = state->pools;
// poolIndex = 0;
// while(pool) {
// struct scclProxyArgs* elem = pool->elems;
// for(int e = 0; e < PROXYARGS_ALLOCATE_SIZE; e++, elem++) {
// if((elem->state & OP_SEEN) == 0) {
// printf("Elem %d-%d is not in any list:\n", poolIndex, e);
// scclCHECK(printProxyOp(elem, poolIndex, e));
// printf("\n");
// } else {
// elem->state -= OP_SEEN;
// }
// }
// pool = pool->next;
// poolIndex++;
// }
// return scclSuccess;
// }
// static scclResult_t scclProxyOpToArgs(struct scclProxyOp* op, struct scclProxyArgs* args, int subIndex) {
// struct scclProxySubArgs* sub = args->subs + subIndex;
// if(subIndex >= SCCL_PROXY_MAX_SUBS) {
// WARN("Proxy append out of bounds");
// return scclInternalError;
// }
// // memset(sub, 0, sizeof(struct scclProxySubArgs));
// sub->connection = op->connection;
// sub->channelId = op->channelId;
// sub->nsteps = op->nsteps;
// sub->nbytes = op->nbytes;
// sub->peer = op->root;
// args->nsubs = subIndex + 1;
// if(subIndex) {
// if((args->sliceSteps != op->sliceSteps) || (args->chunkSteps != op->chunkSteps) || (args->protocol != op->protocol) || (args->dtype != op->dtype) ||
// (args->redOp != op->redOp)) {
// WARN("Proxy append mismatch");
// return scclInternalError;
// }
// if(args->state != scclProxyOpReady) {
// WARN("Proxy append on running operation");
// return scclInternalError;
// }
// return scclSuccess;
// }
// // memset(&args->progress, 0, sizeof(struct scclProxyArgs)-offsetof(struct scclProxyArgs, progress));
// args->done = 0;
// args->opCount = op->opCount;
// args->sliceSteps = op->sliceSteps;
// args->chunkSteps = op->chunkSteps;
// args->chunkSize = op->chunkSize;
// args->dtype = op->dtype;
// args->redOp = op->redOp;
// args->pattern = op->pattern;
// args->protocol = op->protocol;
// args->state = scclProxyOpReady;
// args->progress = op->connection->tcomm->proxyProgress;
// args->proxyAppendPtr = op->connection->proxyAppendPtr;
// return scclSuccess;
// }
// static scclResult_t ProxyAppend(struct scclProxyProgressState* state, struct scclProxyOp* op) {
// struct scclProxyConnection* connection = op->connection;
// int shared = connection->shared;
// struct scclProxyArgs* args = *connection->proxyAppendPtr;
// if(args) {
// if(shared && args->opCount == op->opCount) {
// scclCHECK(scclProxyOpToArgs(op, args, args->nsubs));
// DEBUG_PROXY_PRINT("Insert (%d/%5ld/%5ld) as group with %5ld\n", shared, args->opCount, op->opCount, OP_INDEX(args));
// } else {
// struct scclProxyArgs* prevArgs = args;
// scclCHECK(allocateArgs(state, &args));
// scclCHECK(scclProxyOpToArgs(op, args, 0));
// prevArgs->nextPeer = args;
// DEBUG_PROXY_PRINT(
// "Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, prevArgs->opCount, args->opCount, OP_INDEX(prevArgs));
// *(args->proxyAppendPtr) = args;
// }
// } else {
// // Nothing running for that peer. Add to the list
// scclCHECK(allocateArgs(state, &args));
// scclCHECK(scclProxyOpToArgs(op, args, 0));
// if(state->active == NULL) {
// // Create the list
// DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element\n", OP_INDEX(args), shared, args->opCount);
// state->active = args;
// } else {
// // Append element at the end of the list
// struct scclProxyArgs* last = state->active;
// while(last->next)
// last = last->next;
// last->next = args;
// DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element\n", OP_INDEX(args), shared, args->opCount);
// }
// *(args->proxyAppendPtr) = args;
// }
// return scclSuccess;
// }
// scclResult_t scclProxyPost(struct scclProxyOpsPool* pool, int nextOps, int nextOpsEnd) {
// pthread_mutex_lock(&pool->mutex);
// if(pool->nextOps == -1) {
// pool->nextOps = nextOps;
// pthread_cond_signal(&pool->cond);
// } else {
// pool->ops[pool->nextOpsEnd].next = nextOps;
// }
// pool->nextOpsEnd = nextOpsEnd;
// pthread_mutex_unlock(&pool->mutex);
// return scclSuccess;
// }
// static scclResult_t scclLocalOpAppend(struct scclComm* comm, struct scclProxyConnector* proxyConn, struct scclProxyOp* proxyOp) {
// int tpLocalRank = comm->topParentLocalRanks[comm->localRank];
// struct scclProxyOps* proxyOps = comm->proxyState->proxyOps;
// if(proxyOps == NULL)
// return scclInternalError;
// proxyOps += proxyConn->tpLocalRank;
// struct scclProxyOpsPool* pool = proxyOps->pool;
// TIME_START(0);
// int opIndex = proxyOps->freeOp;
// struct scclProxyOp* op;
// if(opIndex != -1) {
// op = pool->ops + opIndex;
// proxyOps->freeOp = op->next;
// } else {
// int freeOp;
// while((freeOp = pool->freeOps[tpLocalRank]) == -1)
// sched_yield();
// int freeOpNew;
// while((freeOpNew = __sync_val_compare_and_swap(pool->freeOps + tpLocalRank, freeOp, -1)) != freeOp)
// freeOp = freeOpNew;
// opIndex = freeOp;
// op = pool->ops + opIndex;
// proxyOps->freeOp = op->next;
// }
// if(op->next != -1)
// __builtin_prefetch(pool->ops + op->next); // Prefetch next free op
// memcpy(op, proxyOp, sizeof(struct scclProxyOp));
// op->next = -1;
// op->connection = proxyConn->connection;
// if(proxyOps->nextOps == -1) {
// proxyOps->nextOps = proxyOps->nextOpsEnd = opIndex;
// } else {
// pool->ops[proxyOps->nextOpsEnd].next = opIndex;
// proxyOps->nextOpsEnd = opIndex;
// }
// if(++proxyOps->count == MAX_OPS_PER_PEER) {
// // Post what we have so far to free some ops in the pool
// // Do not post last operations as we could have more coming with the same opCount, and posting
// // them in different batches would break proxyArgs aggregation with subs.
// uint64_t lastOpCount = pool->ops[proxyOps->nextOpsEnd].opCount;
// int lastOp = -1;
// int toSend = 0;
// int ops = 0;
// for(int op = proxyOps->nextOps; op != proxyOps->nextOpsEnd; op = pool->ops[op].next) {
// ops++;
// if(pool->ops[op].opCount != lastOpCount) {
// lastOp = op;
// toSend = ops;
// }
// }
// if(lastOp == -1) {
// WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
// return scclInternalError;
// }
// // Cut chain at lastOp
// int nextOps = proxyOps->nextOps;
// proxyOps->nextOps = pool->ops[lastOp].next;
// pool->ops[lastOp].next = -1;
// scclCHECK(scclProxyPost(proxyOps->pool, nextOps, lastOp));
// proxyOps->count -= toSend;
// }
// TIME_STOP(0);
// return scclSuccess;
// }
// static scclResult_t
// SaveProxy(struct scclComm* comm, struct scclChannel* channel, int type, int peer, struct scclProxyOp* op, int connIndex, bool* justInquire) {
// if(peer < 0)
// return scclSuccess;
// struct scclChannelPeer* peerComm = channel->peers[peer];
// struct scclConnector* connector = type == proxyRecv ? peerComm->recv + connIndex : peerComm->send + connIndex;
// if(connector->transportComm == NULL) {
// WARN("Rank %d has no transport for %s peer %d on channel %d/%d", comm->rank, type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex);
// return scclInternalError;
// }
// if(connector->transportComm->proxyProgress == NULL)
// return scclSuccess;
// if(justInquire)
// *justInquire = true;
// else {
// scclCHECK(scclLocalOpAppend(comm, &connector->proxyConn, op));
// }
// return scclSuccess;
// }
// scclResult_t mscclSaveProxy(struct scclComm* comm, struct scclChannel* channel, int type, int peer, struct scclProxyOp* op, int connIndex) {
// scclCHECK(SaveProxy(comm, channel, type, peer, op, connIndex, nullptr));
// return scclSuccess;
// }
// // justInquire != nullptr means don't actually do anything, just assertain need of
// // scclProxySaveOp for this op.
// scclResult_t scclProxySaveOp(struct scclComm* comm, struct scclProxyOp* op, bool* justInquire) {
// struct scclChannel* channel = &comm->channels[op->channelId];
// if(justInquire)
// *justInquire = false;
// switch(op->pattern) {
// case scclPatternRing:
// case scclPatternRingTwice:
// case scclPatternPipelineFrom:
// case scclPatternPipelineTo: {
// struct scclRing* ring = &channel->ring;
// if(NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) {
// scclCHECK(SaveProxy(comm, channel, proxyRecv, ring->prev, op, op->connIndex, justInquire));
// }
// if(NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) {
// scclCHECK(SaveProxy(comm, channel, proxySend, ring->next, op, op->connIndex, justInquire));
// }
// } break;
// case scclPatternTreeUp:
// case scclPatternTreeDown:
// case scclPatternTreeUpDown: {
// if(op->pattern != scclPatternTreeDown) { // Tree up
// struct scclTree* tree = &channel->tree;
// for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++) {
// scclCHECK(SaveProxy(comm, channel, proxyRecv, tree->down[i], op, 0, justInquire));
// }
// scclCHECK(SaveProxy(comm, channel, proxySend, tree->up, op, 0, justInquire));
// }
// if(op->pattern != scclPatternTreeUp) { // Tree down
// struct scclTree* tree = &channel->tree;
// for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++) {
// scclCHECK(SaveProxy(comm, channel, proxySend, tree->down[i], op, 0, justInquire));
// }
// scclCHECK(SaveProxy(comm, channel, proxyRecv, tree->up, op, 0, justInquire));
// }
// } break;
// case scclPatternCollnetChain: {
// scclCHECK(SaveProxy(comm, channel, proxySend, channel->collnetChain.up, op, 1, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetChain.up, op, 0, justInquire));
// } break;
// case scclPatternCollnetDirect: {
// scclCHECK(SaveProxy(comm, channel, proxySend, channel->collnetDirect.out, op, 1, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->collnetDirect.out, op, 0, justInquire));
// } break;
// case scclPatternNvls: {
// scclCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.out, op, 1, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.out, op, 0, justInquire));
// } break;
// case scclPatternNvlsTree: {
// scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[1], op, 0, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeDown[2], op, 0, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeUp, op, 0, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[1], op, 0, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[2], op, 0, justInquire));
// scclCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeUp, op, 0, justInquire));
// } break;
// case scclPatternSend:
// case scclPatternRecv: {
// if(op->root == comm->rank)
// return scclSuccess;
// scclCHECK(SaveProxy(comm, channel, op->pattern == scclPatternSend ? proxySend : proxyRecv, op->root, op, op->connIndex, justInquire));
// } break;
// }
// return scclSuccess;
// }
// SCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
// scclResult_t scclProxyComputeP2p(struct scclInfo* info, struct scclProxyOp* op) {
// memset(op, 0, sizeof(struct scclProxyOp));
// int channelId = info->channelId;
// struct scclChannel* channel = info->comm->channels + channelId;
// op->channelId = channelId;
// op->sliceSteps = P2P_SLICESTEPS;
// op->chunkSteps = P2P_CHUNKSTEPS;
// op->dtype = info->datatype;
// op->protocol = info->protocol;
// int stepSize = info->comm->buffSizes[op->protocol] / SCCL_STEPS;
// if(op->protocol == SCCL_PROTO_SIMPLE)
// stepSize = info->comm->p2pChunkSize;
// #ifdef HCU_SDMA_FEATURE
// info->chunkSize = info->comm->p2pRealChunkSize;
// #else
// info->chunkSize = stepSize;
// #endif
// op->root = info->root;
// struct scclChannelPeer* peer = channel->peers[op->root];
// if(info->coll == scclFuncSend) {
// op->pattern = scclPatternSend;
// if(op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
// // Tune chunk size for the network
// if(info->count < stepSize)
// info->chunkSize /= 4;
// else if(info->count < 8 * stepSize)
// info->chunkSize /= 2;
// }
// } else if(info->coll == scclFuncRecv) {
// op->pattern = scclPatternRecv;
// if(op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) {
// // Tune chunk size for the network
// if(info->count < stepSize)
// info->chunkSize /= 4;
// else if(info->count < 8 * stepSize)
// info->chunkSize /= 2;
// }
// } else {
// WARN("P2p operation is neither send or recv");
// return scclInternalError;
// }
// if(scclParamChunkSize() != 0) {
// info->chunkSize = scclParamChunkSize();
// }
// op->chunkSize = info->chunkSize;
// // Compute nSteps for proxies
// int chunkEffectiveSize = op->chunkSize;
// if(op->protocol == SCCL_PROTO_LL) {
// chunkEffectiveSize /= 2;
// }
// op->nbytes = stepSize;
// op->nsteps = DIVUP(info->count, chunkEffectiveSize);
// if(op->nsteps == 0)
// op->nsteps = 1;
// return scclSuccess;
// }
// static scclResult_t removeOp(struct scclProxyProgressState* state, struct scclProxyArgs** opPtr, struct scclProxyArgs** prevOpPtr) {
// struct scclProxyArgs* freeOp = *opPtr;
// struct scclProxyArgs* next = freeOp->next;
// DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(next));
// *opPtr = next;
// if(freeOp->nextPeer) {
// // replace op by nextPeer
// struct scclProxyArgs* nextPeer = freeOp->nextPeer;
// if(*prevOpPtr) {
// (*prevOpPtr)->next = nextPeer;
// } else {
// state->active = nextPeer;
// }
// nextPeer->next = next;
// *(prevOpPtr) = nextPeer;
// } else {
// *(freeOp->proxyAppendPtr) = NULL;
// if(*prevOpPtr) {
// (*prevOpPtr)->next = next;
// } else {
// state->active = next;
// }
// }
// freeOp->next = state->pool;
// state->pool = freeOp;
// DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
// #ifdef DEBUG_PROXY
// scclCHECK(dumpProxyState(state));
// #endif
// return scclSuccess;
// }
// static scclResult_t progressOps(struct scclProxyState* proxyState, struct scclProxyProgressState* state, struct scclProxyArgs* opStart, int* idle) {
// struct scclProxyArgs* prevOp = NULL;
// struct scclProxyArgs* op = opStart;
// while(op) {
// if(op->state == scclProxyOpNone)
// return scclInternalError;
// TIME_START(0);
// TIME_START(1);
// scclCHECK(op->progress(proxyState, op));
// if(op->idle) {
// TIME_STOP(1);
// TIME_CANCEL(0);
// } else {
// TIME_CANCEL(1);
// TIME_STOP(0);
// }
// *idle &= op->idle;
// if(op->state == scclProxyOpNone) {
// TIME_START(2);
// scclCHECK(removeOp(state, &op, &prevOp));
// TIME_STOP(2);
// } else {
// prevOp = op;
// op = op->next;
// }
// }
// return scclSuccess;
// }
// SCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16);
// static scclResult_t scclProxyGetPostedOps(struct scclProxyState* proxyState, int* added) {
// struct scclProxyProgressState* state = &proxyState->progressState;
// if(state->opsPool == NULL)
// return scclInternalError;
// struct scclProxyOpsPool* pool = state->opsPool;
// struct scclProxyArgs profArgs; // Only used for profiling purposes
// if(state->nextOps != -1)
// goto process_nextops;
// // If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock
// // to be available. Exit, continue progress, and come back later.
// if(state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0))
// return scclSuccess;
// if(state->active == NULL) {
// pthread_mutex_lock(&pool->mutex);
// while(pool->nextOps == -1 && !state->stop) {
// struct scclProxyArgs profArgs; // Only used for profiling purposes
// scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileSleep);
// pthread_cond_wait(&pool->cond, &pool->mutex);
// scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileWakeup);
// }
// if(state->stop) { // We might have been woken up to stop.
// pthread_mutex_unlock(&pool->mutex);
// return scclSuccess;
// }
// }
// state->nextOps = pool->nextOps;
// pool->nextOps = pool->nextOpsEnd = -1;
// pthread_mutex_unlock(&pool->mutex);
// if(state->nextOps == -1)
// return scclInternalError;
// process_nextops:
// scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileAppend);
// TIME_START(2);
// int freeOp[SCCL_MAX_LOCAL_RANKS];
// int freeOpEnd[SCCL_MAX_LOCAL_RANKS];
// for(int i = 0; i < proxyState->tpLocalnRanks; i++)
// freeOp[i] = -1;
// uint64_t lastOpCount = 0;
// int lastPeer = -1;
// int count = 0;
// for(int opIndex = state->nextOps; opIndex != -1;) {
// struct scclProxyOp* peerOp = pool->ops + opIndex;
// int peer = opIndex / MAX_OPS_PER_PEER;
// if((lastOpCount && peerOp->opCount != lastOpCount) || ((lastPeer != -1) && peer != lastPeer))
// count++;
// if(count == scclParamProxyAppendBatchSize() + 1)
// break;
// lastOpCount = peerOp->opCount;
// lastPeer = peer;
// if(peerOp->connection == NULL)
// return scclInternalError;
// if(peerOp->next != -1)
// __builtin_prefetch(pool->ops + peerOp->next);
// scclCHECK(ProxyAppend(state, peerOp));
// (*added)++;
// int lastOpIndex = opIndex;
// opIndex = peerOp->next;
// // Return op to peer pool
// if(freeOp[peer] == -1) {
// freeOpEnd[peer] = lastOpIndex;
// } else {
// peerOp->next = freeOp[peer];
// }
// freeOp[peer] = lastOpIndex;
// state->nextOps = opIndex;
// }
// for(int i = 0; i < proxyState->tpLocalnRanks; i++) {
// if(freeOp[i] == -1)
// continue;
// int newFree = freeOp[i];
// int oldFree = pool->freeOps[i];
// pool->ops[freeOpEnd[i]].next = oldFree;
// if(oldFree == -1) {
// // Nothing for the main thread to consume, we can set it.
// pool->freeOps[i] = newFree;
// } else {
// // The main thread may recycle free ops at any time, replace the freeOps value atomically and check it worked.
// int swap = __sync_val_compare_and_swap(pool->freeOps + i, oldFree, newFree);
// if(swap != oldFree) {
// if(swap != -1)
// return scclInternalError;
// // Ops were recycled while we were trying to swap, just set the value directly now.
// pool->ops[freeOpEnd[i]].next = -1;
// pool->freeOps[i] = newFree;
// }
// }
// }
// profArgs.opCount = *added;
// scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileAppendEnd);
// TIME_STOP(2);
// return scclSuccess;
// }
// #include <signal.h>
// static scclProxyProgressState* scclLastProxyState;
// void scclDumpProxyState(int signal) { dumpProxyState(scclLastProxyState); }
// SCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0);
// static int setProxyThreadContext(struct scclProxyState* proxyState) {
// #if CUDART_VERSION >= 11030
// static int createThreadContext = -1;
// if(createThreadContext == -1) {
// createThreadContext = scclParamCreateThreadContext();
// if(createThreadContext) {
// if(CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
// WARN("Unable to create thread context due to old driver, disabling.");
// createThreadContext = 0;
// }
// }
// }
// if(createThreadContext) {
// if(proxyState->cudaCtx == NULL) {
// if(CUPFN(cuCtxCreate(&proxyState->cudaCtx, CU_CTX_SCHED_SPIN | CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) {
// WARN("Failed to create CUDA context on device %d", proxyState->cudaDev);
// createThreadContext = 0;
// }
// } else {
// if(CUPFN(cuCtxSetCurrent(proxyState->cudaCtx)) != CUDA_SUCCESS) {
// WARN("Failed to set CUDA context on device %d", proxyState->cudaDev);
// return 0;
// }
// return 1;
// }
// }
// #endif
// return 0;
// }
// // Set to SIGUSR1 or SIGUSR2 to help debug proxy state during hangs
// SCCL_PARAM(ProxyDumpSignal, "PROXY_DUMP_SIGNAL", -1);
// SCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8);
// void* scclProxyProgress(void* proxyState_) {
// struct scclProxyState* proxyState = (struct scclProxyState*)proxyState_;
// if(setProxyThreadContext(proxyState)) {
// INFO(SCCL_INIT, "[Proxy Progress] Created CUDA context on device %d", proxyState->cudaDev);
// } else if(cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
// WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev);
// }
// // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
// struct scclProxyProgressState* state = &proxyState->progressState;
// state->nextOps = -1;
// const int sig = scclParamProxyDumpSignal();
// if(sig != -1)
// signal(sig, scclDumpProxyState);
// scclLastProxyState = state;
// char threadName[SCCL_THREAD_NAMELEN];
// snprintf(threadName, SCCL_THREAD_NAMELEN, "sccl Progress%2d", proxyState->cudaDev);
// nvtxNameOsThreadA(syscall(SYS_gettid), threadName);
// int lastIdle = 0;
// /* Too frequent call of scclProxyGetPostedOps() will result in perf regression for small message
// * communication. proxyOpAppendCounter is a counter that helps us decide if we need to append proxy ops.
// * After each progress, proxyOpAppendCounter will increase by 1 and compare with environment variable
// * scclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the
// * frequency of calling scclProxyGetPostedOps() and reduce the perf impact. */
// int proxyOpAppendCounter = 0;
// struct scclProxyArgs profArgs; // Only used for profiling purposes
// while((state->stop == false || (state->stop == true && state->active)) && *proxyState->abortFlag == 0) {
// int idle = 1;
// scclResult_t ret = progressOps(proxyState, state, state->active, &idle);
// if(ret != scclSuccess) {
// INFO(SCCL_ALL, "%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
// return NULL;
// }
// if(lastIdle == 0 && idle == 1)
// scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileIdle);
// if(lastIdle == 1 && idle == 0)
// scclProfilingRecord(&profArgs, 0, 0, scclProxyProfileActive);
// if(idle || (++proxyOpAppendCounter == scclParamProgressAppendOpFreq())) {
// int added = 0;
// proxyOpAppendCounter = 0;
// TIME_START(3);
// if(state->stop == false)
// ret = scclProxyGetPostedOps(proxyState, &added);
// if(added) {
// TIME_STOP(3);
// } else {
// TIME_CANCEL(3);
// }
// if(ret != scclSuccess) {
// INFO(SCCL_ALL, "%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
// }
// if(added == 0) {
// sched_yield(); // No request progressed. Let others run.
// }
// }
// lastIdle = idle;
// }
// return NULL;
// }
// scclResult_t scclProxyStart(struct scclComm* comm) {
// struct scclProxyOps* proxyOps = comm->proxyState->proxyOps;
// if(proxyOps == NULL)
// return scclSuccess;
// TIME_START(1);
// for(int r = 0; r < comm->sharedRes->tpNLocalRanks; r++) {
// struct scclProxyOps* ops = proxyOps + r;
// if(ops->pool == NULL || ops->nextOps == -1)
// continue;
// scclCHECK(scclProxyPost(ops->pool, ops->nextOps, ops->nextOpsEnd));
// ops->nextOps = ops->nextOpsEnd = -1;
// ops->count = 0;
// }
// comm->opCount++;
// TIME_STOP(1);
// return scclSuccess;
// }
// static scclResult_t scclProxyProgressCreate(struct scclProxyState* proxyState) {
// struct scclProxyProgressState* state = &proxyState->progressState;
// if(!state->thread) {
// pthread_create(&state->thread, NULL, scclProxyProgress, proxyState);
// scclSetThreadName(state->thread, "sccl Progress%2d", proxyState->tpLocalnRanks);
// }
// return scclSuccess;
// }
// scclResult_t scclProxyProgressDestroy(struct scclProxyState* proxyState) {
// struct scclProxyProgressState* state = &proxyState->progressState;
// // Request the proxy to stop and then wake it
// if(state->opsPool) {
// pthread_mutex_lock(&state->opsPool->mutex);
// state->stop = true;
// pthread_cond_signal(&state->opsPool->cond);
// pthread_mutex_unlock(&state->opsPool->mutex);
// pthread_join(state->thread, NULL);
// }
// // Free off any memory allocated for the proxy arg pools
// while(state->pools != NULL) {
// struct scclProxyPool* next = state->pools->next;
// free(state->pools);
// state->pools = next;
// }
// scclProfilingDump();
// TIME_PRINT("Proxy");
// return scclSuccess;
// }
// #define SCCL_PROXY_CONN_POOL_SIZE_POW2 7
// #define SCCL_PROXY_CONN_POOL_SIZE (1 << (SCCL_PROXY_CONN_POOL_SIZE_POW2))
// #define SCCL_PROXY_CONN_POOL_MASK ((SCCL_PROXY_CONN_POOL_SIZE) - 1)
// struct scclProxyConnectionPool {
// struct scclProxyConnection** pools;
// int banks;
// int offset;
// };
// static scclResult_t scclProxyNewConnection(struct scclProxyConnectionPool* pool, int* id) {
// if(pool->offset == SCCL_PROXY_CONN_POOL_SIZE) {
// scclCHECK(scclRealloc(&pool->pools, pool->banks, pool->banks + 1));
// scclCHECK(scclCalloc(pool->pools + pool->banks, SCCL_PROXY_CONN_POOL_SIZE));
// pool->banks++;
// pool->offset = 0;
// }
// *id = ((pool->banks - 1) << SCCL_PROXY_CONN_POOL_SIZE_POW2) + pool->offset;
// pool->offset++;
// return scclSuccess;
// }
// static scclResult_t scclProxyGetConnection(struct scclProxyConnectionPool* pool, int id, struct scclProxyConnection** conn) {
// int bank = id >> SCCL_PROXY_CONN_POOL_SIZE_POW2;
// int offset = id & SCCL_PROXY_CONN_POOL_MASK;
// if((pool->pools == NULL) || (bank > pool->banks) || (pool->pools[bank] == NULL))
// return scclInternalError;
// *conn = pool->pools[bank] + offset;
// return scclSuccess;
// }
// static scclResult_t proxyFree(struct scclProxyConnection* connection, struct scclProxyState* proxyState) {
// if(connection->send) {
// if(scclTransports[connection->transport]->send.proxyFree) {
// scclCHECK(scclTransports[connection->transport]->send.proxyFree(connection, proxyState));
// }
// } else {
// if(scclTransports[connection->transport]->recv.proxyFree) {
// scclCHECK(scclTransports[connection->transport]->recv.proxyFree(connection, proxyState));
// }
// }
// return scclSuccess;
// }
// static scclResult_t scclProxyFreeConnections(struct scclProxyConnectionPool* pool, struct scclProxyState* proxyState) {
// for(int b = 0; b < pool->banks; b++) {
// int max = b == pool->banks - 1 ? pool->offset : SCCL_PROXY_CONN_POOL_SIZE;
// for(int i = 0; i < max; i++) {
// scclProxyConnection* connection = pool->pools[b] + i;
// if(connection->state != connUninitialized) {
// scclCHECK(proxyFree(connection, proxyState));
// }
// }
// free(pool->pools[b]);
// }
// free(pool->pools);
// return scclSuccess;
// }
// #include "transport.h"
// struct scclProxyInitReq {
// int transport;
// int send;
// int tpLocalRank;
// int tpRank;
// int sameProcess;
// };
// struct scclProxyInitResp {
// scclProxyConnection* connection;
// char devShmPath[6]; // "XXXXXX" - May or may not be set
// };
// scclResult_t scclProxyConnect(struct scclComm* comm, int transport, int send, int tpProxyRank, struct scclProxyConnector* proxyConn) {
// struct scclSocket* sock;
// int ready, proxyRank = -1;
// struct scclProxyState* sharedProxyState = comm->proxyState;
// // Keep one connection per mlocal rank
// for(int i = 0; i < comm->localRanks; ++i) {
// /* find the proxy rank in comm. */
// if(comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) {
// proxyRank = comm->localRankToRank[i];
// break;
// }
// }
// proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
// // Keep one connection per local rank
// proxyConn->connection = NULL;
// proxyConn->tpRank = tpProxyRank;
// if(sharedProxyState->peerSocks == NULL) {
// scclCHECK(scclCalloc(&sharedProxyState->peerSocks, comm->sharedRes->tpNLocalRanks));
// scclCHECK(scclCalloc(&sharedProxyState->proxyOps, comm->sharedRes->tpNLocalRanks));
// scclCHECK(scclCalloc(&sharedProxyState->sharedDevMems, comm->sharedRes->tpNLocalRanks));
// for(int i = 0; i < comm->sharedRes->tpNLocalRanks; ++i) {
// scclCHECK(scclSocketSetFd(-1, &sharedProxyState->peerSocks[i]));
// }
// }
// proxyConn->tpLocalRank = comm->sharedRes->tpRankToLocalRank[proxyConn->tpRank];
// sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
// scclCHECK(scclSocketReady(sock, &ready));
// if(!ready) {
// scclCHECK(scclSocketInit(sock, sharedProxyState->peerAddresses + proxyConn->tpRank, comm->sharedRes->magic, scclSocketTypeProxy, comm->abortFlag));
// scclCHECK(scclSocketConnect(sock));
// }
// struct scclProxyInitReq req = {0};
// req.transport = transport;
// req.send = send;
// req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
// req.tpRank = comm->topParentRanks[comm->rank];
// req.sameProcess = proxyConn->sameProcess;
// struct scclProxyInitResp resp = {0};
// // This usually sends proxyConn->connection to identify which connection this is.
// // However, this is part of the response and therefore is ignored
// scclCHECK(scclProxyCallBlocking(comm, proxyConn, scclProxyMsgInit, &req, sizeof(req), &resp, sizeof(resp)));
// proxyConn->connection = resp.connection;
// // If we need proxy progress, map progress ops
// struct scclTransportComm* tcomm = send ? &scclTransports[transport]->send : &scclTransports[transport]->recv;
// if(tcomm->proxyProgress) {
// char poolPath[] = "/dev/shm/sccl-XXXXXX";
// strncpy(poolPath + sizeof("/dev/shm/sccl-") - 1, resp.devShmPath, sizeof("XXXXXX") - 1);
// struct scclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank;
// if(proxyOps->pool == NULL) {
// scclCHECK(scclShmOpen(poolPath, sizeof(struct scclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0, &proxyOps->handle));
// proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
// }
// }
// INFO(SCCL_NET | SCCL_PROXY, "Connection to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
// return scclSuccess;
// }
// // cuMem API support
// // The response is sent out-of-band using scclIpcSocket for this specific command
// /**
// * 通过代理连接将文件描述符转换为跨进程可用的描述符
// *
// * @param comm sccl通信器
// * @param proxyConn 代理连接器
// * @param fd 待转换的文件描述符
// * @param convertedFd 输出参数,存储转换后的文件描述符
// * @return 操作结果(scclSuccess表示成功)
// *
// * 该函数会阻塞直到转换完成或失败。首先创建UDS socket接收转换后的fd,
// * 然后通过代理请求转换,最后轮询代理响应直到操作完成。
// * 出错时会关闭socket并返回错误信息。
// */
// scclResult_t scclProxyClientConvertFdBlocking(struct scclComm* comm, struct scclProxyConnector* proxyConn, int fd, int* convertedFd) {
// scclResult_t ret = scclSuccess;
// scclResult_t res = scclInProgress;
// struct scclIpcSocket ipcSock = {0};
// void* opId = malloc(1);
// // Create a UDS socket to receive the converted fd
// scclCHECK(scclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag));
// // Request the conversion of the fd over sockets
// scclCHECKGOTO(scclProxyCallAsync(comm, proxyConn, scclProxyMsgConvertFd, &fd, sizeof(int), 0, opId), ret, error);
// // Receive converted fd over UDS
// scclCHECK(scclIpcSocketRecvFd(&ipcSock, convertedFd));
// TRACE(SCCL_PROXY, "UDS: ConvertFd rank %d returned %p %d", proxyConn->tpLocalRank, convertedFd, *convertedFd);
// scclCHECK(scclIpcSocketClose(&ipcSock));
// while(res == scclInProgress) {
// res = scclPollProxyResponse(comm, proxyConn, NULL, opId);
// }
// free(opId);
// return res;
// error:
// scclCHECK(scclIpcSocketClose(&ipcSock));
// WARN("scclProxyClientConvertFd call to top parent rank %d failed", proxyConn->tpRank);
// return ret;
// }
// const char* scclProxyMsgTypeStr[] = {"Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd"};
// scclResult_t scclProxyCallAsync(struct scclComm* comm, struct scclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId)
// {
// struct scclSocket* sock;
// scclResult_t ret = scclSuccess;
// struct scclProxyState* sharedProxyState = comm->proxyState;
// if(sharedProxyState->peerSocks == NULL)
// return scclInternalError;
// sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
// if(sock == NULL)
// return scclInternalError;
// scclCHECKGOTO(scclSocketSend(sock, &type, sizeof(int)), ret, error);
// scclCHECKGOTO(scclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
// scclCHECKGOTO(scclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
// scclCHECKGOTO(scclSocketSend(sock, &respSize, sizeof(int)), ret, error);
// if(reqSize)
// scclCHECKGOTO(scclSocketSend(sock, reqBuff, reqSize), ret, error);
// // Send opId to proxy
// scclCHECKGOTO(scclSocketSend(sock, &opId, sizeof(opId)), ret, error);
// // Add proxyOp to expected response queue
// scclCHECK(expectedProxyResponseEnqueue(sharedProxyState, opId, respSize));
// return scclSuccess;
// error:
// return ret;
// }
// scclResult_t scclPollProxyResponse(struct scclComm* comm, struct scclProxyConnector* proxyConn, void* respBuff, void* opId) {
// struct scclProxyState* sharedProxyState = comm->proxyState;
// // Receive the connection pointer from the Proxy
// if(*comm->abortFlag) {
// WARN("Comm %p is in abort state", comm);
// return scclInternalError;
// }
// if(sharedProxyState->peerSocks == NULL)
// return scclInternalError;
// // Check response queue
// int found = 0;
// scclCHECK(expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found));
// if(found == 0) {
// // Attempt to read in a new response header from the proxy thread
// struct scclSocket* sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
// void* recvOpId;
// int offset = 0;
// if(scclSuccess != scclSocketProgress(SCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)) {
// WARN("Socket recv failed while polling for opId=%p", opId);
// return scclInternalError;
// }
// if(offset == 0) {
// return scclInProgress;
// // If we've returned a partial response, block to receive the rest of it
// } else if(offset < sizeof(recvOpId)) {
// while(offset < sizeof(recvOpId))
// scclCHECK(scclSocketProgress(SCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset));
// }
// INFO(SCCL_PROXY, "scclPollProxyResponse Received new opId=%p", recvOpId);
// // Now do a blocking recv of the response size
// int respSize = 0;
// scclCHECK(scclSocketRecv(sock, &respSize, sizeof(respSize)));
// // If there's a respSize to recv
// if(respSize > 0) {
// if(recvOpId != opId) {
// // Unexpected response, need to buffer the socket data
// respBuff = malloc(respSize);
// }
// assert(respBuff != NULL);
// scclCHECK(scclSocketRecv(sock, respBuff, respSize));
// }
// if(recvOpId == opId) {
// INFO(SCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId);
// scclCHECK(expectedProxyResponseRemove(sharedProxyState, recvOpId));
// return scclSuccess;
// } else {
// INFO(SCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", recvOpId, respBuff, respSize);
// // Store the result and mark response as completed
// scclCHECK(expectedProxyResponseStore(sharedProxyState, recvOpId, respBuff, respSize));
// return scclInProgress;
// }
// } else {
// INFO(SCCL_PROXY, "scclPollProxyResponse Dequeued cached opId=%p", opId);
// }
// return scclSuccess;
// }
// scclResult_t
// scclProxyCallBlocking(struct scclComm* comm, struct scclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
// // Alloc some memory to act as a handle
// scclResult_t res = scclSuccess;
// void* opId = malloc(1);
// scclCHECKGOTO(scclProxyCallAsync(comm, proxyConn, type, reqBuff, reqSize, respSize, opId), res, fail);
// do {
// res = scclPollProxyResponse(comm, proxyConn, respBuff, opId);
// } while(res == scclInProgress);
// exit:
// free(opId);
// return res;
// fail:
// goto exit;
// }
// static scclResult_t proxyProgressInit(struct scclProxyState* proxyState) {
// struct scclProxyProgressState* state = &proxyState->progressState;
// if(state->opsPool == NULL) {
// int size = sizeof(struct scclProxyOpsPool);
// struct scclProxyOpsPool* pool = NULL;
// char shmPath[sizeof("/dev/shm/sccl-XXXXXX")];
// shmPath[0] = '\0';
// scclCHECK(scclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks + 1, &state->handle));
// // Init pool
// pool->nextOps = -1;
// for(int r = 0; r < proxyState->tpLocalnRanks; r++) {
// pool->freeOps[r] = r * MAX_OPS_PER_PEER;
// for(int i = 0; i < MAX_OPS_PER_PEER - 1; i++)
// pool->ops[r * MAX_OPS_PER_PEER + i].next = r * MAX_OPS_PER_PEER + i + 1;
// pool->ops[(r + 1) * MAX_OPS_PER_PEER - 1].next = -1;
// }
// // Setup mutex/cond to work inter-process
// pthread_mutexattr_t mutexAttr;
// pthread_mutexattr_init(&mutexAttr);
// pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED);
// pthread_mutex_init(&pool->mutex, &mutexAttr);
// pthread_condattr_t condAttr;
// pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED);
// pthread_cond_init(&pool->cond, &condAttr);
// state->opsPool = pool;
// memcpy(state->opsPoolShmSuffix, shmPath + sizeof("/dev/shm/sccl-") - 1, sizeof("XXXXXX") - 1);
// // All ops structures are created, we can start the progress thread
// scclCHECK(scclProxyProgressCreate(proxyState));
// }
// return scclSuccess;
// }
// static void proxyOpsFree(struct scclProxyState* proxyState) {
// struct scclProxyProgressState* state = &proxyState->progressState;
// if(scclShmClose(state->handle) != scclSuccess) {
// WARN("[Service thread] shm close failed");
// }
// }
// scclResult_t scclProxyShmUnlink(struct scclComm* comm) {
// struct scclProxyProgressState* state = &comm->proxyState->progressState;
// if(state->opsPool == NULL)
// return scclSuccess;
// if(scclShmUnlink(state->handle) != scclSuccess) {
// WARN("[Service thread] proxy ops shm unlink failed");
// }
// return scclSuccess;
// }
// static scclResult_t proxyConnInit(struct scclProxyLocalPeer* peer,
// struct scclProxyConnectionPool* connectionPool,
// struct scclProxyState* proxyState,
// scclProxyInitReq* req,
// scclProxyInitResp* resp,
// struct scclProxyConnection** connection) {
// int id;
// scclCHECK(scclProxyNewConnection(connectionPool, &id));
// scclCHECK(scclProxyGetConnection(connectionPool, id, connection));
// (*connection)->sock = &peer->sock;
// (*connection)->transport = req->transport;
// (*connection)->send = req->send;
// (*connection)->tpLocalRank = req->tpLocalRank;
// (*connection)->sameProcess = req->sameProcess;
// peer->tpLocalRank = req->tpLocalRank;
// peer->tpRank = req->tpRank;
// resp->connection = *connection;
// (*connection)->tcomm = (*connection)->send ? &scclTransports[(*connection)->transport]->send : &scclTransports[(*connection)->transport]->recv;
// // If we need proxy progress, let's allocate ops and start the thread
// if((*connection)->tcomm->proxyProgress) {
// scclCHECK(proxyProgressInit(proxyState));
// struct scclProxyProgressState* state = &proxyState->progressState;
// strncpy(resp->devShmPath, state->opsPoolShmSuffix, sizeof(resp->devShmPath));
// }
// INFO(SCCL_NET | SCCL_PROXY,
// "New proxy %s connection %d from local rank %d, transport %d",
// (*connection)->send ? "send" : "recv",
// id,
// (*connection)->tpLocalRank,
// (*connection)->transport);
// __atomic_store_n(&(*connection)->state, connInitialized, __ATOMIC_RELEASE);
// return scclSuccess;
// }
// // cuMem API support
// static scclResult_t proxyConvertFd(struct scclProxyLocalPeer* peer, void* opId, struct scclProxyState* proxyState, int fd) {
// struct scclIpcSocket ipcSock = {0};
// uint64_t hash = (uint64_t)opId;
// INFO(SCCL_PROXY, "UDS proxyConvertFd received fd %d peer %d opId %lx", fd, peer->tpLocalRank, hash);
// // Send back the converted fd using UDS
// scclCHECK(scclIpcSocketInit(&ipcSock, proxyState->tpRank, hash ^ 1, proxyState->abortFlag));
// scclCHECK(scclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash));
// scclCHECK(scclIpcSocketClose(&ipcSock));
// return scclSuccess;
// }
// static scclResult_t proxyProgressAsync(struct scclProxyAsyncOp* op,
// struct scclProxyState* proxyState,
// int* asyncOpCount,
// struct scclProxyLocalPeer* peer,
// struct scclProxyConnectionPool* connectionPool) {
// int done = 1;
// if(op->type == scclProxyMsgSetup) {
// TRACE(SCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId);
// scclCHECK(op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
// } else if(op->type == scclProxyMsgConnect) {
// TRACE(SCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
// scclCHECK(op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
// } else if(op->type == scclProxyMsgSharedInit) {
// int nChannels = (int)*op->reqBuff;
// TRACE(SCCL_PROXY, "proxyProgressAsync::scclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels);
// if(op->connection->tcomm->proxySharedInit)
// scclCHECK(op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels));
// __atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE);
// } else if(op->type == scclProxyMsgConvertFd) {
// int fd = *(int*)op->reqBuff;
// TRACE(SCCL_PROXY, "proxyProgressAsync::scclProxyMsgConvertFd opId=%p op.reqBuff=%p fd=%d", op->opId, op->reqBuff, fd);
// scclCHECK(proxyConvertFd(peer, op->opId, proxyState, fd)); // cuMem API support
// } else if(op->type == scclProxyMsgInit) {
// TRACE(SCCL_PROXY, "proxyProgressAsync::scclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
// scclCHECK(proxyConnInit(peer, connectionPool, proxyState, (scclProxyInitReq*)op->reqBuff, (scclProxyInitResp*)op->respBuff, &op->connection));
// } else
// return scclInternalError;
// if(done) {
// INFO(SCCL_PROXY, "proxyProgressAsync opId=%p op.type=%d op.reqBuff=%p op.respSize=%d done", op->opId, op->type, op->reqBuff, op->respSize);
// if(op->type == scclProxyMsgSetup)
// __atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE);
// else if(op->type == scclProxyMsgConnect)
// __atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE);
// /* if setup or connect is done, we should not return any error at this point since
// * scclSocketSend might already send the respBuff to the requester. If we still choose
// * to abort and close the connection, it can cause segfault if the requester is using
// * the respBuff. */
// // Send the opId for referencing async operation
// scclCHECK(scclSocketSend(op->connection->sock, &op->opId, sizeof(op->opId)));
// // Send the response size
// scclCHECK(scclSocketSend(op->connection->sock, &op->respSize, sizeof(op->respSize)));
// if(op->respSize) {
// // Send the response
// scclCHECK(scclSocketSend(op->connection->sock, op->respBuff, op->respSize));
// }
// asyncProxyOpDequeue(peer, op);
// (*asyncOpCount)--;
// return scclSuccess;
// } else if(*proxyState->abortFlag != 0) {
// return scclInternalError;
// }
// return scclInProgress;
// }
// static scclResult_t proxyServiceInitOp(
// int type, struct scclProxyLocalPeer* peer, struct scclProxyConnectionPool* connectionPool, struct scclProxyState* proxyState, int* asyncOpCount) {
// struct scclSocket* sock = &peer->sock;
// struct scclProxyAsyncOp* asyncOp;
// scclCHECK(scclCalloc(&asyncOp, 1));
// asyncOp->type = type;
// scclCHECK(scclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
// scclCHECK(scclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)));
// scclCHECK(scclSocketRecv(sock, &asyncOp->respSize, sizeof(int)));
// if(asyncOp->reqSize) {
// scclCHECK(scclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
// scclCHECK(scclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
// }
// // Store opId for completion response
// scclCHECK(scclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)));
// if(asyncOp->respSize)
// scclCHECK(scclCalloc(&asyncOp->respBuff, asyncOp->respSize));
// asyncProxyOpEnqueue(peer, asyncOp);
// (*asyncOpCount)++;
// scclCHECK(proxyProgressAsync(asyncOp, proxyState, asyncOpCount, peer, connectionPool));
// return scclSuccess;
// }
// #include <poll.h>
// static bool proxyMatchOpType(int type) {
// switch(type) {
// case scclProxyMsgInit:
// case scclProxyMsgSharedInit:
// case scclProxyMsgSetup:
// case scclProxyMsgConnect:
// case scclProxyMsgConvertFd: return true;
// default: return false;
// }
// }
// void* scclProxyService(void* _args) {
// struct scclProxyState* proxyState = (struct scclProxyState*)_args;
// // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
// if(setProxyThreadContext(proxyState)) {
// INFO(SCCL_INIT, "[Proxy Service] Created CUDA context on device %d", proxyState->cudaDev);
// } else if(cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
// WARN("[Proxy Service] Failed to set CUDA device %d", proxyState->cudaDev);
// }
// // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
// // Prepare poll descriptor
// struct scclProxyConnectionPool connectionPool;
// connectionPool.pools = NULL;
// connectionPool.banks = 0;
// connectionPool.offset = SCCL_PROXY_CONN_POOL_SIZE;
// struct pollfd pollfds[SCCL_MAX_LOCAL_RANKS + 1];
// struct scclProxyLocalPeer peers[SCCL_MAX_LOCAL_RANKS];
// memset(&peers, 0, sizeof(struct scclProxyLocalPeer) * SCCL_MAX_LOCAL_RANKS);
// for(int s = 0; s < SCCL_MAX_LOCAL_RANKS; s++) {
// pollfds[s].fd = -1;
// pollfds[s].events = POLLHUP | POLLIN;
// }
// if(scclSocketGetFd(proxyState->listenSock, &pollfds[SCCL_MAX_LOCAL_RANKS].fd) != scclSuccess) {
// WARN("[Proxy Service] Get listenSock fd fails");
// return NULL;
// };
// pollfds[SCCL_MAX_LOCAL_RANKS].events = POLLIN;
// int maxnpeers = 0;
// int npeers = 0;
// int stop = 0;
// int asyncOpCount = 0;
// while(stop == 0 || (stop == 1 && npeers > 0)) {
// /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
// * connections. Need to wait until all other related comms call abort and safely exit
// * together, or we could face segmentation fault. */
// if(*proxyState->abortFlag != 0)
// stop = 1;
// /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
// int ret;
// do {
// ret = poll(pollfds, SCCL_MAX_LOCAL_RANKS + 1, asyncOpCount ? 0 : 500);
// } while(ret < 0 && errno == EINTR);
// if(ret < 0) {
// WARN("[Proxy Service] Poll failed: %s", strerror(errno));
// return NULL;
// }
// if(pollfds[SCCL_MAX_LOCAL_RANKS].revents) {
// int s = 0;
// while(s < SCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0)
// s++;
// if(s == SCCL_MAX_LOCAL_RANKS) {
// WARN("[Proxy service] Too many connections (%d max)", SCCL_MAX_LOCAL_RANKS);
// return NULL;
// }
// if(maxnpeers < s + 1)
// maxnpeers = s + 1;
// if(scclSocketInit(&peers[s].sock) != scclSuccess) {
// WARN("[Service thread] Initialize peers[%d].sock fails", s);
// return NULL;
// }
// if(scclSocketAccept(&peers[s].sock, proxyState->listenSock) != scclSuccess) {
// WARN("[Service thread] Accept failed %s", strerror(errno));
// } else {
// if(scclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != scclSuccess) {
// WARN("[Service thread] Get peers[%d].sock fd fails", s);
// return NULL;
// }
// npeers++;
// peers[s].tpLocalRank = -1;
// }
// }
// for(int s = 0; s < maxnpeers; s++) {
// struct scclProxyLocalPeer* peer = peers + s;
// struct scclSocket* sock = &peer->sock;
// int closeConn = 0;
// int type = 0;
// scclResult_t res = scclSuccess;
// if(pollfds[s].fd == -1)
// continue;
// // Progress all ops for this scclProxyLocalPeer
// scclProxyAsyncOp* op = peer->asyncOps;
// while(op != nullptr) {
// scclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */
// type = op->type;
// res = proxyProgressAsync(op, proxyState, &asyncOpCount, peer, &connectionPool);
// if(res == scclSuccess || res == scclInProgress) {
// op = opnext;
// } else {
// // Res is a bad result
// closeConn = 1;
// WARN("[Service thread] Error encountered progressing operation=%s, res=%d, closing connection", scclProxyMsgTypeStr[type], res);
// break;
// }
// }
// // Check for additional ops coming in
// if(pollfds[s].revents & POLLIN) {
// int closed;
// res = scclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/);
// if(res != scclSuccess && res != scclInProgress) {
// WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed);
// closeConn = 1;
// } else if(closed) {
// INFO(SCCL_INIT | SCCL_NET | SCCL_PROXY, "[Service thread] Connection closed by localRank %d", peer->tpLocalRank);
// closeConn = 1;
// } else if(res == scclSuccess) { // We received something from the sock
// if(type == scclProxyMsgStop) {
// stop = 1;
// closeConn = 1;
// } else if(type == scclProxyMsgClose) {
// closeConn = 1;
// } else if(proxyMatchOpType(type)) {
// res = proxyServiceInitOp(type, peers + s, &connectionPool, proxyState, &asyncOpCount);
// } else {
// WARN("[Service thread] Unknown command %d from localRank %d", type, peer->tpLocalRank);
// closeConn = 1;
// }
// INFO(SCCL_PROXY, "Received and initiated operation=%s res=%d", scclProxyMsgTypeStr[type], res);
// }
// } else if(pollfds[s].revents & POLLHUP) {
// closeConn = 1;
// }
// if(res != scclSuccess && res != scclInProgress) {
// WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d",
// proxyState->tpRank,
// scclProxyMsgTypeStr[type],
// peer->tpRank,
// res);
// closeConn = 1;
// }
// if(closeConn) {
// scclSocketClose(sock);
// if(op != nullptr) {
// asyncProxyOpDequeue(peer, op);
// asyncOpCount--;
// }
// pollfds[s].fd = -1;
// npeers--;
// }
// }
// }
// // Wait for all operations to complete and stop progress thread before freeing any resource
// if(scclProxyProgressDestroy(proxyState) != scclSuccess) {
// WARN("[Proxy Service] proxyDestroy failed");
// }
// for(int s = 0; s < maxnpeers; s++) {
// scclSocketClose(&peers[s].sock);
// }
// scclProxyFreeConnections(&connectionPool, proxyState);
// scclSocketClose(proxyState->listenSock);
// free(proxyState->listenSock);
// proxyOpsFree(proxyState);
// return NULL;
// }
// scclResult_t scclProxyInit(struct scclComm* comm, struct scclSocket* sock, union scclSocketAddress* peerAddresses) {
// assert(comm->sharedRes->proxyState == NULL);
// scclCHECK(scclCalloc(&comm->sharedRes->proxyState, 1));
// comm->proxyState = comm->sharedRes->proxyState;
// comm->proxyState->refCount = 1;
// comm->proxyState->listenSock = sock;
// comm->proxyState->peerAddresses = peerAddresses;
// return scclSuccess;
// }
// scclResult_t scclProxyCreate(struct scclComm* comm) {
// /* proxyState is shared among parent comm and split comms. comm->proxyState->thread is
// * pthread_join()'d by commFree() in init.cc when the refCount reduces down to 0. */
// struct scclProxyState* proxyState = comm->proxyState;
// if(proxyState->refCount == 1) {
// /* we have to make sure all following fields in comm have been initialized. */
// proxyState->tpRank = comm->rank;
// proxyState->tpnRanks = comm->nRanks;
// proxyState->tpLocalnRanks = comm->localRanks;
// proxyState->cudaDev = comm->cudaDev;
// proxyState->abortFlag = comm->abortFlag;
// proxyState->p2pnChannels = comm->p2pnChannels;
// proxyState->p2pChunkSize = comm->p2pChunkSize;
// proxyState->nChannels = comm->nChannels;
// proxyState->allocP2pNetLLBuffers = comm->allocP2pNetLLBuffers;
// proxyState->dmaBufSupport = comm->dmaBufSupport;
// proxyState->scclNet = comm->scclNet;
// proxyState->scclCollNet = comm->scclCollNet;
// memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes));
// pthread_create(&comm->proxyState->thread, NULL, scclProxyService, comm->proxyState);
// scclSetThreadName(comm->proxyState->thread, "sccl Service %2d", comm->cudaDev);
// }
// return scclSuccess;
// }
// scclResult_t scclProxyStop(struct scclComm* comm) {
// if(comm->sharedRes && comm->sharedRes->proxyState) {
// struct scclProxyState* sharedProxyState = comm->sharedRes->proxyState;
// if((comm->proxyRefCountOld = scclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
// if(sharedProxyState->peerAddresses) {
// if(*comm->abortFlag == 0) {
// struct scclSocket sock;
// int type = scclProxyMsgStop;
// scclCHECK(scclSocketInit(&sock,
// sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank],
// comm->sharedRes->magic,
// scclSocketTypeProxy,
// comm->abortFlag));
// scclCHECK(scclSocketConnect(&sock));
// scclCHECK(scclSocketSend(&sock, &type, sizeof(int)));
// scclCHECK(scclSocketClose(&sock));
// }
// }
// if(sharedProxyState->peerSocks) {
// int tplocalRanks = comm->sharedRes->tpNLocalRanks;
// for(int i = 0; i < tplocalRanks; i++) {
// int fd;
// scclCHECK(scclSocketGetFd(sharedProxyState->peerSocks + i, &fd));
// if(fd >= 0) {
// if(sharedProxyState->proxyOps[i].pool) {
// scclCHECK(scclShmClose(sharedProxyState->proxyOps[i].handle));
// }
// if(sharedProxyState->sharedDevMems[i]) {
// if(!scclCuMemEnable()) {
// CUDACHECK(cudaIpcCloseMemHandle(sharedProxyState->sharedDevMems[i]));
// }
// }
// int type = scclProxyMsgClose;
// if(*comm->abortFlag == 0)
// scclCHECK(scclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)));
// scclCHECK(scclSocketClose(sharedProxyState->peerSocks + i));
// }
// }
// }
// }
// }
// return scclSuccess;
// }
// scclResult_t scclProxyDestroy(struct scclComm* comm) {
// struct scclProxyState* sharedProxyState = comm->sharedRes->proxyState;
// assert(sharedProxyState->refCount == 0);
// free(sharedProxyState->peerAddresses);
// free(sharedProxyState->peerSocks);
// free(sharedProxyState->proxyOps);
// free(sharedProxyState->sharedDevMems);
// expectedProxyResponseFree(sharedProxyState);
// free(sharedProxyState);
// return scclSuccess;
// }
#pragma once
#include <pthread.h>
#include "socket.h"
#include "ipcsocket.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
typedef net::host::scclSocketAddress scclSocketAddress_t;
typedef net::host::scclSocket scclSocket_t;
#define SCCL_PROXY_MAX_SUBS MAXCHANNELS
#define PROXYARGS_ALLOCATE_SIZE SCCL_MAX_OPS
enum proxyConnectState : uint8_t {
connUninitialized = 0,
connInitialized = 1,
connSharedInitialized = 2,
connSetupDone = 3,
connConnected = 4,
numConnStates = 5
};
// 期望代理响应FIFO
struct scclExpectedProxyResponse {
void* opId; // 操作ID,用于标识特定的操作
int respSize; // 响应大小,表示响应数据的字节数
bool done; // 完成标志,表示该响应是否已完成处理
void* respBuff; // 响应缓冲区,用于存储接收到的响应数据
struct scclExpectedProxyResponse* next; // 指向下一个预期代理响应的指针,形成链表结构
};
// 子代理参数数组
struct scclProxySubArgs {
int channelId; // 通道ID
int nsteps; // 操作步骤数
ssize_t nbytes; // 数据字节数
int peer; // 对等体ID
int groupSize; //
uint64_t base; // 基础计数
uint64_t posted; // 已发布的计数
uint64_t received; // 已接收的计数
uint64_t flushed; // 已刷新的计数
uint64_t transmitted; // 已传输的计数
uint64_t done; // 已完成的计数
uint64_t end; // 结束计数
void* requests[SCCL_STEPS]; // 每个步骤的请求指针数组
};
// 定义代理参数结构体
struct scclProxyArgs {
struct scclProxySubArgs subs[SCCL_PROXY_MAX_SUBS]; // 子代理参数数组
int nsubs; // 子代理数量
int done; // 是否完成的标志
uint64_t opCount; // 操作计数
int sliceSteps; // 切片步骤数
int chunkSteps; // 数据块步骤数
int chunkSize; // 数据块大小
scclDataType_t dtype; // 数据类型
scclProtocolType_t protocol; // 协议类型
int state; // 当前状态
char* sharedBuff[SCCL_STEPS]; // 共享缓冲区指针数组
int sharedSize[SCCL_STEPS]; // 共享缓冲区大小数组
int idle; // 是否空闲的标志
// 元素链接
struct scclProxyArgs* next; // 指向下一个代理参数的指针
struct scclProxyArgs* nextPeer; // 指向下一个对等代理参数的指针
struct scclProxyArgs** proxyAppendPtr; // 指向代理追加指针的指针
};
struct scclProxyPool {
struct scclProxyPool* next; // 指向下一个代理池的指针
struct scclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; // 代理参数元素数组
};
struct scclProxyProgressState {
// 用于主线程向进度线程发送工作
// struct scclProxyOpsPool* opsPool;
// scclShmHandle_t handle;
char opsPoolShmSuffix[6]; // 操作池共享内存后缀
pthread_t thread; // 进度线程的线程ID
bool stop; // 停止标志,用于控制线程停止
// struct scclProxyPeer** localPeers;
// struct scclSharedNetComms* netComms[SCCL_MAX_NETDEVS];
struct scclProxyArgs* active; // 当前活动的代理参数
struct scclProxyArgs* pool; // 代理参数池
struct scclProxyPool* pools; // 代理池
int nextOps; // 下一个操作的索引
};
// struct scclProxyOp {
// struct scclProxyConnection* connection;
// int channelId;
// int nsteps;
// ssize_t nbytes;
// struct {
// int root : 30;
// uint32_t connIndex : 2;
// };
// int next;
// uint64_t opCount;
// int sliceSteps;
// int chunkSteps;
// int chunkSize;
// uint8_t /*scclDataType_t*/ dtype;
// uint8_t /*scclDevRedOp_t*/ redOp;
// uint8_t /*scclPattern_t*/ pattern;
// uint8_t protocol;
// union {
// uint64_t unused;
// // For use by enqueue.cc
// struct scclProxyOp* enqNext;
// };
// };
// struct scclProxyOpsPool {
// struct scclProxyOp ops[MAX_OPS_PER_PEER * SCCL_MAX_LOCAL_RANKS];
// volatile int nextOps;
// volatile int nextOpsEnd;
// volatile int freeOps[SCCL_MAX_LOCAL_RANKS];
// pthread_mutex_t mutex;
// pthread_cond_t cond;
// };
////////////////////////////////////////////////////////////////////////////////////////////////
// scclResult_t scclProxyInit(struct scclComm* comm, scclSocket_t* sock, union scclSocketAddress* peerAddresses);
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
// enum scclProxyOpState {
// scclProxyOpNone,
// scclProxyOpReady,
// scclProxyOpProgress
// };
// enum {
// proxyRecv = 0,
// proxySend = 1
// };
// struct scclProxyArgs;
// typedef scclResult_t (*proxyProgressFunc_t)(struct scclProxyState*, struct scclProxyArgs*);
// static_assert(SCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
// struct scclProxyOp {
// struct scclProxyConnection* connection;
// int channelId;
// int nsteps;
// ssize_t nbytes;
// struct {
// int root : 30;
// uint32_t connIndex : 2;
// };
// int next;
// uint64_t opCount;
// int sliceSteps;
// int chunkSteps;
// int chunkSize;
// uint8_t /*scclDataType_t*/
// dtype;
// uint8_t /*scclDevRedOp_t*/ redOp;
// uint8_t /*scclPattern_t*/ pattern;
// uint8_t protocol;
// union {
// uint64_t unused;
// // For use by enqueue.cc
// struct scclProxyOp* enqNext;
// };
// }
// ;
// static_assert(sizeof(struct scclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
// #define SCCL_MAX_NETDEVS 128
// // ProxyOps are used to communicate between main thread and service thread
// // Make sure we have enough to store two full rounds of operations on all channels.
// // Otherwise we'd be unable to post half of them to free new elements.
// #define MAX_OPS_PER_PEER (2 * MAXCHANNELS * SCCL_MAX_WORK_ELEMENTS_P2P)
// #define SCCL_MAX_LOCAL_RANKS 64
// struct scclProxyOpsPool {
// struct scclProxyOp ops[MAX_OPS_PER_PEER * SCCL_MAX_LOCAL_RANKS];
// volatile int nextOps;
// volatile int nextOpsEnd;
// volatile int freeOps[SCCL_MAX_LOCAL_RANKS];
// pthread_mutex_t mutex;
// pthread_cond_t cond;
// };
// struct scclProxyOps {
// scclProxyOpsPool* pool;
// scclShmHandle_t handle;
// int count;
// int freeOp;
// int nextOps;
// int nextOpsEnd;
// };
// struct scclProxySharedP2p {
// int refcount;
// size_t size;
// char* cudaBuff;
// char* hostBuff;
// // CUDA IPC
// scclIpcDesc ipcDesc;
// struct scclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
// };
// struct scclProxyPeer {
// struct scclProxySharedP2p send;
// struct scclProxySharedP2p recv;
// };
// struct scclSharedNetComms {
// void* sendComm[MAXCHANNELS];
// void* recvComm[MAXCHANNELS];
// int sendRefCount[MAXCHANNELS];
// int recvRefCount[MAXCHANNELS];
// };
// struct scclProxyPool;
// struct scclProxyProgressState {
// // Used by main threads to send work to progress thread
// struct scclProxyOpsPool* opsPool;
// scclShmHandle_t handle;
// char opsPoolShmSuffix[6];
// pthread_t thread;
// bool stop;
// struct scclProxyPeer** localPeers;
// struct scclSharedNetComms* netComms[SCCL_MAX_NETDEVS];
// struct scclProxyArgs* active;
// struct scclProxyArgs* pool;
// struct scclProxyPool* pools;
// int nextOps;
// };
// struct scclProxyAsyncOp {
// int type;
// struct scclProxyConnection* connection;
// int reqSize, respSize;
// char *reqBuff, *respBuff;
// void* opId;
// scclProxyAsyncOp* next;
// };
// struct scclProxyLocalPeer {
// struct scclSocket sock;
// int tpRank;
// int tpLocalRank;
// scclProxyAsyncOp* asyncOps;
// int asyncOpCounter;
// };
// struct scclProxyState {
// int refCount;
// int tpRank;
// int tpnRanks;
// int tpLocalnRanks;
// int cudaDev;
// int p2pnChannels;
// int p2pChunkSize;
// int nChannels;
// int buffSizes[SCCL_NUM_PROTOCOLS];
// bool allocP2pNetLLBuffers;
// bool dmaBufSupport;
// scclNet_t* scclNet;
// scclCollNet_t* scclCollNet;
// volatile uint32_t* abortFlag;
// // Service thread
// pthread_t thread;
// struct scclSocket* listenSock;
// int stop;
// CUcontext cudaCtx;
// // Used by main thread
// union scclSocketAddress* peerAddresses;
// struct scclSocket* peerSocks;
// struct scclProxyOps* proxyOps;
// void** sharedDevMems;
// struct scclIpcSocket peerIpcSock; // cuMEM API support (UDS)
// // Progress thread
// struct scclProxyProgressState progressState;
// // Queue of expected responses from the proxy
// struct scclExpectedProxyResponse* expectedResponses;
// };
// enum proxyConnectState {
// connUninitialized = 0,
// connInitialized = 1,
// connSharedInitialized = 2,
// connSetupDone = 3,
// connConnected = 4,
// numConnStates = 5
// };
// struct scclProxyConnection {
// int send, transport, shared;
// int tpLocalRank, sameProcess;
// struct scclSocket* sock;
// struct scclTransportComm* tcomm;
// struct scclProxyArgs* proxyAppend;
// struct scclProxyArgs** proxyAppendPtr;
// void* transportResources;
// proxyConnectState state;
// struct scclCollNetSharedRes* collNet;
// };
// typedef scclResult_t (*threadFunc_t)(struct scclProxyArgs*);
// enum proxyMode {
// proxyRing = 0,
// proxyFrom = 1,
// proxyTo = 2
// };
// scclResult_t scclProxySaveOp(struct scclComm* comm, struct scclProxyOp* proxyOp, bool* justInquire);
// scclResult_t scclProxyComputeP2p(struct scclInfo* info, struct scclProxyOp* proxyOp);
// scclResult_t scclProxyStart(struct scclComm* comm);
// scclResult_t scclProxyCreate(struct scclComm* comm);
// scclResult_t scclProxyConnect(struct scclComm* comm, int transport, int send, int proxyRank, struct scclProxyConnector* proxyConn);
// enum scclProxyMsgType {
// scclProxyMsgInit = 1,
// scclProxyMsgSharedInit = 2,
// scclProxyMsgSetup = 3,
// scclProxyMsgConnect = 4,
// scclProxyMsgStart = 5,
// scclProxyMsgClose = 6,
// scclProxyMsgAbort = 7,
// scclProxyMsgStop = 8,
// scclProxyMsgConvertFd = 9, // cuMem API support (UDS)
// };
// // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
// // Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
// // scclPollProxyResponse(), supplying the same opId to confirm the operation has completed
// scclResult_t scclProxyCallAsync(struct scclComm* comm, struct scclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
// // This function will internally call scclProxyCallAsync() and spin until scclPollProxyResponse() confirms the result is received
// scclResult_t
// scclProxyCallBlocking(struct scclComm* comm, struct scclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
// scclResult_t scclPollProxyResponse(struct scclComm* comm, struct scclProxyConnector* proxyConn, void* respBuff, void* opId);
// scclResult_t scclProxyClientConvertFdBlocking(struct scclComm* comm, struct scclProxyConnector* proxyConn, int fd, int* convertedFd);
// scclResult_t scclProxyStop(struct scclComm* comm);
// scclResult_t scclProxyShmUnlink(struct scclComm* comm);
// scclResult_t scclProxyDestroy(struct scclComm* comm);
// scclResult_t mscclSaveProxy(struct scclComm* comm, struct scclChannel* channel, int type, int peer, struct scclProxyOp* op, int connIndex);
#include "rocm_smi_wrap.h"
#include "topo_utils.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
#define ROCMSMICHECK(cmd) \
do { \
......@@ -124,4 +128,7 @@ scclResult_t rocm_smi_getLinkInfo(int srcIndex, int dstIndex, RSMI_IO_LINK_TYPE*
return scclSuccess;
}
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
/*
Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef ROCM_SMI_WRAP_H_
#define ROCM_SMI_WRAP_H_
#include "rocm_smi/rocm_smi.h"
#pragma once
#include <rocm_smi/rocm_smi.h>
#ifdef USE_ROCM_SMI64CONFIG
#include "rocm_smi/rocm_smi64Config.h"
#include <rocm_smi/rocm_smi64Config.h>
#endif
#include "base.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
// 初始化ROCm SMI库
scclResult_t rocm_smi_init();
......@@ -46,5 +26,7 @@ scclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* d
// 获取两个设备之间的链接信息,包括链接类型、跳数和链接数量
scclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int* hops, int* count);
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif
#include <string.h>
#include "topo_utils.h"
// #include "net.h"
// #include "xml.h"
// #include "net.h"
namespace sccl {
namespace hardware {
......@@ -30,6 +27,45 @@ scclResult_t busIdToInt64(const char* busId, int64_t* id) {
return scclSuccess;
}
// 定义一个常量,表示最大字符串长度
static constexpr int MAX_STR_LEN = 255;
/**
* @brief 从系统文件中读取字符串内容
*
* 该函数通过拼接路径和文件名,打开指定文件并读取其内容到字符串缓冲区中。
* 如果读取失败或文件为空,会将缓冲区置为空字符串并记录警告信息。
*
* @param path 文件所在目录路径
* @param fileName 要读取的文件名
* @param strValue 用于存储读取内容的字符串缓冲区
* @return scclResult_t 始终返回scclSuccess
*
* @note 缓冲区最大长度为MAX_STR_LEN,超出部分会被截断
* 文件内容末尾会自动添加字符串结束符'\0'
*/
scclResult_t scclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
char filePath[PATH_MAX];
sprintf(filePath, "%s/%s", path, fileName);
int offset = 0;
FILE* file;
if((file = fopen(filePath, "r")) != NULL) {
while(feof(file) == 0 && ferror(file) == 0 && offset < MAX_STR_LEN) {
int len = fread(strValue + offset, 1, MAX_STR_LEN - offset, file);
offset += len;
}
fclose(file);
}
if(offset == 0) {
strValue[0] = '\0';
INFO(SCCL_LOG_TOPO, "System detection : could not read %s, ignoring", filePath);
} else {
strValue[offset - 1] = '\0';
}
return scclSuccess;
}
scclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
char* str = path + offset;
// Remove trailing "/"
......
......@@ -2,20 +2,16 @@
#include <string.h>
#include "base.h"
#include "net.h"
#include "hardware_utils.h"
namespace sccl {
namespace hardware {
namespace topology {
// 定义硬件拓扑类型枚举
typedef enum topoNodeType {
GPU = 0, // 图形处理单元
PCI = 1, // 外围组件互连
NVS = 2, // 非易失性存储器
CPU = 3, // 中央处理器,实际上是NUMA域
NIC = 4, // 网络接口控制器
NET = 5 // 网络
} topoNodeType_t;
#define SCCL_TOPO_NODE_TYPES (6) // 硬件node的类型
#define SCCL_TOPO_MAX_NODE_PER_TYPE (4) // 每个硬件node类型中节点的数量,间接表明网络拓扑结构的最大层数
#define SCCL_TOPO_RANK_MAX_LINKS (8) // 每个rank中节点与当前rank中其他节点的链接数量
// 定义 topoPathType_t 枚举类型,用于表示不同的路径类型。
enum topoPathType {
......@@ -33,6 +29,29 @@ enum topoPathType {
////////////////////////////////////////////////////////////////////////////////////////////////
// // 定义拓扑节点的结构体
// struct scclTopoNode;
// // 定义拓扑链接的结构体
// struct scclTopoLink {
// int type; // 链接类型
// float bw; // 带宽
// struct scclTopoNode* remNode; // 远程节点指针
// };
// // 用于表示一组拓扑节点
// struct scclTopoNodeSet {
// int count; // 节点数量
// struct scclTopoNode nodes[SCCL_TOPO_MAX_NODE_PER_TYPE]; //
// };
// // struct scclTopoNodeSet nodes[SCCL_TOPO_NODE_TYPES];
// //// 计算直接相连的nodes信息
// int nlinks; // 链接数量
// struct scclTopoLink links[SCCL_TOPO_RANK_MAX_LINKS]; // rank内部直接相连的其他node的链接列表
////////////////////////////////////////////////////////////////////////////////////////////////
scclResult_t scclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue);
// 将64位整数转换为总线ID字符串
scclResult_t int64ToBusId(int64_t id, char* busId);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment