#ifndef SCCL_GRAPH_H_ #define SCCL_GRAPH_H_ // #include "topo_utils.h" #include "devcomm.h" #include #include #include #include #include namespace sccl { namespace hardware { namespace topology { #define MAX_XGMI_INTER_GPUS 4 struct scclTopoGraph { // Input / output int id; // ring : 0, tree : 1, collnet : 2 int pattern; int crossNic; int collNet; int minChannels; int maxChannels; // Output int nChannels; float bwIntra; float bwInter; float latencyInter; int typeIntra; int typeInter; int sameChannels; int nHops; int intra[MAXCHANNELS * SCCL_TOPO_MAX_NODES]; int inter[MAXCHANNELS * 2]; int nIntraChannels; int intraNets[MAXCHANNELS * SCCL_TOPO_MAX_NODES * 2]; char treeBase[SCCL_TOPO_MAX_NODES][SCCL_TOPO_MAX_NODES * 4]; }; struct scclTopoRanks { int ringRecv[MAXCHANNELS]; int ringSend[MAXCHANNELS]; int ringPrev[MAXCHANNELS]; int ringNext[MAXCHANNELS]; int treeToParent[MAXCHANNELS]; int treeToChild0[MAXCHANNELS]; int treeToChild1[MAXCHANNELS]; int nvlsHeads[MAXCHANNELS]; }; // struct sccl::hardware::topology::topo::scclTopoSystem; // 对系统拓扑结构进行排序 scclResult_t scclTopoSortSystem(struct scclTopoSystem* system); // 打印系统拓扑结构 scclResult_t scclTopoPrint(struct scclTopoSystem* system); // 计算系统中的路径 scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm); // // 释放系统拓扑结构 // void scclTopoFree(struct scclTopoSystem* system); // // 裁剪系统拓扑结构 // scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm); // // 计算点对点通道 // scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm); // // 获取指定rank的Nvidia GPU信息 // scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks); // // 检查系统中是否所有路径都通过NVLink // int scclTopoPathAllNVLink(struct scclTopoSystem* system); // // 获取网络设备信息 // scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank); // // 检查两个设备之间是否存在点对点连接 scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank); // // 检查是否使用GDR // scclResult_t scclTopoCheckGdr(struct scclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); // // 获取内部网络设备信息 // scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev); // // 获取两个CUDA设备之间的连接类型 // scclResult_t scclTopoGetLinkType( // struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter = MAX_XGMI_INTER_GPUS, int nInter = 0, int* inter = nullptr); // // 检查是否需要刷新 // scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush); // // 检查两个设备是否在同一网络中 // scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net); // // 禁用PXE网络 // int scclPxnDisable(struct scclComm* comm); // // 获取PXE网络中的中间节点 // scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks); // // 获取本地节点的rank // scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank); // // 获取CPU亲和性 // scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity); // // 获取CPU类型信息 // scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model); // // 获取GPU数量 // scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count); // // 获取NVS数量 // scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count); // // 获取本地网络设备信息 // scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id); // // 获取本地GPU索引 // scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex); // // 初始化搜索,调用scclTopoCompute之前需要执行 // scclResult_t scclTopoSearchInit(struct scclTopoSystem* system); // // 计算拓扑图 // scclResult_t scclTopoCompute(struct scclTopoSystem* system, struct scclTopoGraph* graph); // // 打印拓扑图 // scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph); // // 导出拓扑图 // scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs); // // 设置预定义拓扑图 // scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks); // // 设置后处理拓扑图 // scclResult_t scclTopoPostset( // struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc); // // 设置基于树的后处理拓扑图 // scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph); // // 调整模型以适应计算能力 // scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs); // scclResult_t scclTopoCudaPath(int cudaDev, char** path); // #include "info.h" // scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time); } // namespace topology } // namespace hardware } // namespace sccl #endif