graph.h

#ifndef SCCL_GRAPH_H_
#define SCCL_GRAPH_H_

// #include "topo_utils.h"
#include "devcomm.h"
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
#include <sched.h>

namespace sccl {
namespace hardware {
namespace topology {

#define MAX_XGMI_INTER_GPUS 4

struct scclTopoGraph {
    // Input / output
    int id; // ring : 0, tree : 1, collnet : 2
    int pattern;
    int crossNic;
    int collNet;
    int minChannels;
    int maxChannels;
    // Output
    int nChannels;
    float bwIntra;
    float bwInter;
    float latencyInter;
    int typeIntra;
    int typeInter;
    int sameChannels;
    int nHops;
    int intra[MAXCHANNELS * SCCL_TOPO_MAX_NODES];
    int inter[MAXCHANNELS * 2];
    int nIntraChannels;
    int intraNets[MAXCHANNELS * SCCL_TOPO_MAX_NODES * 2];
    char treeBase[SCCL_TOPO_MAX_NODES][SCCL_TOPO_MAX_NODES * 4];
};
struct scclTopoRanks {
    int ringRecv[MAXCHANNELS];
    int ringSend[MAXCHANNELS];
    int ringPrev[MAXCHANNELS];
    int ringNext[MAXCHANNELS];
    int treeToParent[MAXCHANNELS];
    int treeToChild0[MAXCHANNELS];
    int treeToChild1[MAXCHANNELS];
    int nvlsHeads[MAXCHANNELS];
};

// struct sccl::hardware::topology::topo::scclTopoSystem;

// 对系统拓扑结构进行排序
scclResult_t scclTopoSortSystem(struct scclTopoSystem* system);
// 打印系统拓扑结构
scclResult_t scclTopoPrint(struct scclTopoSystem* system);

// 计算系统中的路径
scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm);
// // 释放系统拓扑结构
// void scclTopoFree(struct scclTopoSystem* system);
// // 裁剪系统拓扑结构
// scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm);
// // 计算点对点通道
// scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm);
// // 获取指定rank的Nvidia GPU信息
// scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks);
// // 检查系统中是否所有路径都通过NVLink
// int scclTopoPathAllNVLink(struct scclTopoSystem* system);

// // 获取网络设备信息
// scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
// // 检查两个设备之间是否存在点对点连接
scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank);
// // 检查是否使用GDR
// scclResult_t scclTopoCheckGdr(struct scclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
// // 获取内部网络设备信息
// scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev);
// // 获取两个CUDA设备之间的连接类型
// scclResult_t scclTopoGetLinkType(
//     struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter = MAX_XGMI_INTER_GPUS, int nInter = 0, int* inter = nullptr);
// // 检查是否需要刷新
// scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush);
// // 检查两个设备是否在同一网络中
// scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net);
// // 禁用PXE网络
// int scclPxnDisable(struct scclComm* comm);
// // 获取PXE网络中的中间节点
// scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks);
// // 获取本地节点的rank
// scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank);
// // 获取CPU亲和性
// scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity);
// // 获取CPU类型信息
// scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model);
// // 获取GPU数量
// scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count);
// // 获取NVS数量
// scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count);
// // 获取本地网络设备信息
// scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id);
// // 获取本地GPU索引
// scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex);
// // 初始化搜索，调用scclTopoCompute之前需要执行
// scclResult_t scclTopoSearchInit(struct scclTopoSystem* system);
// // 计算拓扑图
// scclResult_t scclTopoCompute(struct scclTopoSystem* system, struct scclTopoGraph* graph);
// // 打印拓扑图
// scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph);
// // 导出拓扑图
// scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs);
// // 设置预定义拓扑图
// scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks);
// // 设置后处理拓扑图
// scclResult_t scclTopoPostset(
//     struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc);
// // 设置基于树的后处理拓扑图
// scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph);
// // 调整模型以适应计算能力
// scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs);

// scclResult_t scclTopoCudaPath(int cudaDev, char** path);
// #include "info.h"
// scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);

} // namespace topology
} // namespace hardware
} // namespace sccl

#endif