Commit d9d23f34 authored by lishen's avatar lishen
Browse files

Initial Code for SCCL_v1

parent 57df3737
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include "base.h"
#include "alloc.h"
#include "topo.h"
#include "xml.h"
#include "mpi.h"
#include "net.h"
#include "comm.h"
#include "graph.h"
using namespace sccl;
int main(int argc, char** argv) {
// struct sccl::hardware::topology::topo::scclXml* xml;
// SCCLCHECK(sccl::scclCalloc(&xml, 1));
// std::string xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
// SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
// struct sccl::hardware::topology::topo::scclTopoSystem* topoSystem;
// SCCLCHECK(sccl::hardware::topology::topo::scclTopoGetSystemFromXml(xml, &topoSystem));
// printf("topoSystem net.gdrSupport:%d\n", topoSystem->nodes[0].nodes[0].net.gdrSupport);
int rank, nranks;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nranks);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
struct sccl::hardware::scclComm* comm;
struct sccl::hardware::topology::topo::scclTopoSystem* topoSystem;
SCCLCHECK(sccl::hardware::topology::topo::scclTopoGetSystem(&topoSystem));
printf("topoSystem net.gdrSupport:%d\n", topoSystem->nodes[0].nodes[0].net.gdrSupport);
topoSystem->nRanks = nranks;
topoSystem->netGdrLevel = -2;
topoSystem->pivotA2AEnabled = false;
topoSystem->pivotA2ANumBiRings = 0;
topoSystem->ll128Enabled = false;
topoSystem->mscclEnabled = false;
topoSystem->treeDefined = false;
SCCLCHECK(sccl::hardware::topology::scclTopoComputePaths(topoSystem, comm));
return 0;
}
\ No newline at end of file
#include <iostream>
#include <stdio.h>
#include "base.h"
#include "alloc.h"
#include "xml.h"
using namespace sccl;
int main(int argc, char** argv) {
struct sccl::hardware::topology::topo::scclXml* xml;
SCCLCHECK(sccl::scclCalloc(&xml, 1));
std::string xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
SCCLCHECK(scclTopoDumpXmlToFile("test_xml.xml", xml));
return 0;
} // main pass
\ No newline at end of file
<system version="2">
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
<net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
hipcc /public/home/lishen/Code/rocSHMEM/SCCL_v1/examples/2_topo/1_demo_rocm/test_rocm_smi.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/rocm_smi_wrap.cc \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp \
-o test_topo \
-std=c++17 -g -O3 -fopenmp -D__HIP_PLATFORM_HCC__ \
-I ./ -I /usr/include -I /opt/dtk/include \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/ \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/ \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/ \
-L /usr/lib/x86_64-linux-gnu \
-L /usr/lib/ \
-lamdhip64 -lrocm_smi64
\ No newline at end of file
#include <iostream>
#include <stdio.h>
#include <string.h>
#include "base.h"
#include "rocm_smi_wrap.h"
#include "topo_utils.h"
using namespace std;
using namespace sccl;
int main(int argc, char** argv) {
printf("hello world\n");
(void)rocm_smi_init();
uint32_t num_devs;
(void)rocm_smi_getNumDevice(&num_devs);
printf("num_devs=%d\n", num_devs);
uint32_t deviceIndex = 0;
char bus0[100] = "bus0";
(void)rocm_smi_getDevicePciBusIdString(deviceIndex, bus0, 100);
printf("bus0=%s\n", bus0);
RSMI_IO_LINK_TYPE rsmi_type;
int hops, count;
(void)rocm_smi_getLinkInfo(0, 8, &rsmi_type, &hops, &count);
printf("rsmi_type=%d, hops=%d, count=%d\n", rsmi_type, hops, count);
// struct sccl::hardware::topology::topo::scclXml* xml;
// SCCLCHECK(sccl::scclCalloc(&xml, 1));
// std::string xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
// SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
// struct sccl::hardware::topology::topo::scclTopoSystem* topoSystem;
// SCCLCHECK(sccl::hardware::topology::topo::scclTopoGetSystemFromXml(xml, &topoSystem));
return 0;
}
\ No newline at end of file
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include "mpi.h"
#include "net.h"
#include "bootstrap_net.h"
using namespace sccl;
int main(int argc, char* argv[]) {
int rank, nranks;
int tag1, src, dst, cnt;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nranks);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
printf("rank=%d, nranks=%d\n", rank, nranks);
// ----------------------------------------------------------------------- //
INFO(SCCL_LOG_TOPO, "Bootstrap ...\n");
(void)sccl::hardware::topology::bootstrap::bootstrap_net::bootstrapNetInit();
MPI_Finalize();
}
/*
单机执行
SCCL_DEBUG_LEVEL=SCCL_LOG_ABORT mpirun --allow-run-as-root -np 2 1_mpi_init
SCCL_DEBUG_LEVEL=SCCL_LOG_INFO SCCL_DEBUG_POS=SCCL_LOG_CODEALL mpirun --allow-run-as-root -np 2 1_mpi_init
跨机执行
SCCL_DEBUG_LEVEL=SCCL_LOG_ABORT mpirun --allow-run-as-root --hostfile hostfile -np 16 ./1_mpi_init
*/
hipcc ./1_mpi_init.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvsymbols.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvwrap.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/net_ib.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/socket.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/net_socket.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_net.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/ipcsocket.cpp \
/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/proxy.cpp \
-o 1_mpi_init \
-std=c++17 -g -O3 -fopenmp -DROC_SHMEM -D__HIP_PLATFORM_HCC__ \
-I ./ -I /usr/include -I /opt/dtk/include \
-I /public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/ \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/ \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include/ \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/ \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ \
-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/ \
-L /public/home/lishen/Code/rocSHMEM/SCCL_v1 \
-L /usr/lib/x86_64-linux-gnu -libverbs -lrdmacm \
-L /public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib -lmpi
# gather功能
包括`gather``all-gather`
# reduce功能
包括`reduce``all-reduce`
# scatter功能
包括`scatter``reduce-scatter`
#pragma once
#include <stdint.h>
#include "base.h"
#include "topo.h"
namespace sccl {
namespace hardware {
// 定义结构体 scclUniqueInfo,用于存储每个通信节点的信息
struct scclUniqueInfo {
int rank; // 当前节点的全局排名
int nRanks; // 总的节点数量
int localRank; // 当前节点在本地计算节点中的排名
int localRanks; // 本地计算节点中的节点总数
int cudaDev; // CUDA 设备 ID
int gdrSupport; // 是否支持 GPU 直接注册 (GDR)
uint64_t hostHash; // 主机哈希值
uint64_t pidHash; // 进程 ID 哈希值
int64_t busId; // 总线 ID
};
// // 定义结构体 scclCommBase,用于存储通信基础信息
// struct scclCommBase {
// struct scclUniqueInfo* peerInfo; // 指向 peerInfo 结构体的指针,存储所有节点的信息
// sccl::hardware::net::scclNet_t* scclNet; // 指向网络结构体的指针,用于网络通信
// };
} // namespace hardware
} // namespace sccl
#include "comm.h"
#include "graph.h"
#include "trees.h"
#include "rings.h"
#include "topo.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
/******************************************************************/
/********************* Internode connection ***********************/
/******************************************************************/
scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks) {
int rank = comm->rank;
int localRanks = comm->topo->nodes[GPU].count;
int nChannels = comm->nChannels;
for(int c = 0; c < nChannels; c++) {
struct scclChannel* channel = comm->channels + c;
channel->ring.prev = channel->ring.next = -1;
channel->tree.up = -1;
channel->collnetChain.up = -1;
for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++)
channel->tree.down[i] = -1;
for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++)
channel->collnetChain.down[i] = -1;
channel->collnetDirect.out = -1;
channel->collnetDirect.headRank = -1;
channel->collnetDirect.nHeads = 0;
channel->collnetDirect.shift = 0;
for(int i = 0; i < SCCL_MAX_DIRECT_ARITY; i++)
channel->collnetDirect.up[i] = -1;
for(int i = 0; i < SCCL_MAX_DIRECT_ARITY; i++)
channel->collnetDirect.down[i] = -1;
int* ringIntra = graphs[SCCL_ALGO_RING]->intra + c * localRanks;
int* treeIntra = graphs[SCCL_ALGO_TREE]->intra + c * localRanks;
int* collNetIntra = graphs[SCCL_ALGO_COLLNET_CHAIN]->intra + c * localRanks;
int* nvlsIntra = graphs[SCCL_ALGO_NVLS]->intra + c * localRanks;
for(int i = 0; i < localRanks; i++) {
if(ringIntra[i] == rank) {
topoRanks->ringRecv[c] = ringIntra[0];
topoRanks->ringSend[c] = ringIntra[localRanks - 1];
channel->ring.prev = (i == 0) ? -1 : ringIntra[i - 1];
channel->ring.next = (i == localRanks - 1) ? -1 : ringIntra[i + 1];
}
if(treeIntra[i] == rank) {
int parentIndex = 0;
int child0Index = graphs[SCCL_ALGO_TREE]->pattern == SCCL_TOPO_PATTERN_TREE ? 0 : 1;
int child1Index = graphs[SCCL_ALGO_TREE]->pattern == SCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
topoRanks->treeToParent[c] = treeIntra[parentIndex];
topoRanks->treeToChild0[c] = treeIntra[child0Index];
topoRanks->treeToChild1[c] = treeIntra[child1Index];
channel->tree.up = i == 0 ? -1 : treeIntra[i - 1];
channel->tree.down[0] = i == localRanks - 1 ? -1 : treeIntra[i + 1];
}
if(collNetIntra[i] == rank) {
channel->collnetChain.up = i == 0 ? comm->nRanks : collNetIntra[i - 1];
channel->collnetChain.down[0] = i == localRanks - 1 ? -1 : collNetIntra[i + 1];
}
}
topoRanks->ringPrev[c] = channel->ring.prev;
topoRanks->ringNext[c] = channel->ring.next;
topoRanks->nvlsHeads[c] = nvlsIntra[0];
}
// Duplicate channels rings/trees
struct scclChannel* channel0 = comm->channels;
struct scclChannel* channel1 = (nChannels > MAXCHANNELS / 2) ? 0 : channel0 + nChannels;
if(channel1)
memcpy(channel1, channel0, nChannels * sizeof(struct scclChannel));
return scclSuccess;
}
bool isRankHere(const char* s, int start, int end, int rank) {
if(end <= start || start < 0 || end < 0)
return false;
int num = 0;
while(start < end) {
char currChar = s[start];
if(isdigit(currChar)) {
num = num * 10 + (currChar - '0');
if(isdigit(s[start + 1])) {
start++;
continue;
}
} else if(currChar == '(' || currChar == ')') {
start++;
num = 0;
continue;
}
if(num == rank)
return true;
start++;
}
return false;
}
scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph) {
int x = 0, y = 0;
for(int i = 0; treeGraph->treeBase[i][0] != 0; i++) {
x = i + 1;
}
if(treeGraph->treeBase[0][0] == 0)
return scclSuccess;
int nChannels = comm->nChannels;
int localRanks = comm->topo->nodes[GPU].count;
// new tree
for(int c = 0; c < nChannels; c++) { // in here
int buff = c % x;
char tempString[SCCL_TOPO_MAX_NODES * 4];
int ko = 0;
while(treeGraph->treeBase[buff][ko] != 0) {
tempString[ko] = treeGraph->treeBase[buff][ko];
ko++;
}
tempString[ko] = 0;
int start = 0;
int curRank = comm->rank;
struct scclChannel* channel = comm->channels + c;
int end = 0;
while(tempString[end] != 0)
end++;
int parent = -1;
// constructing a number from the continuous digits
while(start < end) {
int num = 0, num_found = 0;
start++;
while(start < end && tempString[start] != '(' && tempString[start] != ')') {
int num_here = (int)(tempString[start] - '0');
num = num * 10 + num_here;
start = start + 1;
if(tempString[start] == '(' || tempString[start] == ')' || start == end)
num_found = 1;
}
if(num_found != 0 && num == curRank) {
channel->tree.up = parent;
int depth = 0;
for(int childId = 0; childId < SCCL_MAX_TREE_ARITY; childId++) {
int or_start = start;
int child = -1;
channel->tree.down[childId] = -1;
if(or_start >= end - 1)
continue;
num = 0;
or_start++;
while(tempString[or_start] != 0 && tempString[or_start] != '(' && tempString[or_start] != ')') {
int num_here = (int)(tempString[or_start] - '0');
num = num * 10 + num_here;
or_start++;
}
child = num;
// find next child start
while(start < end) {
if(tempString[start] == '(')
depth++;
else if(tempString[start] == ')')
depth--;
if(depth == 0)
break; // next child
start++;
}
start++;
channel->tree.down[childId] = child;
// get kids, update numbers, get out of this string
}
break;
} else { // go to the next one
parent = num;
int start_c = start;
int end_c = start_c;
while(end_c < end) {
int depth = 0;
while(end_c < end) {
if(tempString[end_c] == '(')
depth++;
else if(tempString[end_c] == ')')
depth--;
if(depth == 0)
break; // next child
end_c++;
}
if(isRankHere(tempString, start_c, end_c, curRank)) {
start = start_c;
end = end_c;
break;
} else {
end_c++;
start_c = end_c;
}
}
}
}
}
return scclSuccess;
}
static scclResult_t connectRings(struct scclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
int nChannels = comm->nChannels;
int nNodes = comm->nNodes;
for(int c = 0; c < nChannels; c++) {
int* recv = ringRecv + c * comm->nNodes;
int* send = ringSend + c * comm->nNodes;
int* prev = ringPrev + c * comm->nRanks;
int* next = ringNext + c * comm->nRanks;
struct scclChannel* channel0 = comm->channels + c;
struct scclChannel* channel1 = (nChannels > MAXCHANNELS / 2) ? 0 : channel0 + nChannels;
for(int n = 0; n < nNodes; n++) {
int recvRank = recv[n];
int prevSendRank = send[(n - 1 + nNodes) % nNodes];
prev[recvRank] = prevSendRank;
if(comm->rank == recvRank) {
channel0->ring.prev = prevSendRank;
if(channel1)
channel1->ring.prev = prevSendRank;
}
int sendRank = send[n];
int nextRecvRank = recv[(n + 1) % nNodes];
next[sendRank] = nextRecvRank;
if(comm->rank == sendRank) {
channel0->ring.next = nextRecvRank;
if(channel1)
channel1->ring.next = nextRecvRank;
}
}
}
return scclSuccess;
}
static scclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
for(int n = 0; n < nNodes; n++)
indexes[n] = ranks[n];
return scclSuccess;
}
static scclResult_t setTreeUp(struct scclTree* tree, int* indexes, int u) {
if(u == -1)
return scclSuccess;
tree->up = indexes[u];
return scclSuccess;
}
static scclResult_t setTreeDown(struct scclTree* tree, int* indexes, int d) {
if(d == -1)
return scclSuccess;
int x = 0;
while(x < SCCL_MAX_TREE_ARITY && tree->down[x] >= 0)
x++;
if(x == SCCL_MAX_TREE_ARITY) {
WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
return scclInternalError;
}
tree->down[x] = indexes[d];
return scclSuccess;
}
static scclResult_t connectTrees(struct scclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
const int nChannels = (comm->nChannels > MAXCHANNELS / 2) ? comm->nChannels / 2 : comm->nChannels, nNodes = comm->nNodes, node = comm->node;
// Compute tree depth. Not an exact value but a good approximation in most
// cases
int depth = comm->nRanks / nNodes - 1 + log2i(nNodes);
int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
int *ttp, *ttc0, *ttc1;
SCCLCHECK(scclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
if(comm->nChannels <= MAXCHANNELS / 2) {
for(int c = 0; c < nChannels; c++) {
struct scclChannel* channel0 = comm->channels + c;
struct scclChannel* channel1 = channel0 + nChannels;
ttp = treeToParent + c * comm->nNodes;
ttc0 = treeToChild0 + c * comm->nNodes;
ttc1 = treeToChild1 + c * comm->nNodes;
if(comm->rank == ttp[node]) {
SCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
SCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
}
if(comm->rank == ttc0[node]) {
SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
}
if(comm->rank == ttc1[node]) {
SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
}
if(comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
INFO(SCCL_LOG_TOPO,
"Tree %d : %d -> %d -> %d/%d/%d",
c,
channel0->tree.up,
comm->rank,
channel0->tree.down[0],
channel0->tree.down[1],
channel0->tree.down[2]);
INFO(SCCL_LOG_TOPO,
"Tree %d : %d -> %d -> %d/%d/%d",
c + nChannels,
channel1->tree.up,
comm->rank,
channel1->tree.down[0],
channel1->tree.down[1],
channel1->tree.down[2]);
}
channel0->tree.depth = channel1->tree.depth = depth;
}
} else {
for(int c = 0; c < nChannels; c++) {
struct scclChannel* channel0 = comm->channels + c;
ttp = treeToParent + c * comm->nNodes;
ttc0 = treeToChild0 + c * comm->nNodes;
ttc1 = treeToChild1 + c * comm->nNodes;
if(comm->rank == ttp[node]) {
SCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
}
if(comm->rank == ttc0[node]) {
SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
}
if(comm->rank == ttc1[node]) {
SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
}
if(comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
INFO(SCCL_LOG_TOPO,
"Tree %d : %d -> %d -> %d/%d/%d",
c,
channel0->tree.up,
comm->rank,
channel0->tree.down[0],
channel0->tree.down[1],
channel0->tree.down[2]);
}
channel0->tree.depth = depth;
}
for(int c = nChannels; c < nChannels * 2; c++) {
struct scclChannel* channel1 = comm->channels + c;
ttp = treeToParent + c * comm->nNodes;
ttc0 = treeToChild0 + c * comm->nNodes;
ttc1 = treeToChild1 + c * comm->nNodes;
if(comm->rank == ttp[node]) {
SCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
}
if(comm->rank == ttc0[node]) {
SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
}
if(comm->rank == ttc1[node]) {
SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
}
if(comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
INFO(SCCL_LOG_TOPO,
"Tree %d : %d -> %d -> %d/%d/%d",
c + nChannels,
channel1->tree.up,
comm->rank,
channel1->tree.down[0],
channel1->tree.down[1],
channel1->tree.down[2]);
}
channel1->tree.depth = depth;
}
}
return scclSuccess;
}
static scclResult_t connectCollNet(struct scclComm* comm, struct scclTopoGraph* collNetGraph) {
int rank = comm->rank;
int localRanks = comm->localRanks;
int nHeads = 0;
int* heads;
SCCLCHECK(scclCalloc(&heads, localRanks));
// Find all head ranks
// Head index is always 0
for(int c = 0; c < collNetGraph->nChannels; c++) {
int* collNetIntra = collNetGraph->intra + c * localRanks;
int head = collNetIntra[0];
for(int h = 0; h < nHeads; h++)
if(heads[h] == head)
head = -1;
if(head != -1)
heads[nHeads++] = collNetIntra[0];
}
// For all channels
for(int c = 0; c < comm->nChannels; c++) {
struct scclChannel* channel = comm->channels + c;
char line[1024];
sprintf(line, "CollNet channel %d rank %d ", c, rank);
int nDown = 0;
for(int i = 0; i < nHeads; i++) {
if(rank == heads[i]) { // is head
channel->collnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel
channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks
int* collNetIntra = collNetGraph->intra + i * localRanks;
sprintf(line + strlen(line), "down ");
for(int r = 0; r < localRanks; r++) {
if(collNetIntra[r] == rank)
continue;
channel->collnetDirect.down[nDown++] = collNetIntra[r]; // connect to all peers
sprintf(line + strlen(line), " %d ", collNetIntra[r]);
}
sprintf(line + strlen(line), "nDown %d ", nDown);
break;
}
}
// Connect to all heads
int nUp = 0;
sprintf(line + strlen(line), "up ");
for(int h = 0; h < nHeads; h++) {
if(rank == heads[h])
continue;
channel->collnetDirect.up[nUp++] = heads[h];
sprintf(line + strlen(line), " %d ", heads[h]);
}
channel->collnetDirect.nHeads = nHeads;
channel->collnetDirect.shift = (rank % localRanks) % nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
sprintf(line + strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
sprintf(line + strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
INFO(SCCL_LOG_TOPO, "%s", line);
channel->collnetChain.depth = comm->nRanks / comm->nNodes;
}
for(int c = 0; c < comm->nvlsChannels; c++) {
struct scclChannel* channel = comm->channels + c;
if(channel->nvls.headRank != -1)
channel->nvls.out = comm->nRanks;
}
free(heads);
return scclSuccess;
}
static scclResult_t connectNvls(struct scclComm* comm, int* nvlsHeads, struct scclTopoGraph* nvlsGraph) {
int nHeads = nvlsGraph->nChannels;
int headRank = -1;
for(int h = 0; h < nHeads; h++) {
if(nvlsGraph->intra[h * comm->localRanks] == comm->rank)
headRank = h;
}
if(nHeads == 0) {
comm->nvlsChannels = 0;
return scclSuccess;
}
for(int c = 0; c < comm->nvlsChannels; c++) {
struct scclChannel* channel = comm->channels + c;
channel->nvls.nHeads = nHeads;
for(int h = 0; h < nHeads; h++)
channel->nvls.up[h] = comm->nRanks + 1 + h;
for(int h = nHeads; h < SCCL_MAX_NVLS_ARITY; h++)
channel->nvls.up[h] = -1;
channel->nvls.down = comm->nRanks + 1 + headRank;
channel->nvls.out = -1; // NVLS+SHARP not yet implemented.
channel->nvls.headRank = headRank;
channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
channel->nvls.node = comm->node;
channel->nvls.nNodes = comm->nNodes;
}
if(comm->nNodes == 1)
return scclSuccess;
// Connect Trees
int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
int pc0, pc1; // ignored
SCCLCHECK(scclGetDtree(comm->nNodes, comm->node, &tree0Parent, &tree0Child0, &tree0Child1, &pc0, &tree1Parent, &tree1Child0, &tree1Child1, &pc1));
int* heads = NULL;
int treeUp[2] = {-1, -1};
int treeDown0[2] = {-1, -1};
int treeDown1[2] = {-1, -1};
if(comm->node == 0) {
for(int h = 0; h < nHeads; h++) {
char line[1024];
sprintf(line, "NVLS Head %2d:", h);
heads = nvlsHeads + h * comm->nNodes;
for(int n = 0; n < comm->nNodes && n < 20; n++) {
sprintf(line + strlen(line), " %2d", heads[n]);
}
INFO(SCCL_INIT, "%s", line);
}
}
// Find the heads where I'm the head rank and retain tree up/down
for(int h = 0; h < nHeads; h++) {
heads = nvlsHeads + h * comm->nNodes;
if(heads[comm->node] == comm->rank) {
treeUp[0] = tree0Parent == -1 ? -1 : heads[tree0Parent];
treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0];
treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1];
treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent];
treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0];
treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1];
break;
}
}
// Set prev/next in all channels (NVLS compute channels work
// orthogonally to NVLS search channels).
for(int c = 0; c < comm->nvlsChannels; c++) {
struct scclChannel* channel = comm->channels + c;
channel->nvls.treeUp = treeUp[c % 2];
channel->nvls.treeDown[0] = channel->nvls.down;
int ix = 1;
if(treeDown0[c % 2] != -1)
channel->nvls.treeDown[ix++] = treeDown0[c % 2];
if(treeDown1[c % 2] != -1)
channel->nvls.treeDown[ix] = treeDown1[c % 2];
}
struct scclNvls* nvls0 = &comm->channels[0].nvls;
struct scclNvls* nvls1 = &comm->channels[1].nvls;
INFO(SCCL_LOG_TOPO,
"NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d",
nvls0->treeDown[0],
nvls0->treeDown[1],
comm->rank,
nvls0->treeUp,
nvls1->treeDown[0],
nvls1->treeDown[1],
comm->rank,
nvls1->treeUp);
return scclSuccess;
}
// Legacy naming
SCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
SCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
// New naming
SCCL_PARAM(MinNchannels, "MIN_NCHANNELS", 4);
SCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
int scclMinNchannels() {
int minNchannels = 2;
if(scclParamMinNrings() != -2)
minNchannels = scclParamMinNrings();
if(scclParamMinNchannels() != -2)
minNchannels = scclParamMinNchannels();
if(minNchannels > MAXCHANNELS) {
WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
minNchannels = MAXCHANNELS;
}
if(minNchannels < 0)
minNchannels = 0;
return minNchannels;
}
int scclMaxNchannels() {
int maxNchannels = MAXCHANNELS;
if(scclParamMaxNrings() != -2)
maxNchannels = scclParamMaxNrings();
if(scclParamMaxNchannels() != -2)
maxNchannels = scclParamMaxNchannels();
if(maxNchannels > MAXCHANNELS)
maxNchannels = MAXCHANNELS;
if(maxNchannels < 1) {
WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
maxNchannels = 1;
}
return maxNchannels;
}
static int copyChannels(struct scclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
int nranks = comm->nRanks;
int c;
for(c = start; c < end; c++) {
memcpy(ringPrev + c * nranks, ringPrev + (c - start) * nranks, nranks * sizeof(int));
memcpy(ringNext + c * nranks, ringNext + (c - start) * nranks, nranks * sizeof(int));
memcpy(comm->channels + c, comm->channels + c - start, sizeof(struct scclChannel));
}
return c;
}
static int copyMixedChannels(struct scclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
int nranks = comm->nRanks;
int c;
for(c = start; c < end; c++) {
memcpy(ringPrev + c * nranks, ringPrev + (c - start) * nranks, nranks * sizeof(int));
memcpy(ringNext + c * nranks, ringNext + (c - start) * nranks, nranks * sizeof(int));
memcpy(comm->channels + c, comm->channels + c - start, sizeof(struct scclChannel));
comm->channels[c].transportType = comm->mixedTransportType;
}
return c;
}
RCCL_PARAM(MaxMixedHylinkNChannels, "MAX_MIXED_HYLINK_NCHANNELS", 0);
RCCL_PARAM(MixedTransportType, "MIXED_TRANSPORT_TYPE", TRANSPORT_SHM);
scclResult_t scclTopoPostset(
struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
int nranks = comm->nRanks;
int nNodes = comm->nNodes;
int nChannels = comm->nChannels;
int MinNChannels = scclMinNchannels();
int MaxNChannels = scclMaxNchannels();
SCCLCHECK(scclCalloc(&ringRecv, nNodes * MAXCHANNELS));
SCCLCHECK(scclCalloc(&ringSend, nNodes * MAXCHANNELS));
SCCLCHECK(scclCalloc(&ringPrev, nranks * MAXCHANNELS));
SCCLCHECK(scclCalloc(&ringNext, nranks * MAXCHANNELS));
SCCLCHECK(scclCalloc(&treeToParent, nNodes * MAXCHANNELS));
SCCLCHECK(scclCalloc(&treeToChild0, nNodes * MAXCHANNELS));
SCCLCHECK(scclCalloc(&treeToChild1, nNodes * MAXCHANNELS));
SCCLCHECK(scclCalloc(&nvlsHeads, nNodes * MAXCHANNELS));
for(int c = 0; c < nChannels; c++) {
for(int n = 0; n < nNodes; n++) {
int r = firstRanks[n];
ringRecv[c * nNodes + n] = allTopoRanks[r]->ringRecv[c];
ringSend[c * nNodes + n] = allTopoRanks[r]->ringSend[c];
treeToParent[c * nNodes + n] = allTopoRanks[r]->treeToParent[c];
treeToChild0[c * nNodes + n] = allTopoRanks[r]->treeToChild0[c];
treeToChild1[c * nNodes + n] = allTopoRanks[r]->treeToChild1[c];
nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
}
for(int r = 0; r < nranks; r++) {
ringPrev[c * nranks + r] = allTopoRanks[r]->ringPrev[c];
ringNext[c * nranks + r] = allTopoRanks[r]->ringNext[c];
}
}
// Connect rings and trees. This should also duplicate the channels.
SCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
SCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
SCCLCHECK(connectNvls(comm, nvlsHeads, graphs[SCCL_ALGO_NVLS]));
// Duplicate ringPrev/ringNext for scclBuildRing
if(nChannels <= MAXCHANNELS / 2)
memcpy(ringPrev + nChannels * nranks, ringPrev, nChannels * nranks * sizeof(int));
if(nChannels <= MAXCHANNELS / 2)
memcpy(ringNext + nChannels * nranks, ringNext, nChannels * nranks * sizeof(int));
if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MIN_NCHANNELS") == NULL)
MinNChannels = 32;
if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MAX_NCHANNELS") == NULL)
MaxNChannels = 32;
#ifdef HCU_SDMA_FEATURE
int ncSdma = nc;
ncSdma = std::min((int)scclMaxNchannels() / comm->nChannels, nc);
ncSdma *= comm->nChannels;
#endif
// Get number of channels after duplication
nc = std::min((int)MaxNChannels / comm->nChannels, nc);
nc *= comm->nChannels;
// Duplication should be complete now
nChannels = comm->nChannels = std::min(MAXCHANNELS, (nChannels <= MAXCHANNELS / 2) ? nChannels * 2 : nChannels);
// Setup CollNet
if(comm->collNetSupport == 1) {
struct scclTopoGraph* collNetGraph = graphs[SCCL_ALGO_COLLNET_DIRECT];
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
if(collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
int collNetNchannels = std::min(MAXCHANNELS, nChannels + nChannels / 2);
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
}
SCCLCHECK(connectCollNet(comm, collNetGraph));
}
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
if(comm->minCompCap == 90 && comm->nNodes > 1 && graphs[SCCL_ALGO_RING]->bwIntra > 45.0 && 2 * nChannels <= MAXCHANNELS) {
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2 * nChannels, ringPrev, ringNext);
}
// Add Hylink + PCIE double channel path
if(graphs[SCCL_ALGO_RING]->typeIntra == PATH_NVL) {
comm->nMixedHylinkChannels = std::min(MAXCHANNELS - comm->nChannels, (int)rcclParamMaxMixedHylinkNChannels());
if(comm->nMixedHylinkChannels > 0) {
INFO(SCCL_LOG_TOPO,
"<%s:%d> -----> comm->nMixedHylinkShmChannels: %d, comm->nChannels: %d\n",
__func__,
__LINE__,
comm->nMixedHylinkChannels,
comm->nChannels);
comm->mixedTransportType = std::max((int)rcclParamMixedTransportType(), TRANSPORT_SHM);
nChannels = comm->nChannels = copyMixedChannels(comm, nChannels, nChannels + comm->nMixedHylinkChannels, ringPrev, ringNext);
}
}
// Honor SCCL_MIN_NRINGS/SCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
if(checkSdmaCopyEnable(comm)) {
uint32_t sdmaChannelNum;
uint32_t maxChannels;
sdmaChannelNum = getSdmaChannelNum(comm);
if(comm->sharedRes->owner != comm) {
/* child comm #channels cannot exceed top parent #channels. */
nChannels = comm->nChannels = std::min(std::min(std::min(scclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
maxChannels =
sdmaChannelNum ? sdmaChannelNum : std::min(std::max(scclMinNchannels(), std::max(ncSdma, comm->config.minCTAs)), comm->sharedRes->tpNChannels);
nChannels = comm->nChannels = copyChannels(comm, nChannels, maxChannels, ringPrev, ringNext);
} else {
nChannels = comm->nChannels = std::min(std::min(scclMaxNchannels(), nChannels), comm->config.maxCTAs);
maxChannels = sdmaChannelNum ? sdmaChannelNum : std::max(scclMinNchannels(), std::max(ncSdma, comm->config.minCTAs));
nChannels = comm->nChannels = copyChannels(comm, nChannels, maxChannels, ringPrev, ringNext);
}
INFO(SCCL_INIT, "-hcugon- scclTopoPostset rank %d sdmaChannelNum %d nChannels %d", comm->rank, sdmaChannelNum, comm->nChannels);
} else {
if(comm->sharedRes->owner != comm) {
/* child comm #channels cannot exceed top parent #channels. */
nChannels = comm->nChannels = std::min(std::min(std::min(MaxNChannels, nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
nChannels = comm->nChannels = copyChannels(
comm, nChannels, std::min(std::max(MinNChannels, std::max(nc, comm->config.minCTAs)), comm->sharedRes->tpNChannels), ringPrev, ringNext);
} else {
nChannels = comm->nChannels = std::min(std::min(MaxNChannels, nChannels), comm->config.maxCTAs);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(MinNChannels, std::max(nc, comm->config.minCTAs)), ringPrev, ringNext);
}
}
// Create rings array and check all is fine
SCCLCHECK(scclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
free(ringRecv);
free(ringSend);
free(ringPrev);
free(ringNext);
free(treeToParent);
free(treeToChild0);
free(treeToChild1);
free(nvlsHeads);
return scclSuccess;
}
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef SCCL_DEVICE_H_
#define SCCL_DEVICE_H_
#include "check.h"
#include "sccl_bfloat16.h"
#include "align.h"
#if defined(ENABLE_NPKIT)
#include "npkit/npkit_struct.h"
#endif
#if defined(ENABLE_TIMELINE)
#include "timeline/timeline.h"
#endif
#include <stdint.h>
#ifdef HCU_SDMA_FEATURE
#include "hsa/hsa_ext_amd.h"
#include "hsa_extra.h"
// #define HCU_PRINT_DEBUG
#endif
namespace sccl {
#define PRINT_ERR(...)
#define PRINT_INFO(...)
#define PRINT_INFOM(...)
#define PRINT_INFOT(tid, ...)
#define PRINT_DEBUG(...)
#if defined(ENABLE_NPKIT) && defined(HCU_SDMA_FEATURE)
#define NPKIT_SET_GPU_EVENT(event, size, cost) \
NpKit::CollectGpuEvent(event, size, cost, NPKIT_GET_GPU_TIMESTAMP(), scclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm) NpKit::CollectGpuEvent(event, size, cost, tm, scclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
#else
#define NPKIT_SET_GPU_EVENT(event, size, cost)
#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm)
#endif
#ifdef HCU_SDMA_FEATURE
#define INIT_PRIMS_SDMA(prims, args) \
{ \
prims.rank = scclShmem.comm.rank; \
prims.useSdmaConfig = args->useSdma; \
prims.useSdmaCopy = args->useSdma && prims.sdmaQueueCtx; \
prims.preFnOps = args->preFnOps; \
prims.sdmaMinCopySize = args->useSdma && prims.sdmaQueueCtx ? prims.sdmaQueueCtx->minCopySize : 0; \
prims.sdmaCountEnable = args->useSdma && prims.sdmaQueueCtx ? prims.sdmaQueueCtx->copyCountEnable : 0; \
prims.sdmaCopyCount = 0; \
prims.allCopyCount = 0; \
}
#endif
#define SCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
typedef enum {
scclFuncBroadcast,
scclFuncReduce,
scclFuncAllGather,
scclFuncReduceScatter,
scclFuncAllReduce,
scclFuncSendRecv,
scclFuncSend,
scclFuncRecv,
scclFuncAllToAllPivot,
scclNumFuncs
} scclFunc_t;
extern const char* scclFuncStr[SCCL_NUM_FUNCTIONS + 2];
#define SCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define SCCL_ALGO_TREE 0
#define SCCL_ALGO_RING 1
#define SCCL_ALGO_COLLNET_DIRECT 2
#define SCCL_ALGO_COLLNET_CHAIN 3
#define SCCL_ALGO_NVLS 4
#define SCCL_ALGO_NVLS_TREE 5
enum scclAlgo {
SCCL_ALGO_TREE = 0, // 树形算法
SCCL_ALGO_RING = 1, // 环形算法
SCCL_ALGO_COLLNET_DIRECT = 2, // 直接网络算法
SCCL_ALGO_COLLNET_CHAIN = 3, // 链式网络算法
SCCL_ALGO_NVLS = 4, // NVLink算法
SCCL_ALGO_NVLS_TREE = 5, // NVLink树形算法
};
extern const char* scclAlgoStr[SCCL_NUM_ALGORITHMS];
#define SCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define SCCL_PROTO_LL 0
#define SCCL_PROTO_LL128 1
#define SCCL_PROTO_SIMPLE 2
extern const char* scclProtoStr[SCCL_NUM_PROTOCOLS];
#define SCCL_MAX_OPS 2048
#define SCCL_STEPS 8
union scclLLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
Note this is assuming that either we receive contiguous chunks of data
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
struct {
uint32_t data1;
uint32_t flag1;
uint32_t data2;
uint32_t flag2;
};
uint64_t v[2];
int4 i4;
};
#define WARP_SIZE warpSize
#define MAXCHANNELS 32
#define SCCL_MAX_NTHREADS 256
#define SCCL_SIMPLE_MAX_NTHREADS SCCL_MAX_NTHREADS
#define SCCL_LL_MAX_NTHREADS SCCL_MAX_NTHREADS
#define SCCL_LL_LINES_PER_THREAD 8
#ifdef TEST_LL_CLEANUP
#define SCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
#define SCCL_LL_FLAG_MAX 0x100
#define SCCL_LL_FLAG(a) ((uint32_t)((a) % SCCL_LL_FLAG_MAX))
#else
#define SCCL_LL_CLEAN_MASK 0x7ffffff8
#define SCCL_LL_FLAG(a) ((uint32_t)(a))
#endif
// Make sure the clean mask will last for at least SCCL_NSTEPS
static_assert(SCCL_LL_CLEAN_MASK % SCCL_STEPS == 0, "Invalid SCCL_LL_CLEAN_MASK value");
#define SCCL_LL128_LINESIZE 64
#define SCCL_LL128_LINEELEMS (SCCL_LL128_LINESIZE / sizeof(uint64_t))
#define SCCL_LL128_DATAELEMS (SCCL_LL128_LINEELEMS - 1)
#define SCCL_LL128_MAX_NTHREADS 256
#define SCCL_LL128_ELEMS_PER_THREAD 28
#define SCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
#define SCCL_LL128_SHMEM_SIZE (SCCL_LL128_SHMEM_ELEMS_PER_THREAD * SCCL_LL128_MAX_NTHREADS)
#define SCCL_DIRECT_WRITE 0x01
#define SCCL_DIRECT_READ 0x02
#define SCCL_DIRECT_NIC 0x04
#define SCCL_IPC_WRITE 0x08
#define SCCL_IPC_READ 0x10
#define SCCL_NVLS_MIN_POLL 0x20
#ifdef HCU_SDMA_FEATURE
#define SDMA_CTX_VALID_MAGIC 0xD65A
#endif
struct scclConnInfo {
// Regular comm mechanism
char* buffs[SCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
uint64_t* tail; // Local for recv, remote for send
uint64_t* head; // Local for send, remote for recv
int flags; // Direct communication / other flags
int shared; // Buffers are shared
void** ptrExchange; // Pointer exchange for direct communication
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
int* sizesFifo; // Sizes fifo from GPU to proxy
int* offsFifo; // Buffer fifo from proxy to GPU
uint64_t step; // Keep where we are
uint64_t llLastCleaning;
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
// allows software to explicitly initiate a flush read to HDP memory. See more
// descriptions in primitives.h.
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
uint32_t* curr_hdp_reg; // Current GPU's HDP register
#ifdef HCU_SDMA_FEATURE
struct sdmaQueueContext* sdmaQueueCtx;
uint32_t sdmaCtxValidMagic;
#endif
};
struct scclProxyConnector {
int tpRank;
int tpLocalRank;
int sameProcess;
struct scclProxyConnection* connection;
};
struct scclConnector {
int connected;
struct scclProxyConnector proxyConn;
struct scclTransportComm* transportComm;
void* transportResources;
struct scclConnInfo conn;
};
struct scclRing {
// Shortcuts for userRanks[1] and userRanks[n-1]
int prev;
int next;
// Maps an internal sccl index to user-specified rank order. This is necessary
// since we need to know how the user expects data to be ordered across
// devices. Ordered from current device.
int* userRanks;
int index; // This rank's index in the ring
};
// The root of each tree only has one node down (+1 intra-node).
#define SCCL_MAX_TREE_ARITY_TOP 2
// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
#define SCCL_MAX_TREE_ARITY 3
struct scclTree {
int depth;
int up;
int down[SCCL_MAX_TREE_ARITY];
};
#define SCCL_MAX_DIRECT_ARITY 7
struct scclDirect {
int depth;
int out;
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
int up[SCCL_MAX_DIRECT_ARITY];
int down[SCCL_MAX_DIRECT_ARITY];
};
#define SCCL_CONN_IDX_P2P_NET 2
#define SCCL_MAX_NVLS_ARITY 8
#define SCCL_MAX_NVLS_TREE_ARITY 3
struct scclNvls {
int out;
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int up[SCCL_MAX_NVLS_ARITY];
int down;
int treeUp;
int treeDown[SCCL_MAX_NVLS_TREE_ARITY];
int node;
int nNodes;
};
#define SCCL_MAX_CONNS 3
struct scclChannelPeer {
struct scclConnector send[SCCL_MAX_CONNS];
struct scclConnector recv[SCCL_MAX_CONNS];
int refCount;
};
struct scclDevComm;
#pragma pack(push) /* push current alignment to stack */
#pragma pack(8) /* set alignment to 8 bytes boundary */
/* scclWork is to be a power of two, currently 8x64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of scclWorkElem. */
#define SCCL_WORK_SIZE 256
enum scclWorkType : uint8_t {
scclWorkTypeUnused = 0,
scclWorkTypeColl = 1,
scclWorkTypeP2p = 2,
scclWorkTypeRegColl = 3
};
enum scclWorkP2PType : uint8_t {
scclWorkP2pTypeUnused = 0,
scclWorkP2pTypeSend,
scclWorkP2pTypeRecv
};
struct scclWorkHeader {
union {
int32_t workNext; // when isLast=0: Offset from kernel argument workHead
uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
};
uint16_t funcIndex;
uint8_t isLast : 1; // last work for this kernel
uint8_t inFifo : 1; // is this work in the fifo
enum scclWorkType type;
};
struct scclWorkElem {
union {
uint8_t flagBits;
struct {
uint8_t isUsed : 1, redOpArgIsPtr : 1, regUsed : 1, nWarps : 5;
};
};
uint8_t direct;
uint8_t bid;
uint8_t nChannels;
struct {
uint32_t root : 28;
uint32_t preFnOps : 1;
uint32_t useSdma : 1;
uint32_t connIndex : 2;
};
const void* sendbuff;
void* recvbuff;
size_t count;
union {
size_t lastChunkSize;
// Pivot A2A kernel computes chunk size itself.
// Instead, it needs the number of bidirectional rings.
size_t pivotA2ANumBiRings;
};
uint64_t redOpArg;
uint64_t opCount;
};
static_assert((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElem))) / sizeof(scclWorkElem) == 4,
"Sanity check: SCCL_MAX_WORK_ELEMENTS == 4");
#define SCCL_MAX_WORK_ELEMENTS 1
struct scclWorkElemP2p {
struct {
int32_t peer : 26;
uint32_t preFnOps : 1;
uint32_t useSdma : 1;
uint32_t connIndex : 2;
int32_t proto : 2;
};
union {
uint16_t flagBits;
struct {
enum scclWorkP2PType p2pType : 4;
uint16_t nWarps : 4;
uint16_t warpStart : 4;
uint16_t ngroups : 4;
};
};
uint16_t opCount;
// Important not to use any fields with greater than 4-byte alignment since
// we need sizeof(scclWorkElemP2p)==28, but that would be padded up to 32 if
// there were 8-byte fields.
// void* buff;
uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
// size_t count;
uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
int chunkSize;
};
static_assert(((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElemP2p))) / sizeof(scclWorkElemP2p)) == 8,
"Sanity check: SCCL_MAX_WORK_ELEMENTS_P2P == 8");
#define SCCL_MAX_WORK_ELEMENTS_P2P 2
struct scclWorkElemReg {
struct scclWorkElem elem;
void* dnInputs[SCCL_MAX_DIRECT_ARITY + 1];
void* dnOutputs[SCCL_MAX_DIRECT_ARITY + 1];
void* upOutputs[SCCL_MAX_DIRECT_ARITY + 1];
};
#define SCCL_MAX_WORK_ELEMENTS_REG ((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElemReg))) / sizeof(scclWorkElemReg))
static_assert(SCCL_MAX_WORK_ELEMENTS_REG == 1, "Sanity check: SCCL_MAX_WORK_ELEMENTS_REG == 1");
// Number of named barriers supported by CUDA
#define SCCL_MAX_GROUPS (SCCL_MAX_NTHREADS / WARP_SIZE)
struct scclWork {
struct scclWorkHeader header;
union {
char pad[SCCL_WORK_SIZE - sizeof(struct scclWorkHeader)];
struct scclWorkElem elems[SCCL_MAX_WORK_ELEMENTS];
struct scclWorkElemP2p p2pElems[SCCL_MAX_WORK_ELEMENTS_P2P];
struct scclWorkElemReg regElems[SCCL_MAX_WORK_ELEMENTS_REG];
};
};
static_assert(sizeof(struct scclWork) == SCCL_WORK_SIZE, "Sanity check: sizeof(struct scclWork) == SCCL_WORK_SIZE");
static_assert(sizeof(struct scclWork) % 16 == 0, "Sanity check: sizeof(struct scclWork)%16 == 0");
struct scclDevChannelPeer {
// Stripped version of scclChannelPeer where we only keep the scclConnInfo
// instead of the full scclConnector.
struct scclConnInfo send[SCCL_MAX_CONNS];
struct scclConnInfo recv[SCCL_MAX_CONNS];
};
#pragma pack(pop) /* restore original alignment from stack */
#ifdef ENABLE_PROFILING
#define PROFILE_NUM_ITEMS 31
#define PROFILE_NUM_LAUNCHES 1024
struct scclProf {
uint32_t count;
uint32_t seq; // only entry from first launch is used
struct {
uint64_t line : 16;
uint64_t timeStamp : 48;
} elem[PROFILE_NUM_ITEMS];
};
static_assert(sizeof(struct scclProf) == 256, "scclProf must have size of 256");
#endif
#ifdef ENABLE_COLLTRACE
typedef enum {
scclCollTraceNotReady = 0,
scclCollTraceKernelLaunchType = 1,
scclCollTraceKernelEndType = 2,
scclCollTraceCollLaunchType = 3,
scclCollTraceAbortType = 4,
scclCollTraceDataType = 5,
scclCollTraceCollElemType = (1 << 4),
scclCollTraceP2pElemType = (1 << 5),
} scclCollTraceDataType_t;
struct scclCollTrace {
uint8_t type;
uint8_t bid;
int16_t funcIndex;
uint32_t data_0;
uint64_t timeStamp;
union {
uint64_t opCount;
uint32_t p2pOpCount[2];
};
union {
uint64_t data_1;
struct {
uint8_t nWarps;
uint8_t bid;
uint8_t nChannels;
} coll;
struct {
int16_t peer;
uint8_t ngroups : 4;
uint8_t connIndex : 4;
uint8_t warpStart : 4;
uint8_t nWarps : 4;
} p2p[2];
};
};
static_assert(sizeof(struct scclCollTrace) == 8 * sizeof(int), "scclCollTrace must have a pow2 size");
union scclCollTraceTail {
uint32_t tail;
char padding[4096];
};
#define COLLTRACE_NUM_ITEMS 8192
#endif
#ifdef HCU_SDMA_FEATURE
struct sdmaQueueContext {
hsa_sdma_info_t* sdmaInfo;
uint64_t pkgIndex;
uint32_t queueId;
uint32_t sumSdmaCopyCount;
uint32_t sumAllCopyCount;
uint32_t queueLock;
uint32_t minCopySize;
uint32_t copyCountEnable;
uint32_t sdmaQueueDepth;
uint32_t sdmaPkgLen;
uint32_t sdmaQueueLen;
};
#endif
struct alignas(16) scclDevChannel {
struct scclDevChannelPeer** peers;
struct scclRing ring;
struct scclTree tree;
struct scclTree collnetChain;
struct scclDirect collnetDirect;
struct scclTree binTree;
struct scclNvls nvls;
uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
};
struct scclDevComm {
int rank;
int nRanks;
int buffSizes[SCCL_NUM_PROTOCOLS];
// Operation list for aggregation
int workFifoDepth;
struct scclWork* workFifoHeap; // may be cudaHost or GDR memory
// Flag to ask SCCL kernels to abort
volatile uint32_t* abortFlag;
// Channels, device side
struct scclDevChannel* channels /*[MAXCHANNELS]*/;
#if defined(ENABLE_NPKIT)
NpKitEventCollectContext* npKitEventCollectContexts;
#endif
#ifdef ENABLE_COLLTRACE
struct scclCollTrace* collTrace;
union scclCollTraceTail* collTraceTail;
pthread_t collTraceThread;
#endif
#ifdef ENABLE_PROFILING
struct scclProf* devProf;
#endif
#if defined(ENABLE_TIMELINE)
TimelineGpuEventContext* gpuEventContext;
#endif
#if defined(ENABLE_NPKIT) || defined(ENABLE_TIMELINE)
uint64_t* cpuTimestamp;
#endif
};
struct alignas(16) scclDevCommAndChannels {
struct scclDevComm comm;
struct scclDevChannel channels[MAXCHANNELS];
};
#ifdef __CUDA_ARCH__
#define SCCL_CUDA_ARCH __CUDA_ARCH__
#else
#define SCCL_CUDA_ARCH 0
#endif
template <typename T>
__host__ __device__ constexpr T min_constexpr(T a) {
return a;
}
template <typename T, typename... Ts>
__host__ __device__ constexpr T min_constexpr(T a, T b, Ts... c) {
return min_constexpr<T>((a < b ? a : b), c...);
}
template <typename T>
__host__ __device__ constexpr T max_constexpr(T a) {
return a;
}
template <typename T, typename... Ts>
__host__ __device__ constexpr T max_constexpr(T a, T b, Ts... c) {
return max_constexpr<T>((a > b ? a : b), c...);
}
// Calculate the unroll factor given:
// * bytePerPack: number of bytes accessed per instruction
// * insns: max permissible unroll value
// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
__host__ __device__ constexpr int scclCalcUnroll(int bytePerPack, int insns, int bytes) {
return min_constexpr(insns, (bytes + bytePerPack - 1) / bytePerPack);
}
// Note that all unroll value logic should depend on a given cudaArch argument
// and not __CUDA_ARCH__ since these need to be host-side executable where the
// arch value is strictly runtime only. By defaulting to SCCL_CUDA_ARCH, device
// side code can elide passing the arch for brevity.
__host__ __device__ constexpr int scclCollUnroll(int cudaArch = SCCL_CUDA_ARCH) {
// Our collective unroll should move to the same bytes&insns model as NVLS.
return cudaArch >= 800 ? 8 : 4;
}
__host__ __device__ constexpr int scclNvlsUnrollBytes(int cudaArch = SCCL_CUDA_ARCH) { return 4 * 16; }
__host__ __device__ constexpr int scclNvlsUnrollInsns(int cudaArch = SCCL_CUDA_ARCH) { return 16; }
__host__ __device__ constexpr int scclNvlsUnroll(int bytePerPack, int cudaArch = SCCL_CUDA_ARCH) {
return scclCalcUnroll(bytePerPack, scclNvlsUnrollInsns(cudaArch), scclNvlsUnrollBytes(cudaArch));
}
// The amount of dynamic shmem per warp
__host__ __device__ constexpr int scclShmemScratchWarpSize(int cudaArch = SCCL_CUDA_ARCH) {
return (max_constexpr<int>(
/*LL */ 0,
/*LL128 */ (SCCL_LL128_SHMEM_ELEMS_PER_THREAD * WARP_SIZE) * sizeof(uint64_t),
/*SIMPLE*/ (scclCollUnroll(cudaArch) * WARP_SIZE + 1) * 16,
// NVLS needs an extra 16B to read unaligned data.
/*NVLS */ WARP_SIZE * (cudaArch >= 900 ? scclNvlsUnrollBytes(cudaArch) : 0) + 16) +
15) &
-16; // pad to 16 bytes
}
// The amount of dynamic shmem per block
__host__ __device__ constexpr int scclShmemDynamicSize(int cudaArch = SCCL_CUDA_ARCH) {
return cudaArch < 700 ? 0 : scclShmemScratchWarpSize(cudaArch) * (SCCL_MAX_NTHREADS / WARP_SIZE);
}
} // namespace sccl
#endif
#ifndef SCCL_GRAPH_H_
#define SCCL_GRAPH_H_
// #include "topo_utils.h"
#include "devcomm.h"
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
#include <sched.h>
namespace sccl {
namespace hardware {
namespace topology {
#define MAX_XGMI_INTER_GPUS 4
struct scclTopoGraph {
// Input / output
int id; // ring : 0, tree : 1, collnet : 2
int pattern;
int crossNic;
int collNet;
int minChannels;
int maxChannels;
// Output
int nChannels;
float bwIntra;
float bwInter;
float latencyInter;
int typeIntra;
int typeInter;
int sameChannels;
int nHops;
int intra[MAXCHANNELS * SCCL_TOPO_MAX_NODES];
int inter[MAXCHANNELS * 2];
int nIntraChannels;
int intraNets[MAXCHANNELS * SCCL_TOPO_MAX_NODES * 2];
char treeBase[SCCL_TOPO_MAX_NODES][SCCL_TOPO_MAX_NODES * 4];
};
struct scclTopoRanks {
int ringRecv[MAXCHANNELS];
int ringSend[MAXCHANNELS];
int ringPrev[MAXCHANNELS];
int ringNext[MAXCHANNELS];
int treeToParent[MAXCHANNELS];
int treeToChild0[MAXCHANNELS];
int treeToChild1[MAXCHANNELS];
int nvlsHeads[MAXCHANNELS];
};
// struct sccl::hardware::topology::topo::scclTopoSystem;
// 对系统拓扑结构进行排序
scclResult_t scclTopoSortSystem(struct scclTopoSystem* system);
// 打印系统拓扑结构
scclResult_t scclTopoPrint(struct scclTopoSystem* system);
// 计算系统中的路径
scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm);
// // 释放系统拓扑结构
// void scclTopoFree(struct scclTopoSystem* system);
// // 裁剪系统拓扑结构
// scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm);
// // 计算点对点通道
// scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm);
// // 获取指定rank的Nvidia GPU信息
// scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks);
// // 检查系统中是否所有路径都通过NVLink
// int scclTopoPathAllNVLink(struct scclTopoSystem* system);
// // 获取网络设备信息
// scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
// // 检查两个设备之间是否存在点对点连接
scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank);
// // 检查是否使用GDR
// scclResult_t scclTopoCheckGdr(struct scclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
// // 获取内部网络设备信息
// scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev);
// // 获取两个CUDA设备之间的连接类型
// scclResult_t scclTopoGetLinkType(
// struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter = MAX_XGMI_INTER_GPUS, int nInter = 0, int* inter = nullptr);
// // 检查是否需要刷新
// scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush);
// // 检查两个设备是否在同一网络中
// scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net);
// // 禁用PXE网络
// int scclPxnDisable(struct scclComm* comm);
// // 获取PXE网络中的中间节点
// scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks);
// // 获取本地节点的rank
// scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank);
// // 获取CPU亲和性
// scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity);
// // 获取CPU类型信息
// scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model);
// // 获取GPU数量
// scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count);
// // 获取NVS数量
// scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count);
// // 获取本地网络设备信息
// scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id);
// // 获取本地GPU索引
// scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex);
// // 初始化搜索,调用scclTopoCompute之前需要执行
// scclResult_t scclTopoSearchInit(struct scclTopoSystem* system);
// // 计算拓扑图
// scclResult_t scclTopoCompute(struct scclTopoSystem* system, struct scclTopoGraph* graph);
// // 打印拓扑图
// scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph);
// // 导出拓扑图
// scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs);
// // 设置预定义拓扑图
// scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks);
// // 设置后处理拓扑图
// scclResult_t scclTopoPostset(
// struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc);
// // 设置基于树的后处理拓扑图
// scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph);
// // 调整模型以适应计算能力
// scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs);
// scclResult_t scclTopoCudaPath(int cudaDev, char** path);
// #include "info.h"
// scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "comm.h"
#include "net.h"
#include "channel.h"
#include "xml.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace graph {
// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
struct scclTopoNodeList {
struct scclTopoNode* list[SCCL_TOPO_MAX_NODES];
int count;
};
static scclResult_t getPath(struct scclTopoSystem* system, struct scclTopoNode* node, int t, int64_t id, struct scclTopoLinkList** path) {
for(int i = 0; i < system->nodes[t].count; i++) {
if(system->nodes[t].nodes[i].id == id) {
*path = node->paths[t] + i;
return scclSuccess;
}
}
WARN("Could not find node of type %d id %lx", t, id);
return scclInternalError;
}
static scclResult_t scclTopoSetPaths(struct scclTopoNode* baseNode, struct scclTopoSystem* system) {
if(baseNode->paths[baseNode->type] == NULL) {
SCCLCHECK(scclCalloc(baseNode->paths + baseNode->type, system->nodes[baseNode->type].count));
}
// breadth-first search to set all paths to that node in the system
struct scclTopoNodeList nodeList;
struct scclTopoNodeList nextNodeList;
nodeList.count = 1;
nodeList.list[0] = baseNode;
nextNodeList.count = 0;
struct scclTopoLinkList* basePath;
SCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
basePath->count = 0;
basePath->bw = LOC_BW;
basePath->type = PATH_LOC;
while(nodeList.count) {
nextNodeList.count = 0;
for(int n = 0; n < nodeList.count; n++) {
struct scclTopoNode* node = nodeList.list[n];
struct scclTopoLinkList* path;
SCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
for(int l = 0; l < node->nlinks; l++) {
struct scclTopoLink* link = node->links + l;
struct scclTopoNode* remNode = link->remNode;
if(remNode->paths[baseNode->type] == NULL) {
SCCLCHECK(scclCalloc(remNode->paths + baseNode->type, system->nodes[baseNode->type].count));
}
struct scclTopoLinkList* remPath;
SCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
float bw = std::min(path->bw, link->bw);
// allow routing through a GPU only as 1 hop
if(node != baseNode && node->type == GPU && (link->type != LINK_NVL || remNode->type != GPU || path->count > 1))
continue;
if((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) {
// Find reverse link
for(int l = 0; l < remNode->nlinks; l++) {
if(remNode->links[l].remNode == node) {
remPath->list[0] = remNode->links + l;
break;
}
}
if(remPath->list[0] == NULL) {
WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
remNode->type,
remNode->id,
remNode->nlinks,
node->type,
node->id);
return scclInternalError;
}
// Copy the rest of the path
for(int i = 0; i < path->count; i++)
remPath->list[i + 1] = path->list[i];
remPath->count = path->count + 1;
remPath->bw = bw;
// Start with path type = link type. PATH and LINK types are supposed to match.
// Don't consider LINK_NET as we only care about the NIC->GPU path.
int type = link->type == LINK_NET ? LINK_LOC : link->type;
// Differentiate between one and multiple PCI switches
if(node->type == PCI && remNode->type == PCI)
type = PATH_PXB;
// Consider a path going through the CPU as PATH_PHB
if(link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU))
type = PATH_PHB;
// Set 1 hop NVLink as NVB
// if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
remPath->type = std::max(path->type, type);
// Add to the list for the next iteration if not already in the list
// Disallow GPUs as intermediate steps for now
if(remNode->type != GPU) {
int i;
for(i = 0; i < nextNodeList.count; i++)
if(nextNodeList.list[i] == remNode)
break;
if(i == nextNodeList.count)
nextNodeList.list[nextNodeList.count++] = remNode;
}
}
}
}
memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
}
return scclSuccess;
}
/**
* 打印节点路径信息
*
* @param system 拓扑系统指针
* @param node 待打印路径的节点指针
*
* 该函数用于输出指定节点的路径信息,包括路径类型、目标节点ID、
* 路径跳数、带宽和路径类型字符串。输出格式为一行字符串。
*/
static void printNodePaths(struct scclTopoSystem* system, struct scclTopoNode* node) {
char line[1024];
sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
int offset = strlen(line);
for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
if(node->paths[t] == NULL)
continue;
for(int n = 0; n < system->nodes[t].count; n++) {
sprintf(line + offset,
"%s/%lX (%d/%f/%s) ",
topoNodeTypeStr[t],
system->nodes[t].nodes[n].id,
node->paths[t][n].count,
node->paths[t][n].bw,
topoPathTypeStr[node->paths[t][n].type]);
offset = strlen(line);
}
}
}
static scclResult_t getLocalCpu(struct scclTopoSystem* system, int gpu, int* retCpu) {
// Find the closest CPU to a GPU
int minHops = 0;
int localCpu = -1;
struct scclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
for(int c = 0; c < system->nodes[CPU].count; c++) {
int hops = paths[c].count;
if(minHops == 0 || hops < minHops) {
localCpu = c;
minHops = hops;
}
}
if(localCpu == -1) {
WARN("Error : could not find CPU close to GPU %d", gpu);
return scclInternalError;
}
*retCpu = localCpu;
return scclSuccess;
}
static scclResult_t addInterStep(struct scclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
struct scclTopoNode* cpuNode = system->nodes[tx].nodes + ix;
struct scclTopoNode* srcNode = system->nodes[t1].nodes + i1;
int l = 0;
// Node 1 -> CPU
for(int i = 0; i < srcNode->paths[tx][ix].count; i++)
srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i];
// CPU -> Node 2
for(int i = 0; i < cpuNode->paths[t2][i2].count; i++)
srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
// Update path characteristics
srcNode->paths[t2][i2].count = l;
srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
if(tx == GPU)
srcNode->paths[t2][i2].type = PATH_PXN;
srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw);
return scclSuccess;
}
// Remove/free paths for a given type
static void scclTopoRemovePathType(struct scclTopoSystem* system, int nodeType) {
for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
// Remove links _to_ the given type
for(int n = 0; n < system->nodes[t].count; n++) {
struct scclTopoNode* node = system->nodes[t].nodes + n;
free(node->paths[nodeType]);
node->paths[nodeType] = NULL;
}
// Remove links _from_ the given type
for(int n = 0; n < system->nodes[nodeType].count; n++) {
struct scclTopoNode* node = system->nodes[nodeType].nodes + n;
free(node->paths[t]);
node->paths[t] = NULL;
}
}
}
static const int levelsOldToNew[] = {PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS};
scclResult_t scclGetLevel(int* level, const char* disableEnv, const char* levelEnv) {
if(*level == -1) {
int l = -1;
if(disableEnv) {
char* str = getenv(disableEnv);
if(str) {
int disable = strtol(str, NULL, 0);
if(disable == 1)
l = 0;
}
}
if(l == -1) {
char* str = getenv(levelEnv);
if(str) {
for(int i = 0; i <= PATH_SYS; i++) {
if(strcmp(str, topoPathTypeStr[i]) == 0) {
l = i;
break;
}
}
// Old style numbering
// levelsOldToNew to is an array with each index corresponding to the
// "old level" int, and each value mapping to the correct value defined in topo.h
// maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew)
if(l == -1 && str[0] >= '0' && str[0] <= '9') {
int oldLevel = strtol(str, NULL, 0);
const int maxOldLevel = sizeof(levelsOldToNew) / sizeof(int) - 1;
if(oldLevel > maxOldLevel)
oldLevel = maxOldLevel;
l = levelsOldToNew[oldLevel];
}
}
}
if(l >= 0)
INFO(SCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
*level = l >= 0 ? l : -2;
}
return scclSuccess;
}
SCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
int scclTopoUserGdrLevel = -1;
scclResult_t scclTopoCheckGdr(struct scclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) {
*useGdr = 0;
// Get GPU and NET
int n, g;
SCCLCHECK(scclTopoIdToIndex(system, NET, netDev, &n));
struct scclTopoNode* net = system->nodes[NET].nodes + n;
SCCLCHECK(scclTopoIdToIndex(system, GPU, busId, &g));
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
// Check that both the NIC and GPUs support it
if(net->net.gdrSupport == 0)
return scclSuccess;
if(gpu->gpu.gdrSupport == 0)
return scclSuccess;
if(read) { // For reads (sends) only enable under certain conditions
int gdrReadParam = scclParamNetGdrRead();
if(gdrReadParam == 0)
return scclSuccess;
if(gdrReadParam < 0) {
int nvlink = 0;
// Since we don't know whether there are other communicators,
// it's better to keep things local if we have a single GPU.
if(system->nodes[GPU].count == 1)
nvlink = 1;
for(int i = 0; i < system->nodes[GPU].count; i++) {
if(i == g)
continue;
if(gpu->paths[GPU][i].type == PATH_NVL) {
nvlink = 1;
break;
}
}
if(!nvlink)
return scclSuccess;
}
}
// Check if we are close enough that it makes sense to enable GDR
int netGdrLevel = system->netGdrLevel == -2 ? PATH_PXB : system->netGdrLevel;
SCCLCHECK(scclGetLevel(&scclTopoUserGdrLevel, NULL, "SCCL_NET_GDR_LEVEL"));
if(scclTopoUserGdrLevel != -2)
netGdrLevel = scclTopoUserGdrLevel;
else {
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME) {
int i, d1 = -1, d2 = -1;
for(i = 0; i < system->nodes[CPU].count; i++)
if(system->nodes[GPU].nodes[g].paths[CPU][i].count == 2)
break;
if(i < system->nodes[CPU].count)
d1 = system->nodes[CPU].nodes[i].id;
for(i = 0; i < system->nodes[CPU].count; i++)
if(system->nodes[NET].nodes[n].paths[CPU][i].count == 2)
break;
if(i < system->nodes[CPU].count)
d2 = system->nodes[CPU].nodes[i].id;
if(d1 != -1 && d2 != -1 && d1 == d2 && (system->nodes[GPU].nodes[g].id & 0xf0000) == (system->nodes[NET].nodes[n].net.busId & 0xf0000)) {
netGdrLevel = PATH_PHB;
}
}
}
int distance = gpu->paths[NET][n].type;
if(distance == PATH_PXN) {
// In case of PXN, use the intermediate GPU distance instead
int proxyRank, g;
SCCLCHECK(scclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
SCCLCHECK(scclTopoRankToIndex(system, proxyRank, &g));
struct scclTopoNode* proxyGpu = system->nodes[GPU].nodes + g;
distance = proxyGpu->paths[NET][n].type;
}
if(distance > netGdrLevel) {
INFO(SCCL_NET, "GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
return scclSuccess;
}
*useGdr = 1;
INFO(SCCL_NET, "GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read);
return scclSuccess;
}
// Set to 0 to disable the flush on Hopper when using GDR
SCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1);
// Determine whether we need to flush the GDR recv buffers
scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush) {
int g;
SCCLCHECK(scclTopoIdToIndex(system, GPU, busId, &g));
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
// Flush is required on Ampere and earlier
*flush = gpu->gpu.cudaCompCap < 90 ? 1 : scclParamNetForceFlush();
return scclSuccess;
}
SCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 1);
// Check whether going through the network would be faster than going through P2P/SHM.
scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net) {
if(scclParamNetDisableIntra() == 1) {
*net = 0;
return scclSuccess;
}
*net = 1;
// First check the current GPU-to-GPU speed.
int g1, g2;
if(scclTopoIdToIndex(system, GPU, id1, &g1) != scclSuccess || scclTopoIdToIndex(system, GPU, id2, &g2) != scclSuccess) {
return scclSuccess;
}
struct scclTopoNode* gpu1 = system->nodes[GPU].nodes + g1;
struct scclTopoNode* gpu2 = system->nodes[GPU].nodes + g2;
float speed = gpu1->paths[GPU][g2].bw;
// Now check the speed each GPU can access the network through PXB or better
float netSpeed1 = 0, netSpeed2 = 0;
for(int n = 0; n < system->nodes[NET].count; n++) {
struct scclTopoLinkList* path = gpu1->paths[NET] + n;
if(path->type <= PATH_PXB && path->bw > netSpeed1)
netSpeed1 = path->bw;
path = gpu2->paths[NET] + n;
if(path->type <= PATH_PXB && path->bw > netSpeed2)
netSpeed2 = path->bw;
}
if(netSpeed1 > speed && netSpeed2 > speed)
return scclSuccess;
*net = 0;
return scclSuccess;
}
scclResult_t scclTopoGetIntermediateRank(struct scclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
// Get GPU and NET
int n, g;
SCCLCHECK(scclTopoIdToIndex(system, NET, netDev, &n));
SCCLCHECK(scclTopoRankToIndex(system, rank, &g));
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
struct scclTopoLinkList* path = gpu->paths[NET] + n;
if(path->type == PATH_PXN) {
struct scclTopoNode* node;
int type = NVS;
for(int i = 0; i < path->count && type == NVS; i++) {
node = path->list[i]->remNode;
type = node->type;
}
if(type != GPU) {
WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
return scclInternalError;
}
*intermediateRank = node->gpu.rank;
} else {
*intermediateRank = rank;
}
return scclSuccess;
}
SCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);
// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
// remote proxies without risking deadlocks
int scclPxnDisable(struct scclComm* comm) {
static int pxnDisable = -1;
if(pxnDisable == -1) {
if(comm && scclNetVersion(comm) == 4) {
INFO(SCCL_INIT, "PXN Disabled as plugin is v4");
pxnDisable = 1;
} else {
pxnDisable = scclParamPxnDisable();
}
}
return pxnDisable;
}
scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks) {
struct scclTopoSystem* system = comm->topo;
*nranks = 0;
*intermediateRanks = NULL;
if(system->nodes[NET].count == 0)
return scclSuccess;
int nr = 0;
int* ranks = NULL;
for(int rank = 0; rank < comm->nRanks; rank++) {
int netDev, proxyRank;
SCCLCHECK(scclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank));
if(proxyRank == comm->rank)
continue;
int useGdr;
SCCLCHECK(scclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr));
if(useGdr == 0)
continue;
int found = 0;
for(int r = 0; r < nr; r++) {
if(ranks[r] == proxyRank)
found = 1;
}
if(!found) {
SCCLCHECK(scclRealloc(&ranks, nr, nr + 1));
ranks[nr++] = proxyRank;
}
}
*nranks = nr;
*intermediateRanks = ranks;
return scclSuccess;
}
static bool rcclPathOverride(struct scclTopoSystem* system, uint64_t distance) {
int i, j;
for(i = 0; i < system->nodes[GPU].count; i++) {
for(j = 0; j < system->nodes[NET].count; j++) {
if((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) ||
(system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance))
break;
}
if(j >= system->nodes[NET].count)
break;
}
if(i >= system->nodes[GPU].count) {
for(i = 0; i < system->nodes[GPU].count; i++) {
for(j = 0; j < system->nodes[NET].count; j++) {
if((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) ||
(system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance))
system->nodes[GPU].nodes[i].paths[NET][j].type = PATH_PXB;
}
}
return true;
} else {
return false;
}
}
RCCL_PARAM(EnableIntranet, "ENABLE_INTRANET", -2);
scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm) {
int* domains;
int64_t* ids;
SCCLCHECK(scclCalloc(&domains, system->nodes[GPU].count));
SCCLCHECK(scclCalloc(&ids, system->nodes[GPU].count));
int myDomain = 0;
for(int g = 0; g < system->nodes[GPU].count; g++) {
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
domains[g] = g;
ids[g] = gpu->id;
for(int p = 0; p < g; p++) {
if(gpu->paths[GPU][p].type < PATH_NET) {
domains[g] = std::min(domains[g], domains[p]);
}
}
if(gpu->gpu.rank == comm->rank)
myDomain = domains[g];
}
int ngpus = system->nodes[GPU].count;
for(int i = 0; i < ngpus; i++) {
if(domains[i] == myDomain)
continue;
struct scclTopoNode* gpu = NULL;
int g;
for(g = 0; g < system->nodes[GPU].count /* This one varies over the loops */; g++) {
gpu = system->nodes[GPU].nodes + g;
if(gpu->id == ids[i])
break;
else
gpu = NULL;
}
if(gpu == NULL) {
WARN("Could not find id %lx", ids[i]);
free(domains);
free(ids);
return scclInternalError;
}
SCCLCHECK(scclTopoRemoveNode(system, GPU, g));
}
// trim low speed port on same NIC
for(int i = 0; i < system->nodes[NET].count; i++) {
for(int j = 0; j < system->nodes[NET].count; j++) {
if(i == j)
continue;
if(system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[j].net.asic) {
if(system->nodes[NET].nodes[i].net.bw > system->nodes[NET].nodes[j].net.bw)
system->nodes[NET].nodes[j].net.bw = 0;
}
}
}
do {
int n;
for(n = 0; n < system->nodes[NET].count; n++) {
if(system->nodes[NET].nodes[n].net.bw == 0)
break;
}
if(n < system->nodes[NET].count) {
SCCLCHECK(scclTopoRemoveNode(system, NET, n));
} else
break;
} while(system->nodes[NET].count);
int remove = 1;
int gdr = 1;
bool allXgmi = true;
// detect if all GPUs are connected by XGMI
for(int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
for(int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
if(i == j)
continue;
int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
bool isXGMI;
SCCLCHECK(scclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
allXgmi &= isXGMI;
}
}
if(allXgmi)
system->type |= RCCL_TOPO_XGMI_ALL;
for(int g = 0; g < system->nodes[GPU].count; g++) {
int net;
SCCLCHECK(scclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
if(!gdr)
break;
}
if(gdr && !allXgmi) {
remove = 0;
system->type |= RCCL_TOPO_GDR_ALL;
INFO(SCCL_LOG_TOPO, "GDR is available on all GPUs");
}
// Special handling of gfx94x
if(rcclParamEnableIntranet() == 1 || (rcclParamEnableIntranet() == -2 && IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
system->nodes[GPU].count == 8 && system->nodes[NET].count == 8)) {
remove = 0;
system->type |= RCCL_TOPO_FORCE_INTRA;
}
comm->localRanks = system->nodes[GPU].count;
if(system->nodes[GPU].count == comm->nRanks && remove) {
for(int n = system->nodes[NET].count - 1; n >= 0; n--)
SCCLCHECK(scclTopoRemoveNode(system, NET, n));
}
free(domains);
free(ids);
return scclSuccess;
}
void scclTopoFree(struct scclTopoSystem* system) {
for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++)
scclTopoRemovePathType(system, t);
free(system);
}
SCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 1);
SCCL_PARAM(NChannelsPerPeer, "NCHANNELS_PER_PEER", 4);
static scclResult_t scclTopoGetNchannels(struct scclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
int peer;
struct scclTopoLinkList* path = NULL;
if(scclTopoRankToIndex(system, peerRank, &peer) == scclSuccess) {
// Same rank
if(g == peer) {
*nChannels = -1;
return scclSuccess;
}
// Local rank
path = system->nodes[GPU].nodes[peer].paths[GPU] + g;
if(path->type == PATH_NVL) {
float nvlBw = scclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn);
*nChannels = (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 4 : 2) * std::max(1, (int)(path->bw / nvlBw));
} else {
*nChannels = 2;
}
} else {
// Remote rank, use network
*nChannels = scclParamNChannelsPerNetPeer();
}
return scclSuccess;
}
SCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 4);
SCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);
static int nextPow2(int v) {
int pow2 = 1;
while(pow2 < v)
pow2 <<= 1;
return pow2;
}
scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm) {
/* here we already honor comm->max/minCTAs for p2pnChannels. */
int MinP2pNchannels = (int)scclParamMinP2pNChannels();
int MaxP2pNchannels = (int)scclParamMaxP2pNChannels();
int NchannelsPerPeer = (int)scclParamNChannelsPerPeer();
if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MIN_P2P_NCHANNELS") == NULL)
MinP2pNchannels = 32;
if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MAX_P2P_NCHANNELS") == NULL)
MaxP2pNchannels = 32;
if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_NCHANNELS_PER_PEER") == NULL)
NchannelsPerPeer = 32;
int scclMinP2pNchannels = MinP2pNchannels;
if(comm->sharedRes->owner != comm) {
comm->p2pnChannels = std::min(comm->nChannels, MaxP2pNchannels);
comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, scclMinP2pNchannels), comm->sharedRes->tpP2pNChannels);
} else {
comm->p2pnChannels = std::min(comm->nChannels, MaxP2pNchannels);
comm->p2pnChannels = std::max(comm->p2pnChannels, scclMinP2pNchannels);
}
int minChannels = comm->p2pnChannels;
// We need to loop through all local GPUs to have a global picture
for(int g = 0; g < comm->topo->nodes[GPU].count; g++) {
for(int r = 0; r < comm->nRanks; r++) {
int nChannels;
SCCLCHECK(scclTopoGetNchannels(comm->topo, g, r, &nChannels));
if(nChannels >= 0)
minChannels = std::min(minChannels, nChannels);
}
}
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(comm->topo, &arch, &vendor, &model));
// Round to next pow2 nChannelsPerPeer and nChannels
if(getNumaMaxGpus() == 1 && !scclTopoPathAllNVLink(comm->topo)) {
comm->p2pnChannelsPerPeer = nextPow2(comm->p2pnChannels);
} else {
comm->p2pnChannelsPerPeer = (NchannelsPerPeer == -2 ? nextPow2(minChannels) : NchannelsPerPeer);
}
comm->p2pnChannels = nextPow2(comm->p2pnChannels);
// Init channels that weren't used so far
for(int c = comm->nChannels; c < std::max(comm->nChannels, comm->p2pnChannels); c++)
SCCLCHECK(initChannel(comm, c));
// We want to spread channels used when there aren't many and progressively
// fill the whole space of nChannels. To do so we mirror the bits in the
// nChannels space.
for(int c = 0; c < comm->p2pnChannels; c++) {
int mirror = 0;
for(int b = 1, mb = (comm->p2pnChannels >> 1); b < comm->p2pnChannels; b <<= 1, mb >>= 1)
if(c & b)
mirror |= mb;
comm->p2pChannels[c] = mirror;
}
return scclSuccess;
}
scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks) {
int ngpus = system->nodes[GPU].count;
SCCLCHECK(scclCalloc(ranks, ngpus));
int nvbGpus = 0;
for(int g = 0; g < ngpus; g++) {
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
if(gpu->gpu.rank != rank)
continue;
for(int p = 0; p < ngpus; p++) {
if(gpu->paths[GPU][p].type == PATH_NVB) {
(*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
}
}
}
*nranks = nvbGpus;
return scclSuccess;
}
int scclTopoPathAllNVLink(struct scclTopoSystem* system) {
int minPath = PATH_DIS;
for(int i = 0; i < system->nodes[GPU].count; i++) {
struct scclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
for(int j = 0; j < system->nodes[GPU].count; j++) {
if(i == j)
continue;
minPath = std::min(minPath, paths[j].type);
}
}
return minPath >= PATH_PIX ? 0 : 1;
}
} // namespace graph
scclResult_t scclTopoPrintPaths(struct scclTopoSystem* system) {
for(int i = 0; i < system->nodes[GPU].count; i++) {
graph::printNodePaths(system, system->nodes[GPU].nodes + i);
}
for(int i = 0; i < system->nodes[NET].count; i++) {
graph::printNodePaths(system, system->nodes[NET].nodes + i);
}
return scclSuccess;
}
int scclTopoUserP2pLevel = -1;
scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank) {
*p2p = 0;
if(read)
*read = 0;
if(intermediateRank)
*intermediateRank = -1;
// Get GPUs from topology
int g1, g2;
SCCLCHECK(scclTopoIdToIndex(system, GPU, id1, &g1));
struct scclTopoNode* gpu1 = system->nodes[GPU].nodes + g1;
if(scclTopoIdToIndex(system, GPU, id2, &g2) == scclInternalError) {
// GPU not found, we can't use p2p.
return scclSuccess;
}
int intermediateIndex = -1;
// Set intermediate GPU rank, if routing through an intermediate GPU.
struct scclTopoLinkList* path = gpu1->paths[GPU] + g2;
if(path->count == 2) {
struct scclTopoNode* intermediateNode = path->list[0]->remNode;
if(intermediateNode->type == GPU) {
intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
if(intermediateRank)
*intermediateRank = intermediateNode->gpu.rank;
}
}
// In general, use P2P whenever we can.
int p2pLevel = PATH_SYS;
// User override
if(scclTopoUserP2pLevel == -1)
SCCLCHECK(scclGetLevel(&scclTopoUserP2pLevel, "SCCL_P2P_DISABLE", "SCCL_P2P_LEVEL"));
if(scclTopoUserP2pLevel != -2) {
p2pLevel = scclTopoUserP2pLevel;
goto compare;
}
// Don't use P2P through ARM CPUs
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
if(arch == SCCL_TOPO_CPU_ARCH_ARM)
p2pLevel = PATH_PXB;
if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
p2pLevel = PATH_PXB;
}
if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
p2pLevel = PATH_PXB;
}
compare:
// Compute the PCI distance and compare with the p2pLevel.
if(path->type <= p2pLevel)
*p2p = 1;
if(path->type == PATH_NVL) {
struct scclTopoNode* gpu2 = system->nodes[GPU].nodes + g2;
// Enable P2P Read for Ampere/NVLink only
if(read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80))
*read = 1;
}
return scclSuccess;
}
scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm) {
// Precompute paths between GPUs/NICs.
// Remove everything in case we're re-computing
for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++)
graph::scclTopoRemovePathType(system, t);
// Set direct paths to CPUs. We need them in many cases.
for(int c = 0; c < system->nodes[CPU].count; c++) {
SCCLCHECK(graph::scclTopoSetPaths(system->nodes[CPU].nodes + c, system));
}
// Set direct paths to GPUs.
for(int g = 0; g < system->nodes[GPU].count; g++) {
SCCLCHECK(graph::scclTopoSetPaths(system->nodes[GPU].nodes + g, system));
}
// Set direct paths to NICs.
for(int n = 0; n < system->nodes[NET].count; n++) {
SCCLCHECK(graph::scclTopoSetPaths(system->nodes[NET].nodes + n, system));
}
// Set direct paths to NVSwitches.
for(int n = 0; n < system->nodes[NVS].count; n++) {
SCCLCHECK(graph::scclTopoSetPaths(system->nodes[NVS].nodes + n, system));
}
// Update path for GPUs when we don't want to / can't use GPU Direct P2P
for(int g = 0; g < system->nodes[GPU].count; g++) {
for(int p = 0; p < system->nodes[GPU].count; p++) {
int p2p;
SCCLCHECK(scclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
if(p2p == 0) {
// Divert all traffic through the CPU
int cpu;
SCCLCHECK(getLocalCpu(system, g, &cpu));
SCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
}
}
if(comm == NULL)
continue;
// Remove GPUs we can't (or don't want to) communicate with through P2P or SHM
struct scclPeerInfo* dstInfo = comm->peerInfo + system->nodes[GPU].nodes[g].gpu.rank;
for(int p = 0; p < system->nodes[GPU].count; p++) {
if(p == g)
continue;
struct scclPeerInfo* srcInfo = comm->peerInfo + system->nodes[GPU].nodes[p].gpu.rank;
int p2p;
SCCLCHECK(scclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
if(p2p == 0) {
int shm;
SCCLCHECK(scclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
if(shm == 0) {
// Mark this peer as inaccessible. We'll trim it later.
system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
}
}
}
}
// Special handling of gfx94x
#if !defined(TOPO_EXPL)
char strValue[1024];
SCCLCHECK(scclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue));
if(strncmp("Hyper-V UEFI Release", strValue, 20) == 0) {
#endif
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_INTEL && IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
((system->nodes[GPU].count == 8 && system->nodes[NET].count == 8 && system->nodes[GPU].count == system->nRanks) ||
(system->nodes[GPU].count != system->nRanks))) {
if(!rcclPathOverride(system, 0x100000))
rcclPathOverride(system, 0x1000);
}
#if !defined(TOPO_EXPL)
}
#endif
// Update paths for NICs (no GPU Direct, PXN, ...)
for(int n = 0; n < system->nodes[NET].count; n++) {
struct scclTopoNode* netNode = system->nodes[NET].nodes + n;
for(int g = 0; g < system->nodes[GPU].count; g++) {
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
if(scclPxnDisable(comm) != 1) {
int localGpuIndex;
SCCLCHECK(scclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex));
if(localGpuIndex != g && localGpuIndex != -1) {
// PXN = PCI + NVLink.
struct scclTopoNode* peerNode = system->nodes[GPU].nodes + localGpuIndex;
// Only use PXN for NIC n if remote GPU p ...
if(peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI
peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU
// We can use that GPU as relay to communicate with that NIC.
// Only enabling it in the GPU->NIC direction for now to favor
// receiving locally and sending remotely (consistent with net.cc)
SCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
}
}
// Update path when we dont want to / can't use GPU Direct RDMA.
int gdr;
SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
if(gdr == 0) {
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
int localCpu;
SCCLCHECK(getLocalCpu(system, g, &localCpu));
SCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
SCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
}
}
}
return scclSuccess;
}
} // namespace topology
} // namespace hardware
} // namespace sccl
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
#define MAXWIDTH 20
#define PREFIXLEN 15
#define STRLENGTH (PREFIXLEN + 5 * MAXWIDTH)
void dumpLine(int* values, int nranks, const char* prefix) {
int prefixlen = strlen(prefix);
char line[STRLENGTH + 1];
line[STRLENGTH] = '\0';
memset(line, ' ', STRLENGTH);
strncpy(line, prefix, PREFIXLEN);
for(int i = 0; i < nranks && i < MAXWIDTH; i++)
sprintf(line + prefixlen + 4 * i, " %3d", values[i]);
INFO(SCCL_INIT, "%s", line);
}
scclResult_t scclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
for(int r = 0; r < nrings; r++) {
char prefix[40];
/*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
dumpLine(prev+r*nranks, nranks, prefix);
sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
dumpLine(next+r*nranks, nranks, prefix);*/
int current = rank;
for(int i = 0; i < nranks; i++) {
rings[r * nranks + i] = current;
current = next[r * nranks + current];
}
sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
if(rank == 0)
dumpLine(rings + r * nranks, nranks, prefix);
if(current != rank) {
WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
return scclInternalError;
}
// Check that all ranks are there
for(int i = 0; i < nranks; i++) {
int found = 0;
for(int j = 0; j < nranks; j++) {
if(rings[r * nranks + j] == i) {
found = 1;
break;
}
}
if(found == 0) {
WARN("Error : ring %d does not contain rank %d", r, i);
return scclInternalError;
}
}
}
return scclSuccess;
}
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
/*************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
scclResult_t scclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "xml.h"
#include <math.h>
#include <sys/time.h>
#include <algorithm>
#include <string.h>
#include "rome_models.h"
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
struct scclRomeModel {
int nGpus;
int nCpus;
int nNics;
int nLinks;
int64_t gpuIds[SCCL_TOPO_MAX_NODES];
int64_t nicIds[SCCL_TOPO_MAX_NODES];
int64_t gpuNuma[SCCL_TOPO_MAX_NODES];
int64_t nicNuma[SCCL_TOPO_MAX_NODES];
uint8_t connMatrix[SCCL_TOPO_MAX_NODES * SCCL_TOPO_MAX_NODES];
uint8_t gdrLevel[SCCL_TOPO_MAX_NODES * SCCL_TOPO_MAX_NODES];
const char* pattern;
const char* ringBase;
const char* options;
const char* treeBase;
};
static struct scclRomeModel rome_model_22 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 1,
.nLinks = 2,
.gpuIds =
{
0x3000,
0x43000,
0x26000,
0xc3000,
0x83000,
0x23000,
0xc6000,
0xa3000,
},
.nicIds =
{
0xe1000,
},
.gpuNuma =
{
1,
0,
1,
2,
3,
1,
2,
3,
},
.nicNuma =
{
2,
},
.connMatrix =
{
0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_SYS,
},
.pattern = "10302120",
.ringBase = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6",
.options = "",
};
static struct scclRomeModel rome_model_25 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 2,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds =
{
0x61000,
0xa1000,
},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
2,
3,
},
.nicNuma =
{
0,
3,
},
.connMatrix =
{
0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
},
.gdrLevel =
{
PATH_PHB,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
},
.pattern = "11303011",
.ringBase = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0",
.options = "",
};
static struct scclRomeModel rome_model_27 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 2,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds =
{
0x61000,
0xa1000,
},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
2,
3,
},
.nicNuma =
{
0,
3,
},
.connMatrix =
{
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
},
.gdrLevel =
{
PATH_PHB,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
},
.pattern = "11303011",
.ringBase = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2",
.options = "",
};
static struct scclRomeModel rome_model_29 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 1,
.nLinks = 3,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds =
{
0xe1000,
},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
3,
3,
},
.nicNuma =
{
2,
},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_PHB,
PATH_SYS,
PATH_SYS,
},
.pattern = "10302120",
.ringBase = "6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0",
.options = "",
};
static struct scclRomeModel rome_model_31 = {
.nGpus = 8,
.nCpus = 8,
.nNics = 2,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds =
{
0x61000,
0xa1000,
},
.gpuNuma =
{
1,
2,
2,
3,
4,
5,
5,
7,
},
.nicNuma =
{
0,
6,
},
.connMatrix =
{
0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
},
.pattern = "0110201010200110",
.ringBase = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3",
.options = "",
};
static struct scclRomeModel rome_model_33 = {
.nGpus = 8,
.nCpus = 8,
.nNics = 2,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds =
{
0x61000,
0xa1000,
},
.gpuNuma =
{
1,
2,
2,
3,
4,
5,
5,
7,
},
.nicNuma =
{
0,
6,
},
.connMatrix =
{
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
},
.pattern = "0110201010200110",
.ringBase = "1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0",
.options = "",
};
static struct scclRomeModel rome_model_30 = {
.nGpus = 8,
.nCpus = 8,
.nNics = 0,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
1,
2,
2,
3,
4,
5,
5,
7,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "0010201010200010",
.ringBase = "3 0 1 2 6 7 5 4|2 1 0 3 7 6 4 5",
.options = "",
};
static struct scclRomeModel rome_model_32 = {
.nGpus = 8,
.nCpus = 8,
.nNics = 0,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
1,
2,
2,
3,
4,
5,
5,
7,
},
.nicNuma = {},
.connMatrix =
{
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
},
.gdrLevel = {},
.pattern = "0010201010200010",
.ringBase = "0 6 2 3 4 5 7 1|3 2 6 0 1 7 5 4",
.options = "",
};
static struct scclRomeModel rome_model_24 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
2,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "10303010",
.ringBase = "0 1 2 3 5 7 6 4|1 0 3 2 7 5 4 6",
.options = "",
};
static struct scclRomeModel rome_model_26 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xe3000,
0xc3000,
0xc6000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
2,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
},
.gdrLevel = {},
.pattern = "10303010",
.ringBase = "4 5 7 1 0 3 2 6|3 0 6 2 1 7 5 4",
.options = "",
};
static struct scclRomeModel rome_model_23 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
},
.gdrLevel = {},
.pattern = "10302020",
.ringBase = "1 7 6 4 5 2 0 3|2 5 3 0 4 6 7 1",
.options = "",
};
static struct scclRomeModel rome_model_38 = {
.nGpus = 8,
.nCpus = 7,
.nNics = 0,
.nLinks = 2,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
1,
2,
2,
3,
5,
5,
6,
7,
},
.nicNuma = {},
.connMatrix =
{
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
},
.gdrLevel = {},
.pattern = "10201000201010",
.ringBase = "6 7 1 4 3 5 2 0|0 2 5 3 4 1 7 6",
.options = "",
};
static struct scclRomeModel rome_model_28 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "10302020",
.ringBase = "0 3 2 1 4 5 6 7|7 6 5 4 1 2 3 0|0 2 5 7 4 6 3 1|1 3 6 4 7 5 2 0",
.options = "",
};
static struct scclRomeModel rome_model_40 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 1,
.nLinks = 3,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds =
{
0xe1000,
},
.gpuNuma =
{
0,
1,
1,
1,
2,
2,
3,
3,
},
.nicNuma =
{
2,
},
.connMatrix =
{
0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_PHB,
PATH_SYS,
PATH_SYS,
},
.pattern = "10302120",
.ringBase = "6 7 1 4 0 5 3 2|7 6 4 1 0 2 3 5",
.options = "",
};
static struct scclRomeModel rome_model_42 = {
.nGpus = 8,
.nCpus = 7,
.nNics = 1,
.nLinks = 3,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds =
{
0xe1000,
},
.gpuNuma =
{
1,
2,
2,
3,
5,
5,
6,
7,
},
.nicNuma =
{
4,
},
.connMatrix =
{
0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
},
.pattern = "10201001201010",
.ringBase = "7 4 6 1 3 0 2 5|6 4 7 1 3 2 5 0",
.options = "",
};
static struct scclRomeModel rome_model_44 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 1,
.nLinks = 3,
.gpuIds =
{
0x63000,
0x43000,
0x27000,
0x3000,
0xe3000,
0xc3000,
0xa3000,
0x83000,
},
.nicIds =
{
0xc4000,
},
.gpuNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.nicNuma =
{
2,
},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_PHB,
PATH_SYS,
PATH_SYS,
},
.pattern = "20202120",
.ringBase = "5 4 7 6 2 1 3 0|5 6 7 4 1 0 2 3",
.options = "",
};
static struct scclRomeModel rome_model_45 = {
.nGpus = 8,
.nCpus = 7,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
1,
2,
2,
3,
5,
5,
6,
7,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "10201000201010",
.ringBase = "0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1",
.options = "",
};
static struct scclRomeModel rome_model_46 = {
.nGpus = 8,
.nCpus = 7,
.nNics = 1,
.nLinks = 3,
.gpuIds =
{
0x43000,
0x23000,
0x26000,
0x3000,
0xc3000,
0xc6000,
0xa3000,
0x83000,
},
.nicIds =
{
0xe1000,
},
.gpuNuma =
{
1,
2,
2,
3,
5,
5,
6,
7,
},
.nicNuma =
{
4,
},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
},
.pattern = "10201001201010",
.ringBase = "6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2",
.options = "",
};
static struct scclRomeModel rome_model_48 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x4a000,
0x50000,
0xa000,
0xf000,
0xcb000,
0xd1000,
0x8a000,
0x90000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "20202020",
.ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0",
.options = "",
};
static struct scclRomeModel rome_model_49 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0x4a000,
0x50000,
0xa000,
0xf000,
0xcb000,
0xd1000,
0x8a000,
0x90000,
},
.nicIds =
{
0x45000,
0x13000,
0xc6000,
0x85000,
},
.gpuNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.nicNuma =
{
0,
1,
2,
3,
},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB,
PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
},
.pattern = "21212121",
.ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1",
.options = "",
};
static struct scclRomeModel rome_model_52 = {
.nGpus = 8,
.nCpus = 1,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0xc1000,
0xc5000,
0xc9000,
0xcd000,
0xd1000,
0xd5000,
0xd9000,
0xdd000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
0,
0,
0,
0,
0,
0,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "80",
.ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
.options = "",
};
static struct scclRomeModel rome_model_53 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0x4a000,
0x50000,
0xa000,
0xf000,
0xcb000,
0xd1000,
0x8a000,
0x90000,
},
.nicIds =
{
0x45000,
0x13000,
0xc6000,
0x85000,
},
.gpuNuma =
{
1,
1,
3,
3,
5,
5,
7,
7,
},
.nicNuma =
{
1,
3,
5,
7,
},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB,
PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
},
.pattern = "21212121",
.ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1",
.options = "",
};
static struct scclRomeModel rome_model_43 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x63000,
0x43000,
0x27000,
0x3000,
0xe3000,
0xc3000,
0xa3000,
0x83000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "20202020",
.ringBase = "0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1|0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1|0 1 2 3 4 5 6 "
"7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1",
.options = "treeDefined=1",
.treeBase =
"(2(5(6(7(4))))(3(0(1))))|(2(5(7(6(4))))(0(1(3))))|(2(5(7(4(6))))(1(3(0))))|(6(1(0(2(3))))(7(4(5))))|(6(1(2(0(3))))(4(5(7))))|(6(1(0(3(2))))(5(7(4))))|"
"(1(6(7(5(4))))(2(3(0))))|(1(6(4(7(5))))(3(2(0))))|(1(6(5(4(7))))(3(0(2))))|(5(2(3(1(0))))(4(6(7))))|(5(2(0(3(1))))(6(4(7))))|(5(2(1(0(3))))(4(7(6))))",
};
static struct scclRomeModel rome_model_55 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x100000,
0x200000,
0x300000,
0x400000,
0x500000,
0x600000,
0x700000,
0x800000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "20202020",
.ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|2 3 0 1 6 7 4 5|5 4 7 6 1 0 3 2",
.options = "",
};
static struct scclRomeModel rome_model_56 = {
.nGpus = 16,
.nCpus = 4,
.nNics = 0,
.nLinks = 4,
.gpuIds =
{
0x4e000,
0x51000,
0x56000,
0x59000,
0xe000,
0x11000,
0x16000,
0x19000,
0xcf000,
0xd2000,
0xd7000,
0xda000,
0x8f000,
0x92000,
0x97000,
0x9a000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
2,
2,
2,
2,
3,
3,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 4, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 4, 0,
},
.gdrLevel = {},
.pattern = "40404040",
.ringBase = "0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4|0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4|0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1|4 5 13 12 8 9 11 10 14 15 7 "
"6 2 3 1 0|4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0|1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0",
.options = "pivotA2AEnabled=1,pivotA2ANumBiRings=3,tuning=1,mscclEnabled=1,treeDefined=1",
.treeBase = "(0(1(3(2(6(7(15(14(10))))))))(4(5(13(12(8(9(11))))))))|(2(3(7(6(13(12(8(9(10))))))))(1(0(4(5(14(15(11))))))))|(14(15(11(10(8(9(13(12(4))))))))"
"(6(7(3(2(0(1(5))))))))|(10(11(9(8(12(13(5(4(0))))))))(14(15(7(6(2(3(1))))))))|(10(11(15(14(5(4(0(1(2))))))))(9(8(12(13(6(7(3))))))))|(4(5(1(0("
"2(3(7(6(14))))))))(12(13(9(8(10(11(15))))))))|(6(7(15(14(10(11(9(8(12))))))))(2(3(1(0(4(5(13))))))))|(13(12(8(9(10(11(15(14(5))))))))(6(7(3(2("
"1(0(4))))))))|(8(9(13(12(4(5(1(0(2))))))))(10(11(15(14(6(7(3))))))))|(12(13(5(4(0(1(3(2(6))))))))(8(9(11(10(14(15(7))))))))|(5(4(0(1(2(3(7(6("
"13))))))))(14(15(11(10(9(8(12))))))))|(2(3(7(6(14(15(11(10(8))))))))(0(1(5(4(12(13(9))))))))",
};
static struct scclRomeModel rome_model_58 = {
.nGpus = 8,
.nCpus = 3,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0xc1000,
0xc6000,
0xc9000,
0xce000,
0xd1000,
0xd6000,
0xd9000,
0xde000,
},
.nicIds = {},
.gpuNuma =
{
3,
3,
1,
1,
0,
0,
0,
0,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "402020",
.ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
.options = "",
};
static struct scclRomeModel rome_model_59 = {
.nGpus = 16,
.nCpus = 4,
.nNics = 8,
.nLinks = 4,
.gpuIds =
{
0x4e000,
0x51000,
0x56000,
0x59000,
0xe000,
0x11000,
0x16000,
0x19000,
0xcf000,
0xd2000,
0xd7000,
0xda000,
0x8f000,
0x92000,
0x97000,
0x9a000,
},
.nicIds =
{
0x4b000,
0x5a000,
0xb000,
0x1a000,
0xcc000,
0xdb000,
0x8c000,
0x9b000,
},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
2,
2,
2,
2,
3,
3,
3,
3,
},
.nicNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.connMatrix =
{
0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 4, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 4, 0,
},
.gdrLevel =
{
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
},
.pattern = "42424242",
.ringBase = "N4 9 8 12 13 5 4 0 1 3 2 6 7 15 14 10 11 N5|N1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 1 N0|N3 7 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 N7|N7 15 14 "
"10 11 9 8 12 13 5 4 0 1 3 2 6 7 N3|N5 11 10 14 15 7 6 2 3 1 0 4 5 13 12 8 9 N4|N0 1 0 4 5 13 12 8 9 11 10 14 15 7 6 2 3 N1|N3 6 7 3 2 1 0 4 5 "
"14 15 11 10 9 8 12 13 N6|N7 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 5 N2|N2 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 14 N7|N6 13 12 8 9 10 11 15 14 5 4 "
"0 1 2 3 7 6 N3|N4 8 9 13 12 4 5 1 0 2 3 7 6 14 15 11 10 N5|N5 10 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 N4|N6 12 13 9 8 10 11 15 14 6 7 3 2 0 1 5 "
"4 N2|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N1 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 N0|N0 0 1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 N1|N5 "
"10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N3 6 7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N7 14 15 7 6 "
"2 3 1 0 4 5 13 12 8 9 11 10 N5|N0 0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 N2|N4 8 9 10 11 15 14 5 4 0 1 2 3 7 6 13 12 N6|N3 7 6 13 12 8 9 10 11 "
"15 14 5 4 0 1 2 3 N1|N1 3 2 1 0 4 5 14 15 11 10 9 8 12 13 6 7 N3|N6 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 8 N4|N2 4 5 14 15 11 10 9 8 12 13 6 7 "
"3 2 1 0 N0|N0 1 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 N2|N6 13 12 4 5 1 0 2 3 7 6 14 15 11 10 8 9 N4|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 "
"N7|N2 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0 1 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6",
.options = "tuning=4,ll128Enabled=1,baseBw=161.4",
};
static struct scclRomeModel rome_model_62 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0xc1000,
0xc6000,
0xc9000,
0xce000,
0xd1000,
0xd6000,
0xd9000,
0xde000,
},
.nicIds = {},
.gpuNuma =
{
3,
3,
1,
1,
0,
0,
2,
2,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "20202020",
.ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
.options = "",
};
static struct scclRomeModel rome_model_63 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0xc1000,
0xc6000,
0xc9000,
0xce000,
0xd1000,
0xd6000,
0xd9000,
0xde000,
},
.nicIds =
{
0xc5000,
0xcd000,
0xd5000,
0xdd000,
},
.gpuNuma =
{
3,
3,
1,
1,
0,
0,
2,
2,
},
.nicNuma =
{
3,
1,
0,
2,
},
.connMatrix =
{
0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
},
.gdrLevel =
{
PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
},
.pattern = "21212121",
.ringBase = "N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 "
"1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3",
.options = "tuning=3",
};
static struct scclRomeModel rome_model_65 = {
.nGpus = 16,
.nCpus = 4,
.nNics = 8,
.nLinks = 4,
.gpuIds =
{
0x4e000,
0x51000,
0x56000,
0x59000,
0xe000,
0x11000,
0x16000,
0x19000,
0xcf000,
0xd2000,
0xd7000,
0xda000,
0x8f000,
0x92000,
0x97000,
0x9a000,
},
.nicIds =
{
0x4b000,
0x5a000,
0xb000,
0x1a000,
0xcc000,
0xdb000,
0x8c000,
0x9b000,
},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
2,
2,
2,
2,
3,
3,
3,
3,
},
.nicNuma =
{
0,
0,
1,
1,
2,
2,
3,
3,
},
.connMatrix =
{
0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 4, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 4, 0,
},
.gdrLevel =
{
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
},
.pattern = "42424242",
.ringBase = "N4 9 8 12 13 5 4 0 1 3 2 6 7 15 14 10 11 N5|N1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 1 N0|N3 7 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 N7|N7 15 14 "
"10 11 9 8 12 13 5 4 0 1 3 2 6 7 N3|N5 11 10 14 15 7 6 2 3 1 0 4 5 13 12 8 9 N4|N0 1 0 4 5 13 12 8 9 11 10 14 15 7 6 2 3 N1|N3 6 7 3 2 1 0 4 5 "
"14 15 11 10 9 8 12 13 N6|N7 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 5 N2|N2 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 14 N7|N6 13 12 8 9 10 11 15 14 5 4 "
"0 1 2 3 7 6 N3|N4 8 9 13 12 4 5 1 0 2 3 7 6 14 15 11 10 N5|N5 10 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 N4|N6 12 13 9 8 10 11 15 14 6 7 3 2 0 1 5 "
"4 N2|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N1 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 N0|N0 0 1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 N1|N5 "
"10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N3 6 7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N7 14 15 7 6 "
"2 3 1 0 4 5 13 12 8 9 11 10 N5|N0 0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 N2|N4 8 9 10 11 15 14 5 4 0 1 2 3 7 6 13 12 N6|N3 7 6 13 12 8 9 10 11 "
"15 14 5 4 0 1 2 3 N1|N1 3 2 1 0 4 5 14 15 11 10 9 8 12 13 6 7 N3|N6 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 8 N4|N2 4 5 14 15 11 10 9 8 12 13 6 7 "
"3 2 1 0 N0|N0 1 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 N2|N6 13 12 4 5 1 0 2 3 7 6 14 15 11 10 8 9 N4|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 "
"N7|N2 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0 1 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6",
.options = "tuning=4,ll128Enabled=1,baseBw=161.4",
};
static struct scclRomeModel rome_model_66 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x29000,
0x2c000,
0x2f000,
0x32000,
0xad000,
0xb0000,
0xb3000,
0xb6000,
},
.nicIds = {},
.gpuNuma =
{
1,
1,
1,
1,
3,
3,
3,
3,
},
.nicNuma = {},
.connMatrix =
{
0, 4, 0, 0, 2, 0, 1, 0, 4, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 4, 1, 0, 2, 0, 0, 1, 4, 0, 0, 1, 0, 0,
2, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 1, 4, 0,
},
.gdrLevel = {},
.pattern = "4040",
.ringBase = "0 6 7 5 4 2 3 1|1 3 2 4 5 7 6 0|0 1 7 6 2 3 5 4|4 5 3 2 6 7 1 0",
.options = "disableNumaMatching=1,tuning=2",
};
static struct scclRomeModel rome_model_67 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0x29000,
0x2c000,
0x2f000,
0x32000,
0xad000,
0xb0000,
0xb3000,
0xb6000,
},
.nicIds =
{
0x1d000,
0x1e000,
0xa1000,
0xa2000,
},
.gpuNuma =
{
1,
1,
1,
1,
3,
3,
3,
3,
},
.nicNuma =
{
1,
1,
3,
3,
},
.connMatrix =
{
0, 4, 0, 0, 2, 0, 1, 0, 4, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 4, 1, 0, 2, 0, 0, 1, 4, 0, 0, 1, 0, 0,
2, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 1, 4, 0,
},
.gdrLevel =
{
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
},
.pattern = "4242",
.ringBase = "N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N1 2 3 5 4 0 1 7 6 N3|N2 4 5 3 2 6 7 1 0 N0|N1 3 2 4 5 7 6 0 1 N0|N0 1 0 6 7 5 4 2 3 N1|N0 0 1 7 "
"6 2 3 5 4 N2|N3 6 7 1 0 4 5 3 2 N1",
.options = "disableNumaMatching=1,tuning=2",
};
static struct scclRomeModel rome_model_68 = {
.nGpus = 16,
.nCpus = 1,
.nNics = 16,
.nLinks = 3,
.gpuIds =
{
0xcf000,
0xd4000,
0xd5000,
0xd6000,
0xd0000,
0xd1000,
0xd2000,
0xd3000,
0xf0000,
0xf1000,
0xf2000,
0xf3000,
0xf4000,
0xf5000,
0xf6000,
0xf7000,
},
.nicIds =
{
0xcd000,
0xc8000,
0xc9000,
0xcb000,
0xcc000,
0xce000,
0xc7000,
0xca000,
0xe8000,
0xe9000,
0xea000,
0xeb000,
0xec000,
0xed000,
0xee000,
0xef000,
},
.gpuNuma =
{
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
},
.nicNuma =
{
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
},
.connMatrix =
{
0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB,
PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PIX,
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB,
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PIX, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX,
},
.pattern = "@@",
.ringBase = "N0 0 1 2 3 N3 N4 4 5 6 7 N7 N8 8 9 10 11 N11 N12 12 13 14 15 N15|N15 15 14 13 12 N12 N11 11 10 9 8 N8 N7 7 6 5 4 N4 N3 3 2 1 0 N0|N1 1 3 0 2 "
"N2 N5 5 7 4 6 N6 N9 9 11 8 10 N10 N13 13 15 12 14 N14|N14 14 12 15 13 N13 N10 10 8 11 9 N9 N6 6 4 7 5 N5 N2 2 0 3 1 N1|N0 0 1 2 3 N3 N4 4 5 6 "
"7 N7 N8 8 9 10 11 N11 N12 12 13 14 15 N15|N15 15 14 13 12 N12 N11 11 10 9 8 N8 N7 7 6 5 4 N4 N3 3 2 1 0 N0|N1 1 3 0 2 N2 N5 5 7 4 6 N6 N9 9 "
"11 8 10 N10 N13 13 15 12 14 N14|N14 14 12 15 13 N13 N10 10 8 11 9 N9 N6 6 4 7 5 N5 N2 2 0 3 1 N1",
.options = "",
};
static struct scclRomeModel rome_model_71 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0x32000,
0x35000,
0x11000,
0x14000,
0xae000,
0xb3000,
0x8e000,
0x93000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
},
.nicNuma = {},
.connMatrix =
{
0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
},
.gdrLevel = {},
.pattern = "4040",
.ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 2 3 7 6|6 7 3 2 4 5 1 0",
.options = "disableNumaMatching=1,tuning=2",
};
static struct scclRomeModel rome_model_72 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0x32000,
0x35000,
0x11000,
0x14000,
0xae000,
0xb3000,
0x8e000,
0x93000,
},
.nicIds =
{
0x1d000,
0x1e000,
0xa0000,
0xa1000,
},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
},
.nicNuma =
{
0,
0,
1,
1,
},
.connMatrix =
{
0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
},
.gdrLevel =
{
PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
},
.pattern = "4242",
.ringBase = "N0 0 1 3 2 4 5 7 6 N3|N1 2 3 1 0 6 7 5 4 N2|N3 7 6 0 1 5 4 2 3 N1|N0 1 0 6 7 3 2 4 5 N2|N2 4 5 7 6 0 1 3 2 N1|N3 6 7 5 4 2 3 1 0 N0|N2 5 4 2 "
"3 7 6 0 1 N0|N1 3 2 4 5 1 0 6 7 N3",
.options = "disableNumaMatching=1,tuning=2",
};
static struct scclRomeModel rome_model_73 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 0,
.nLinks = 3,
.gpuIds =
{
0xc1000,
0xc6000,
0xc9000,
0xce000,
0xd1000,
0xd6000,
0xd9000,
0xde000,
},
.nicIds = {},
.gpuNuma =
{
3,
3,
1,
1,
0,
0,
2,
2,
},
.nicNuma = {},
.connMatrix =
{
0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
},
.gdrLevel = {},
.pattern = "20202020",
.ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
.options = "",
};
static struct scclRomeModel rome_model_74 = {
.nGpus = 8,
.nCpus = 4,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0xc1000,
0xc6000,
0xc9000,
0xce000,
0xd1000,
0xd6000,
0xd9000,
0xde000,
},
.nicIds =
{
0xc5000,
0xcd000,
0xd5000,
0xdd000,
},
.gpuNuma =
{
3,
3,
1,
1,
0,
0,
2,
2,
},
.nicNuma =
{
3,
1,
0,
2,
},
.connMatrix =
{
0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
},
.gdrLevel =
{
PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
},
.pattern = "21212121",
.ringBase = "N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 "
"1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3",
.options = "tuning=3",
};
static struct scclRomeModel rome_model_76 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 8,
.nLinks = 3,
.gpuIds =
{
0x32000,
0x35000,
0x11000,
0x14000,
0xae000,
0xb3000,
0x8e000,
0x93000,
},
.nicIds =
{
0x26000,
0x2d000,
0x5000,
0xc000,
0xab000,
0xb4000,
0x8b000,
0x94000,
},
.gpuNuma =
{
1,
1,
1,
1,
3,
3,
3,
3,
},
.nicNuma =
{
1,
1,
1,
1,
3,
3,
3,
3,
},
.connMatrix =
{
0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
},
.gdrLevel =
{
PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB,
},
.pattern = "4444",
.ringBase = "N0 0 1 3 2 4 5 7 6 N6|N2 2 3 1 0 6 7 5 4 N4|N5 5 4 2 3 7 6 0 1 N1|N1 1 0 6 7 3 2 4 5 N5|N4 4 5 7 6 0 1 3 2 N2|N2 2 3 1 0 6 7 5 4 N4|N0 0 1 5 "
"4 2 3 7 6 N6|N3 3 2 4 5 1 0 6 7 N7|N4 4 5 7 6 0 1 3 2 N2|N6 6 7 5 4 2 3 1 0 N0|N7 7 6 0 1 5 4 2 3 N3|N6 6 7 3 2 4 5 1 0 N0|N3 3 2 0 1 5 4 6 7 "
"N7|N1 1 0 2 3 7 6 4 5 N5|N5 5 4 6 7 3 2 0 1 N1|N7 7 6 4 5 1 0 2 3 N3",
.options = "disableNumaMatching=1,tuning=3",
};
static struct scclRomeModel rome_model_79 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 0,
.nLinks = 7,
.gpuIds =
{
0x1d000,
0x2e000,
0x3f000,
0x61000,
0x9f000,
0xaf000,
0xbf000,
0xdf000,
},
.nicIds = {},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
},
.nicNuma = {},
.connMatrix =
{
0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
},
.gdrLevel = {},
.pattern = "4040",
.ringBase = "0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 "
"0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0",
.options = "noCpuCheck=1,mscclEnabled=1",
};
static struct scclRomeModel rome_model_80 = {
.nGpus = 4,
.nCpus = 4,
.nNics = 4,
.nLinks = 3,
.gpuIds =
{
0x82000,
0xc2000,
0x2000,
0x42000,
},
.nicIds =
{
0x81000,
0xc1000,
0x1000,
0x41000,
},
.gpuNuma =
{
2,
3,
0,
1,
},
.nicNuma =
{
2,
3,
0,
1,
},
.connMatrix =
{
0,
2,
2,
2,
2,
0,
2,
2,
2,
2,
0,
2,
2,
2,
2,
0,
},
.gdrLevel =
{
PATH_PHB,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_SYS,
PATH_PHB,
},
.pattern = "11111111",
.ringBase = "N2 2 3 0 1 N1|N0 0 1 3 2 N2|N0 0 2 1 3 N3|N3 3 1 0 2 N2|N3 3 1 2 0 N0|N1 1 0 3 2 N2|N1 1 2 3 0 N0|N2 2 0 1 3 N3|N3 3 0 2 1 N1|N2 2 3 1 0 "
"N0|N1 1 2 0 3 N3|N0 0 3 2 1 N1",
.options = "",
};
static struct scclRomeModel rome_model_81 = {
.nGpus = 8,
.nCpus = 2,
.nNics = 8,
.nLinks = 7,
.gpuIds =
{
0xc000,
0x22000,
0x38000,
0x5c000,
0x9f000,
0xaf000,
0xbf000,
0xdf000,
},
.nicIds =
{
0x7000,
0x1d000,
0x33000,
0x57000,
0x9a000,
0xaa000,
0xba000,
0xda000,
},
.gpuNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
},
.nicNuma =
{
0,
0,
0,
0,
1,
1,
1,
1,
},
.connMatrix =
{
0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
},
.gdrLevel =
{
PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS,
PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PHB, PATH_PHB,
PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB,
},
.pattern = "4444",
.ringBase = "N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 7 1 6 4 N4|N3 3 6 1 5 2 7 4 0 N0|N4 4 7 0 6 5 1 3 2 N2|N5 5 4 6 3 0 7 2 1 N1|N6 6 2 0 "
"4 1 7 5 3 N3|N7 7 3 1 4 2 6 0 5 N5|N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 7 1 6 4 N4|N3 3 6 1 5 2 7 4 0 N0|N4 4 7 0 6 5 1 3 2 "
"N2|N5 5 4 6 3 0 7 2 1 N1|N6 6 2 0 4 1 7 5 3 N3|N7 7 3 1 4 2 6 0 5 N5",
.options = "noCpuCheck=1,mscclEnabled=1",
};
static struct scclRomeModel romeTopoModels[] = {
rome_model_22, rome_model_25, rome_model_27, rome_model_29, rome_model_31, rome_model_33, rome_model_30, rome_model_32, rome_model_24,
rome_model_26, rome_model_23, rome_model_38, rome_model_28, rome_model_40, rome_model_42, rome_model_44, rome_model_45, rome_model_46,
rome_model_48, rome_model_49, rome_model_52, rome_model_53, rome_model_43, rome_model_55, rome_model_56, rome_model_58, rome_model_59,
rome_model_62, rome_model_63, rome_model_65, rome_model_66, rome_model_67, rome_model_68, rome_model_71, rome_model_72, rome_model_73,
rome_model_74, rome_model_76, rome_model_79, rome_model_80, rome_model_81,
};
/* Parse user defined rings. Format is like :
* "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1"
* Network interfaces can be optionally specified by N prefix.
* Rings with a non-matching number of gpus are ignored so we can provide
* rings for multiple cases.
*/
scclResult_t parseGraph(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map, int* net_map) {
int gpus[SCCL_TOPO_MAX_NODES];
int nChannels = 0;
int gpu = 0;
int offset = 0;
int status = 0; // 0 : between numbers, 1 : inside number, 2: start NET, 3: inside NET
int nets[SCCL_TOPO_MAX_NODES * 2];
int net_offset = 0, net_count = 0;
int ngpus = system->nodes[GPU].count;
int nnets = system->nodes[NET].count;
do {
if(str[offset] == 'N') {
if(status == 0) {
status = 2;
}
} else {
int digit = str[offset] - '0';
if(digit >= 0 && digit <= 9) {
switch(status) {
case 0:
gpus[gpu] = digit;
status = 1;
break;
case 1: gpus[gpu] = gpus[gpu] * 10 + digit; break;
case 2:
nets[net_offset] = digit + 'N';
status = 3;
break;
case 3: nets[net_offset] = (nets[net_offset] - 'N') * 10 + digit + 'N'; break;
}
} else {
if(status == 1) {
gpu++;
net_offset = 2 * gpu - 1;
if(gpu > SCCL_TOPO_MAX_NODES)
goto end;
} else if(status == 2 || status == 3) {
net_offset++;
net_count++;
if(net_offset > ngpus * 2)
goto end;
}
status = 0;
if(str[offset] == '|' || str[offset] == '\0') {
// Ignore if ngpus doesn't match
if(gpu != ngpus)
goto newchannel;
// Ignore if net_count is not 0 or odd number
if(net_count && net_count % 2)
goto newchannel;
for(int r = 0; r < ngpus; r++) {
int g = gpus[r];
// Ignore if gpus are out of bounds
if(g < 0 || g >= ngpus)
goto newchannel;
// Ignore if gpus are duplicate
for(int i = 0; i < r; i++)
if(gpus[i] == g)
goto newchannel;
// remap if needed
if(gpu_map)
g = gpu_map[g];
// Translate gpu numbers into ranks
int j = 0;
for(j = 0; j < ngpus; j++)
if(g == system->nodes[GPU].nodes[j].gpu.dev)
break;
if(j < ngpus)
graph->intra[nChannels * ngpus + r] = system->nodes[GPU].nodes[j].gpu.rank;
else
return scclInternalError;
}
if(net_count) {
for(int i = 0; net_map && i < ngpus * 2; i++) {
if(nets[i] - 'N' < 0 || nets[i] - 'N' >= nnets)
continue;
nets[i] = net_map[nets[i] - 'N'] + 'N';
}
memcpy(&graph->intraNets[ngpus * nChannels * 2], nets, ngpus * 2 * sizeof(int));
graph->nIntraChannels++;
if(nets[0] - 'N' >= nnets || nets[ngpus * 2 - 1] - 'N' >= nnets)
goto newchannel;
graph->inter[nChannels * 2] = nets[0] - 'N';
graph->inter[nChannels * 2 + 1] = nets[ngpus * 2 - 1] - 'N';
} else if(nnets) {
graph->inter[nChannels * 2] = system->nodes[NET].nodes[nChannels % nnets].id;
graph->inter[nChannels * 2 + 1] = system->nodes[NET].nodes[(nChannels + 1) % nnets].id;
}
nChannels++;
newchannel:
gpu = 0;
net_offset = 0;
net_count = 0;
}
}
}
} while(str[offset++] != 0);
end:
graph->nChannels = nChannels;
graph->bwIntra = graph->bwInter = system->totalBw / nChannels;
if(graph->id == 1) {
for(int i = 0; i < graph->nChannels; i++) {
int net;
scclTopoGetLocalNet(system, graph->intra[i * ngpus + 1], i, &net);
graph->inter[i * 2 + 1] = net;
}
}
#if 0
for (int i=0; i<graph->nChannels; i++) {
printf("%d: ", i);
printf ("NET/%d ", graph->inter[i*2]);
for (int j=0; j<ngpus; j++) printf("GPU/%d ", graph->intra[i*ngpus+j]);
printf ("NET/%d ", graph->inter[i*2+1]);
printf("\n");
}
#endif
return scclSuccess;
}
/* Parse user defined treeBase for complicated trees. Format is like :
* "(4(2(3)(1))(6(5)))"
*
* Rings with a non-matching number of gpus are ignored so we can provide
* rings for multiple cases.
*/
scclResult_t parseGraphLight(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map) {
int gpus[SCCL_TOPO_MAX_NODES]; // transcribe/change according to gpu_map
int nChannels = 0;
int gpu = 0;
int offset = 0;
int start_offset = offset;
if(str[0] == 0) {
graph->treeBase[0][0] = 0;
return scclSuccess;
}
int status = 0; // 0 : between numbers, 1 : inside number
int ngpus = system->nodes[GPU].count;
int x = 0, y = 0;
do {
int digit = str[offset] - '0';
if(digit >= 0 && digit <= 9) {
switch(status) {
case 0:
gpus[gpu] = digit;
status = 1;
break;
case 1: gpus[gpu] = gpus[gpu] * 10 + digit; break;
}
} else {
if(status == 1) {
gpu++;
}
status = 0;
if(str[offset] == '|' || str[offset] == 0) {
int r = 0, y = 0;
while(start_offset < offset) {
// for (int r=0; r<gpu; r++) {
if(str[start_offset] == '(' || str[start_offset] == ')') {
graph->treeBase[x][y] = str[start_offset];
y++;
start_offset++;
} else {
int g = gpus[r];
// remap if needed
if(gpu_map)
g = gpu_map[g];
r++;
int j = 0;
// Translate gpu numbers into ranks
for(j = 0; j < ngpus; j++)
if(g == system->nodes[GPU].nodes[j].gpu.dev)
break;
if(j < ngpus) {
while(str[start_offset] != '(' && str[start_offset] != ')')
start_offset++;
char number_str[10];
sprintf(number_str, "%d", g);
int k = 0;
while(number_str[k] != 0) {
graph->treeBase[x][y] = number_str[k];
y++;
k++;
}
} else
return scclInternalError;
}
}
graph->treeBase[x][y] = 0;
x++;
gpu = 0;
start_offset = offset + 1;
}
}
} while(str[offset++] != 0);
graph->treeBase[x][0] = 0;
return scclSuccess;
}
#define MAX_OPT_TOKENS 10
extern const char* topoPathTypeStr[];
static void parseOptions(struct scclTopoSystem* system, const char* options) {
if(strcmp(options, "")) {
char* str_temp = (char*)malloc(strlen(options) + 1);
strcpy(str_temp, options);
char* tokens[MAX_OPT_TOKENS];
int numTokens = 0;
char* state;
tokens[numTokens] = strtok_r(str_temp, "=, ", &state);
numTokens++;
while(tokens[numTokens - 1] != NULL && numTokens < MAX_OPT_TOKENS)
tokens[numTokens++] = strtok_r(NULL, "=, ", &state);
for(int i = 0; i < numTokens / 2; i++) {
if(strcmp(tokens[i * 2], "netGdrLevel") == 0) {
int j;
for(j = 0; j <= PATH_SYS; j++) {
if(strcmp(tokens[i * 2 + 1], topoPathTypeStr[j]) == 0)
break;
}
if(j <= PATH_SYS)
system->netGdrLevel = j;
else {
system->netGdrLevel = -2;
WARN("invalid netGdrLevel: %s", tokens[i * 2 + 1]);
}
} else if(strcmp(tokens[i * 2], "pivotA2AEnabled") == 0) {
system->pivotA2AEnabled = (bool)atol(tokens[i * 2 + 1]);
} else if(strcmp(tokens[i * 2], "pivotA2ANumBiRings") == 0) {
system->pivotA2ANumBiRings = atol(tokens[i * 2 + 1]);
} else if(strcmp(tokens[i * 2], "tuning") == 0) {
system->tuning = atol(tokens[i * 2 + 1]);
} else if(strcmp(tokens[i * 2], "ll128Enabled") == 0) {
system->ll128Enabled = (bool)atol(tokens[i * 2 + 1]);
} else if(strcmp(tokens[i * 2], "baseBw") == 0) {
system->baseBw = std::stof(tokens[i * 2 + 1]);
} else if(strcmp(tokens[i * 2], "mscclEnabled") == 0) {
system->mscclEnabled = (bool)atol(tokens[i * 2 + 1]);
} else if(strcmp(tokens[i * 2], "treeDefined") == 0) {
system->treeDefined = (bool)atol(tokens[i * 2 + 1]);
}
}
free(str_temp);
}
}
static bool checkOption(const char* options, const char* name) {
if(strcmp(options, "")) {
char* str_temp = (char*)malloc(strlen(options) + 1);
strcpy(str_temp, options);
char* tokens[MAX_OPT_TOKENS];
int numTokens = 0;
char* state;
tokens[numTokens] = strtok_r(str_temp, "=, ", &state);
numTokens++;
while(tokens[numTokens - 1] != NULL && numTokens < MAX_OPT_TOKENS)
tokens[numTokens++] = strtok_r(NULL, "=, ", &state);
for(int i = 0; i < numTokens / 2; i++) {
if(strcmp(tokens[i * 2], name) == 0) {
return (bool)atol(tokens[i * 2 + 1]);
}
}
free(str_temp);
}
return false;
}
scclResult_t parseChordalRing(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
static const char* ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3";
int id[8], dist[8];
int i;
int ngpus = system->nodes[GPU].count;
if(ngpus != 8)
return scclSuccess;
// validate chordal ring and calculate distance
for(i = 0; i < ngpus; i++) {
struct scclTopoNode* node = system->nodes[GPU].nodes + i;
if(node->paths[GPU] == NULL)
continue;
int sum = ngpus * (ngpus - 1) / 2 - node->gpu.dev;
int count = 0;
for(int n = 0; n < ngpus; n++) {
struct scclTopoLink* link;
for(link = node->links; link->remNode; link++) {
if(link->remNode->gpu.dev == n)
break;
}
if(!link->remNode)
continue;
if(link->type != LINK_NVL)
continue;
sum -= system->nodes[GPU].nodes[n].gpu.dev;
count++;
}
if(count != ngpus - 2 || sum < 0 || sum > ngpus - 1) {
return scclSuccess;
}
dist[i] = sum;
}
// remap GPU ids
for(i = 0; i < ngpus; i++)
id[i] = i;
for(i = 0; i < ngpus; i++) {
if(dist[i] == ngpus - 1 - i)
continue;
int j, m, n, temp;
for(j = i + 1; j < ngpus; j++)
if(dist[j] == ngpus - 1 - i)
break;
m = dist[i];
n = dist[j];
dist[i] = n;
dist[j] = m;
temp = id[m];
id[m] = id[n];
id[n] = temp;
temp = dist[m];
dist[m] = dist[n];
dist[n] = temp;
}
// create chordal ring based on reference and remapped ids
system->type |= RCCL_TOPO_CR8G;
SCCLCHECK(parseGraph(ringBase, system, graph, id, NULL));
if(system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
int *intra, *used;
graph->nChannels = system->nodes[NET].count;
SCCLCHECK(scclCalloc(&intra, ngpus));
SCCLCHECK(scclCalloc(&used, system->nodes[NET].count));
for(int n = 0; n < system->nodes[NET].count; n++) {
graph->inter[n * 2] = graph->inter[n * 2 + 1] = n;
struct scclTopoNode* net = system->nodes[NET].nodes + n;
struct scclTopoLinkList* paths = net->paths[GPU];
// find the first unsed GPU that is closest to NIC
int f, m;
for(f = 0; f < ngpus; f++) {
int j = 0;
for(j = 0; j < n; j++)
if(used[j] == system->nodes[GPU].nodes[f].gpu.rank)
break;
if(j >= n)
break;
}
for(int i = 0; i < ngpus; i++) {
int j = 0;
for(j = 0; j < n; j++)
if(used[j] == system->nodes[GPU].nodes[i].gpu.rank)
break;
if(j < n)
continue;
if(paths[i].count < paths[f].count)
f = i;
}
for(m = 0; m < ngpus; m++)
if(graph->intra[n * ngpus + m] == system->nodes[GPU].nodes[f].gpu.rank)
break;
used[n] = graph->intra[n * ngpus + m];
for(int i = 0; i < ngpus; i++)
intra[i] = graph->intra[n * ngpus + ((i + m) % ngpus)];
for(int i = 0; i < ngpus; i++)
graph->intra[n * ngpus + i] = intra[i];
}
free(used);
free(intra);
}
return scclSuccess;
}
static scclResult_t parseRomeSystem(struct scclTopoSystem* system, struct scclRomeModel* romeTopo, char* pattern) {
pattern[0] = 0; // pattern will be NULL for invalid topology
romeTopo->nGpus = system->nodes[GPU].count;
romeTopo->nCpus = system->nodes[CPU].count;
romeTopo->nNics = system->nodes[NET].count;
romeTopo->nLinks = 0;
struct scclGpuIdHIP {
int g;
int dev;
};
auto cmpIds = [](const void* g1, const void* g2) {
struct scclGpuIdHIP* s1 = (struct scclGpuIdHIP*)g1;
struct scclGpuIdHIP* s2 = (struct scclGpuIdHIP*)g2;
return s1->dev - s2->dev;
};
struct scclCpuNuma {
int c;
uint64_t numa;
};
auto cmpNuma = [](const void* g1, const void* g2) {
struct scclCpuNuma* s1 = (struct scclCpuNuma*)g1;
struct scclCpuNuma* s2 = (struct scclCpuNuma*)g2;
return (int)(s1->numa - s2->numa);
};
struct scclNetId {
int n;
uint64_t id;
};
auto cmpNets = [](const void* g1, const void* g2) {
struct scclNetId* s1 = (struct scclNetId*)g1;
struct scclNetId* s2 = (struct scclNetId*)g2;
return (int)(s1->id - s2->id);
};
// sort GPU devices by HIP device ID
struct scclGpuIdHIP gpu_scores[SCCL_TOPO_MAX_NODES];
for(int i = 0; i < romeTopo->nGpus; i++) {
gpu_scores[i].g = i;
gpu_scores[i].dev = system->nodes[GPU].nodes[i].gpu.dev;
}
qsort(gpu_scores, romeTopo->nGpus, sizeof(struct scclGpuIdHIP), cmpIds);
// sort CPU devices by NUMA id
struct scclCpuNuma cpu_scores[SCCL_TOPO_MAX_NODES];
for(int i = 0; i < romeTopo->nCpus; i++) {
cpu_scores[i].c = i;
cpu_scores[i].numa = system->nodes[CPU].nodes[i].id;
}
qsort(cpu_scores, romeTopo->nCpus, sizeof(struct scclCpuNuma), cmpNuma);
// sort NET devices by id
struct scclNetId net_scores[SCCL_TOPO_MAX_NODES];
for(int i = 0; i < romeTopo->nNics; i++) {
net_scores[i].n = i;
net_scores[i].id = system->nodes[NET].nodes[i].id;
}
qsort(net_scores, romeTopo->nNics, sizeof(struct scclNetId), cmpNets);
for(int i = 0; i < romeTopo->nGpus; i++) {
int gpu, n, m, distance;
gpu = gpu_scores[i].g;
romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
m = 0;
distance = system->nodes[GPU].nodes[gpu].paths[CPU][m].count;
for(n = 1; n < romeTopo->nCpus; n++) {
if(system->nodes[GPU].nodes[gpu].paths[CPU][n].count < distance) {
distance = system->nodes[GPU].nodes[gpu].paths[CPU][n].count;
m = n;
}
}
if(m < romeTopo->nCpus)
romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[m].id;
struct scclTopoNode* node = system->nodes[GPU].nodes + gpu;
if(node->paths[GPU] == NULL)
continue;
int count = 0;
for(n = 0; n < romeTopo->nGpus; n++) {
romeTopo->connMatrix[i * romeTopo->nGpus + n] = 0;
struct scclTopoLink* link;
for(link = node->links; link->remNode; link++) {
if(link->remNode->gpu.dev == n)
break;
}
if(!link->remNode)
continue;
if(link->type != LINK_NVL)
continue;
romeTopo->connMatrix[i * romeTopo->nGpus + n] = link->bw / scclTopoXGMISpeed(node->gpu.gcn);
count++;
}
if(romeTopo->nLinks < count)
romeTopo->nLinks = count;
}
for(int i = 0; i < romeTopo->nNics; i++) {
int n, m, distance;
m = 0;
int net = net_scores[i].n;
romeTopo->nicIds[i] = system->nodes[NET].nodes[net].net.busId;
distance = system->nodes[NET].nodes[net].paths[CPU][m].count;
for(n = 0; n < romeTopo->nCpus; n++)
if(system->nodes[NET].nodes[net].paths[CPU][n].count < distance) {
distance = system->nodes[NET].nodes[net].paths[CPU][n].count;
m = n;
}
if(m < romeTopo->nCpus)
romeTopo->nicNuma[i] = system->nodes[CPU].nodes[m].id;
else
return scclSuccess;
}
// number of GPUs and NICs on each numa node is used as first screening pattern
for(int i = 0; i < romeTopo->nCpus; i++) {
uint64_t id = system->nodes[CPU].nodes[cpu_scores[i].c].id;
int g = 0, n = 0;
for(int j = 0; j < romeTopo->nGpus; j++)
if(romeTopo->gpuNuma[j] == id)
g++;
for(int j = 0; j < romeTopo->nNics; j++)
if(romeTopo->nicNuma[j] == id)
n++;
pattern[i * 2] = '0' + g;
pattern[i * 2 + 1] = '0' + n;
}
pattern[romeTopo->nCpus * 2] = 0;
// compute gdr level matrix
for(int i = 0; i < romeTopo->nNics; i++) {
int n = net_scores[i].n;
for(int j = 0; j < romeTopo->nGpus; j++) {
int g = gpu_scores[j].g;
romeTopo->gdrLevel[i * romeTopo->nGpus + j] = system->nodes[GPU].nodes[g].paths[NET][n].type;
}
}
const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
if(romeModelFile) {
INFO(SCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
FILE* file = fopen(romeModelFile, "w");
if(file == NULL) {
WARN("Unable to open %s, not dumping Rome model.", romeModelFile);
return scclSuccess;
}
fprintf(file, "static struct scclRomeModel rome_model_ = {\n");
fprintf(file, " .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,\n", romeTopo->nGpus, romeTopo->nCpus, romeTopo->nNics, romeTopo->nLinks);
fprintf(file, " .gpuIds = { ");
for(int i = 0; i < romeTopo->nGpus; i++)
fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]);
fprintf(file, "},\n");
fprintf(file, " .nicIds = { ");
for(int i = 0; i < romeTopo->nNics; i++)
fprintf(file, "0x%lx, ", romeTopo->nicIds[i]);
fprintf(file, "},\n");
fprintf(file, " .gpuNuma = { ");
for(int i = 0; i < romeTopo->nGpus; i++)
fprintf(file, "%ld, ", romeTopo->gpuNuma[i]);
fprintf(file, "},\n");
fprintf(file, " .nicNuma = { ");
for(int i = 0; i < romeTopo->nNics; i++)
fprintf(file, "%ld, ", romeTopo->nicNuma[i]);
fprintf(file, "},\n");
fprintf(file, " .connMatrix = { ");
for(int i = 0; i < romeTopo->nGpus; i++)
for(int n = 0; n < romeTopo->nGpus; n++)
fprintf(file, "%d, ", romeTopo->connMatrix[i * romeTopo->nGpus + n]);
fprintf(file, "},\n");
fprintf(file, " .gdrLevel = { ");
for(int i = 0; i < romeTopo->nNics; i++)
for(int n = 0; n < romeTopo->nGpus; n++)
fprintf(file, "PATH_%s, ", topoPathTypeStr[romeTopo->gdrLevel[i * romeTopo->nGpus + n]]);
fprintf(file, "},\n");
fprintf(file, " .pattern = \"%s\",\n", pattern);
fprintf(file, " .ringBase = \"\",\n");
fprintf(file, " .options = \"\",\n");
fprintf(file, "};\n");
fclose(file);
}
return scclSuccess;
}
static bool permuteGpuIds(int* g, int n, int last, struct scclRomeModel* ref, struct scclRomeModel* topo, int* time, bool nbio, bool ignore_numa) {
(*time)++;
if(n == last) {
int i, j;
// match GPU numa
if(!ignore_numa) {
for(i = 0; i < ref->nGpus; i++)
if(ref->gpuNuma[i] != topo->gpuNuma[g[i]])
break;
if(i < ref->nGpus)
return false;
}
// match XGMI connection
for(i = 0; i < ref->nGpus; i++) {
for(j = 0; j < ref->nGpus; j++) {
if(ref->connMatrix[i * ref->nGpus + j] != topo->connMatrix[g[i] * ref->nGpus + g[j]])
break;
if((ref->gpuIds[i] - ref->gpuIds[j]) * (topo->gpuIds[g[i]] - topo->gpuIds[g[j]]) < 0)
break;
}
if(j < ref->nGpus)
break;
}
if(i < ref->nGpus)
return false;
// match NBIO
if(nbio) {
for(i = 0; i < ref->nGpus; i++) {
for(j = 0; j < ref->nGpus; j++) {
if(i == j)
continue;
bool nbio_ref = (ref->gpuIds[i] & 0xf0000) == (ref->gpuIds[j] & 0xf0000);
bool nbio_topo = (topo->gpuIds[g[i]] & 0xf0000) == (topo->gpuIds[g[j]] & 0xf0000);
if(nbio_ref != nbio_topo)
break;
if(nbio_ref && ((ref->gpuIds[i] - ref->gpuIds[j]) * (topo->gpuIds[g[i]] - topo->gpuIds[g[j]]) < 0))
break;
}
if(j < ref->nGpus)
break;
}
if(i < ref->nGpus)
return false;
}
return true;
} else {
for(int i = n; i <= last; i++) {
std::swap(g[n], g[i]);
if(permuteGpuIds(g, n + 1, last, ref, topo, time, nbio, ignore_numa))
return true;
std::swap(g[n], g[i]);
}
}
return false;
}
static bool permuteNetIds(int* n, int* g, int s, int last, struct scclRomeModel* ref, struct scclRomeModel* topo, int* time, bool ignore_numa) {
(*time)++;
if(s == last) {
int i, j;
// match NET numa
if(!ignore_numa) {
for(i = 0; i < ref->nNics; i++) {
if(ref->nicNuma[i] != topo->nicNuma[n[i]])
break;
}
if(i < ref->nNics)
return false;
}
// match gdr level
for(i = 0; i < ref->nNics; i++) {
for(j = 0; j < ref->nGpus; j++) {
if(ref->gdrLevel[i * ref->nGpus + j] != topo->gdrLevel[n[i] * ref->nGpus + g[j]])
break;
}
if(j < ref->nGpus)
break;
}
if(i < ref->nNics)
return false;
return true;
} else {
for(int i = s; i <= last; i++) {
std::swap(n[s], n[i]);
if(permuteNetIds(n, g, s + 1, last, ref, topo, time, ignore_numa))
return true;
std::swap(n[s], n[i]);
}
}
return false;
}
scclResult_t parseRome4P2H(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
static char ringRemap[64];
int i;
int ngpus = system->nodes[GPU].count;
int ncpus = system->nodes[CPU].count;
int nnets = system->nodes[NET].count;
if(ngpus > 8)
return scclSuccess;
// only valid on Rome
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
// number of GPUs and NICs on each numa node is used as first screening pattern
struct scclRomeModel romeTopo;
char pattern[256];
SCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
// recognize system as Rome 4P2H even if no matching model
if(ngpus > 4 && romeTopo.nLinks)
system->type |= RCCL_TOPO_4P2H_ROME;
int g[SCCL_TOPO_MAX_NODES], n[SCCL_TOPO_MAX_NODES];
int time = 0;
struct timeval tvs, tve;
gettimeofday(&tvs, NULL);
// check if GPUs are directly connected to CPU
bool match_nbio = true;
for(i = 0; i < romeTopo.nGpus; i++) {
int cpu, gpu;
SCCLCHECK(scclTopoIdToIndex(system, CPU, romeTopo.gpuNuma[i], &cpu));
SCCLCHECK(scclTopoIdToIndex(system, GPU, romeTopo.gpuIds[i], &gpu));
if(system->nodes[GPU].nodes[gpu].paths[CPU][cpu].count > 2)
break;
}
if(i < romeTopo.nGpus)
match_nbio = false;
for(i = 0; i < sizeof(romeTopoModels) / sizeof(romeTopoModels[0]); i++) {
bool ignore_cpu = checkOption(romeTopoModels[i].options, "noCpuCheck");
if(!ignore_cpu && (arch != SCCL_TOPO_CPU_ARCH_X86 || vendor != SCCL_TOPO_CPU_VENDOR_AMD || model != SCCL_TOPO_CPU_TYPE_ROME))
continue;
bool ignore_numa = checkOption(romeTopoModels[i].options, "disableNumaMatching");
if(!ignore_numa && romeTopo.nCpus != romeTopoModels[i].nCpus)
continue;
if(romeTopo.nGpus != romeTopoModels[i].nGpus || romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks)
continue;
if(!ignore_numa && strcmp(romeTopoModels[i].pattern, pattern))
continue;
// permute GPU IDs
for(int j = 0; j < ngpus; j++)
g[j] = (j + 2) % ngpus;
if(!permuteGpuIds(g, 0, ngpus - 1, romeTopoModels + i, &romeTopo, &time, ignore_cpu ? false : match_nbio, ignore_numa))
continue;
if(nnets > 1) {
// permute NET IDs
for(int j = 0; j < nnets; j++)
n[j] = (j + 2) % nnets;
if(permuteNetIds(n, g, 0, nnets - 1, romeTopoModels + i, &romeTopo, &time, ignore_numa))
break;
} else
break;
}
gettimeofday(&tve, NULL);
float t = (tve.tv_sec - tvs.tv_sec) * 1E3 + (tve.tv_usec - tvs.tv_usec) / 1E3;
if(i >= sizeof(romeTopoModels) / sizeof(romeTopoModels[0])) {
// printf("No solution in %.2fms (%d iter)\n", t, time);
return scclSuccess;
}
char line[1024];
// sprintf(line, "Found matching Rome model index %d in %.2fms (%d iter) with GPU mapping: ", i, t, time);
sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i);
int offset = strlen(line);
for(int k = 0; k < ngpus; k++) {
sprintf(line + offset, "%d ", g[k]);
offset = strlen(line);
}
if(nnets > 1) {
sprintf(line + offset, "NET mapping: ");
offset = strlen(line);
for(int k = 0; k < nnets; k++) {
sprintf(line + offset, "%d ", n[k]);
offset = strlen(line);
}
}
INFO(SCCL_GRAPH, "%s", line);
parseOptions(system, romeTopoModels[i].options);
// create 4P2H based on reference and remapped ids
SCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL));
if(romeTopoModels[i].treeBase != nullptr)
SCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, g));
return scclSuccess;
}
scclResult_t parse1H16P(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
#define NUMA_CPUS 4
#define NUMA_GPUS 4
#define NUMA_PERMUTE_COUNT 24
#define TOTAL_PERMUTE_COUNT (NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT)
static char ringRemap[256];
int i;
int ngpus = system->nodes[GPU].count;
int ncpus = system->nodes[CPU].count;
int nnets = system->nodes[NET].count;
// only valid on Rome
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
if(arch != SCCL_TOPO_CPU_ARCH_X86 || vendor != SCCL_TOPO_CPU_VENDOR_AMD || model != SCCL_TOPO_CPU_TYPE_ROME)
return scclSuccess;
// number of GPUs and NICs on each numa node is used as first screening pattern
struct scclRomeModel romeTopo;
char pattern[256];
SCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
// only match for system with 16 GPUs
if(ngpus != 16 || ncpus != NUMA_CPUS)
return scclSuccess;
int gcnt = 0;
int *g16, n[SCCL_TOPO_MAX_NODES];
int* all_gpu_permutations = (int*)malloc(TOTAL_PERMUTE_COUNT * NUMA_CPUS * NUMA_GPUS * sizeof(int));
struct timeval tvs, tve;
gettimeofday(&tvs, NULL);
for(i = 0; i < sizeof(romeTopoModels) / sizeof(romeTopoModels[0]); i++) {
if(romeTopo.nCpus != romeTopoModels[i].nCpus || romeTopo.nGpus != romeTopoModels[i].nGpus || romeTopo.nNics != romeTopoModels[i].nNics ||
romeTopo.nLinks != romeTopoModels[i].nLinks)
continue;
if(strcmp(romeTopoModels[i].pattern, pattern))
continue;
int j, r[ngpus], g[ngpus];
int numa_gpu_permutations[NUMA_CPUS][NUMA_PERMUTE_COUNT][NUMA_GPUS];
// permute GPUs for each CPU NUMA nodes
for(j = 0; j < ncpus; j++) {
int ngpusPerNuma = 0, cnt = 0, npermute = 0;
for(int k = 0; k < ngpus; k++) {
if(romeTopoModels[i].gpuNuma[k] != j)
continue;
r[ngpusPerNuma++] = k;
}
if(ngpusPerNuma == 0)
continue;
if(ngpusPerNuma != NUMA_GPUS)
break;
gcnt++;
// init GPU mapping
for(int k = 0; k < ngpus; k++) {
if(romeTopo.gpuNuma[k] != j)
continue;
g[(2 + cnt++) % ngpusPerNuma] = k;
}
std::sort(g, g + ngpusPerNuma);
do {
for(int n = 0; n < ngpusPerNuma; n++)
numa_gpu_permutations[j][npermute][n] = g[n];
npermute++;
} while(std::next_permutation(g, g + ngpusPerNuma));
if(npermute != NUMA_PERMUTE_COUNT)
break;
}
if(j < ncpus)
continue;
// permute GPUs for all CPU NUMA nodes
for(int a = 0; a < NUMA_PERMUTE_COUNT; a++) {
for(int b = 0; b < NUMA_PERMUTE_COUNT; b++) {
for(int c = 0; c < NUMA_PERMUTE_COUNT; c++) {
for(int d = 0; d < NUMA_PERMUTE_COUNT; d++) {
uint64_t offset = ((a * NUMA_PERMUTE_COUNT + b) * NUMA_PERMUTE_COUNT + c) * NUMA_PERMUTE_COUNT + d;
// offset = (offset+TOTAL_PERMUTE_COUNT/2)%TOTAL_PERMUTE_COUNT;
offset *= (NUMA_CPUS * NUMA_GPUS);
memcpy(all_gpu_permutations + offset, &numa_gpu_permutations[0][a][0], NUMA_GPUS * sizeof(int));
memcpy(all_gpu_permutations + offset + NUMA_GPUS, &numa_gpu_permutations[1][b][0], NUMA_GPUS * sizeof(int));
memcpy(all_gpu_permutations + offset + NUMA_GPUS * 2, &numa_gpu_permutations[2][c][0], NUMA_GPUS * sizeof(int));
memcpy(all_gpu_permutations + offset + NUMA_GPUS * 3, &numa_gpu_permutations[3][d][0], NUMA_GPUS * sizeof(int));
}
}
}
}
// match all GPUs' XGMI connection
int p;
for(p = 0; p < TOTAL_PERMUTE_COUNT; p++) {
g16 = all_gpu_permutations + p * NUMA_CPUS * NUMA_GPUS;
int k;
for(k = 0; k < romeTopoModels[i].nGpus; k++) {
int m;
for(m = 0; m < romeTopoModels[i].nGpus; m++) {
if(romeTopoModels[i].connMatrix[k * romeTopoModels[i].nGpus + m] != romeTopo.connMatrix[g16[k] * romeTopoModels[i].nGpus + g16[m]])
break;
}
if(m < romeTopoModels[i].nGpus)
break;
}
if(k < romeTopoModels[i].nGpus)
continue;
// printf("found match %d: ", p); for (int n = 0; n < NUMA_CPUS*NUMA_GPUS; n++) printf("%d ", g16[n]); printf("\n");
if(nnets > 1) {
// permute NET IDs
int time = 0;
for(int m = 0; m < nnets; m++)
n[m] = (m + 2) % nnets;
if(permuteNetIds(n, g16, 0, nnets - 1, romeTopoModels + i, &romeTopo, &time, false))
break;
} else
break;
}
if(p < TOTAL_PERMUTE_COUNT)
break;
}
gettimeofday(&tve, NULL);
float t = (tve.tv_sec - tvs.tv_sec) * 1E3 + (tve.tv_usec - tvs.tv_usec) / 1E3;
if(i >= sizeof(romeTopoModels) / sizeof(romeTopoModels[0])) {
// printf("No solution in %.2fms\n", t);
return scclSuccess;
}
char line[1024];
// sprintf(line, "Found matching Rome model index %d in %.2fms with GPU mapping: ", i, t);
sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i);
int offset = strlen(line);
for(int k = 0; k < ngpus; k++) {
sprintf(line + offset, "%d ", g16[k]);
offset = strlen(line);
}
if(nnets > 1) {
sprintf(line + offset, "NET mapping: ");
offset = strlen(line);
for(int k = 0; k < nnets; k++) {
sprintf(line + offset, "%d ", n[k]);
offset = strlen(line);
}
}
INFO(SCCL_GRAPH, "%s", line);
system->type |= RCCL_TOPO_16P1H;
parseOptions(system, romeTopoModels[i].options);
// create 16P1H based on reference and remapped ids
SCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g16, nnets > 1 ? n : NULL));
if(romeTopoModels[i].treeBase != nullptr)
SCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, g16));
// clean up
free(all_gpu_permutations);
return scclSuccess;
}
scclResult_t parse4H4P(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
#define NUM_HIVES 4
#define HIVE_GPUS 4
static char ringRemap[256];
int ngpus = system->nodes[GPU].count;
int nnets = system->nodes[NET].count;
// only valid on Rome
int arch, vendor, model;
SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
if(arch != SCCL_TOPO_CPU_ARCH_X86 || vendor != SCCL_TOPO_CPU_VENDOR_AMD || model != SCCL_TOPO_CPU_TYPE_ROME)
return scclSuccess;
// number of GPUs and NICs on each numa node is used as first screening pattern
struct scclRomeModel romeTopo;
char pattern[256];
SCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
// only match for system with 16 GPUs
if(ngpus != NUM_HIVES * HIVE_GPUS || nnets != NUM_HIVES * HIVE_GPUS)
return scclSuccess;
int g_hives[ngpus], n_hives[nnets];
int ng_hives[NUM_HIVES];
// try to sort GPUs into hives
for(int i = 0; i < NUM_HIVES; i++)
ng_hives[i] = 0;
for(int i = 0; i < nnets; i++)
n_hives[i] = -1;
for(int i = 0; i < ngpus; i++)
g_hives[i] = -1;
for(int i = 0; i < ngpus; i++) {
int j, h;
for(j = 0; j < NUM_HIVES; j++) {
if(ng_hives[j]) {
if(romeTopo.connMatrix[i * ngpus + g_hives[j * HIVE_GPUS]]) {
g_hives[j * HIVE_GPUS + ng_hives[j]] = i;
ng_hives[j]++;
break;
}
}
}
if(j >= NUM_HIVES) {
for(h = 0; h < NUM_HIVES; h++) {
if(ng_hives[h] == 0) {
g_hives[h * HIVE_GPUS] = i;
ng_hives[h]++;
break;
}
}
if(h >= NUM_HIVES)
return scclSuccess;
}
}
for(int i = 0; i < NUM_HIVES; i++)
if(ng_hives[i] != 4)
return scclSuccess;
// remap NET ids
for(int i = 0; i < nnets; i++) {
int j;
for(j = 0; j < ngpus; j++) {
if(romeTopo.gdrLevel[i * nnets + g_hives[j]] == 3) {
n_hives[j] = i;
break;
}
}
if(j >= ngpus)
return scclSuccess;
}
// validation
for(int i = 0; i < nnets; i++)
if(n_hives[i] == -1)
return scclSuccess;
for(int i = 0; i < ngpus; i++)
if(g_hives[i] == -1)
return scclSuccess;
char line[1024];
sprintf(line, "Found matching Rome model 4P4H with GPU mapping: ");
int offset = strlen(line);
for(int k = 0; k < ngpus; k++) {
sprintf(line + offset, "%d ", g_hives[k]);
offset = strlen(line);
}
if(nnets > 1) {
sprintf(line + offset, "NET mapping: ");
offset = strlen(line);
for(int k = 0; k < nnets; k++) {
sprintf(line + offset, "%d ", n_hives[k]);
offset = strlen(line);
}
}
INFO(SCCL_GRAPH, "%s", line);
if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME)
system->type |= RCCL_TOPO_4P2H_ROME;
parseOptions(system, rome_model_68.options);
// create 4P4H based on reference and remapped ids
SCCLCHECK(parseGraph(rome_model_68.ringBase, system, graph, g_hives, n_hives));
return scclSuccess;
}
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
#ifndef SCCL_ROME_MODELS_H_
#define SCCL_ROME_MODELS_H_
namespace sccl {
namespace hardware {
namespace topology {
namespace detect {
scclResult_t parseGraph(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map, int* net_map);
scclResult_t parseGraphLight(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map);
scclResult_t parseRome4P2H(struct scclTopoSystem* system, struct scclTopoGraph* graph);
scclResult_t parseChordalRing(struct scclTopoSystem* system, struct scclTopoGraph* graph);
scclResult_t parse1H16P(struct scclTopoSystem* system, struct scclTopoGraph* graph);
scclResult_t parse4H4P(struct scclTopoSystem* system, struct scclTopoGraph* graph);
} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl
#endif
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment