Initial Code for SCCL_v1

d9d23f34 · lishen · 57df3737 · d9d23f34 · d9d23f34 · d9d23f34
Commit d9d23f34 authored Jun 20, 2025 by lishen
20 changed files
--- a/examples/2_topo/0_demo_topo/test_topo.cpp
+++ b/examples/2_topo/0_demo_topo/test_topo.cpp
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "base.h"
+#include "alloc.h"
+#include "topo.h"
+#include "xml.h"
+#include "mpi.h"
+#include "net.h"
+#include "comm.h"
+#include "graph.h"
+using namespace sccl;
+int main(int argc, char** argv) {
+    // struct sccl::hardware::topology::topo::scclXml* xml;
+    // SCCLCHECK(sccl::scclCalloc(&xml, 1));
+    // std::string xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
+    // SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
+    // struct sccl::hardware::topology::topo::scclTopoSystem* topoSystem;
+    // SCCLCHECK(sccl::hardware::topology::topo::scclTopoGetSystemFromXml(xml, &topoSystem));
+    // printf("topoSystem net.gdrSupport:%d\n", topoSystem->nodes[0].nodes[0].net.gdrSupport);
+    int rank, nranks;
+    MPI_Status status;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    struct sccl::hardware::scclComm* comm;
+    struct sccl::hardware::topology::topo::scclTopoSystem* topoSystem;
+    SCCLCHECK(sccl::hardware::topology::topo::scclTopoGetSystem(&topoSystem));
+    printf("topoSystem net.gdrSupport:%d\n", topoSystem->nodes[0].nodes[0].net.gdrSupport);
+    topoSystem->nRanks             = nranks;
+    topoSystem->netGdrLevel        = -2;
+    topoSystem->pivotA2AEnabled    = false;
+    topoSystem->pivotA2ANumBiRings = 0;
+    topoSystem->ll128Enabled       = false;
+    topoSystem->mscclEnabled       = false;
+    topoSystem->treeDefined        = false;
+    SCCLCHECK(sccl::hardware::topology::scclTopoComputePaths(topoSystem, comm));
+    return 0;
+}
\ No newline at end of file
--- a/examples/2_topo/0_demo_topo/test_xml.cpp
+++ b/examples/2_topo/0_demo_topo/test_xml.cpp
+#include <iostream>
+#include <stdio.h>
+#include "base.h"
+#include "alloc.h"
+#include "xml.h"
+using namespace sccl;
+int main(int argc, char** argv) {
+    struct sccl::hardware::topology::topo::scclXml* xml;
+    SCCLCHECK(sccl::scclCalloc(&xml, 1));
+    std::string xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
+    SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
+    SCCLCHECK(scclTopoDumpXmlToFile("test_xml.xml", xml));
+    return 0;
+} // main pass
\ No newline at end of file
--- a/examples/2_topo/0_demo_topo/test_xml.xml
+++ b/examples/2_topo/0_demo_topo/test_xml.xml
+<system version="2">
+  <cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
+      <nic>
+        <net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
+        <net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
+      </nic>
+    </pci>
+  </cpu>
+</system>
--- a/examples/2_topo/1_demo_rocm/compile_rocm_smi.sh
+++ b/examples/2_topo/1_demo_rocm/compile_rocm_smi.sh
+hipcc /public/home/lishen/Code/rocSHMEM/SCCL_v1/examples/2_topo/1_demo_rocm/test_rocm_smi.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/rocm_smi_wrap.cc \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp \
+-o test_topo \
+-std=c++17 -g -O3 -fopenmp -D__HIP_PLATFORM_HCC__ \
+-I ./ -I /usr/include -I /opt/dtk/include \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/ \
+-L /usr/lib/x86_64-linux-gnu \
+-L /usr/lib/ \
+-lamdhip64 -lrocm_smi64
\ No newline at end of file
--- a/examples/2_topo/1_demo_rocm/test_rocm_smi.cpp
+++ b/examples/2_topo/1_demo_rocm/test_rocm_smi.cpp
+#include <iostream>
+#include <stdio.h>
+#include <string.h>
+#include "base.h"
+#include "rocm_smi_wrap.h"
+#include "topo_utils.h"
+using namespace std;
+using namespace sccl;
+int main(int argc, char** argv) {
+    printf("hello world\n");
+    (void)rocm_smi_init();
+    uint32_t num_devs;
+    (void)rocm_smi_getNumDevice(&num_devs);
+    printf("num_devs=%d\n", num_devs);
+    uint32_t deviceIndex = 0;
+    char bus0[100]       = "bus0";
+    (void)rocm_smi_getDevicePciBusIdString(deviceIndex, bus0, 100);
+    printf("bus0=%s\n", bus0);
+    RSMI_IO_LINK_TYPE rsmi_type;
+    int hops, count;
+    (void)rocm_smi_getLinkInfo(0, 8, &rsmi_type, &hops, &count);
+    printf("rsmi_type=%d, hops=%d, count=%d\n", rsmi_type, hops, count);
+    // struct sccl::hardware::topology::topo::scclXml* xml;
+    // SCCLCHECK(sccl::scclCalloc(&xml, 1));
+    // std::string xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
+    // SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
+    // struct sccl::hardware::topology::topo::scclTopoSystem* topoSystem;
+    // SCCLCHECK(sccl::hardware::topology::topo::scclTopoGetSystemFromXml(xml, &topoSystem));
+    return 0;
+}
\ No newline at end of file
--- a/examples/2_topo/2_bootstrap/1_mpi_init.cpp
+++ b/examples/2_topo/2_bootstrap/1_mpi_init.cpp
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "mpi.h"
+#include "net.h"
+#include "bootstrap_net.h"
+using namespace sccl;
+int main(int argc, char* argv[]) {
+    int rank, nranks;
+    int tag1, src, dst, cnt;
+    MPI_Status status;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    printf("rank=%d, nranks=%d\n", rank, nranks);
+    // ----------------------------------------------------------------------- //
+    INFO(SCCL_LOG_TOPO, "Bootstrap ...\n");
+    (void)sccl::hardware::topology::bootstrap::bootstrap_net::bootstrapNetInit();
+    MPI_Finalize();
+}
+/*
+单机执行
+SCCL_DEBUG_LEVEL=SCCL_LOG_ABORT mpirun --allow-run-as-root -np 2 1_mpi_init
+SCCL_DEBUG_LEVEL=SCCL_LOG_INFO SCCL_DEBUG_POS=SCCL_LOG_CODEALL mpirun --allow-run-as-root -np 2 1_mpi_init
+跨机执行
+SCCL_DEBUG_LEVEL=SCCL_LOG_ABORT mpirun --allow-run-as-root --hostfile hostfile -np 16 ./1_mpi_init
+*/
--- a/examples/2_topo/2_bootstrap/compile_mpi.sh
+++ b/examples/2_topo/2_bootstrap/compile_mpi.sh
+hipcc ./1_mpi_init.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvsymbols.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvwrap.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/net_ib.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/socket.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/net_socket.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_net.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/ipcsocket.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/proxy.cpp \
+-o 1_mpi_init \
+-std=c++17 -g -O3 -fopenmp -DROC_SHMEM -D__HIP_PLATFORM_HCC__ \
+-I ./ -I /usr/include -I /opt/dtk/include \
+-I /public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/ \
+-L /public/home/lishen/Code/rocSHMEM/SCCL_v1 \
+-L /usr/lib/x86_64-linux-gnu -libverbs -lrdmacm \
+-L /public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib -lmpi
--- a/src/algothrim/collectives/gather/readme.MD
+++ b/src/algothrim/collectives/gather/readme.MD
+# gather功能
+包括`gather`和`all-gather`
--- a/src/algothrim/collectives/reduce/readme.MD
+++ b/src/algothrim/collectives/reduce/readme.MD
+# reduce功能
+包括`reduce`和`all-reduce`
--- a/src/algothrim/collectives/scatter/readme.MD
+++ b/src/algothrim/collectives/scatter/readme.MD
+# scatter功能
+包括`scatter` 和 `reduce-scatter`
--- a/src/hardware/comm.h
+++ b/src/hardware/comm.h
+#pragma once
+#include <stdint.h>
+#include "base.h"
+#include "topo.h"
+namespace sccl {
+namespace hardware {
+// 定义结构体 scclUniqueInfo，用于存储每个通信节点的信息
+struct scclUniqueInfo {
+    int rank;       // 当前节点的全局排名
+    int nRanks;     // 总的节点数量
+    int localRank;  // 当前节点在本地计算节点中的排名
+    int localRanks; // 本地计算节点中的节点总数
+    int cudaDev;       // CUDA 设备 ID
+    int gdrSupport;    // 是否支持 GPU 直接注册 (GDR)
+    uint64_t hostHash; // 主机哈希值
+    uint64_t pidHash;  // 进程 ID 哈希值
+    int64_t busId;     // 总线 ID
+};
+// // 定义结构体 scclCommBase，用于存储通信基础信息
+// struct scclCommBase {
+//     struct scclUniqueInfo* peerInfo;           // 指向 peerInfo 结构体的指针，存储所有节点的信息
+//     sccl::hardware::net::scclNet_t* scclNet; // 指向网络结构体的指针，用于网络通信
+// };
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/connection/communication/readme.MD
+++ b/src/hardware/connection/communication/readme.MD
--- a/src/hardware/graph/connect.cc
+++ b/src/hardware/graph/connect.cc
+#include "comm.h"
+#include "graph.h"
+#include "trees.h"
+#include "rings.h"
+#include "topo.h"
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace detect {
+/******************************************************************/
+/********************* Internode connection ***********************/
+/******************************************************************/
+scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks) {
+    int rank       = comm->rank;
+    int localRanks = comm->topo->nodes[GPU].count;
+    int nChannels  = comm->nChannels;
+    for(int c = 0; c < nChannels; c++) {
+        struct scclChannel* channel = comm->channels + c;
+        channel->ring.prev = channel->ring.next = -1;
+        channel->tree.up                        = -1;
+        channel->collnetChain.up                = -1;
+        for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++)
+            channel->tree.down[i] = -1;
+        for(int i = 0; i < SCCL_MAX_TREE_ARITY; i++)
+            channel->collnetChain.down[i] = -1;
+        channel->collnetDirect.out      = -1;
+        channel->collnetDirect.headRank = -1;
+        channel->collnetDirect.nHeads   = 0;
+        channel->collnetDirect.shift    = 0;
+        for(int i = 0; i < SCCL_MAX_DIRECT_ARITY; i++)
+            channel->collnetDirect.up[i] = -1;
+        for(int i = 0; i < SCCL_MAX_DIRECT_ARITY; i++)
+            channel->collnetDirect.down[i] = -1;
+        int* ringIntra    = graphs[SCCL_ALGO_RING]->intra + c * localRanks;
+        int* treeIntra    = graphs[SCCL_ALGO_TREE]->intra + c * localRanks;
+        int* collNetIntra = graphs[SCCL_ALGO_COLLNET_CHAIN]->intra + c * localRanks;
+        int* nvlsIntra    = graphs[SCCL_ALGO_NVLS]->intra + c * localRanks;
+        for(int i = 0; i < localRanks; i++) {
+            if(ringIntra[i] == rank) {
+                topoRanks->ringRecv[c] = ringIntra[0];
+                topoRanks->ringSend[c] = ringIntra[localRanks - 1];
+                channel->ring.prev     = (i == 0) ? -1 : ringIntra[i - 1];
+                channel->ring.next     = (i == localRanks - 1) ? -1 : ringIntra[i + 1];
+            }
+            if(treeIntra[i] == rank) {
+                int parentIndex = 0;
+                int child0Index = graphs[SCCL_ALGO_TREE]->pattern == SCCL_TOPO_PATTERN_TREE ? 0 : 1;
+                int child1Index = graphs[SCCL_ALGO_TREE]->pattern == SCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
+                topoRanks->treeToParent[c] = treeIntra[parentIndex];
+                topoRanks->treeToChild0[c] = treeIntra[child0Index];
+                topoRanks->treeToChild1[c] = treeIntra[child1Index];
+                channel->tree.up           = i == 0 ? -1 : treeIntra[i - 1];
+                channel->tree.down[0]      = i == localRanks - 1 ? -1 : treeIntra[i + 1];
+            }
+            if(collNetIntra[i] == rank) {
+                channel->collnetChain.up      = i == 0 ? comm->nRanks : collNetIntra[i - 1];
+                channel->collnetChain.down[0] = i == localRanks - 1 ? -1 : collNetIntra[i + 1];
+            }
+        }
+        topoRanks->ringPrev[c]  = channel->ring.prev;
+        topoRanks->ringNext[c]  = channel->ring.next;
+        topoRanks->nvlsHeads[c] = nvlsIntra[0];
+    }
+    // Duplicate channels rings/trees
+    struct scclChannel* channel0 = comm->channels;
+    struct scclChannel* channel1 = (nChannels > MAXCHANNELS / 2) ? 0 : channel0 + nChannels;
+    if(channel1)
+        memcpy(channel1, channel0, nChannels * sizeof(struct scclChannel));
+    return scclSuccess;
+}
+bool isRankHere(const char* s, int start, int end, int rank) {
+    if(end <= start || start < 0 || end < 0)
+        return false;
+    int num = 0;
+    while(start < end) {
+        char currChar = s[start];
+        if(isdigit(currChar)) {
+            num = num * 10 + (currChar - '0');
+            if(isdigit(s[start + 1])) {
+                start++;
+                continue;
+            }
+        } else if(currChar == '(' || currChar == ')') {
+            start++;
+            num = 0;
+            continue;
+        }
+        if(num == rank)
+            return true;
+        start++;
+    }
+    return false;
+}
+scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph) {
+    int x = 0, y = 0;
+    for(int i = 0; treeGraph->treeBase[i][0] != 0; i++) {
+        x = i + 1;
+    }
+    if(treeGraph->treeBase[0][0] == 0)
+        return scclSuccess;
+    int nChannels  = comm->nChannels;
+    int localRanks = comm->topo->nodes[GPU].count;
+    // new tree
+    for(int c = 0; c < nChannels; c++) { // in here
+        int buff = c % x;
+        char tempString[SCCL_TOPO_MAX_NODES * 4];
+        int ko = 0;
+        while(treeGraph->treeBase[buff][ko] != 0) {
+            tempString[ko] = treeGraph->treeBase[buff][ko];
+            ko++;
+        }
+        tempString[ko]              = 0;
+        int start                   = 0;
+        int curRank                 = comm->rank;
+        struct scclChannel* channel = comm->channels + c;
+        int end                     = 0;
+        while(tempString[end] != 0)
+            end++;
+        int parent = -1;
+        // constructing a number from the continuous digits
+        while(start < end) {
+            int num = 0, num_found = 0;
+            start++;
+            while(start < end && tempString[start] != '(' && tempString[start] != ')') {
+                int num_here = (int)(tempString[start] - '0');
+                num          = num * 10 + num_here;
+                start        = start + 1;
+                if(tempString[start] == '(' || tempString[start] == ')' || start == end)
+                    num_found = 1;
+            }
+            if(num_found != 0 && num == curRank) {
+                channel->tree.up = parent;
+                int depth        = 0;
+                for(int childId = 0; childId < SCCL_MAX_TREE_ARITY; childId++) {
+                    int or_start                = start;
+                    int child                   = -1;
+                    channel->tree.down[childId] = -1;
+                    if(or_start >= end - 1)
+                        continue;
+                    num = 0;
+                    or_start++;
+                    while(tempString[or_start] != 0 && tempString[or_start] != '(' && tempString[or_start] != ')') {
+                        int num_here = (int)(tempString[or_start] - '0');
+                        num          = num * 10 + num_here;
+                        or_start++;
+                    }
+                    child = num;
+                    // find next child start
+                    while(start < end) {
+                        if(tempString[start] == '(')
+                            depth++;
+                        else if(tempString[start] == ')')
+                            depth--;
+                        if(depth == 0)
+                            break; // next child
+                        start++;
+                    }
+                    start++;
+                    channel->tree.down[childId] = child;
+                    // get kids, update numbers, get out of this string
+                }
+                break;
+            } else { // go to the next one
+                parent      = num;
+                int start_c = start;
+                int end_c   = start_c;
+                while(end_c < end) {
+                    int depth = 0;
+                    while(end_c < end) {
+                        if(tempString[end_c] == '(')
+                            depth++;
+                        else if(tempString[end_c] == ')')
+                            depth--;
+                        if(depth == 0)
+                            break; // next child
+                        end_c++;
+                    }
+                    if(isRankHere(tempString, start_c, end_c, curRank)) {
+                        start = start_c;
+                        end   = end_c;
+                        break;
+                    } else {
+                        end_c++;
+                        start_c = end_c;
+                    }
+                }
+            }
+        }
+    }
+    return scclSuccess;
+}
+static scclResult_t connectRings(struct scclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
+    int nChannels = comm->nChannels;
+    int nNodes    = comm->nNodes;
+    for(int c = 0; c < nChannels; c++) {
+        int* recv                    = ringRecv + c * comm->nNodes;
+        int* send                    = ringSend + c * comm->nNodes;
+        int* prev                    = ringPrev + c * comm->nRanks;
+        int* next                    = ringNext + c * comm->nRanks;
+        struct scclChannel* channel0 = comm->channels + c;
+        struct scclChannel* channel1 = (nChannels > MAXCHANNELS / 2) ? 0 : channel0 + nChannels;
+        for(int n = 0; n < nNodes; n++) {
+            int recvRank     = recv[n];
+            int prevSendRank = send[(n - 1 + nNodes) % nNodes];
+            prev[recvRank]   = prevSendRank;
+            if(comm->rank == recvRank) {
+                channel0->ring.prev = prevSendRank;
+                if(channel1)
+                    channel1->ring.prev = prevSendRank;
+            }
+            int sendRank     = send[n];
+            int nextRecvRank = recv[(n + 1) % nNodes];
+            next[sendRank]   = nextRecvRank;
+            if(comm->rank == sendRank) {
+                channel0->ring.next = nextRecvRank;
+                if(channel1)
+                    channel1->ring.next = nextRecvRank;
+            }
+        }
+    }
+    return scclSuccess;
+}
+static scclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
+    for(int n = 0; n < nNodes; n++)
+        indexes[n] = ranks[n];
+    return scclSuccess;
+}
+static scclResult_t setTreeUp(struct scclTree* tree, int* indexes, int u) {
+    if(u == -1)
+        return scclSuccess;
+    tree->up = indexes[u];
+    return scclSuccess;
+}
+static scclResult_t setTreeDown(struct scclTree* tree, int* indexes, int d) {
+    if(d == -1)
+        return scclSuccess;
+    int x = 0;
+    while(x < SCCL_MAX_TREE_ARITY && tree->down[x] >= 0)
+        x++;
+    if(x == SCCL_MAX_TREE_ARITY) {
+        WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
+        return scclInternalError;
+    }
+    tree->down[x] = indexes[d];
+    return scclSuccess;
+}
+static scclResult_t connectTrees(struct scclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
+    const int nChannels = (comm->nChannels > MAXCHANNELS / 2) ? comm->nChannels / 2 : comm->nChannels, nNodes = comm->nNodes, node = comm->node;
+    // Compute tree depth. Not an exact value but a good approximation in most
+    // cases
+    int depth = comm->nRanks / nNodes - 1 + log2i(nNodes);
+    int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
+    int *ttp, *ttc0, *ttc1;
+    SCCLCHECK(scclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
+    if(comm->nChannels <= MAXCHANNELS / 2) {
+        for(int c = 0; c < nChannels; c++) {
+            struct scclChannel* channel0 = comm->channels + c;
+            struct scclChannel* channel1 = channel0 + nChannels;
+            ttp                          = treeToParent + c * comm->nNodes;
+            ttc0                         = treeToChild0 + c * comm->nNodes;
+            ttc1                         = treeToChild1 + c * comm->nNodes;
+            if(comm->rank == ttp[node]) {
+                SCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
+                SCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
+            }
+            if(comm->rank == ttc0[node]) {
+                SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
+                SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
+            }
+            if(comm->rank == ttc1[node]) {
+                SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
+                SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
+            }
+            if(comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
+                INFO(SCCL_LOG_TOPO,
+                     "Tree %d : %d -> %d -> %d/%d/%d",
+                     c,
+                     channel0->tree.up,
+                     comm->rank,
+                     channel0->tree.down[0],
+                     channel0->tree.down[1],
+                     channel0->tree.down[2]);
+                INFO(SCCL_LOG_TOPO,
+                     "Tree %d : %d -> %d -> %d/%d/%d",
+                     c + nChannels,
+                     channel1->tree.up,
+                     comm->rank,
+                     channel1->tree.down[0],
+                     channel1->tree.down[1],
+                     channel1->tree.down[2]);
+            }
+            channel0->tree.depth = channel1->tree.depth = depth;
+        }
+    } else {
+        for(int c = 0; c < nChannels; c++) {
+            struct scclChannel* channel0 = comm->channels + c;
+            ttp                          = treeToParent + c * comm->nNodes;
+            ttc0                         = treeToChild0 + c * comm->nNodes;
+            ttc1                         = treeToChild1 + c * comm->nNodes;
+            if(comm->rank == ttp[node]) {
+                SCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
+            }
+            if(comm->rank == ttc0[node]) {
+                SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
+            }
+            if(comm->rank == ttc1[node]) {
+                SCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
+            }
+            if(comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
+                INFO(SCCL_LOG_TOPO,
+                     "Tree %d : %d -> %d -> %d/%d/%d",
+                     c,
+                     channel0->tree.up,
+                     comm->rank,
+                     channel0->tree.down[0],
+                     channel0->tree.down[1],
+                     channel0->tree.down[2]);
+            }
+            channel0->tree.depth = depth;
+        }
+        for(int c = nChannels; c < nChannels * 2; c++) {
+            struct scclChannel* channel1 = comm->channels + c;
+            ttp                          = treeToParent + c * comm->nNodes;
+            ttc0                         = treeToChild0 + c * comm->nNodes;
+            ttc1                         = treeToChild1 + c * comm->nNodes;
+            if(comm->rank == ttp[node]) {
+                SCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
+            }
+            if(comm->rank == ttc0[node]) {
+                SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
+            }
+            if(comm->rank == ttc1[node]) {
+                SCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
+            }
+            if(comm->rank == ttp[node] || comm->rank == ttc0[node] || comm->rank == ttc1[node]) {
+                INFO(SCCL_LOG_TOPO,
+                     "Tree %d : %d -> %d -> %d/%d/%d",
+                     c + nChannels,
+                     channel1->tree.up,
+                     comm->rank,
+                     channel1->tree.down[0],
+                     channel1->tree.down[1],
+                     channel1->tree.down[2]);
+            }
+            channel1->tree.depth = depth;
+        }
+    }
+    return scclSuccess;
+}
+static scclResult_t connectCollNet(struct scclComm* comm, struct scclTopoGraph* collNetGraph) {
+    int rank       = comm->rank;
+    int localRanks = comm->localRanks;
+    int nHeads     = 0;
+    int* heads;
+    SCCLCHECK(scclCalloc(&heads, localRanks));
+    // Find all head ranks
+    // Head index is always 0
+    for(int c = 0; c < collNetGraph->nChannels; c++) {
+        int* collNetIntra = collNetGraph->intra + c * localRanks;
+        int head          = collNetIntra[0];
+        for(int h = 0; h < nHeads; h++)
+            if(heads[h] == head)
+                head = -1;
+        if(head != -1)
+            heads[nHeads++] = collNetIntra[0];
+    }
+    // For all channels
+    for(int c = 0; c < comm->nChannels; c++) {
+        struct scclChannel* channel = comm->channels + c;
+        char line[1024];
+        sprintf(line, "CollNet channel %d rank %d ", c, rank);
+        int nDown = 0;
+        for(int i = 0; i < nHeads; i++) {
+            if(rank == heads[i]) {                              // is head
+                channel->collnetDirect.headRank = i;            // Mark the index for deciding offset in the CUDA kernel
+                channel->collnetDirect.out      = comm->nRanks; // Set root of collnetDirect to id nranks
+                int* collNetIntra               = collNetGraph->intra + i * localRanks;
+                sprintf(line + strlen(line), "down ");
+                for(int r = 0; r < localRanks; r++) {
+                    if(collNetIntra[r] == rank)
+                        continue;
+                    channel->collnetDirect.down[nDown++] = collNetIntra[r]; // connect to all peers
+                    sprintf(line + strlen(line), " %d ", collNetIntra[r]);
+                }
+                sprintf(line + strlen(line), "nDown %d ", nDown);
+                break;
+            }
+        }
+        // Connect to all heads
+        int nUp = 0;
+        sprintf(line + strlen(line), "up ");
+        for(int h = 0; h < nHeads; h++) {
+            if(rank == heads[h])
+                continue;
+            channel->collnetDirect.up[nUp++] = heads[h];
+            sprintf(line + strlen(line), " %d ", heads[h]);
+        }
+        channel->collnetDirect.nHeads = nHeads;
+        channel->collnetDirect.shift  = (rank % localRanks) % nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
+        channel->collnetDirect.depth  = (nUp == 0 && nDown == 0) ? 1 : 2;
+        sprintf(line + strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
+        sprintf(line + strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
+        INFO(SCCL_LOG_TOPO, "%s", line);
+        channel->collnetChain.depth = comm->nRanks / comm->nNodes;
+    }
+    for(int c = 0; c < comm->nvlsChannels; c++) {
+        struct scclChannel* channel = comm->channels + c;
+        if(channel->nvls.headRank != -1)
+            channel->nvls.out = comm->nRanks;
+    }
+    free(heads);
+    return scclSuccess;
+}
+static scclResult_t connectNvls(struct scclComm* comm, int* nvlsHeads, struct scclTopoGraph* nvlsGraph) {
+    int nHeads   = nvlsGraph->nChannels;
+    int headRank = -1;
+    for(int h = 0; h < nHeads; h++) {
+        if(nvlsGraph->intra[h * comm->localRanks] == comm->rank)
+            headRank = h;
+    }
+    if(nHeads == 0) {
+        comm->nvlsChannels = 0;
+        return scclSuccess;
+    }
+    for(int c = 0; c < comm->nvlsChannels; c++) {
+        struct scclChannel* channel = comm->channels + c;
+        channel->nvls.nHeads        = nHeads;
+        for(int h = 0; h < nHeads; h++)
+            channel->nvls.up[h] = comm->nRanks + 1 + h;
+        for(int h = nHeads; h < SCCL_MAX_NVLS_ARITY; h++)
+            channel->nvls.up[h] = -1;
+        channel->nvls.down     = comm->nRanks + 1 + headRank;
+        channel->nvls.out      = -1; // NVLS+SHARP not yet implemented.
+        channel->nvls.headRank = headRank;
+        channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
+        channel->nvls.node                                                                                       = comm->node;
+        channel->nvls.nNodes                                                                                     = comm->nNodes;
+    }
+    if(comm->nNodes == 1)
+        return scclSuccess;
+    // Connect Trees
+    int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
+    int pc0, pc1; // ignored
+    SCCLCHECK(scclGetDtree(comm->nNodes, comm->node, &tree0Parent, &tree0Child0, &tree0Child1, &pc0, &tree1Parent, &tree1Child0, &tree1Child1, &pc1));
+    int* heads       = NULL;
+    int treeUp[2]    = {-1, -1};
+    int treeDown0[2] = {-1, -1};
+    int treeDown1[2] = {-1, -1};
+    if(comm->node == 0) {
+        for(int h = 0; h < nHeads; h++) {
+            char line[1024];
+            sprintf(line, "NVLS Head %2d:", h);
+            heads = nvlsHeads + h * comm->nNodes;
+            for(int n = 0; n < comm->nNodes && n < 20; n++) {
+                sprintf(line + strlen(line), " %2d", heads[n]);
+            }
+            INFO(SCCL_INIT, "%s", line);
+        }
+    }
+    // Find the heads where I'm the head rank and retain tree up/down
+    for(int h = 0; h < nHeads; h++) {
+        heads = nvlsHeads + h * comm->nNodes;
+        if(heads[comm->node] == comm->rank) {
+            treeUp[0]    = tree0Parent == -1 ? -1 : heads[tree0Parent];
+            treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0];
+            treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1];
+            treeUp[1]    = tree1Parent == -1 ? -1 : heads[tree1Parent];
+            treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0];
+            treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1];
+            break;
+        }
+    }
+    // Set prev/next in all channels (NVLS compute channels work
+    // orthogonally to NVLS search channels).
+    for(int c = 0; c < comm->nvlsChannels; c++) {
+        struct scclChannel* channel = comm->channels + c;
+        channel->nvls.treeUp        = treeUp[c % 2];
+        channel->nvls.treeDown[0]   = channel->nvls.down;
+        int ix                      = 1;
+        if(treeDown0[c % 2] != -1)
+            channel->nvls.treeDown[ix++] = treeDown0[c % 2];
+        if(treeDown1[c % 2] != -1)
+            channel->nvls.treeDown[ix] = treeDown1[c % 2];
+    }
+    struct scclNvls* nvls0 = &comm->channels[0].nvls;
+    struct scclNvls* nvls1 = &comm->channels[1].nvls;
+    INFO(SCCL_LOG_TOPO,
+         "NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d",
+         nvls0->treeDown[0],
+         nvls0->treeDown[1],
+         comm->rank,
+         nvls0->treeUp,
+         nvls1->treeDown[0],
+         nvls1->treeDown[1],
+         comm->rank,
+         nvls1->treeUp);
+    return scclSuccess;
+}
+// Legacy naming
+SCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
+SCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
+// New naming
+SCCL_PARAM(MinNchannels, "MIN_NCHANNELS", 4);
+SCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
+int scclMinNchannels() {
+    int minNchannels = 2;
+    if(scclParamMinNrings() != -2)
+        minNchannels = scclParamMinNrings();
+    if(scclParamMinNchannels() != -2)
+        minNchannels = scclParamMinNchannels();
+    if(minNchannels > MAXCHANNELS) {
+        WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
+        minNchannels = MAXCHANNELS;
+    }
+    if(minNchannels < 0)
+        minNchannels = 0;
+    return minNchannels;
+}
+int scclMaxNchannels() {
+    int maxNchannels = MAXCHANNELS;
+    if(scclParamMaxNrings() != -2)
+        maxNchannels = scclParamMaxNrings();
+    if(scclParamMaxNchannels() != -2)
+        maxNchannels = scclParamMaxNchannels();
+    if(maxNchannels > MAXCHANNELS)
+        maxNchannels = MAXCHANNELS;
+    if(maxNchannels < 1) {
+        WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
+        maxNchannels = 1;
+    }
+    return maxNchannels;
+}
+static int copyChannels(struct scclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
+    int nranks = comm->nRanks;
+    int c;
+    for(c = start; c < end; c++) {
+        memcpy(ringPrev + c * nranks, ringPrev + (c - start) * nranks, nranks * sizeof(int));
+        memcpy(ringNext + c * nranks, ringNext + (c - start) * nranks, nranks * sizeof(int));
+        memcpy(comm->channels + c, comm->channels + c - start, sizeof(struct scclChannel));
+    }
+    return c;
+}
+static int copyMixedChannels(struct scclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
+    int nranks = comm->nRanks;
+    int c;
+    for(c = start; c < end; c++) {
+        memcpy(ringPrev + c * nranks, ringPrev + (c - start) * nranks, nranks * sizeof(int));
+        memcpy(ringNext + c * nranks, ringNext + (c - start) * nranks, nranks * sizeof(int));
+        memcpy(comm->channels + c, comm->channels + c - start, sizeof(struct scclChannel));
+        comm->channels[c].transportType = comm->mixedTransportType;
+    }
+    return c;
+}
+RCCL_PARAM(MaxMixedHylinkNChannels, "MAX_MIXED_HYLINK_NCHANNELS", 0);
+RCCL_PARAM(MixedTransportType, "MIXED_TRANSPORT_TYPE", TRANSPORT_SHM);
+scclResult_t scclTopoPostset(
+    struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc) {
+    // Gather data from all ranks
+    int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
+    int nranks       = comm->nRanks;
+    int nNodes       = comm->nNodes;
+    int nChannels    = comm->nChannels;
+    int MinNChannels = scclMinNchannels();
+    int MaxNChannels = scclMaxNchannels();
+    SCCLCHECK(scclCalloc(&ringRecv, nNodes * MAXCHANNELS));
+    SCCLCHECK(scclCalloc(&ringSend, nNodes * MAXCHANNELS));
+    SCCLCHECK(scclCalloc(&ringPrev, nranks * MAXCHANNELS));
+    SCCLCHECK(scclCalloc(&ringNext, nranks * MAXCHANNELS));
+    SCCLCHECK(scclCalloc(&treeToParent, nNodes * MAXCHANNELS));
+    SCCLCHECK(scclCalloc(&treeToChild0, nNodes * MAXCHANNELS));
+    SCCLCHECK(scclCalloc(&treeToChild1, nNodes * MAXCHANNELS));
+    SCCLCHECK(scclCalloc(&nvlsHeads, nNodes * MAXCHANNELS));
+    for(int c = 0; c < nChannels; c++) {
+        for(int n = 0; n < nNodes; n++) {
+            int r                        = firstRanks[n];
+            ringRecv[c * nNodes + n]     = allTopoRanks[r]->ringRecv[c];
+            ringSend[c * nNodes + n]     = allTopoRanks[r]->ringSend[c];
+            treeToParent[c * nNodes + n] = allTopoRanks[r]->treeToParent[c];
+            treeToChild0[c * nNodes + n] = allTopoRanks[r]->treeToChild0[c];
+            treeToChild1[c * nNodes + n] = allTopoRanks[r]->treeToChild1[c];
+            nvlsHeads[c * nNodes + n]    = allTopoRanks[r]->nvlsHeads[c];
+        }
+        for(int r = 0; r < nranks; r++) {
+            ringPrev[c * nranks + r] = allTopoRanks[r]->ringPrev[c];
+            ringNext[c * nranks + r] = allTopoRanks[r]->ringNext[c];
+        }
+    }
+    // Connect rings and trees. This should also duplicate the channels.
+    SCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
+    SCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
+    SCCLCHECK(connectNvls(comm, nvlsHeads, graphs[SCCL_ALGO_NVLS]));
+    // Duplicate ringPrev/ringNext for scclBuildRing
+    if(nChannels <= MAXCHANNELS / 2)
+        memcpy(ringPrev + nChannels * nranks, ringPrev, nChannels * nranks * sizeof(int));
+    if(nChannels <= MAXCHANNELS / 2)
+        memcpy(ringNext + nChannels * nranks, ringNext, nChannels * nranks * sizeof(int));
+    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MIN_NCHANNELS") == NULL)
+        MinNChannels = 32;
+    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MAX_NCHANNELS") == NULL)
+        MaxNChannels = 32;
+#ifdef HCU_SDMA_FEATURE
+    int ncSdma = nc;
+    ncSdma     = std::min((int)scclMaxNchannels() / comm->nChannels, nc);
+    ncSdma *= comm->nChannels;
+#endif
+    // Get number of channels after duplication
+    nc = std::min((int)MaxNChannels / comm->nChannels, nc);
+    nc *= comm->nChannels;
+    // Duplication should be complete now
+    nChannels = comm->nChannels = std::min(MAXCHANNELS, (nChannels <= MAXCHANNELS / 2) ? nChannels * 2 : nChannels);
+    // Setup CollNet
+    if(comm->collNetSupport == 1) {
+        struct scclTopoGraph* collNetGraph = graphs[SCCL_ALGO_COLLNET_DIRECT];
+        // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
+        if(collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
+            int collNetNchannels = std::min(MAXCHANNELS, nChannels + nChannels / 2);
+            nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
+        }
+        SCCLCHECK(connectCollNet(comm, collNetGraph));
+    }
+    // Use 4 compute channels per search channel to reach peak BW on <8 PPN
+    if(comm->minCompCap == 90 && comm->nNodes > 1 && graphs[SCCL_ALGO_RING]->bwIntra > 45.0 && 2 * nChannels <= MAXCHANNELS) {
+        nChannels = comm->nChannels = copyChannels(comm, nChannels, 2 * nChannels, ringPrev, ringNext);
+    }
+    // Add Hylink + PCIE double channel path
+    if(graphs[SCCL_ALGO_RING]->typeIntra == PATH_NVL) {
+        comm->nMixedHylinkChannels = std::min(MAXCHANNELS - comm->nChannels, (int)rcclParamMaxMixedHylinkNChannels());
+        if(comm->nMixedHylinkChannels > 0) {
+            INFO(SCCL_LOG_TOPO,
+                 "<%s:%d> -----> comm->nMixedHylinkShmChannels: %d, comm->nChannels: %d\n",
+                 __func__,
+                 __LINE__,
+                 comm->nMixedHylinkChannels,
+                 comm->nChannels);
+            comm->mixedTransportType = std::max((int)rcclParamMixedTransportType(), TRANSPORT_SHM);
+            nChannels = comm->nChannels = copyMixedChannels(comm, nChannels, nChannels + comm->nMixedHylinkChannels, ringPrev, ringNext);
+        }
+    }
+    // Honor SCCL_MIN_NRINGS/SCCL_MAX_NRINGS.
+    // We permit combining max, then min, to only use the first channels, then duplicate them.
+    if(checkSdmaCopyEnable(comm)) {
+        uint32_t sdmaChannelNum;
+        uint32_t maxChannels;
+        sdmaChannelNum = getSdmaChannelNum(comm);
+        if(comm->sharedRes->owner != comm) {
+            /* child comm #channels cannot exceed top parent #channels. */
+            nChannels = comm->nChannels = std::min(std::min(std::min(scclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
+            maxChannels =
+                sdmaChannelNum ? sdmaChannelNum : std::min(std::max(scclMinNchannels(), std::max(ncSdma, comm->config.minCTAs)), comm->sharedRes->tpNChannels);
+            nChannels = comm->nChannels = copyChannels(comm, nChannels, maxChannels, ringPrev, ringNext);
+        } else {
+            nChannels = comm->nChannels = std::min(std::min(scclMaxNchannels(), nChannels), comm->config.maxCTAs);
+            maxChannels                 = sdmaChannelNum ? sdmaChannelNum : std::max(scclMinNchannels(), std::max(ncSdma, comm->config.minCTAs));
+            nChannels = comm->nChannels = copyChannels(comm, nChannels, maxChannels, ringPrev, ringNext);
+        }
+        INFO(SCCL_INIT, "-hcugon- scclTopoPostset rank %d sdmaChannelNum %d nChannels %d", comm->rank, sdmaChannelNum, comm->nChannels);
+    } else {
+        if(comm->sharedRes->owner != comm) {
+            /* child comm #channels cannot exceed top parent #channels. */
+            nChannels = comm->nChannels = std::min(std::min(std::min(MaxNChannels, nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
+            nChannels = comm->nChannels = copyChannels(
+                comm, nChannels, std::min(std::max(MinNChannels, std::max(nc, comm->config.minCTAs)), comm->sharedRes->tpNChannels), ringPrev, ringNext);
+        } else {
+            nChannels = comm->nChannels = std::min(std::min(MaxNChannels, nChannels), comm->config.maxCTAs);
+            nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(MinNChannels, std::max(nc, comm->config.minCTAs)), ringPrev, ringNext);
+        }
+    }
+    // Create rings array and check all is fine
+    SCCLCHECK(scclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
+    free(ringRecv);
+    free(ringSend);
+    free(ringPrev);
+    free(ringNext);
+    free(treeToParent);
+    free(treeToChild0);
+    free(treeToChild1);
+    free(nvlsHeads);
+    return scclSuccess;
+}
+} // namespace detect
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/graph/devcomm.h
+++ b/src/hardware/graph/devcomm.h
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef SCCL_DEVICE_H_
+#define SCCL_DEVICE_H_
+#include "check.h"
+#include "sccl_bfloat16.h"
+#include "align.h"
+#if defined(ENABLE_NPKIT)
+#include "npkit/npkit_struct.h"
+#endif
+#if defined(ENABLE_TIMELINE)
+#include "timeline/timeline.h"
+#endif
+#include <stdint.h>
+#ifdef HCU_SDMA_FEATURE
+#include "hsa/hsa_ext_amd.h"
+#include "hsa_extra.h"
+// #define HCU_PRINT_DEBUG
+#endif
+namespace sccl {
+#define PRINT_ERR(...)
+#define PRINT_INFO(...)
+#define PRINT_INFOM(...)
+#define PRINT_INFOT(tid, ...)
+#define PRINT_DEBUG(...)
+#if defined(ENABLE_NPKIT) && defined(HCU_SDMA_FEATURE)
+#define NPKIT_SET_GPU_EVENT(event, size, cost) \
+    NpKit::CollectGpuEvent(event, size, cost, NPKIT_GET_GPU_TIMESTAMP(), scclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm) NpKit::CollectGpuEvent(event, size, cost, tm, scclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+#else
+#define NPKIT_SET_GPU_EVENT(event, size, cost)
+#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm)
+#endif
+#ifdef HCU_SDMA_FEATURE
+#define INIT_PRIMS_SDMA(prims, args)                                                                           \
+    {                                                                                                          \
+        prims.rank            = scclShmem.comm.rank;                                                           \
+        prims.useSdmaConfig   = args->useSdma;                                                                 \
+        prims.useSdmaCopy     = args->useSdma && prims.sdmaQueueCtx;                                           \
+        prims.preFnOps        = args->preFnOps;                                                                \
+        prims.sdmaMinCopySize = args->useSdma && prims.sdmaQueueCtx ? prims.sdmaQueueCtx->minCopySize : 0;     \
+        prims.sdmaCountEnable = args->useSdma && prims.sdmaQueueCtx ? prims.sdmaQueueCtx->copyCountEnable : 0; \
+        prims.sdmaCopyCount   = 0;                                                                             \
+        prims.allCopyCount    = 0;                                                                             \
+    }
+#endif
+#define SCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
+typedef enum {
+    scclFuncBroadcast,
+    scclFuncReduce,
+    scclFuncAllGather,
+    scclFuncReduceScatter,
+    scclFuncAllReduce,
+    scclFuncSendRecv,
+    scclFuncSend,
+    scclFuncRecv,
+    scclFuncAllToAllPivot,
+    scclNumFuncs
+} scclFunc_t;
+extern const char* scclFuncStr[SCCL_NUM_FUNCTIONS + 2];
+#define SCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
+#define SCCL_ALGO_TREE 0
+#define SCCL_ALGO_RING 1
+#define SCCL_ALGO_COLLNET_DIRECT 2
+#define SCCL_ALGO_COLLNET_CHAIN 3
+#define SCCL_ALGO_NVLS 4
+#define SCCL_ALGO_NVLS_TREE 5
+enum scclAlgo {
+    SCCL_ALGO_TREE           = 0, // 树形算法
+    SCCL_ALGO_RING           = 1, // 环形算法
+    SCCL_ALGO_COLLNET_DIRECT = 2, // 直接网络算法
+    SCCL_ALGO_COLLNET_CHAIN  = 3, // 链式网络算法
+    SCCL_ALGO_NVLS           = 4, // NVLink算法
+    SCCL_ALGO_NVLS_TREE      = 5, // NVLink树形算法
+};
+extern const char* scclAlgoStr[SCCL_NUM_ALGORITHMS];
+#define SCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define SCCL_PROTO_LL 0
+#define SCCL_PROTO_LL128 1
+#define SCCL_PROTO_SIMPLE 2
+extern const char* scclProtoStr[SCCL_NUM_PROTOCOLS];
+#define SCCL_MAX_OPS 2048
+#define SCCL_STEPS 8
+union scclLLFifoLine {
+    /* Flags have to be *after* data, because otherwise, an incomplete receive
+       from the network may receive the flag but not the data.
+       Note this is assuming that either we receive contiguous chunks of data
+       (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+    struct {
+        uint32_t data1;
+        uint32_t flag1;
+        uint32_t data2;
+        uint32_t flag2;
+    };
+    uint64_t v[2];
+    int4 i4;
+};
+#define WARP_SIZE warpSize
+#define MAXCHANNELS 32
+#define SCCL_MAX_NTHREADS 256
+#define SCCL_SIMPLE_MAX_NTHREADS SCCL_MAX_NTHREADS
+#define SCCL_LL_MAX_NTHREADS SCCL_MAX_NTHREADS
+#define SCCL_LL_LINES_PER_THREAD 8
+#ifdef TEST_LL_CLEANUP
+#define SCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
+#define SCCL_LL_FLAG_MAX 0x100
+#define SCCL_LL_FLAG(a) ((uint32_t)((a) % SCCL_LL_FLAG_MAX))
+#else
+#define SCCL_LL_CLEAN_MASK 0x7ffffff8
+#define SCCL_LL_FLAG(a) ((uint32_t)(a))
+#endif
+// Make sure the clean mask will last for at least SCCL_NSTEPS
+static_assert(SCCL_LL_CLEAN_MASK % SCCL_STEPS == 0, "Invalid SCCL_LL_CLEAN_MASK value");
+#define SCCL_LL128_LINESIZE 64
+#define SCCL_LL128_LINEELEMS (SCCL_LL128_LINESIZE / sizeof(uint64_t))
+#define SCCL_LL128_DATAELEMS (SCCL_LL128_LINEELEMS - 1)
+#define SCCL_LL128_MAX_NTHREADS 256
+#define SCCL_LL128_ELEMS_PER_THREAD 28
+#define SCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
+#define SCCL_LL128_SHMEM_SIZE (SCCL_LL128_SHMEM_ELEMS_PER_THREAD * SCCL_LL128_MAX_NTHREADS)
+#define SCCL_DIRECT_WRITE 0x01
+#define SCCL_DIRECT_READ 0x02
+#define SCCL_DIRECT_NIC 0x04
+#define SCCL_IPC_WRITE 0x08
+#define SCCL_IPC_READ 0x10
+#define SCCL_NVLS_MIN_POLL 0x20
+#ifdef HCU_SDMA_FEATURE
+#define SDMA_CTX_VALID_MAGIC 0xD65A
+#endif
+struct scclConnInfo {
+    // Regular comm mechanism
+    char* buffs[SCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
+    uint64_t* tail;                  // Local for recv, remote for send
+    uint64_t* head;                  // Local for send, remote for recv
+    int flags;                  // Direct communication / other flags
+    int shared;                 // Buffers are shared
+    void** ptrExchange;         // Pointer exchange for direct communication
+    uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
+    int* sizesFifo; // Sizes fifo from GPU to proxy
+    int* offsFifo;  // Buffer fifo from proxy to GPU
+    uint64_t step; // Keep where we are
+    uint64_t llLastCleaning;
+    // GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
+    // allows software to explicitly initiate a flush read to HDP memory. See more
+    // descriptions in primitives.h.
+    uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
+    uint32_t* curr_hdp_reg; // Current GPU's HDP register
+#ifdef HCU_SDMA_FEATURE
+    struct sdmaQueueContext* sdmaQueueCtx;
+    uint32_t sdmaCtxValidMagic;
+#endif
+};
+struct scclProxyConnector {
+    int tpRank;
+    int tpLocalRank;
+    int sameProcess;
+    struct scclProxyConnection* connection;
+};
+struct scclConnector {
+    int connected;
+    struct scclProxyConnector proxyConn;
+    struct scclTransportComm* transportComm;
+    void* transportResources;
+    struct scclConnInfo conn;
+};
+struct scclRing {
+    // Shortcuts for userRanks[1] and userRanks[n-1]
+    int prev;
+    int next;
+    // Maps an internal sccl index to user-specified rank order. This is necessary
+    // since we need to know how the user expects data to be ordered across
+    // devices. Ordered from current device.
+    int* userRanks;
+    int index; // This rank's index in the ring
+};
+// The root of each tree only has one node down (+1 intra-node).
+#define SCCL_MAX_TREE_ARITY_TOP 2
+// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
+#define SCCL_MAX_TREE_ARITY 3
+struct scclTree {
+    int depth;
+    int up;
+    int down[SCCL_MAX_TREE_ARITY];
+};
+#define SCCL_MAX_DIRECT_ARITY 7
+struct scclDirect {
+    int depth;
+    int out;
+    int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
+    int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
+    int shift;    // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
+    int up[SCCL_MAX_DIRECT_ARITY];
+    int down[SCCL_MAX_DIRECT_ARITY];
+};
+#define SCCL_CONN_IDX_P2P_NET 2
+#define SCCL_MAX_NVLS_ARITY 8
+#define SCCL_MAX_NVLS_TREE_ARITY 3
+struct scclNvls {
+    int out;
+    int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
+    int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
+    int up[SCCL_MAX_NVLS_ARITY];
+    int down;
+    int treeUp;
+    int treeDown[SCCL_MAX_NVLS_TREE_ARITY];
+    int node;
+    int nNodes;
+};
+#define SCCL_MAX_CONNS 3
+struct scclChannelPeer {
+    struct scclConnector send[SCCL_MAX_CONNS];
+    struct scclConnector recv[SCCL_MAX_CONNS];
+    int refCount;
+};
+struct scclDevComm;
+#pragma pack(push) /* push current alignment to stack */
+#pragma pack(8)    /* set alignment to 8 bytes boundary */
+/* scclWork is to be a power of two, currently 8x64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of scclWorkElem. */
+#define SCCL_WORK_SIZE 256
+enum scclWorkType : uint8_t {
+    scclWorkTypeUnused  = 0,
+    scclWorkTypeColl    = 1,
+    scclWorkTypeP2p     = 2,
+    scclWorkTypeRegColl = 3
+};
+enum scclWorkP2PType : uint8_t {
+    scclWorkP2pTypeUnused = 0,
+    scclWorkP2pTypeSend,
+    scclWorkP2pTypeRecv
+};
+struct scclWorkHeader {
+    union {
+        int32_t workNext;  // when isLast=0: Offset from kernel argument workHead
+        uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
+    };
+    uint16_t funcIndex;
+    uint8_t isLast : 1; // last work for this kernel
+    uint8_t inFifo : 1; // is this work in the fifo
+    enum scclWorkType type;
+};
+struct scclWorkElem {
+    union {
+        uint8_t flagBits;
+        struct {
+            uint8_t isUsed : 1, redOpArgIsPtr : 1, regUsed : 1, nWarps : 5;
+        };
+    };
+    uint8_t direct;
+    uint8_t bid;
+    uint8_t nChannels;
+    struct {
+        uint32_t root : 28;
+        uint32_t preFnOps : 1;
+        uint32_t useSdma : 1;
+        uint32_t connIndex : 2;
+    };
+    const void* sendbuff;
+    void* recvbuff;
+    size_t count;
+    union {
+        size_t lastChunkSize;
+        // Pivot A2A kernel computes chunk size itself.
+        // Instead, it needs the number of bidirectional rings.
+        size_t pivotA2ANumBiRings;
+    };
+    uint64_t redOpArg;
+    uint64_t opCount;
+};
+static_assert((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElem))) / sizeof(scclWorkElem) == 4,
+              "Sanity check: SCCL_MAX_WORK_ELEMENTS == 4");
+#define SCCL_MAX_WORK_ELEMENTS 1
+struct scclWorkElemP2p {
+    struct {
+        int32_t peer : 26;
+        uint32_t preFnOps : 1;
+        uint32_t useSdma : 1;
+        uint32_t connIndex : 2;
+        int32_t proto : 2;
+    };
+    union {
+        uint16_t flagBits;
+        struct {
+            enum scclWorkP2PType p2pType : 4;
+            uint16_t nWarps : 4;
+            uint16_t warpStart : 4;
+            uint16_t ngroups : 4;
+        };
+    };
+    uint16_t opCount;
+    // Important not to use any fields with greater than 4-byte alignment since
+    // we need sizeof(scclWorkElemP2p)==28, but that would be padded up to 32 if
+    // there were 8-byte fields.
+    // void* buff;
+    uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
+    // size_t count;
+    uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
+    int chunkSize;
+};
+static_assert(((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElemP2p))) / sizeof(scclWorkElemP2p)) == 8,
+              "Sanity check: SCCL_MAX_WORK_ELEMENTS_P2P == 8");
+#define SCCL_MAX_WORK_ELEMENTS_P2P 2
+struct scclWorkElemReg {
+    struct scclWorkElem elem;
+    void* dnInputs[SCCL_MAX_DIRECT_ARITY + 1];
+    void* dnOutputs[SCCL_MAX_DIRECT_ARITY + 1];
+    void* upOutputs[SCCL_MAX_DIRECT_ARITY + 1];
+};
+#define SCCL_MAX_WORK_ELEMENTS_REG ((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElemReg))) / sizeof(scclWorkElemReg))
+static_assert(SCCL_MAX_WORK_ELEMENTS_REG == 1, "Sanity check: SCCL_MAX_WORK_ELEMENTS_REG == 1");
+// Number of named barriers supported by CUDA
+#define SCCL_MAX_GROUPS (SCCL_MAX_NTHREADS / WARP_SIZE)
+struct scclWork {
+    struct scclWorkHeader header;
+    union {
+        char pad[SCCL_WORK_SIZE - sizeof(struct scclWorkHeader)];
+        struct scclWorkElem elems[SCCL_MAX_WORK_ELEMENTS];
+        struct scclWorkElemP2p p2pElems[SCCL_MAX_WORK_ELEMENTS_P2P];
+        struct scclWorkElemReg regElems[SCCL_MAX_WORK_ELEMENTS_REG];
+    };
+};
+static_assert(sizeof(struct scclWork) == SCCL_WORK_SIZE, "Sanity check: sizeof(struct scclWork) == SCCL_WORK_SIZE");
+static_assert(sizeof(struct scclWork) % 16 == 0, "Sanity check: sizeof(struct scclWork)%16 == 0");
+struct scclDevChannelPeer {
+    // Stripped version of scclChannelPeer where we only keep the scclConnInfo
+    // instead of the full scclConnector.
+    struct scclConnInfo send[SCCL_MAX_CONNS];
+    struct scclConnInfo recv[SCCL_MAX_CONNS];
+};
+#pragma pack(pop) /* restore original alignment from stack */
+#ifdef ENABLE_PROFILING
+#define PROFILE_NUM_ITEMS 31
+#define PROFILE_NUM_LAUNCHES 1024
+struct scclProf {
+    uint32_t count;
+    uint32_t seq; // only entry from first launch is used
+    struct {
+        uint64_t line : 16;
+        uint64_t timeStamp : 48;
+    } elem[PROFILE_NUM_ITEMS];
+};
+static_assert(sizeof(struct scclProf) == 256, "scclProf must have size of 256");
+#endif
+#ifdef ENABLE_COLLTRACE
+typedef enum {
+    scclCollTraceNotReady         = 0,
+    scclCollTraceKernelLaunchType = 1,
+    scclCollTraceKernelEndType    = 2,
+    scclCollTraceCollLaunchType   = 3,
+    scclCollTraceAbortType        = 4,
+    scclCollTraceDataType         = 5,
+    scclCollTraceCollElemType     = (1 << 4),
+    scclCollTraceP2pElemType      = (1 << 5),
+} scclCollTraceDataType_t;
+struct scclCollTrace {
+    uint8_t type;
+    uint8_t bid;
+    int16_t funcIndex;
+    uint32_t data_0;
+    uint64_t timeStamp;
+    union {
+        uint64_t opCount;
+        uint32_t p2pOpCount[2];
+    };
+    union {
+        uint64_t data_1;
+        struct {
+            uint8_t nWarps;
+            uint8_t bid;
+            uint8_t nChannels;
+        } coll;
+        struct {
+            int16_t peer;
+            uint8_t ngroups : 4;
+            uint8_t connIndex : 4;
+            uint8_t warpStart : 4;
+            uint8_t nWarps : 4;
+        } p2p[2];
+    };
+};
+static_assert(sizeof(struct scclCollTrace) == 8 * sizeof(int), "scclCollTrace must have a pow2 size");
+union scclCollTraceTail {
+    uint32_t tail;
+    char padding[4096];
+};
+#define COLLTRACE_NUM_ITEMS 8192
+#endif
+#ifdef HCU_SDMA_FEATURE
+struct sdmaQueueContext {
+    hsa_sdma_info_t* sdmaInfo;
+    uint64_t pkgIndex;
+    uint32_t queueId;
+    uint32_t sumSdmaCopyCount;
+    uint32_t sumAllCopyCount;
+    uint32_t queueLock;
+    uint32_t minCopySize;
+    uint32_t copyCountEnable;
+    uint32_t sdmaQueueDepth;
+    uint32_t sdmaPkgLen;
+    uint32_t sdmaQueueLen;
+};
+#endif
+struct alignas(16) scclDevChannel {
+    struct scclDevChannelPeer** peers;
+    struct scclRing ring;
+    struct scclTree tree;
+    struct scclTree collnetChain;
+    struct scclDirect collnetDirect;
+    struct scclTree binTree;
+    struct scclNvls nvls;
+    uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
+};
+struct scclDevComm {
+    int rank;
+    int nRanks;
+    int buffSizes[SCCL_NUM_PROTOCOLS];
+    // Operation list for aggregation
+    int workFifoDepth;
+    struct scclWork* workFifoHeap; // may be cudaHost or GDR memory
+    // Flag to ask SCCL kernels to abort
+    volatile uint32_t* abortFlag;
+    // Channels, device side
+    struct scclDevChannel* channels /*[MAXCHANNELS]*/;
+#if defined(ENABLE_NPKIT)
+    NpKitEventCollectContext* npKitEventCollectContexts;
+#endif
+#ifdef ENABLE_COLLTRACE
+    struct scclCollTrace* collTrace;
+    union scclCollTraceTail* collTraceTail;
+    pthread_t collTraceThread;
+#endif
+#ifdef ENABLE_PROFILING
+    struct scclProf* devProf;
+#endif
+#if defined(ENABLE_TIMELINE)
+    TimelineGpuEventContext* gpuEventContext;
+#endif
+#if defined(ENABLE_NPKIT) || defined(ENABLE_TIMELINE)
+    uint64_t* cpuTimestamp;
+#endif
+};
+struct alignas(16) scclDevCommAndChannels {
+    struct scclDevComm comm;
+    struct scclDevChannel channels[MAXCHANNELS];
+};
+#ifdef __CUDA_ARCH__
+#define SCCL_CUDA_ARCH __CUDA_ARCH__
+#else
+#define SCCL_CUDA_ARCH 0
+#endif
+template <typename T>
+__host__ __device__ constexpr T min_constexpr(T a) {
+    return a;
+}
+template <typename T, typename... Ts>
+__host__ __device__ constexpr T min_constexpr(T a, T b, Ts... c) {
+    return min_constexpr<T>((a < b ? a : b), c...);
+}
+template <typename T>
+__host__ __device__ constexpr T max_constexpr(T a) {
+    return a;
+}
+template <typename T, typename... Ts>
+__host__ __device__ constexpr T max_constexpr(T a, T b, Ts... c) {
+    return max_constexpr<T>((a > b ? a : b), c...);
+}
+// Calculate the unroll factor given:
+// * bytePerPack: number of bytes accessed per instruction
+// * insns: max permissible unroll value
+// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
+__host__ __device__ constexpr int scclCalcUnroll(int bytePerPack, int insns, int bytes) {
+    return min_constexpr(insns, (bytes + bytePerPack - 1) / bytePerPack);
+}
+// Note that all unroll value logic should depend on a given cudaArch argument
+// and not __CUDA_ARCH__ since these need to be host-side executable where the
+// arch value is strictly runtime only. By defaulting to SCCL_CUDA_ARCH, device
+// side code can elide passing the arch for brevity.
+__host__ __device__ constexpr int scclCollUnroll(int cudaArch = SCCL_CUDA_ARCH) {
+    // Our collective unroll should move to the same bytes&insns model as NVLS.
+    return cudaArch >= 800 ? 8 : 4;
+}
+__host__ __device__ constexpr int scclNvlsUnrollBytes(int cudaArch = SCCL_CUDA_ARCH) { return 4 * 16; }
+__host__ __device__ constexpr int scclNvlsUnrollInsns(int cudaArch = SCCL_CUDA_ARCH) { return 16; }
+__host__ __device__ constexpr int scclNvlsUnroll(int bytePerPack, int cudaArch = SCCL_CUDA_ARCH) {
+    return scclCalcUnroll(bytePerPack, scclNvlsUnrollInsns(cudaArch), scclNvlsUnrollBytes(cudaArch));
+}
+// The amount of dynamic shmem per warp
+__host__ __device__ constexpr int scclShmemScratchWarpSize(int cudaArch = SCCL_CUDA_ARCH) {
+    return (max_constexpr<int>(
+                /*LL    */ 0,
+                /*LL128 */ (SCCL_LL128_SHMEM_ELEMS_PER_THREAD * WARP_SIZE) * sizeof(uint64_t),
+                /*SIMPLE*/ (scclCollUnroll(cudaArch) * WARP_SIZE + 1) * 16,
+                // NVLS needs an extra 16B to read unaligned data.
+                /*NVLS  */ WARP_SIZE * (cudaArch >= 900 ? scclNvlsUnrollBytes(cudaArch) : 0) + 16) +
+            15) &
+           -16; // pad to 16 bytes
+}
+// The amount of dynamic shmem per block
+__host__ __device__ constexpr int scclShmemDynamicSize(int cudaArch = SCCL_CUDA_ARCH) {
+    return cudaArch < 700 ? 0 : scclShmemScratchWarpSize(cudaArch) * (SCCL_MAX_NTHREADS / WARP_SIZE);
+}
+} // namespace sccl
+#endif
--- a/src/hardware/graph/graph.h
+++ b/src/hardware/graph/graph.h
+#ifndef SCCL_GRAPH_H_
+#define SCCL_GRAPH_H_
+// #include "topo_utils.h"
+#include "devcomm.h"
+#include <limits.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <sched.h>
+namespace sccl {
+namespace hardware {
+namespace topology {
+#define MAX_XGMI_INTER_GPUS 4
+struct scclTopoGraph {
+    // Input / output
+    int id; // ring : 0, tree : 1, collnet : 2
+    int pattern;
+    int crossNic;
+    int collNet;
+    int minChannels;
+    int maxChannels;
+    // Output
+    int nChannels;
+    float bwIntra;
+    float bwInter;
+    float latencyInter;
+    int typeIntra;
+    int typeInter;
+    int sameChannels;
+    int nHops;
+    int intra[MAXCHANNELS * SCCL_TOPO_MAX_NODES];
+    int inter[MAXCHANNELS * 2];
+    int nIntraChannels;
+    int intraNets[MAXCHANNELS * SCCL_TOPO_MAX_NODES * 2];
+    char treeBase[SCCL_TOPO_MAX_NODES][SCCL_TOPO_MAX_NODES * 4];
+};
+struct scclTopoRanks {
+    int ringRecv[MAXCHANNELS];
+    int ringSend[MAXCHANNELS];
+    int ringPrev[MAXCHANNELS];
+    int ringNext[MAXCHANNELS];
+    int treeToParent[MAXCHANNELS];
+    int treeToChild0[MAXCHANNELS];
+    int treeToChild1[MAXCHANNELS];
+    int nvlsHeads[MAXCHANNELS];
+};
+// struct sccl::hardware::topology::topo::scclTopoSystem;
+// 对系统拓扑结构进行排序
+scclResult_t scclTopoSortSystem(struct scclTopoSystem* system);
+// 打印系统拓扑结构
+scclResult_t scclTopoPrint(struct scclTopoSystem* system);
+// 计算系统中的路径
+scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm);
+// // 释放系统拓扑结构
+// void scclTopoFree(struct scclTopoSystem* system);
+// // 裁剪系统拓扑结构
+// scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm);
+// // 计算点对点通道
+// scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm);
+// // 获取指定rank的Nvidia GPU信息
+// scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks);
+// // 检查系统中是否所有路径都通过NVLink
+// int scclTopoPathAllNVLink(struct scclTopoSystem* system);
+// // 获取网络设备信息
+// scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
+// // 检查两个设备之间是否存在点对点连接
+scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank);
+// // 检查是否使用GDR
+// scclResult_t scclTopoCheckGdr(struct scclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
+// // 获取内部网络设备信息
+// scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev);
+// // 获取两个CUDA设备之间的连接类型
+// scclResult_t scclTopoGetLinkType(
+//     struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter = MAX_XGMI_INTER_GPUS, int nInter = 0, int* inter = nullptr);
+// // 检查是否需要刷新
+// scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush);
+// // 检查两个设备是否在同一网络中
+// scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net);
+// // 禁用PXE网络
+// int scclPxnDisable(struct scclComm* comm);
+// // 获取PXE网络中的中间节点
+// scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks);
+// // 获取本地节点的rank
+// scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank);
+// // 获取CPU亲和性
+// scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity);
+// // 获取CPU类型信息
+// scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model);
+// // 获取GPU数量
+// scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count);
+// // 获取NVS数量
+// scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count);
+// // 获取本地网络设备信息
+// scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id);
+// // 获取本地GPU索引
+// scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex);
+// // 初始化搜索，调用scclTopoCompute之前需要执行
+// scclResult_t scclTopoSearchInit(struct scclTopoSystem* system);
+// // 计算拓扑图
+// scclResult_t scclTopoCompute(struct scclTopoSystem* system, struct scclTopoGraph* graph);
+// // 打印拓扑图
+// scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph);
+// // 导出拓扑图
+// scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs);
+// // 设置预定义拓扑图
+// scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks);
+// // 设置后处理拓扑图
+// scclResult_t scclTopoPostset(
+//     struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc);
+// // 设置基于树的后处理拓扑图
+// scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph);
+// // 调整模型以适应计算能力
+// scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs);
+// scclResult_t scclTopoCudaPath(int cudaDev, char** path);
+// #include "info.h"
+// scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
+#endif
--- a/src/hardware/graph/paths.cc
+++ b/src/hardware/graph/paths.cc
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+#include "comm.h"
+#include "net.h"
+#include "channel.h"
+#include "xml.h"
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace graph {
+// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
+struct scclTopoNodeList {
+    struct scclTopoNode* list[SCCL_TOPO_MAX_NODES];
+    int count;
+};
+static scclResult_t getPath(struct scclTopoSystem* system, struct scclTopoNode* node, int t, int64_t id, struct scclTopoLinkList** path) {
+    for(int i = 0; i < system->nodes[t].count; i++) {
+        if(system->nodes[t].nodes[i].id == id) {
+            *path = node->paths[t] + i;
+            return scclSuccess;
+        }
+    }
+    WARN("Could not find node of type %d id %lx", t, id);
+    return scclInternalError;
+}
+static scclResult_t scclTopoSetPaths(struct scclTopoNode* baseNode, struct scclTopoSystem* system) {
+    if(baseNode->paths[baseNode->type] == NULL) {
+        SCCLCHECK(scclCalloc(baseNode->paths + baseNode->type, system->nodes[baseNode->type].count));
+    }
+    // breadth-first search to set all paths to that node in the system
+    struct scclTopoNodeList nodeList;
+    struct scclTopoNodeList nextNodeList;
+    nodeList.count     = 1;
+    nodeList.list[0]   = baseNode;
+    nextNodeList.count = 0;
+    struct scclTopoLinkList* basePath;
+    SCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
+    basePath->count = 0;
+    basePath->bw    = LOC_BW;
+    basePath->type  = PATH_LOC;
+    while(nodeList.count) {
+        nextNodeList.count = 0;
+        for(int n = 0; n < nodeList.count; n++) {
+            struct scclTopoNode* node = nodeList.list[n];
+            struct scclTopoLinkList* path;
+            SCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
+            for(int l = 0; l < node->nlinks; l++) {
+                struct scclTopoLink* link    = node->links + l;
+                struct scclTopoNode* remNode = link->remNode;
+                if(remNode->paths[baseNode->type] == NULL) {
+                    SCCLCHECK(scclCalloc(remNode->paths + baseNode->type, system->nodes[baseNode->type].count));
+                }
+                struct scclTopoLinkList* remPath;
+                SCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
+                float bw = std::min(path->bw, link->bw);
+                // allow routing through a GPU only as 1 hop
+                if(node != baseNode && node->type == GPU && (link->type != LINK_NVL || remNode->type != GPU || path->count > 1))
+                    continue;
+                if((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) {
+                    // Find reverse link
+                    for(int l = 0; l < remNode->nlinks; l++) {
+                        if(remNode->links[l].remNode == node) {
+                            remPath->list[0] = remNode->links + l;
+                            break;
+                        }
+                    }
+                    if(remPath->list[0] == NULL) {
+                        WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
+                             remNode->type,
+                             remNode->id,
+                             remNode->nlinks,
+                             node->type,
+                             node->id);
+                        return scclInternalError;
+                    }
+                    // Copy the rest of the path
+                    for(int i = 0; i < path->count; i++)
+                        remPath->list[i + 1] = path->list[i];
+                    remPath->count = path->count + 1;
+                    remPath->bw    = bw;
+                    // Start with path type = link type. PATH and LINK types are supposed to match.
+                    // Don't consider LINK_NET as we only care about the NIC->GPU path.
+                    int type = link->type == LINK_NET ? LINK_LOC : link->type;
+                    // Differentiate between one and multiple PCI switches
+                    if(node->type == PCI && remNode->type == PCI)
+                        type = PATH_PXB;
+                    // Consider a path going through the CPU as PATH_PHB
+                    if(link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU))
+                        type = PATH_PHB;
+                    // Set 1 hop NVLink as NVB
+                    // if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
+                    remPath->type = std::max(path->type, type);
+                    // Add to the list for the next iteration if not already in the list
+                    // Disallow GPUs as intermediate steps for now
+                    if(remNode->type != GPU) {
+                        int i;
+                        for(i = 0; i < nextNodeList.count; i++)
+                            if(nextNodeList.list[i] == remNode)
+                                break;
+                        if(i == nextNodeList.count)
+                            nextNodeList.list[nextNodeList.count++] = remNode;
+                    }
+                }
+            }
+        }
+        memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
+    }
+    return scclSuccess;
+}
+/**
+ * 打印节点路径信息
+ *
+ * @param system 拓扑系统指针
+ * @param node 待打印路径的节点指针
+ *
+ * 该函数用于输出指定节点的路径信息，包括路径类型、目标节点ID、
+ * 路径跳数、带宽和路径类型字符串。输出格式为一行字符串。
+ */
+static void printNodePaths(struct scclTopoSystem* system, struct scclTopoNode* node) {
+    char line[1024];
+    sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
+    int offset = strlen(line);
+    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
+        if(node->paths[t] == NULL)
+            continue;
+        for(int n = 0; n < system->nodes[t].count; n++) {
+            sprintf(line + offset,
+                    "%s/%lX (%d/%f/%s) ",
+                    topoNodeTypeStr[t],
+                    system->nodes[t].nodes[n].id,
+                    node->paths[t][n].count,
+                    node->paths[t][n].bw,
+                    topoPathTypeStr[node->paths[t][n].type]);
+            offset = strlen(line);
+        }
+    }
+}
+static scclResult_t getLocalCpu(struct scclTopoSystem* system, int gpu, int* retCpu) {
+    // Find the closest CPU to a GPU
+    int minHops                    = 0;
+    int localCpu                   = -1;
+    struct scclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
+    for(int c = 0; c < system->nodes[CPU].count; c++) {
+        int hops = paths[c].count;
+        if(minHops == 0 || hops < minHops) {
+            localCpu = c;
+            minHops  = hops;
+        }
+    }
+    if(localCpu == -1) {
+        WARN("Error : could not find CPU close to GPU %d", gpu);
+        return scclInternalError;
+    }
+    *retCpu = localCpu;
+    return scclSuccess;
+}
+static scclResult_t addInterStep(struct scclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
+    struct scclTopoNode* cpuNode = system->nodes[tx].nodes + ix;
+    struct scclTopoNode* srcNode = system->nodes[t1].nodes + i1;
+    int l = 0;
+    // Node 1 -> CPU
+    for(int i = 0; i < srcNode->paths[tx][ix].count; i++)
+        srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i];
+    // CPU -> Node 2
+    for(int i = 0; i < cpuNode->paths[t2][i2].count; i++)
+        srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
+    // Update path characteristics
+    srcNode->paths[t2][i2].count = l;
+    srcNode->paths[t2][i2].type  = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
+    if(tx == GPU)
+        srcNode->paths[t2][i2].type = PATH_PXN;
+    srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw);
+    return scclSuccess;
+}
+// Remove/free paths for a given type
+static void scclTopoRemovePathType(struct scclTopoSystem* system, int nodeType) {
+    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++) {
+        // Remove links _to_ the given type
+        for(int n = 0; n < system->nodes[t].count; n++) {
+            struct scclTopoNode* node = system->nodes[t].nodes + n;
+            free(node->paths[nodeType]);
+            node->paths[nodeType] = NULL;
+        }
+        // Remove links _from_ the given type
+        for(int n = 0; n < system->nodes[nodeType].count; n++) {
+            struct scclTopoNode* node = system->nodes[nodeType].nodes + n;
+            free(node->paths[t]);
+            node->paths[t] = NULL;
+        }
+    }
+}
+static const int levelsOldToNew[] = {PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS};
+scclResult_t scclGetLevel(int* level, const char* disableEnv, const char* levelEnv) {
+    if(*level == -1) {
+        int l = -1;
+        if(disableEnv) {
+            char* str = getenv(disableEnv);
+            if(str) {
+                int disable = strtol(str, NULL, 0);
+                if(disable == 1)
+                    l = 0;
+            }
+        }
+        if(l == -1) {
+            char* str = getenv(levelEnv);
+            if(str) {
+                for(int i = 0; i <= PATH_SYS; i++) {
+                    if(strcmp(str, topoPathTypeStr[i]) == 0) {
+                        l = i;
+                        break;
+                    }
+                }
+                // Old style numbering
+                // levelsOldToNew to is an array with each index corresponding to the
+                // "old level" int, and each value mapping to the correct value defined in topo.h
+                // maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew)
+                if(l == -1 && str[0] >= '0' && str[0] <= '9') {
+                    int oldLevel          = strtol(str, NULL, 0);
+                    const int maxOldLevel = sizeof(levelsOldToNew) / sizeof(int) - 1;
+                    if(oldLevel > maxOldLevel)
+                        oldLevel = maxOldLevel;
+                    l = levelsOldToNew[oldLevel];
+                }
+            }
+        }
+        if(l >= 0)
+            INFO(SCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
+        *level = l >= 0 ? l : -2;
+    }
+    return scclSuccess;
+}
+SCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
+int scclTopoUserGdrLevel = -1;
+scclResult_t scclTopoCheckGdr(struct scclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) {
+    *useGdr = 0;
+    // Get GPU and NET
+    int n, g;
+    SCCLCHECK(scclTopoIdToIndex(system, NET, netDev, &n));
+    struct scclTopoNode* net = system->nodes[NET].nodes + n;
+    SCCLCHECK(scclTopoIdToIndex(system, GPU, busId, &g));
+    struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
+    // Check that both the NIC and GPUs support it
+    if(net->net.gdrSupport == 0)
+        return scclSuccess;
+    if(gpu->gpu.gdrSupport == 0)
+        return scclSuccess;
+    if(read) { // For reads (sends) only enable under certain conditions
+        int gdrReadParam = scclParamNetGdrRead();
+        if(gdrReadParam == 0)
+            return scclSuccess;
+        if(gdrReadParam < 0) {
+            int nvlink = 0;
+            // Since we don't know whether there are other communicators,
+            // it's better to keep things local if we have a single GPU.
+            if(system->nodes[GPU].count == 1)
+                nvlink = 1;
+            for(int i = 0; i < system->nodes[GPU].count; i++) {
+                if(i == g)
+                    continue;
+                if(gpu->paths[GPU][i].type == PATH_NVL) {
+                    nvlink = 1;
+                    break;
+                }
+            }
+            if(!nvlink)
+                return scclSuccess;
+        }
+    }
+    // Check if we are close enough that it makes sense to enable GDR
+    int netGdrLevel = system->netGdrLevel == -2 ? PATH_PXB : system->netGdrLevel;
+    SCCLCHECK(scclGetLevel(&scclTopoUserGdrLevel, NULL, "SCCL_NET_GDR_LEVEL"));
+    if(scclTopoUserGdrLevel != -2)
+        netGdrLevel = scclTopoUserGdrLevel;
+    else {
+        int arch, vendor, model;
+        SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
+        if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME) {
+            int i, d1 = -1, d2 = -1;
+            for(i = 0; i < system->nodes[CPU].count; i++)
+                if(system->nodes[GPU].nodes[g].paths[CPU][i].count == 2)
+                    break;
+            if(i < system->nodes[CPU].count)
+                d1 = system->nodes[CPU].nodes[i].id;
+            for(i = 0; i < system->nodes[CPU].count; i++)
+                if(system->nodes[NET].nodes[n].paths[CPU][i].count == 2)
+                    break;
+            if(i < system->nodes[CPU].count)
+                d2 = system->nodes[CPU].nodes[i].id;
+            if(d1 != -1 && d2 != -1 && d1 == d2 && (system->nodes[GPU].nodes[g].id & 0xf0000) == (system->nodes[NET].nodes[n].net.busId & 0xf0000)) {
+                netGdrLevel = PATH_PHB;
+            }
+        }
+    }
+    int distance = gpu->paths[NET][n].type;
+    if(distance == PATH_PXN) {
+        // In case of PXN, use the intermediate GPU distance instead
+        int proxyRank, g;
+        SCCLCHECK(scclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
+        SCCLCHECK(scclTopoRankToIndex(system, proxyRank, &g));
+        struct scclTopoNode* proxyGpu = system->nodes[GPU].nodes + g;
+        distance                      = proxyGpu->paths[NET][n].type;
+    }
+    if(distance > netGdrLevel) {
+        INFO(SCCL_NET, "GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
+        return scclSuccess;
+    }
+    *useGdr = 1;
+    INFO(SCCL_NET, "GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read);
+    return scclSuccess;
+}
+// Set to 0 to disable the flush on Hopper when using GDR
+SCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1);
+// Determine whether we need to flush the GDR recv buffers
+scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush) {
+    int g;
+    SCCLCHECK(scclTopoIdToIndex(system, GPU, busId, &g));
+    struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
+    // Flush is required on Ampere and earlier
+    *flush = gpu->gpu.cudaCompCap < 90 ? 1 : scclParamNetForceFlush();
+    return scclSuccess;
+}
+SCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 1);
+// Check whether going through the network would be faster than going through P2P/SHM.
+scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net) {
+    if(scclParamNetDisableIntra() == 1) {
+        *net = 0;
+        return scclSuccess;
+    }
+    *net = 1;
+    // First check the current GPU-to-GPU speed.
+    int g1, g2;
+    if(scclTopoIdToIndex(system, GPU, id1, &g1) != scclSuccess || scclTopoIdToIndex(system, GPU, id2, &g2) != scclSuccess) {
+        return scclSuccess;
+    }
+    struct scclTopoNode* gpu1 = system->nodes[GPU].nodes + g1;
+    struct scclTopoNode* gpu2 = system->nodes[GPU].nodes + g2;
+    float speed               = gpu1->paths[GPU][g2].bw;
+    // Now check the speed each GPU can access the network through PXB or better
+    float netSpeed1 = 0, netSpeed2 = 0;
+    for(int n = 0; n < system->nodes[NET].count; n++) {
+        struct scclTopoLinkList* path = gpu1->paths[NET] + n;
+        if(path->type <= PATH_PXB && path->bw > netSpeed1)
+            netSpeed1 = path->bw;
+        path = gpu2->paths[NET] + n;
+        if(path->type <= PATH_PXB && path->bw > netSpeed2)
+            netSpeed2 = path->bw;
+    }
+    if(netSpeed1 > speed && netSpeed2 > speed)
+        return scclSuccess;
+    *net = 0;
+    return scclSuccess;
+}
+scclResult_t scclTopoGetIntermediateRank(struct scclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
+    // Get GPU and NET
+    int n, g;
+    SCCLCHECK(scclTopoIdToIndex(system, NET, netDev, &n));
+    SCCLCHECK(scclTopoRankToIndex(system, rank, &g));
+    struct scclTopoNode* gpu      = system->nodes[GPU].nodes + g;
+    struct scclTopoLinkList* path = gpu->paths[NET] + n;
+    if(path->type == PATH_PXN) {
+        struct scclTopoNode* node;
+        int type = NVS;
+        for(int i = 0; i < path->count && type == NVS; i++) {
+            node = path->list[i]->remNode;
+            type = node->type;
+        }
+        if(type != GPU) {
+            WARN("Could not find intermediate GPU between GPU rank %d and NIC %d", rank, netDev);
+            return scclInternalError;
+        }
+        *intermediateRank = node->gpu.rank;
+    } else {
+        *intermediateRank = rank;
+    }
+    return scclSuccess;
+}
+SCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);
+// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
+// remote proxies without risking deadlocks
+int scclPxnDisable(struct scclComm* comm) {
+    static int pxnDisable = -1;
+    if(pxnDisable == -1) {
+        if(comm && scclNetVersion(comm) == 4) {
+            INFO(SCCL_INIT, "PXN Disabled as plugin is v4");
+            pxnDisable = 1;
+        } else {
+            pxnDisable = scclParamPxnDisable();
+        }
+    }
+    return pxnDisable;
+}
+scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks) {
+    struct scclTopoSystem* system = comm->topo;
+    *nranks                       = 0;
+    *intermediateRanks            = NULL;
+    if(system->nodes[NET].count == 0)
+        return scclSuccess;
+    int nr     = 0;
+    int* ranks = NULL;
+    for(int rank = 0; rank < comm->nRanks; rank++) {
+        int netDev, proxyRank;
+        SCCLCHECK(scclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank));
+        if(proxyRank == comm->rank)
+            continue;
+        int useGdr;
+        SCCLCHECK(scclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr));
+        if(useGdr == 0)
+            continue;
+        int found = 0;
+        for(int r = 0; r < nr; r++) {
+            if(ranks[r] == proxyRank)
+                found = 1;
+        }
+        if(!found) {
+            SCCLCHECK(scclRealloc(&ranks, nr, nr + 1));
+            ranks[nr++] = proxyRank;
+        }
+    }
+    *nranks            = nr;
+    *intermediateRanks = ranks;
+    return scclSuccess;
+}
+static bool rcclPathOverride(struct scclTopoSystem* system, uint64_t distance) {
+    int i, j;
+    for(i = 0; i < system->nodes[GPU].count; i++) {
+        for(j = 0; j < system->nodes[NET].count; j++) {
+            if((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) ||
+               (system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance))
+                break;
+        }
+        if(j >= system->nodes[NET].count)
+            break;
+    }
+    if(i >= system->nodes[GPU].count) {
+        for(i = 0; i < system->nodes[GPU].count; i++) {
+            for(j = 0; j < system->nodes[NET].count; j++) {
+                if((system->nodes[NET].nodes[j].net.busId - system->nodes[GPU].nodes[i].id == distance) ||
+                   (system->nodes[GPU].nodes[i].id - system->nodes[NET].nodes[j].net.busId == distance))
+                    system->nodes[GPU].nodes[i].paths[NET][j].type = PATH_PXB;
+            }
+        }
+        return true;
+    } else {
+        return false;
+    }
+}
+RCCL_PARAM(EnableIntranet, "ENABLE_INTRANET", -2);
+scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm) {
+    int* domains;
+    int64_t* ids;
+    SCCLCHECK(scclCalloc(&domains, system->nodes[GPU].count));
+    SCCLCHECK(scclCalloc(&ids, system->nodes[GPU].count));
+    int myDomain = 0;
+    for(int g = 0; g < system->nodes[GPU].count; g++) {
+        struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
+        domains[g]               = g;
+        ids[g]                   = gpu->id;
+        for(int p = 0; p < g; p++) {
+            if(gpu->paths[GPU][p].type < PATH_NET) {
+                domains[g] = std::min(domains[g], domains[p]);
+            }
+        }
+        if(gpu->gpu.rank == comm->rank)
+            myDomain = domains[g];
+    }
+    int ngpus = system->nodes[GPU].count;
+    for(int i = 0; i < ngpus; i++) {
+        if(domains[i] == myDomain)
+            continue;
+        struct scclTopoNode* gpu = NULL;
+        int g;
+        for(g = 0; g < system->nodes[GPU].count /* This one varies over the loops */; g++) {
+            gpu = system->nodes[GPU].nodes + g;
+            if(gpu->id == ids[i])
+                break;
+            else
+                gpu = NULL;
+        }
+        if(gpu == NULL) {
+            WARN("Could not find id %lx", ids[i]);
+            free(domains);
+            free(ids);
+            return scclInternalError;
+        }
+        SCCLCHECK(scclTopoRemoveNode(system, GPU, g));
+    }
+    // trim low speed port on same NIC
+    for(int i = 0; i < system->nodes[NET].count; i++) {
+        for(int j = 0; j < system->nodes[NET].count; j++) {
+            if(i == j)
+                continue;
+            if(system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[j].net.asic) {
+                if(system->nodes[NET].nodes[i].net.bw > system->nodes[NET].nodes[j].net.bw)
+                    system->nodes[NET].nodes[j].net.bw = 0;
+            }
+        }
+    }
+    do {
+        int n;
+        for(n = 0; n < system->nodes[NET].count; n++) {
+            if(system->nodes[NET].nodes[n].net.bw == 0)
+                break;
+        }
+        if(n < system->nodes[NET].count) {
+            SCCLCHECK(scclTopoRemoveNode(system, NET, n));
+        } else
+            break;
+    } while(system->nodes[NET].count);
+    int remove   = 1;
+    int gdr      = 1;
+    bool allXgmi = true;
+    // detect if all GPUs are connected by XGMI
+    for(int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
+        int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
+        for(int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
+            if(i == j)
+                continue;
+            int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
+            bool isXGMI;
+            SCCLCHECK(scclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
+            allXgmi &= isXGMI;
+        }
+    }
+    if(allXgmi)
+        system->type |= RCCL_TOPO_XGMI_ALL;
+    for(int g = 0; g < system->nodes[GPU].count; g++) {
+        int net;
+        SCCLCHECK(scclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
+        SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
+        if(!gdr)
+            break;
+    }
+    if(gdr && !allXgmi) {
+        remove = 0;
+        system->type |= RCCL_TOPO_GDR_ALL;
+        INFO(SCCL_LOG_TOPO, "GDR is available on all GPUs");
+    }
+    // Special handling of gfx94x
+    if(rcclParamEnableIntranet() == 1 || (rcclParamEnableIntranet() == -2 && IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
+                                          system->nodes[GPU].count == 8 && system->nodes[NET].count == 8)) {
+        remove = 0;
+        system->type |= RCCL_TOPO_FORCE_INTRA;
+    }
+    comm->localRanks = system->nodes[GPU].count;
+    if(system->nodes[GPU].count == comm->nRanks && remove) {
+        for(int n = system->nodes[NET].count - 1; n >= 0; n--)
+            SCCLCHECK(scclTopoRemoveNode(system, NET, n));
+    }
+    free(domains);
+    free(ids);
+    return scclSuccess;
+}
+void scclTopoFree(struct scclTopoSystem* system) {
+    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++)
+        scclTopoRemovePathType(system, t);
+    free(system);
+}
+SCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 1);
+SCCL_PARAM(NChannelsPerPeer, "NCHANNELS_PER_PEER", 4);
+static scclResult_t scclTopoGetNchannels(struct scclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
+    int peer;
+    struct scclTopoLinkList* path = NULL;
+    if(scclTopoRankToIndex(system, peerRank, &peer) == scclSuccess) {
+        // Same rank
+        if(g == peer) {
+            *nChannels = -1;
+            return scclSuccess;
+        }
+        // Local rank
+        path = system->nodes[GPU].nodes[peer].paths[GPU] + g;
+        if(path->type == PATH_NVL) {
+            float nvlBw = scclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn);
+            *nChannels  = (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 4 : 2) * std::max(1, (int)(path->bw / nvlBw));
+        } else {
+            *nChannels = 2;
+        }
+    } else {
+        // Remote rank, use network
+        *nChannels = scclParamNChannelsPerNetPeer();
+    }
+    return scclSuccess;
+}
+SCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 4);
+SCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);
+static int nextPow2(int v) {
+    int pow2 = 1;
+    while(pow2 < v)
+        pow2 <<= 1;
+    return pow2;
+}
+scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm) {
+    /* here we already honor comm->max/minCTAs for p2pnChannels. */
+    int MinP2pNchannels  = (int)scclParamMinP2pNChannels();
+    int MaxP2pNchannels  = (int)scclParamMaxP2pNChannels();
+    int NchannelsPerPeer = (int)scclParamNChannelsPerPeer();
+    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MIN_P2P_NCHANNELS") == NULL)
+        MinP2pNchannels = 32;
+    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_MAX_P2P_NCHANNELS") == NULL)
+        MaxP2pNchannels = 32;
+    if(scclTopoPathAllNVLink(comm->topo) == 1 && getenv("SCCL_NCHANNELS_PER_PEER") == NULL)
+        NchannelsPerPeer = 32;
+    int scclMinP2pNchannels = MinP2pNchannels;
+    if(comm->sharedRes->owner != comm) {
+        comm->p2pnChannels = std::min(comm->nChannels, MaxP2pNchannels);
+        comm->p2pnChannels = std::min(std::max(comm->p2pnChannels, scclMinP2pNchannels), comm->sharedRes->tpP2pNChannels);
+    } else {
+        comm->p2pnChannels = std::min(comm->nChannels, MaxP2pNchannels);
+        comm->p2pnChannels = std::max(comm->p2pnChannels, scclMinP2pNchannels);
+    }
+    int minChannels = comm->p2pnChannels;
+    // We need to loop through all local GPUs to have a global picture
+    for(int g = 0; g < comm->topo->nodes[GPU].count; g++) {
+        for(int r = 0; r < comm->nRanks; r++) {
+            int nChannels;
+            SCCLCHECK(scclTopoGetNchannels(comm->topo, g, r, &nChannels));
+            if(nChannels >= 0)
+                minChannels = std::min(minChannels, nChannels);
+        }
+    }
+    int arch, vendor, model;
+    SCCLCHECK(scclTopoCpuType(comm->topo, &arch, &vendor, &model));
+    // Round to next pow2 nChannelsPerPeer and nChannels
+    if(getNumaMaxGpus() == 1 && !scclTopoPathAllNVLink(comm->topo)) {
+        comm->p2pnChannelsPerPeer = nextPow2(comm->p2pnChannels);
+    } else {
+        comm->p2pnChannelsPerPeer = (NchannelsPerPeer == -2 ? nextPow2(minChannels) : NchannelsPerPeer);
+    }
+    comm->p2pnChannels = nextPow2(comm->p2pnChannels);
+    // Init channels that weren't used so far
+    for(int c = comm->nChannels; c < std::max(comm->nChannels, comm->p2pnChannels); c++)
+        SCCLCHECK(initChannel(comm, c));
+    // We want to spread channels used when there aren't many and progressively
+    // fill the whole space of nChannels. To do so we mirror the bits in the
+    // nChannels space.
+    for(int c = 0; c < comm->p2pnChannels; c++) {
+        int mirror = 0;
+        for(int b = 1, mb = (comm->p2pnChannels >> 1); b < comm->p2pnChannels; b <<= 1, mb >>= 1)
+            if(c & b)
+                mirror |= mb;
+        comm->p2pChannels[c] = mirror;
+    }
+    return scclSuccess;
+}
+scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks) {
+    int ngpus = system->nodes[GPU].count;
+    SCCLCHECK(scclCalloc(ranks, ngpus));
+    int nvbGpus = 0;
+    for(int g = 0; g < ngpus; g++) {
+        struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
+        if(gpu->gpu.rank != rank)
+            continue;
+        for(int p = 0; p < ngpus; p++) {
+            if(gpu->paths[GPU][p].type == PATH_NVB) {
+                (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank;
+            }
+        }
+    }
+    *nranks = nvbGpus;
+    return scclSuccess;
+}
+int scclTopoPathAllNVLink(struct scclTopoSystem* system) {
+    int minPath = PATH_DIS;
+    for(int i = 0; i < system->nodes[GPU].count; i++) {
+        struct scclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
+        for(int j = 0; j < system->nodes[GPU].count; j++) {
+            if(i == j)
+                continue;
+            minPath = std::min(minPath, paths[j].type);
+        }
+    }
+    return minPath >= PATH_PIX ? 0 : 1;
+}
+} // namespace graph
+scclResult_t scclTopoPrintPaths(struct scclTopoSystem* system) {
+    for(int i = 0; i < system->nodes[GPU].count; i++) {
+        graph::printNodePaths(system, system->nodes[GPU].nodes + i);
+    }
+    for(int i = 0; i < system->nodes[NET].count; i++) {
+        graph::printNodePaths(system, system->nodes[NET].nodes + i);
+    }
+    return scclSuccess;
+}
+int scclTopoUserP2pLevel = -1;
+scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank) {
+    *p2p = 0;
+    if(read)
+        *read = 0;
+    if(intermediateRank)
+        *intermediateRank = -1;
+    // Get GPUs from topology
+    int g1, g2;
+    SCCLCHECK(scclTopoIdToIndex(system, GPU, id1, &g1));
+    struct scclTopoNode* gpu1 = system->nodes[GPU].nodes + g1;
+    if(scclTopoIdToIndex(system, GPU, id2, &g2) == scclInternalError) {
+        // GPU not found, we can't use p2p.
+        return scclSuccess;
+    }
+    int intermediateIndex = -1;
+    // Set intermediate GPU rank, if routing through an intermediate GPU.
+    struct scclTopoLinkList* path = gpu1->paths[GPU] + g2;
+    if(path->count == 2) {
+        struct scclTopoNode* intermediateNode = path->list[0]->remNode;
+        if(intermediateNode->type == GPU) {
+            intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
+            if(intermediateRank)
+                *intermediateRank = intermediateNode->gpu.rank;
+        }
+    }
+    // In general, use P2P whenever we can.
+    int p2pLevel = PATH_SYS;
+    // User override
+    if(scclTopoUserP2pLevel == -1)
+        SCCLCHECK(scclGetLevel(&scclTopoUserP2pLevel, "SCCL_P2P_DISABLE", "SCCL_P2P_LEVEL"));
+    if(scclTopoUserP2pLevel != -2) {
+        p2pLevel = scclTopoUserP2pLevel;
+        goto compare;
+    }
+    // Don't use P2P through ARM CPUs
+    int arch, vendor, model;
+    SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
+    if(arch == SCCL_TOPO_CPU_ARCH_ARM)
+        p2pLevel = PATH_PXB;
+    if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_INTEL) {
+        p2pLevel = PATH_PXB;
+    }
+    if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
+        p2pLevel = PATH_PXB;
+    }
+compare:
+    // Compute the PCI distance and compare with the p2pLevel.
+    if(path->type <= p2pLevel)
+        *p2p = 1;
+    if(path->type == PATH_NVL) {
+        struct scclTopoNode* gpu2 = system->nodes[GPU].nodes + g2;
+        // Enable P2P Read for Ampere/NVLink only
+        if(read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80))
+            *read = 1;
+    }
+    return scclSuccess;
+}
+scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm) {
+    // Precompute paths between GPUs/NICs.
+    // Remove everything in case we're re-computing
+    for(int t = 0; t < SCCL_TOPO_NODE_TYPES; t++)
+        graph::scclTopoRemovePathType(system, t);
+    // Set direct paths to CPUs. We need them in many cases.
+    for(int c = 0; c < system->nodes[CPU].count; c++) {
+        SCCLCHECK(graph::scclTopoSetPaths(system->nodes[CPU].nodes + c, system));
+    }
+    // Set direct paths to GPUs.
+    for(int g = 0; g < system->nodes[GPU].count; g++) {
+        SCCLCHECK(graph::scclTopoSetPaths(system->nodes[GPU].nodes + g, system));
+    }
+    // Set direct paths to NICs.
+    for(int n = 0; n < system->nodes[NET].count; n++) {
+        SCCLCHECK(graph::scclTopoSetPaths(system->nodes[NET].nodes + n, system));
+    }
+    // Set direct paths to NVSwitches.
+    for(int n = 0; n < system->nodes[NVS].count; n++) {
+        SCCLCHECK(graph::scclTopoSetPaths(system->nodes[NVS].nodes + n, system));
+    }
+    // Update path for GPUs when we don't want to / can't use GPU Direct P2P
+    for(int g = 0; g < system->nodes[GPU].count; g++) {
+        for(int p = 0; p < system->nodes[GPU].count; p++) {
+            int p2p;
+            SCCLCHECK(scclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
+            if(p2p == 0) {
+                // Divert all traffic through the CPU
+                int cpu;
+                SCCLCHECK(getLocalCpu(system, g, &cpu));
+                SCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
+            }
+        }
+        if(comm == NULL)
+            continue;
+        // Remove GPUs we can't (or don't want to) communicate with through P2P or SHM
+        struct scclPeerInfo* dstInfo = comm->peerInfo + system->nodes[GPU].nodes[g].gpu.rank;
+        for(int p = 0; p < system->nodes[GPU].count; p++) {
+            if(p == g)
+                continue;
+            struct scclPeerInfo* srcInfo = comm->peerInfo + system->nodes[GPU].nodes[p].gpu.rank;
+            int p2p;
+            SCCLCHECK(scclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
+            if(p2p == 0) {
+                int shm;
+                SCCLCHECK(scclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
+                if(shm == 0) {
+                    // Mark this peer as inaccessible. We'll trim it later.
+                    system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
+                }
+            }
+        }
+    }
+    // Special handling of gfx94x
+#if !defined(TOPO_EXPL)
+    char strValue[1024];
+    SCCLCHECK(scclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue));
+    if(strncmp("Hyper-V UEFI Release", strValue, 20) == 0) {
+#endif
+        int arch, vendor, model;
+        SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
+        if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_INTEL && IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
+           ((system->nodes[GPU].count == 8 && system->nodes[NET].count == 8 && system->nodes[GPU].count == system->nRanks) ||
+            (system->nodes[GPU].count != system->nRanks))) {
+            if(!rcclPathOverride(system, 0x100000))
+                rcclPathOverride(system, 0x1000);
+        }
+#if !defined(TOPO_EXPL)
+    }
+#endif
+    // Update paths for NICs (no GPU Direct, PXN, ...)
+    for(int n = 0; n < system->nodes[NET].count; n++) {
+        struct scclTopoNode* netNode = system->nodes[NET].nodes + n;
+        for(int g = 0; g < system->nodes[GPU].count; g++) {
+            // Check whether we can access the NIC through another NVLink-connected GPU (PXN)
+            struct scclTopoNode* gpu = system->nodes[GPU].nodes + g;
+            if(scclPxnDisable(comm) != 1) {
+                int localGpuIndex;
+                SCCLCHECK(scclTopoGetLocalGpu(system, system->nodes[NET].nodes[n].id, &localGpuIndex));
+                if(localGpuIndex != g && localGpuIndex != -1) {
+                    // PXN = PCI + NVLink.
+                    struct scclTopoNode* peerNode = system->nodes[GPU].nodes + localGpuIndex;
+                    // Only use PXN for NIC n if remote GPU p ...
+                    if(peerNode->paths[NET][n].type <= PATH_PXB &&            // Is connected to the NIC through PCI
+                       peerNode->paths[GPU][g].type <= PATH_NVL &&            // Is connected to us through NVLink
+                       (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
+                        gpu->paths[NET][n].type > PATH_PXB))                  // or avoids going through a CPU
+                        // We can use that GPU as relay to communicate with that NIC.
+                        // Only enabling it in the GPU->NIC direction for now to favor
+                        // receiving locally and sending remotely (consistent with net.cc)
+                        SCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
+                }
+            }
+            // Update path when we dont want to / can't use GPU Direct RDMA.
+            int gdr;
+            SCCLCHECK(scclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
+            if(gdr == 0) {
+                // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
+                int localCpu;
+                SCCLCHECK(getLocalCpu(system, g, &localCpu));
+                SCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
+                SCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
+            }
+        }
+    }
+    return scclSuccess;
+}
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/graph/rings.cc
+++ b/src/hardware/graph/rings.cc
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "core.h"
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace detect {
+#define MAXWIDTH 20
+#define PREFIXLEN 15
+#define STRLENGTH (PREFIXLEN + 5 * MAXWIDTH)
+void dumpLine(int* values, int nranks, const char* prefix) {
+    int prefixlen = strlen(prefix);
+    char line[STRLENGTH + 1];
+    line[STRLENGTH] = '\0';
+    memset(line, ' ', STRLENGTH);
+    strncpy(line, prefix, PREFIXLEN);
+    for(int i = 0; i < nranks && i < MAXWIDTH; i++)
+        sprintf(line + prefixlen + 4 * i, " %3d", values[i]);
+    INFO(SCCL_INIT, "%s", line);
+}
+scclResult_t scclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
+    for(int r = 0; r < nrings; r++) {
+        char prefix[40];
+        /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
+        dumpLine(prev+r*nranks, nranks, prefix);
+        sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
+        dumpLine(next+r*nranks, nranks, prefix);*/
+        int current = rank;
+        for(int i = 0; i < nranks; i++) {
+            rings[r * nranks + i] = current;
+            current               = next[r * nranks + current];
+        }
+        sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
+        if(rank == 0)
+            dumpLine(rings + r * nranks, nranks, prefix);
+        if(current != rank) {
+            WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
+            return scclInternalError;
+        }
+        // Check that all ranks are there
+        for(int i = 0; i < nranks; i++) {
+            int found = 0;
+            for(int j = 0; j < nranks; j++) {
+                if(rings[r * nranks + j] == i) {
+                    found = 1;
+                    break;
+                }
+            }
+            if(found == 0) {
+                WARN("Error : ring %d does not contain rank %d", r, i);
+                return scclInternalError;
+            }
+        }
+    }
+    return scclSuccess;
+}
+} // namespace detect
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/graph/rings.h
+++ b/src/hardware/graph/rings.h
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace detect {
+scclResult_t scclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
+} // namespace detect
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/graph/rome_models.cc
+++ b/src/hardware/graph/rome_models.cc
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+#include "xml.h"
+#include <math.h>
+#include <sys/time.h>
+#include <algorithm>
+#include <string.h>
+#include "rome_models.h"
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace detect {
+struct scclRomeModel {
+    int nGpus;
+    int nCpus;
+    int nNics;
+    int nLinks;
+    int64_t gpuIds[SCCL_TOPO_MAX_NODES];
+    int64_t nicIds[SCCL_TOPO_MAX_NODES];
+    int64_t gpuNuma[SCCL_TOPO_MAX_NODES];
+    int64_t nicNuma[SCCL_TOPO_MAX_NODES];
+    uint8_t connMatrix[SCCL_TOPO_MAX_NODES * SCCL_TOPO_MAX_NODES];
+    uint8_t gdrLevel[SCCL_TOPO_MAX_NODES * SCCL_TOPO_MAX_NODES];
+    const char* pattern;
+    const char* ringBase;
+    const char* options;
+    const char* treeBase;
+};
+static struct scclRomeModel rome_model_22 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 1,
+    .nLinks = 2,
+    .gpuIds =
+        {
+            0x3000,
+            0x43000,
+            0x26000,
+            0xc3000,
+            0x83000,
+            0x23000,
+            0xc6000,
+            0xa3000,
+        },
+    .nicIds =
+        {
+            0xe1000,
+        },
+    .gpuNuma =
+        {
+            1,
+            0,
+            1,
+            2,
+            3,
+            1,
+            2,
+            3,
+        },
+    .nicNuma =
+        {
+            2,
+        },
+    .connMatrix =
+        {
+            0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
+            0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_PHB,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_PHB,
+            PATH_SYS,
+        },
+    .pattern  = "10302120",
+    .ringBase = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_25 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 2,
+    .nLinks = 2,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xe3000,
+            0xc3000,
+            0xc6000,
+            0x83000,
+        },
+    .nicIds =
+        {
+            0x61000,
+            0xa1000,
+        },
+    .gpuNuma =
+        {
+            0,
+            1,
+            1,
+            1,
+            2,
+            2,
+            2,
+            3,
+        },
+    .nicNuma =
+        {
+            0,
+            3,
+        },
+    .connMatrix =
+        {
+            0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PHB,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_PHB,
+        },
+    .pattern  = "11303011",
+    .ringBase = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_27 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 2,
+    .nLinks = 2,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xe3000,
+            0xc3000,
+            0xc6000,
+            0x83000,
+        },
+    .nicIds =
+        {
+            0x61000,
+            0xa1000,
+        },
+    .gpuNuma =
+        {
+            0,
+            1,
+            1,
+            1,
+            2,
+            2,
+            2,
+            3,
+        },
+    .nicNuma =
+        {
+            0,
+            3,
+        },
+    .connMatrix =
+        {
+            0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
+            0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PHB,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_PHB,
+        },
+    .pattern  = "11303011",
+    .ringBase = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_29 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 1,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xc3000,
+            0xc6000,
+            0xa3000,
+            0x83000,
+        },
+    .nicIds =
+        {
+            0xe1000,
+        },
+    .gpuNuma =
+        {
+            0,
+            1,
+            1,
+            1,
+            2,
+            2,
+            3,
+            3,
+        },
+    .nicNuma =
+        {
+            2,
+        },
+    .connMatrix =
+        {
+            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_PHB,
+            PATH_PHB,
+            PATH_SYS,
+            PATH_SYS,
+        },
+    .pattern  = "10302120",
+    .ringBase = "6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_31 = {
+    .nGpus  = 8,
+    .nCpus  = 8,
+    .nNics  = 2,
+    .nLinks = 2,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xe3000,
+            0xc3000,
+            0xc6000,
+            0x83000,
+        },
+    .nicIds =
+        {
+            0x61000,
+            0xa1000,
+        },
+    .gpuNuma =
+        {
+            1,
+            2,
+            2,
+            3,
+            4,
+            5,
+            5,
+            7,
+        },
+    .nicNuma =
+        {
+            0,
+            6,
+        },
+    .connMatrix =
+        {
+            0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+        },
+    .pattern  = "0110201010200110",
+    .ringBase = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_33 = {
+    .nGpus  = 8,
+    .nCpus  = 8,
+    .nNics  = 2,
+    .nLinks = 2,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xe3000,
+            0xc3000,
+            0xc6000,
+            0x83000,
+        },
+    .nicIds =
+        {
+            0x61000,
+            0xa1000,
+        },
+    .gpuNuma =
+        {
+            1,
+            2,
+            2,
+            3,
+            4,
+            5,
+            5,
+            7,
+        },
+    .nicNuma =
+        {
+            0,
+            6,
+        },
+    .connMatrix =
+        {
+            0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
+            0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+        },
+    .pattern  = "0110201010200110",
+    .ringBase = "1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_30 = {
+    .nGpus  = 8,
+    .nCpus  = 8,
+    .nNics  = 0,
+    .nLinks = 2,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xe3000,
+            0xc3000,
+            0xc6000,
+            0x83000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            1,
+            2,
+            2,
+            3,
+            4,
+            5,
+            5,
+            7,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "0010201010200010",
+    .ringBase = "3 0 1 2 6 7 5 4|2 1 0 3 7 6 4 5",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_32 = {
+    .nGpus  = 8,
+    .nCpus  = 8,
+    .nNics  = 0,
+    .nLinks = 2,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xe3000,
+            0xc3000,
+            0xc6000,
+            0x83000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            1,
+            2,
+            2,
+            3,
+            4,
+            5,
+            5,
+            7,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
+            0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "0010201010200010",
+    .ringBase = "0 6 2 3 4 5 7 1|3 2 6 0 1 7 5 4",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_24 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 0,
+    .nLinks = 2,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xe3000,
+            0xc3000,
+            0xc6000,
+            0x83000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            0,
+            1,
+            1,
+            1,
+            2,
+            2,
+            2,
+            3,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "10303010",
+    .ringBase = "0 1 2 3 5 7 6 4|1 0 3 2 7 5 4 6",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_26 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 0,
+    .nLinks = 2,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xe3000,
+            0xc3000,
+            0xc6000,
+            0x83000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            0,
+            1,
+            1,
+            1,
+            2,
+            2,
+            2,
+            3,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
+            0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "10303010",
+    .ringBase = "4 5 7 1 0 3 2 6|3 0 6 2 1 7 5 4",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_23 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 0,
+    .nLinks = 2,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xc3000,
+            0xc6000,
+            0xa3000,
+            0x83000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            0,
+            1,
+            1,
+            1,
+            2,
+            2,
+            3,
+            3,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
+            0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "10302020",
+    .ringBase = "1 7 6 4 5 2 0 3|2 5 3 0 4 6 7 1",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_38 = {
+    .nGpus  = 8,
+    .nCpus  = 7,
+    .nNics  = 0,
+    .nLinks = 2,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xc3000,
+            0xc6000,
+            0xa3000,
+            0x83000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            1,
+            2,
+            2,
+            3,
+            5,
+            5,
+            6,
+            7,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
+            0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "10201000201010",
+    .ringBase = "6 7 1 4 3 5 2 0|0 2 5 3 4 1 7 6",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_28 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 0,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xc3000,
+            0xc6000,
+            0xa3000,
+            0x83000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            0,
+            1,
+            1,
+            1,
+            2,
+            2,
+            3,
+            3,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "10302020",
+    .ringBase = "0 3 2 1 4 5 6 7|7 6 5 4 1 2 3 0|0 2 5 7 4 6 3 1|1 3 6 4 7 5 2 0",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_40 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 1,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xc3000,
+            0xc6000,
+            0xa3000,
+            0x83000,
+        },
+    .nicIds =
+        {
+            0xe1000,
+        },
+    .gpuNuma =
+        {
+            0,
+            1,
+            1,
+            1,
+            2,
+            2,
+            3,
+            3,
+        },
+    .nicNuma =
+        {
+            2,
+        },
+    .connMatrix =
+        {
+            0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
+            0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_PHB,
+            PATH_PHB,
+            PATH_SYS,
+            PATH_SYS,
+        },
+    .pattern  = "10302120",
+    .ringBase = "6 7 1 4 0 5 3 2|7 6 4 1 0 2 3 5",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_42 = {
+    .nGpus  = 8,
+    .nCpus  = 7,
+    .nNics  = 1,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xc3000,
+            0xc6000,
+            0xa3000,
+            0x83000,
+        },
+    .nicIds =
+        {
+            0xe1000,
+        },
+    .gpuNuma =
+        {
+            1,
+            2,
+            2,
+            3,
+            5,
+            5,
+            6,
+            7,
+        },
+    .nicNuma =
+        {
+            4,
+        },
+    .connMatrix =
+        {
+            0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
+            0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+        },
+    .pattern  = "10201001201010",
+    .ringBase = "7 4 6 1 3 0 2 5|6 4 7 1 3 2 5 0",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_44 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 1,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x63000,
+            0x43000,
+            0x27000,
+            0x3000,
+            0xe3000,
+            0xc3000,
+            0xa3000,
+            0x83000,
+        },
+    .nicIds =
+        {
+            0xc4000,
+        },
+    .gpuNuma =
+        {
+            0,
+            0,
+            1,
+            1,
+            2,
+            2,
+            3,
+            3,
+        },
+    .nicNuma =
+        {
+            2,
+        },
+    .connMatrix =
+        {
+            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_PHB,
+            PATH_PHB,
+            PATH_SYS,
+            PATH_SYS,
+        },
+    .pattern  = "20202120",
+    .ringBase = "5 4 7 6 2 1 3 0|5 6 7 4 1 0 2 3",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_45 = {
+    .nGpus  = 8,
+    .nCpus  = 7,
+    .nNics  = 0,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xc3000,
+            0xc6000,
+            0xa3000,
+            0x83000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            1,
+            2,
+            2,
+            3,
+            5,
+            5,
+            6,
+            7,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "10201000201010",
+    .ringBase = "0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_46 = {
+    .nGpus  = 8,
+    .nCpus  = 7,
+    .nNics  = 1,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x43000,
+            0x23000,
+            0x26000,
+            0x3000,
+            0xc3000,
+            0xc6000,
+            0xa3000,
+            0x83000,
+        },
+    .nicIds =
+        {
+            0xe1000,
+        },
+    .gpuNuma =
+        {
+            1,
+            2,
+            2,
+            3,
+            5,
+            5,
+            6,
+            7,
+        },
+    .nicNuma =
+        {
+            4,
+        },
+    .connMatrix =
+        {
+            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+        },
+    .pattern  = "10201001201010",
+    .ringBase = "6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_48 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 0,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x4a000,
+            0x50000,
+            0xa000,
+            0xf000,
+            0xcb000,
+            0xd1000,
+            0x8a000,
+            0x90000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            0,
+            0,
+            1,
+            1,
+            2,
+            2,
+            3,
+            3,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "20202020",
+    .ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_49 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 4,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x4a000,
+            0x50000,
+            0xa000,
+            0xf000,
+            0xcb000,
+            0xd1000,
+            0x8a000,
+            0x90000,
+        },
+    .nicIds =
+        {
+            0x45000,
+            0x13000,
+            0xc6000,
+            0x85000,
+        },
+    .gpuNuma =
+        {
+            0,
+            0,
+            1,
+            1,
+            2,
+            2,
+            3,
+            3,
+        },
+    .nicNuma =
+        {
+            0,
+            1,
+            2,
+            3,
+        },
+    .connMatrix =
+        {
+            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB,
+            PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
+        },
+    .pattern  = "21212121",
+    .ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_52 = {
+    .nGpus  = 8,
+    .nCpus  = 1,
+    .nNics  = 0,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0xc1000,
+            0xc5000,
+            0xc9000,
+            0xcd000,
+            0xd1000,
+            0xd5000,
+            0xd9000,
+            0xdd000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
+            0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "80",
+    .ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_53 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 4,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x4a000,
+            0x50000,
+            0xa000,
+            0xf000,
+            0xcb000,
+            0xd1000,
+            0x8a000,
+            0x90000,
+        },
+    .nicIds =
+        {
+            0x45000,
+            0x13000,
+            0xc6000,
+            0x85000,
+        },
+    .gpuNuma =
+        {
+            1,
+            1,
+            3,
+            3,
+            5,
+            5,
+            7,
+            7,
+        },
+    .nicNuma =
+        {
+            1,
+            3,
+            5,
+            7,
+        },
+    .connMatrix =
+        {
+            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB,
+            PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
+        },
+    .pattern  = "21212121",
+    .ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_43 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 0,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x63000,
+            0x43000,
+            0x27000,
+            0x3000,
+            0xe3000,
+            0xc3000,
+            0xa3000,
+            0x83000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            0,
+            0,
+            1,
+            1,
+            2,
+            2,
+            3,
+            3,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "20202020",
+    .ringBase = "0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1|0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1|0 1 2 3 4 5 6 "
+                "7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1",
+    .options  = "treeDefined=1",
+    .treeBase =
+        "(2(5(6(7(4))))(3(0(1))))|(2(5(7(6(4))))(0(1(3))))|(2(5(7(4(6))))(1(3(0))))|(6(1(0(2(3))))(7(4(5))))|(6(1(2(0(3))))(4(5(7))))|(6(1(0(3(2))))(5(7(4))))|"
+        "(1(6(7(5(4))))(2(3(0))))|(1(6(4(7(5))))(3(2(0))))|(1(6(5(4(7))))(3(0(2))))|(5(2(3(1(0))))(4(6(7))))|(5(2(0(3(1))))(6(4(7))))|(5(2(1(0(3))))(4(7(6))))",
+};
+static struct scclRomeModel rome_model_55 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 0,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x100000,
+            0x200000,
+            0x300000,
+            0x400000,
+            0x500000,
+            0x600000,
+            0x700000,
+            0x800000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            0,
+            0,
+            1,
+            1,
+            2,
+            2,
+            3,
+            3,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "20202020",
+    .ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|2 3 0 1 6 7 4 5|5 4 7 6 1 0 3 2",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_56 = {
+    .nGpus  = 16,
+    .nCpus  = 4,
+    .nNics  = 0,
+    .nLinks = 4,
+    .gpuIds =
+        {
+            0x4e000,
+            0x51000,
+            0x56000,
+            0x59000,
+            0xe000,
+            0x11000,
+            0x16000,
+            0x19000,
+            0xcf000,
+            0xd2000,
+            0xd7000,
+            0xda000,
+            0x8f000,
+            0x92000,
+            0x97000,
+            0x9a000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            2,
+            2,
+            2,
+            2,
+            3,
+            3,
+            3,
+            3,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 4, 0,
+            0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4,
+            0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 4, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "40404040",
+    .ringBase = "0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4|0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4|0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1|4 5 13 12 8 9 11 10 14 15 7 "
+                "6 2 3 1 0|4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0|1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0",
+    .options  = "pivotA2AEnabled=1,pivotA2ANumBiRings=3,tuning=1,mscclEnabled=1,treeDefined=1",
+    .treeBase = "(0(1(3(2(6(7(15(14(10))))))))(4(5(13(12(8(9(11))))))))|(2(3(7(6(13(12(8(9(10))))))))(1(0(4(5(14(15(11))))))))|(14(15(11(10(8(9(13(12(4))))))))"
+                "(6(7(3(2(0(1(5))))))))|(10(11(9(8(12(13(5(4(0))))))))(14(15(7(6(2(3(1))))))))|(10(11(15(14(5(4(0(1(2))))))))(9(8(12(13(6(7(3))))))))|(4(5(1(0("
+                "2(3(7(6(14))))))))(12(13(9(8(10(11(15))))))))|(6(7(15(14(10(11(9(8(12))))))))(2(3(1(0(4(5(13))))))))|(13(12(8(9(10(11(15(14(5))))))))(6(7(3(2("
+                "1(0(4))))))))|(8(9(13(12(4(5(1(0(2))))))))(10(11(15(14(6(7(3))))))))|(12(13(5(4(0(1(3(2(6))))))))(8(9(11(10(14(15(7))))))))|(5(4(0(1(2(3(7(6("
+                "13))))))))(14(15(11(10(9(8(12))))))))|(2(3(7(6(14(15(11(10(8))))))))(0(1(5(4(12(13(9))))))))",
+};
+static struct scclRomeModel rome_model_58 = {
+    .nGpus  = 8,
+    .nCpus  = 3,
+    .nNics  = 0,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0xc1000,
+            0xc6000,
+            0xc9000,
+            0xce000,
+            0xd1000,
+            0xd6000,
+            0xd9000,
+            0xde000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            3,
+            3,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
+            0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "402020",
+    .ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_59 = {
+    .nGpus  = 16,
+    .nCpus  = 4,
+    .nNics  = 8,
+    .nLinks = 4,
+    .gpuIds =
+        {
+            0x4e000,
+            0x51000,
+            0x56000,
+            0x59000,
+            0xe000,
+            0x11000,
+            0x16000,
+            0x19000,
+            0xcf000,
+            0xd2000,
+            0xd7000,
+            0xda000,
+            0x8f000,
+            0x92000,
+            0x97000,
+            0x9a000,
+        },
+    .nicIds =
+        {
+            0x4b000,
+            0x5a000,
+            0xb000,
+            0x1a000,
+            0xcc000,
+            0xdb000,
+            0x8c000,
+            0x9b000,
+        },
+    .gpuNuma =
+        {
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            2,
+            2,
+            2,
+            2,
+            3,
+            3,
+            3,
+            3,
+        },
+    .nicNuma =
+        {
+            0,
+            0,
+            1,
+            1,
+            2,
+            2,
+            3,
+            3,
+        },
+    .connMatrix =
+        {
+            0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 4, 0,
+            0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4,
+            0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 4, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB,
+            PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB,
+            PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
+        },
+    .pattern  = "42424242",
+    .ringBase = "N4 9 8 12 13 5 4 0 1 3 2 6 7 15 14 10 11 N5|N1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 1 N0|N3 7 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 N7|N7 15 14 "
+                "10 11 9 8 12 13 5 4 0 1 3 2 6 7 N3|N5 11 10 14 15 7 6 2 3 1 0 4 5 13 12 8 9 N4|N0 1 0 4 5 13 12 8 9 11 10 14 15 7 6 2 3 N1|N3 6 7 3 2 1 0 4 5 "
+                "14 15 11 10 9 8 12 13 N6|N7 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 5 N2|N2 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 14 N7|N6 13 12 8 9 10 11 15 14 5 4 "
+                "0 1 2 3 7 6 N3|N4 8 9 13 12 4 5 1 0 2 3 7 6 14 15 11 10 N5|N5 10 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 N4|N6 12 13 9 8 10 11 15 14 6 7 3 2 0 1 5 "
+                "4 N2|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N1 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 N0|N0 0 1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 N1|N5 "
+                "10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N3 6 7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N7 14 15 7 6 "
+                "2 3 1 0 4 5 13 12 8 9 11 10 N5|N0 0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 N2|N4 8 9 10 11 15 14 5 4 0 1 2 3 7 6 13 12 N6|N3 7 6 13 12 8 9 10 11 "
+                "15 14 5 4 0 1 2 3 N1|N1 3 2 1 0 4 5 14 15 11 10 9 8 12 13 6 7 N3|N6 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 8 N4|N2 4 5 14 15 11 10 9 8 12 13 6 7 "
+                "3 2 1 0 N0|N0 1 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 N2|N6 13 12 4 5 1 0 2 3 7 6 14 15 11 10 8 9 N4|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 "
+                "N7|N2 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0 1 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6",
+    .options  = "tuning=4,ll128Enabled=1,baseBw=161.4",
+};
+static struct scclRomeModel rome_model_62 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 0,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0xc1000,
+            0xc6000,
+            0xc9000,
+            0xce000,
+            0xd1000,
+            0xd6000,
+            0xd9000,
+            0xde000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            3,
+            3,
+            1,
+            1,
+            0,
+            0,
+            2,
+            2,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
+            0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "20202020",
+    .ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_63 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 4,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0xc1000,
+            0xc6000,
+            0xc9000,
+            0xce000,
+            0xd1000,
+            0xd6000,
+            0xd9000,
+            0xde000,
+        },
+    .nicIds =
+        {
+            0xc5000,
+            0xcd000,
+            0xd5000,
+            0xdd000,
+        },
+    .gpuNuma =
+        {
+            3,
+            3,
+            1,
+            1,
+            0,
+            0,
+            2,
+            2,
+        },
+    .nicNuma =
+        {
+            3,
+            1,
+            0,
+            2,
+        },
+    .connMatrix =
+        {
+            0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
+            0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB,
+            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
+        },
+    .pattern  = "21212121",
+    .ringBase = "N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 "
+                "1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3",
+    .options  = "tuning=3",
+};
+static struct scclRomeModel rome_model_65 = {
+    .nGpus  = 16,
+    .nCpus  = 4,
+    .nNics  = 8,
+    .nLinks = 4,
+    .gpuIds =
+        {
+            0x4e000,
+            0x51000,
+            0x56000,
+            0x59000,
+            0xe000,
+            0x11000,
+            0x16000,
+            0x19000,
+            0xcf000,
+            0xd2000,
+            0xd7000,
+            0xda000,
+            0x8f000,
+            0x92000,
+            0x97000,
+            0x9a000,
+        },
+    .nicIds =
+        {
+            0x4b000,
+            0x5a000,
+            0xb000,
+            0x1a000,
+            0xcc000,
+            0xdb000,
+            0x8c000,
+            0x9b000,
+        },
+    .gpuNuma =
+        {
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            2,
+            2,
+            2,
+            2,
+            3,
+            3,
+            3,
+            3,
+        },
+    .nicNuma =
+        {
+            0,
+            0,
+            1,
+            1,
+            2,
+            2,
+            3,
+            3,
+        },
+    .connMatrix =
+        {
+            0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4, 0, 0, 1, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 4, 0,
+            0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 4,
+            0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+            0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 4, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
+        },
+    .pattern  = "42424242",
+    .ringBase = "N4 9 8 12 13 5 4 0 1 3 2 6 7 15 14 10 11 N5|N1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 1 N0|N3 7 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 N7|N7 15 14 "
+                "10 11 9 8 12 13 5 4 0 1 3 2 6 7 N3|N5 11 10 14 15 7 6 2 3 1 0 4 5 13 12 8 9 N4|N0 1 0 4 5 13 12 8 9 11 10 14 15 7 6 2 3 N1|N3 6 7 3 2 1 0 4 5 "
+                "14 15 11 10 9 8 12 13 N6|N7 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 5 N2|N2 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 14 N7|N6 13 12 8 9 10 11 15 14 5 4 "
+                "0 1 2 3 7 6 N3|N4 8 9 13 12 4 5 1 0 2 3 7 6 14 15 11 10 N5|N5 10 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 N4|N6 12 13 9 8 10 11 15 14 6 7 3 2 0 1 5 "
+                "4 N2|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N1 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 N0|N0 0 1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 N1|N5 "
+                "10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N3 6 7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N7 14 15 7 6 "
+                "2 3 1 0 4 5 13 12 8 9 11 10 N5|N0 0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 N2|N4 8 9 10 11 15 14 5 4 0 1 2 3 7 6 13 12 N6|N3 7 6 13 12 8 9 10 11 "
+                "15 14 5 4 0 1 2 3 N1|N1 3 2 1 0 4 5 14 15 11 10 9 8 12 13 6 7 N3|N6 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 8 N4|N2 4 5 14 15 11 10 9 8 12 13 6 7 "
+                "3 2 1 0 N0|N0 1 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 N2|N6 13 12 4 5 1 0 2 3 7 6 14 15 11 10 8 9 N4|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 "
+                "N7|N2 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0 1 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6",
+    .options  = "tuning=4,ll128Enabled=1,baseBw=161.4",
+};
+static struct scclRomeModel rome_model_66 = {
+    .nGpus  = 8,
+    .nCpus  = 2,
+    .nNics  = 0,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x29000,
+            0x2c000,
+            0x2f000,
+            0x32000,
+            0xad000,
+            0xb0000,
+            0xb3000,
+            0xb6000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            1,
+            1,
+            1,
+            1,
+            3,
+            3,
+            3,
+            3,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 4, 0, 0, 2, 0, 1, 0, 4, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 4, 1, 0, 2, 0, 0, 1, 4, 0, 0, 1, 0, 0,
+            2, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 1, 4, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "4040",
+    .ringBase = "0 6 7 5 4 2 3 1|1 3 2 4 5 7 6 0|0 1 7 6 2 3 5 4|4 5 3 2 6 7 1 0",
+    .options  = "disableNumaMatching=1,tuning=2",
+};
+static struct scclRomeModel rome_model_67 = {
+    .nGpus  = 8,
+    .nCpus  = 2,
+    .nNics  = 4,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x29000,
+            0x2c000,
+            0x2f000,
+            0x32000,
+            0xad000,
+            0xb0000,
+            0xb3000,
+            0xb6000,
+        },
+    .nicIds =
+        {
+            0x1d000,
+            0x1e000,
+            0xa1000,
+            0xa2000,
+        },
+    .gpuNuma =
+        {
+            1,
+            1,
+            1,
+            1,
+            3,
+            3,
+            3,
+            3,
+        },
+    .nicNuma =
+        {
+            1,
+            1,
+            3,
+            3,
+        },
+    .connMatrix =
+        {
+            0, 4, 0, 0, 2, 0, 1, 0, 4, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 4, 1, 0, 2, 0, 0, 1, 4, 0, 0, 1, 0, 0,
+            2, 0, 1, 0, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 1, 4, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB,
+            PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB,
+            PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
+        },
+    .pattern  = "4242",
+    .ringBase = "N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N1 2 3 5 4 0 1 7 6 N3|N2 4 5 3 2 6 7 1 0 N0|N1 3 2 4 5 7 6 0 1 N0|N0 1 0 6 7 5 4 2 3 N1|N0 0 1 7 "
+                "6 2 3 5 4 N2|N3 6 7 1 0 4 5 3 2 N1",
+    .options  = "disableNumaMatching=1,tuning=2",
+};
+static struct scclRomeModel rome_model_68 = {
+    .nGpus  = 16,
+    .nCpus  = 1,
+    .nNics  = 16,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0xcf000,
+            0xd4000,
+            0xd5000,
+            0xd6000,
+            0xd0000,
+            0xd1000,
+            0xd2000,
+            0xd3000,
+            0xf0000,
+            0xf1000,
+            0xf2000,
+            0xf3000,
+            0xf4000,
+            0xf5000,
+            0xf6000,
+            0xf7000,
+        },
+    .nicIds =
+        {
+            0xcd000,
+            0xc8000,
+            0xc9000,
+            0xcb000,
+            0xcc000,
+            0xce000,
+            0xc7000,
+            0xca000,
+            0xe8000,
+            0xe9000,
+            0xea000,
+            0xeb000,
+            0xec000,
+            0xed000,
+            0xee000,
+            0xef000,
+        },
+    .gpuNuma =
+        {
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+        },
+    .nicNuma =
+        {
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+        },
+    .connMatrix =
+        {
+            0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+            1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_PHB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
+            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB,
+            PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
+            PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB,
+            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
+            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB,
+            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PIX,
+            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
+            PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX, PATH_PXB, PATH_PXB,
+            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
+            PATH_PIX, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_PXB, PATH_PXB,
+            PATH_PXB, PATH_PXB, PATH_PXB, PATH_PIX,
+        },
+    .pattern  = "@@",
+    .ringBase = "N0 0 1 2 3 N3 N4 4 5 6 7 N7 N8 8 9 10 11 N11 N12 12 13 14 15 N15|N15 15 14 13 12 N12 N11 11 10 9 8 N8 N7 7 6 5 4 N4 N3 3 2 1 0 N0|N1 1 3 0 2 "
+                "N2 N5 5 7 4 6 N6 N9 9 11 8 10 N10 N13 13 15 12 14 N14|N14 14 12 15 13 N13 N10 10 8 11 9 N9 N6 6 4 7 5 N5 N2 2 0 3 1 N1|N0 0 1 2 3 N3 N4 4 5 6 "
+                "7 N7 N8 8 9 10 11 N11 N12 12 13 14 15 N15|N15 15 14 13 12 N12 N11 11 10 9 8 N8 N7 7 6 5 4 N4 N3 3 2 1 0 N0|N1 1 3 0 2 N2 N5 5 7 4 6 N6 N9 9 "
+                "11 8 10 N10 N13 13 15 12 14 N14|N14 14 12 15 13 N13 N10 10 8 11 9 N9 N6 6 4 7 5 N5 N2 2 0 3 1 N1",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_71 = {
+    .nGpus  = 8,
+    .nCpus  = 2,
+    .nNics  = 0,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x32000,
+            0x35000,
+            0x11000,
+            0x14000,
+            0xae000,
+            0xb3000,
+            0x8e000,
+            0x93000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
+            0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "4040",
+    .ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 2 3 7 6|6 7 3 2 4 5 1 0",
+    .options  = "disableNumaMatching=1,tuning=2",
+};
+static struct scclRomeModel rome_model_72 = {
+    .nGpus  = 8,
+    .nCpus  = 2,
+    .nNics  = 4,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x32000,
+            0x35000,
+            0x11000,
+            0x14000,
+            0xae000,
+            0xb3000,
+            0x8e000,
+            0x93000,
+        },
+    .nicIds =
+        {
+            0x1d000,
+            0x1e000,
+            0xa0000,
+            0xa1000,
+        },
+    .gpuNuma =
+        {
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+        },
+    .nicNuma =
+        {
+            0,
+            0,
+            1,
+            1,
+        },
+    .connMatrix =
+        {
+            0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
+            0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB,
+        },
+    .pattern  = "4242",
+    .ringBase = "N0 0 1 3 2 4 5 7 6 N3|N1 2 3 1 0 6 7 5 4 N2|N3 7 6 0 1 5 4 2 3 N1|N0 1 0 6 7 3 2 4 5 N2|N2 4 5 7 6 0 1 3 2 N1|N3 6 7 5 4 2 3 1 0 N0|N2 5 4 2 "
+                "3 7 6 0 1 N0|N1 3 2 4 5 1 0 6 7 N3",
+    .options  = "disableNumaMatching=1,tuning=2",
+};
+static struct scclRomeModel rome_model_73 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 0,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0xc1000,
+            0xc6000,
+            0xc9000,
+            0xce000,
+            0xd1000,
+            0xd6000,
+            0xd9000,
+            0xde000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            3,
+            3,
+            1,
+            1,
+            0,
+            0,
+            2,
+            2,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
+            0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "20202020",
+    .ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_74 = {
+    .nGpus  = 8,
+    .nCpus  = 4,
+    .nNics  = 4,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0xc1000,
+            0xc6000,
+            0xc9000,
+            0xce000,
+            0xd1000,
+            0xd6000,
+            0xd9000,
+            0xde000,
+        },
+    .nicIds =
+        {
+            0xc5000,
+            0xcd000,
+            0xd5000,
+            0xdd000,
+        },
+    .gpuNuma =
+        {
+            3,
+            3,
+            1,
+            1,
+            0,
+            0,
+            2,
+            2,
+        },
+    .nicNuma =
+        {
+            3,
+            1,
+            0,
+            2,
+        },
+    .connMatrix =
+        {
+            0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
+            0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB,
+            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
+        },
+    .pattern  = "21212121",
+    .ringBase = "N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 "
+                "1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3",
+    .options  = "tuning=3",
+};
+static struct scclRomeModel rome_model_76 = {
+    .nGpus  = 8,
+    .nCpus  = 2,
+    .nNics  = 8,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x32000,
+            0x35000,
+            0x11000,
+            0x14000,
+            0xae000,
+            0xb3000,
+            0x8e000,
+            0x93000,
+        },
+    .nicIds =
+        {
+            0x26000,
+            0x2d000,
+            0x5000,
+            0xc000,
+            0xab000,
+            0xb4000,
+            0x8b000,
+            0x94000,
+        },
+    .gpuNuma =
+        {
+            1,
+            1,
+            1,
+            1,
+            3,
+            3,
+            3,
+            3,
+        },
+    .nicNuma =
+        {
+            1,
+            1,
+            1,
+            1,
+            3,
+            3,
+            3,
+            3,
+        },
+    .connMatrix =
+        {
+            0, 4, 1, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 1,
+            0, 0, 2, 0, 0, 4, 1, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 4, 0, 0, 0, 1, 0, 1, 4, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
+            PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PHB,
+            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PXB,
+        },
+    .pattern  = "4444",
+    .ringBase = "N0 0 1 3 2 4 5 7 6 N6|N2 2 3 1 0 6 7 5 4 N4|N5 5 4 2 3 7 6 0 1 N1|N1 1 0 6 7 3 2 4 5 N5|N4 4 5 7 6 0 1 3 2 N2|N2 2 3 1 0 6 7 5 4 N4|N0 0 1 5 "
+                "4 2 3 7 6 N6|N3 3 2 4 5 1 0 6 7 N7|N4 4 5 7 6 0 1 3 2 N2|N6 6 7 5 4 2 3 1 0 N0|N7 7 6 0 1 5 4 2 3 N3|N6 6 7 3 2 4 5 1 0 N0|N3 3 2 0 1 5 4 6 7 "
+                "N7|N1 1 0 2 3 7 6 4 5 N5|N5 5 4 6 7 3 2 0 1 N1|N7 7 6 4 5 1 0 2 3 N3",
+    .options  = "disableNumaMatching=1,tuning=3",
+};
+static struct scclRomeModel rome_model_79 = {
+    .nGpus  = 8,
+    .nCpus  = 2,
+    .nNics  = 0,
+    .nLinks = 7,
+    .gpuIds =
+        {
+            0x1d000,
+            0x2e000,
+            0x3f000,
+            0x61000,
+            0x9f000,
+            0xaf000,
+            0xbf000,
+            0xdf000,
+        },
+    .nicIds = {},
+    .gpuNuma =
+        {
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+        },
+    .nicNuma = {},
+    .connMatrix =
+        {
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
+            1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+        },
+    .gdrLevel = {},
+    .pattern  = "4040",
+    .ringBase = "0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 "
+                "0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0",
+    .options  = "noCpuCheck=1,mscclEnabled=1",
+};
+static struct scclRomeModel rome_model_80 = {
+    .nGpus  = 4,
+    .nCpus  = 4,
+    .nNics  = 4,
+    .nLinks = 3,
+    .gpuIds =
+        {
+            0x82000,
+            0xc2000,
+            0x2000,
+            0x42000,
+        },
+    .nicIds =
+        {
+            0x81000,
+            0xc1000,
+            0x1000,
+            0x41000,
+        },
+    .gpuNuma =
+        {
+            2,
+            3,
+            0,
+            1,
+        },
+    .nicNuma =
+        {
+            2,
+            3,
+            0,
+            1,
+        },
+    .connMatrix =
+        {
+            0,
+            2,
+            2,
+            2,
+            2,
+            0,
+            2,
+            2,
+            2,
+            2,
+            0,
+            2,
+            2,
+            2,
+            2,
+            0,
+        },
+    .gdrLevel =
+        {
+            PATH_PHB,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_PHB,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_PHB,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_SYS,
+            PATH_PHB,
+        },
+    .pattern  = "11111111",
+    .ringBase = "N2 2 3 0 1 N1|N0 0 1 3 2 N2|N0 0 2 1 3 N3|N3 3 1 0 2 N2|N3 3 1 2 0 N0|N1 1 0 3 2 N2|N1 1 2 3 0 N0|N2 2 0 1 3 N3|N3 3 0 2 1 N1|N2 2 3 1 0 "
+                "N0|N1 1 2 0 3 N3|N0 0 3 2 1 N1",
+    .options  = "",
+};
+static struct scclRomeModel rome_model_81 = {
+    .nGpus  = 8,
+    .nCpus  = 2,
+    .nNics  = 8,
+    .nLinks = 7,
+    .gpuIds =
+        {
+            0xc000,
+            0x22000,
+            0x38000,
+            0x5c000,
+            0x9f000,
+            0xaf000,
+            0xbf000,
+            0xdf000,
+        },
+    .nicIds =
+        {
+            0x7000,
+            0x1d000,
+            0x33000,
+            0x57000,
+            0x9a000,
+            0xaa000,
+            0xba000,
+            0xda000,
+        },
+    .gpuNuma =
+        {
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+        },
+    .nicNuma =
+        {
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+        },
+    .connMatrix =
+        {
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
+            1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+        },
+    .gdrLevel =
+        {
+            PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS,
+            PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PHB, PATH_PHB,
+            PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS,
+            PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB,
+        },
+    .pattern  = "4444",
+    .ringBase = "N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 7 1 6 4 N4|N3 3 6 1 5 2 7 4 0 N0|N4 4 7 0 6 5 1 3 2 N2|N5 5 4 6 3 0 7 2 1 N1|N6 6 2 0 "
+                "4 1 7 5 3 N3|N7 7 3 1 4 2 6 0 5 N5|N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 7 1 6 4 N4|N3 3 6 1 5 2 7 4 0 N0|N4 4 7 0 6 5 1 3 2 "
+                "N2|N5 5 4 6 3 0 7 2 1 N1|N6 6 2 0 4 1 7 5 3 N3|N7 7 3 1 4 2 6 0 5 N5",
+    .options  = "noCpuCheck=1,mscclEnabled=1",
+};
+static struct scclRomeModel romeTopoModels[] = {
+    rome_model_22, rome_model_25, rome_model_27, rome_model_29, rome_model_31, rome_model_33, rome_model_30, rome_model_32, rome_model_24,
+    rome_model_26, rome_model_23, rome_model_38, rome_model_28, rome_model_40, rome_model_42, rome_model_44, rome_model_45, rome_model_46,
+    rome_model_48, rome_model_49, rome_model_52, rome_model_53, rome_model_43, rome_model_55, rome_model_56, rome_model_58, rome_model_59,
+    rome_model_62, rome_model_63, rome_model_65, rome_model_66, rome_model_67, rome_model_68, rome_model_71, rome_model_72, rome_model_73,
+    rome_model_74, rome_model_76, rome_model_79, rome_model_80, rome_model_81,
+};
+/* Parse user defined rings. Format is like :
+ * "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1"
+ * Network interfaces can be optionally specified by N prefix.
+ * Rings with a non-matching number of gpus are ignored so we can provide
+ * rings for multiple cases.
+ */
+scclResult_t parseGraph(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map, int* net_map) {
+    int gpus[SCCL_TOPO_MAX_NODES];
+    int nChannels = 0;
+    int gpu       = 0;
+    int offset    = 0;
+    int status    = 0; // 0 : between numbers, 1 : inside number, 2: start NET, 3: inside NET
+    int nets[SCCL_TOPO_MAX_NODES * 2];
+    int net_offset = 0, net_count = 0;
+    int ngpus = system->nodes[GPU].count;
+    int nnets = system->nodes[NET].count;
+    do {
+        if(str[offset] == 'N') {
+            if(status == 0) {
+                status = 2;
+            }
+        } else {
+            int digit = str[offset] - '0';
+            if(digit >= 0 && digit <= 9) {
+                switch(status) {
+                    case 0:
+                        gpus[gpu] = digit;
+                        status    = 1;
+                        break;
+                    case 1: gpus[gpu] = gpus[gpu] * 10 + digit; break;
+                    case 2:
+                        nets[net_offset] = digit + 'N';
+                        status           = 3;
+                        break;
+                    case 3: nets[net_offset] = (nets[net_offset] - 'N') * 10 + digit + 'N'; break;
+                }
+            } else {
+                if(status == 1) {
+                    gpu++;
+                    net_offset = 2 * gpu - 1;
+                    if(gpu > SCCL_TOPO_MAX_NODES)
+                        goto end;
+                } else if(status == 2 || status == 3) {
+                    net_offset++;
+                    net_count++;
+                    if(net_offset > ngpus * 2)
+                        goto end;
+                }
+                status = 0;
+                if(str[offset] == '|' || str[offset] == '\0') {
+                    // Ignore if ngpus doesn't match
+                    if(gpu != ngpus)
+                        goto newchannel;
+                    // Ignore if net_count is not 0 or odd number
+                    if(net_count && net_count % 2)
+                        goto newchannel;
+                    for(int r = 0; r < ngpus; r++) {
+                        int g = gpus[r];
+                        // Ignore if gpus are out of bounds
+                        if(g < 0 || g >= ngpus)
+                            goto newchannel;
+                        // Ignore if gpus are duplicate
+                        for(int i = 0; i < r; i++)
+                            if(gpus[i] == g)
+                                goto newchannel;
+                        // remap if needed
+                        if(gpu_map)
+                            g = gpu_map[g];
+                        // Translate gpu numbers into ranks
+                        int j = 0;
+                        for(j = 0; j < ngpus; j++)
+                            if(g == system->nodes[GPU].nodes[j].gpu.dev)
+                                break;
+                        if(j < ngpus)
+                            graph->intra[nChannels * ngpus + r] = system->nodes[GPU].nodes[j].gpu.rank;
+                        else
+                            return scclInternalError;
+                    }
+                    if(net_count) {
+                        for(int i = 0; net_map && i < ngpus * 2; i++) {
+                            if(nets[i] - 'N' < 0 || nets[i] - 'N' >= nnets)
+                                continue;
+                            nets[i] = net_map[nets[i] - 'N'] + 'N';
+                        }
+                        memcpy(&graph->intraNets[ngpus * nChannels * 2], nets, ngpus * 2 * sizeof(int));
+                        graph->nIntraChannels++;
+                        if(nets[0] - 'N' >= nnets || nets[ngpus * 2 - 1] - 'N' >= nnets)
+                            goto newchannel;
+                        graph->inter[nChannels * 2]     = nets[0] - 'N';
+                        graph->inter[nChannels * 2 + 1] = nets[ngpus * 2 - 1] - 'N';
+                    } else if(nnets) {
+                        graph->inter[nChannels * 2]     = system->nodes[NET].nodes[nChannels % nnets].id;
+                        graph->inter[nChannels * 2 + 1] = system->nodes[NET].nodes[(nChannels + 1) % nnets].id;
+                    }
+                    nChannels++;
+                newchannel:
+                    gpu        = 0;
+                    net_offset = 0;
+                    net_count  = 0;
+                }
+            }
+        }
+    } while(str[offset++] != 0);
+end:
+    graph->nChannels = nChannels;
+    graph->bwIntra = graph->bwInter = system->totalBw / nChannels;
+    if(graph->id == 1) {
+        for(int i = 0; i < graph->nChannels; i++) {
+            int net;
+            scclTopoGetLocalNet(system, graph->intra[i * ngpus + 1], i, &net);
+            graph->inter[i * 2 + 1] = net;
+        }
+    }
+#if 0
+  for (int i=0; i<graph->nChannels; i++) {
+    printf("%d: ", i);
+    printf ("NET/%d ", graph->inter[i*2]);
+    for (int j=0; j<ngpus; j++) printf("GPU/%d ", graph->intra[i*ngpus+j]);
+    printf ("NET/%d ", graph->inter[i*2+1]);
+    printf("\n");
+  }
+#endif
+    return scclSuccess;
+}
+/* Parse user defined treeBase for complicated trees. Format is like :
+ * "(4(2(3)(1))(6(5)))"
+ *
+ * Rings with a non-matching number of gpus are ignored so we can provide
+ * rings for multiple cases.
+ */
+scclResult_t parseGraphLight(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map) {
+    int gpus[SCCL_TOPO_MAX_NODES]; // transcribe/change according to gpu_map
+    int nChannels    = 0;
+    int gpu          = 0;
+    int offset       = 0;
+    int start_offset = offset;
+    if(str[0] == 0) {
+        graph->treeBase[0][0] = 0;
+        return scclSuccess;
+    }
+    int status = 0; // 0 : between numbers, 1 : inside number
+    int ngpus  = system->nodes[GPU].count;
+    int x = 0, y = 0;
+    do {
+        int digit = str[offset] - '0';
+        if(digit >= 0 && digit <= 9) {
+            switch(status) {
+                case 0:
+                    gpus[gpu] = digit;
+                    status    = 1;
+                    break;
+                case 1: gpus[gpu] = gpus[gpu] * 10 + digit; break;
+            }
+        } else {
+            if(status == 1) {
+                gpu++;
+            }
+            status = 0;
+            if(str[offset] == '|' || str[offset] == 0) {
+                int r = 0, y = 0;
+                while(start_offset < offset) {
+                    // for (int r=0; r<gpu; r++) {
+                    if(str[start_offset] == '(' || str[start_offset] == ')') {
+                        graph->treeBase[x][y] = str[start_offset];
+                        y++;
+                        start_offset++;
+                    } else {
+                        int g = gpus[r];
+                        // remap if needed
+                        if(gpu_map)
+                            g = gpu_map[g];
+                        r++;
+                        int j = 0;
+                        // Translate gpu numbers into ranks
+                        for(j = 0; j < ngpus; j++)
+                            if(g == system->nodes[GPU].nodes[j].gpu.dev)
+                                break;
+                        if(j < ngpus) {
+                            while(str[start_offset] != '(' && str[start_offset] != ')')
+                                start_offset++;
+                            char number_str[10];
+                            sprintf(number_str, "%d", g);
+                            int k = 0;
+                            while(number_str[k] != 0) {
+                                graph->treeBase[x][y] = number_str[k];
+                                y++;
+                                k++;
+                            }
+                        } else
+                            return scclInternalError;
+                    }
+                }
+                graph->treeBase[x][y] = 0;
+                x++;
+                gpu          = 0;
+                start_offset = offset + 1;
+            }
+        }
+    } while(str[offset++] != 0);
+    graph->treeBase[x][0] = 0;
+    return scclSuccess;
+}
+#define MAX_OPT_TOKENS 10
+extern const char* topoPathTypeStr[];
+static void parseOptions(struct scclTopoSystem* system, const char* options) {
+    if(strcmp(options, "")) {
+        char* str_temp = (char*)malloc(strlen(options) + 1);
+        strcpy(str_temp, options);
+        char* tokens[MAX_OPT_TOKENS];
+        int numTokens = 0;
+        char* state;
+        tokens[numTokens] = strtok_r(str_temp, "=, ", &state);
+        numTokens++;
+        while(tokens[numTokens - 1] != NULL && numTokens < MAX_OPT_TOKENS)
+            tokens[numTokens++] = strtok_r(NULL, "=, ", &state);
+        for(int i = 0; i < numTokens / 2; i++) {
+            if(strcmp(tokens[i * 2], "netGdrLevel") == 0) {
+                int j;
+                for(j = 0; j <= PATH_SYS; j++) {
+                    if(strcmp(tokens[i * 2 + 1], topoPathTypeStr[j]) == 0)
+                        break;
+                }
+                if(j <= PATH_SYS)
+                    system->netGdrLevel = j;
+                else {
+                    system->netGdrLevel = -2;
+                    WARN("invalid netGdrLevel: %s", tokens[i * 2 + 1]);
+                }
+            } else if(strcmp(tokens[i * 2], "pivotA2AEnabled") == 0) {
+                system->pivotA2AEnabled = (bool)atol(tokens[i * 2 + 1]);
+            } else if(strcmp(tokens[i * 2], "pivotA2ANumBiRings") == 0) {
+                system->pivotA2ANumBiRings = atol(tokens[i * 2 + 1]);
+            } else if(strcmp(tokens[i * 2], "tuning") == 0) {
+                system->tuning = atol(tokens[i * 2 + 1]);
+            } else if(strcmp(tokens[i * 2], "ll128Enabled") == 0) {
+                system->ll128Enabled = (bool)atol(tokens[i * 2 + 1]);
+            } else if(strcmp(tokens[i * 2], "baseBw") == 0) {
+                system->baseBw = std::stof(tokens[i * 2 + 1]);
+            } else if(strcmp(tokens[i * 2], "mscclEnabled") == 0) {
+                system->mscclEnabled = (bool)atol(tokens[i * 2 + 1]);
+            } else if(strcmp(tokens[i * 2], "treeDefined") == 0) {
+                system->treeDefined = (bool)atol(tokens[i * 2 + 1]);
+            }
+        }
+        free(str_temp);
+    }
+}
+static bool checkOption(const char* options, const char* name) {
+    if(strcmp(options, "")) {
+        char* str_temp = (char*)malloc(strlen(options) + 1);
+        strcpy(str_temp, options);
+        char* tokens[MAX_OPT_TOKENS];
+        int numTokens = 0;
+        char* state;
+        tokens[numTokens] = strtok_r(str_temp, "=, ", &state);
+        numTokens++;
+        while(tokens[numTokens - 1] != NULL && numTokens < MAX_OPT_TOKENS)
+            tokens[numTokens++] = strtok_r(NULL, "=, ", &state);
+        for(int i = 0; i < numTokens / 2; i++) {
+            if(strcmp(tokens[i * 2], name) == 0) {
+                return (bool)atol(tokens[i * 2 + 1]);
+            }
+        }
+        free(str_temp);
+    }
+    return false;
+}
+scclResult_t parseChordalRing(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
+    static const char* ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3";
+    int id[8], dist[8];
+    int i;
+    int ngpus = system->nodes[GPU].count;
+    if(ngpus != 8)
+        return scclSuccess;
+    // validate chordal ring and calculate distance
+    for(i = 0; i < ngpus; i++) {
+        struct scclTopoNode* node = system->nodes[GPU].nodes + i;
+        if(node->paths[GPU] == NULL)
+            continue;
+        int sum   = ngpus * (ngpus - 1) / 2 - node->gpu.dev;
+        int count = 0;
+        for(int n = 0; n < ngpus; n++) {
+            struct scclTopoLink* link;
+            for(link = node->links; link->remNode; link++) {
+                if(link->remNode->gpu.dev == n)
+                    break;
+            }
+            if(!link->remNode)
+                continue;
+            if(link->type != LINK_NVL)
+                continue;
+            sum -= system->nodes[GPU].nodes[n].gpu.dev;
+            count++;
+        }
+        if(count != ngpus - 2 || sum < 0 || sum > ngpus - 1) {
+            return scclSuccess;
+        }
+        dist[i] = sum;
+    }
+    // remap GPU ids
+    for(i = 0; i < ngpus; i++)
+        id[i] = i;
+    for(i = 0; i < ngpus; i++) {
+        if(dist[i] == ngpus - 1 - i)
+            continue;
+        int j, m, n, temp;
+        for(j = i + 1; j < ngpus; j++)
+            if(dist[j] == ngpus - 1 - i)
+                break;
+        m       = dist[i];
+        n       = dist[j];
+        dist[i] = n;
+        dist[j] = m;
+        temp    = id[m];
+        id[m]   = id[n];
+        id[n]   = temp;
+        temp    = dist[m];
+        dist[m] = dist[n];
+        dist[n] = temp;
+    }
+    // create chordal ring based on reference and remapped ids
+    system->type |= RCCL_TOPO_CR8G;
+    SCCLCHECK(parseGraph(ringBase, system, graph, id, NULL));
+    if(system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
+        int *intra, *used;
+        graph->nChannels = system->nodes[NET].count;
+        SCCLCHECK(scclCalloc(&intra, ngpus));
+        SCCLCHECK(scclCalloc(&used, system->nodes[NET].count));
+        for(int n = 0; n < system->nodes[NET].count; n++) {
+            graph->inter[n * 2] = graph->inter[n * 2 + 1] = n;
+            struct scclTopoNode* net                      = system->nodes[NET].nodes + n;
+            struct scclTopoLinkList* paths                = net->paths[GPU];
+            // find the first unsed GPU that is closest to NIC
+            int f, m;
+            for(f = 0; f < ngpus; f++) {
+                int j = 0;
+                for(j = 0; j < n; j++)
+                    if(used[j] == system->nodes[GPU].nodes[f].gpu.rank)
+                        break;
+                if(j >= n)
+                    break;
+            }
+            for(int i = 0; i < ngpus; i++) {
+                int j = 0;
+                for(j = 0; j < n; j++)
+                    if(used[j] == system->nodes[GPU].nodes[i].gpu.rank)
+                        break;
+                if(j < n)
+                    continue;
+                if(paths[i].count < paths[f].count)
+                    f = i;
+            }
+            for(m = 0; m < ngpus; m++)
+                if(graph->intra[n * ngpus + m] == system->nodes[GPU].nodes[f].gpu.rank)
+                    break;
+            used[n] = graph->intra[n * ngpus + m];
+            for(int i = 0; i < ngpus; i++)
+                intra[i] = graph->intra[n * ngpus + ((i + m) % ngpus)];
+            for(int i = 0; i < ngpus; i++)
+                graph->intra[n * ngpus + i] = intra[i];
+        }
+        free(used);
+        free(intra);
+    }
+    return scclSuccess;
+}
+static scclResult_t parseRomeSystem(struct scclTopoSystem* system, struct scclRomeModel* romeTopo, char* pattern) {
+    pattern[0]       = 0; // pattern will be NULL for invalid topology
+    romeTopo->nGpus  = system->nodes[GPU].count;
+    romeTopo->nCpus  = system->nodes[CPU].count;
+    romeTopo->nNics  = system->nodes[NET].count;
+    romeTopo->nLinks = 0;
+    struct scclGpuIdHIP {
+        int g;
+        int dev;
+    };
+    auto cmpIds = [](const void* g1, const void* g2) {
+        struct scclGpuIdHIP* s1 = (struct scclGpuIdHIP*)g1;
+        struct scclGpuIdHIP* s2 = (struct scclGpuIdHIP*)g2;
+        return s1->dev - s2->dev;
+    };
+    struct scclCpuNuma {
+        int c;
+        uint64_t numa;
+    };
+    auto cmpNuma = [](const void* g1, const void* g2) {
+        struct scclCpuNuma* s1 = (struct scclCpuNuma*)g1;
+        struct scclCpuNuma* s2 = (struct scclCpuNuma*)g2;
+        return (int)(s1->numa - s2->numa);
+    };
+    struct scclNetId {
+        int n;
+        uint64_t id;
+    };
+    auto cmpNets = [](const void* g1, const void* g2) {
+        struct scclNetId* s1 = (struct scclNetId*)g1;
+        struct scclNetId* s2 = (struct scclNetId*)g2;
+        return (int)(s1->id - s2->id);
+    };
+    // sort GPU devices by HIP device ID
+    struct scclGpuIdHIP gpu_scores[SCCL_TOPO_MAX_NODES];
+    for(int i = 0; i < romeTopo->nGpus; i++) {
+        gpu_scores[i].g   = i;
+        gpu_scores[i].dev = system->nodes[GPU].nodes[i].gpu.dev;
+    }
+    qsort(gpu_scores, romeTopo->nGpus, sizeof(struct scclGpuIdHIP), cmpIds);
+    // sort CPU devices by NUMA id
+    struct scclCpuNuma cpu_scores[SCCL_TOPO_MAX_NODES];
+    for(int i = 0; i < romeTopo->nCpus; i++) {
+        cpu_scores[i].c    = i;
+        cpu_scores[i].numa = system->nodes[CPU].nodes[i].id;
+    }
+    qsort(cpu_scores, romeTopo->nCpus, sizeof(struct scclCpuNuma), cmpNuma);
+    // sort NET devices by id
+    struct scclNetId net_scores[SCCL_TOPO_MAX_NODES];
+    for(int i = 0; i < romeTopo->nNics; i++) {
+        net_scores[i].n  = i;
+        net_scores[i].id = system->nodes[NET].nodes[i].id;
+    }
+    qsort(net_scores, romeTopo->nNics, sizeof(struct scclNetId), cmpNets);
+    for(int i = 0; i < romeTopo->nGpus; i++) {
+        int gpu, n, m, distance;
+        gpu                 = gpu_scores[i].g;
+        romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
+        m                   = 0;
+        distance            = system->nodes[GPU].nodes[gpu].paths[CPU][m].count;
+        for(n = 1; n < romeTopo->nCpus; n++) {
+            if(system->nodes[GPU].nodes[gpu].paths[CPU][n].count < distance) {
+                distance = system->nodes[GPU].nodes[gpu].paths[CPU][n].count;
+                m        = n;
+            }
+        }
+        if(m < romeTopo->nCpus)
+            romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[m].id;
+        struct scclTopoNode* node = system->nodes[GPU].nodes + gpu;
+        if(node->paths[GPU] == NULL)
+            continue;
+        int count = 0;
+        for(n = 0; n < romeTopo->nGpus; n++) {
+            romeTopo->connMatrix[i * romeTopo->nGpus + n] = 0;
+            struct scclTopoLink* link;
+            for(link = node->links; link->remNode; link++) {
+                if(link->remNode->gpu.dev == n)
+                    break;
+            }
+            if(!link->remNode)
+                continue;
+            if(link->type != LINK_NVL)
+                continue;
+            romeTopo->connMatrix[i * romeTopo->nGpus + n] = link->bw / scclTopoXGMISpeed(node->gpu.gcn);
+            count++;
+        }
+        if(romeTopo->nLinks < count)
+            romeTopo->nLinks = count;
+    }
+    for(int i = 0; i < romeTopo->nNics; i++) {
+        int n, m, distance;
+        m                   = 0;
+        int net             = net_scores[i].n;
+        romeTopo->nicIds[i] = system->nodes[NET].nodes[net].net.busId;
+        distance            = system->nodes[NET].nodes[net].paths[CPU][m].count;
+        for(n = 0; n < romeTopo->nCpus; n++)
+            if(system->nodes[NET].nodes[net].paths[CPU][n].count < distance) {
+                distance = system->nodes[NET].nodes[net].paths[CPU][n].count;
+                m        = n;
+            }
+        if(m < romeTopo->nCpus)
+            romeTopo->nicNuma[i] = system->nodes[CPU].nodes[m].id;
+        else
+            return scclSuccess;
+    }
+    // number of GPUs and NICs on each numa node is used as first screening pattern
+    for(int i = 0; i < romeTopo->nCpus; i++) {
+        uint64_t id = system->nodes[CPU].nodes[cpu_scores[i].c].id;
+        int g = 0, n = 0;
+        for(int j = 0; j < romeTopo->nGpus; j++)
+            if(romeTopo->gpuNuma[j] == id)
+                g++;
+        for(int j = 0; j < romeTopo->nNics; j++)
+            if(romeTopo->nicNuma[j] == id)
+                n++;
+        pattern[i * 2]     = '0' + g;
+        pattern[i * 2 + 1] = '0' + n;
+    }
+    pattern[romeTopo->nCpus * 2] = 0;
+    // compute gdr level matrix
+    for(int i = 0; i < romeTopo->nNics; i++) {
+        int n = net_scores[i].n;
+        for(int j = 0; j < romeTopo->nGpus; j++) {
+            int g                                       = gpu_scores[j].g;
+            romeTopo->gdrLevel[i * romeTopo->nGpus + j] = system->nodes[GPU].nodes[g].paths[NET][n].type;
+        }
+    }
+    const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
+    if(romeModelFile) {
+        INFO(SCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
+        FILE* file = fopen(romeModelFile, "w");
+        if(file == NULL) {
+            WARN("Unable to open %s, not dumping Rome model.", romeModelFile);
+            return scclSuccess;
+        }
+        fprintf(file, "static struct scclRomeModel rome_model_ = {\n");
+        fprintf(file, "  .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,\n", romeTopo->nGpus, romeTopo->nCpus, romeTopo->nNics, romeTopo->nLinks);
+        fprintf(file, "  .gpuIds = { ");
+        for(int i = 0; i < romeTopo->nGpus; i++)
+            fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]);
+        fprintf(file, "},\n");
+        fprintf(file, "  .nicIds = { ");
+        for(int i = 0; i < romeTopo->nNics; i++)
+            fprintf(file, "0x%lx, ", romeTopo->nicIds[i]);
+        fprintf(file, "},\n");
+        fprintf(file, "  .gpuNuma = { ");
+        for(int i = 0; i < romeTopo->nGpus; i++)
+            fprintf(file, "%ld, ", romeTopo->gpuNuma[i]);
+        fprintf(file, "},\n");
+        fprintf(file, "  .nicNuma = { ");
+        for(int i = 0; i < romeTopo->nNics; i++)
+            fprintf(file, "%ld, ", romeTopo->nicNuma[i]);
+        fprintf(file, "},\n");
+        fprintf(file, "  .connMatrix = { ");
+        for(int i = 0; i < romeTopo->nGpus; i++)
+            for(int n = 0; n < romeTopo->nGpus; n++)
+                fprintf(file, "%d, ", romeTopo->connMatrix[i * romeTopo->nGpus + n]);
+        fprintf(file, "},\n");
+        fprintf(file, "  .gdrLevel = { ");
+        for(int i = 0; i < romeTopo->nNics; i++)
+            for(int n = 0; n < romeTopo->nGpus; n++)
+                fprintf(file, "PATH_%s, ", topoPathTypeStr[romeTopo->gdrLevel[i * romeTopo->nGpus + n]]);
+        fprintf(file, "},\n");
+        fprintf(file, "  .pattern = \"%s\",\n", pattern);
+        fprintf(file, "  .ringBase = \"\",\n");
+        fprintf(file, "  .options = \"\",\n");
+        fprintf(file, "};\n");
+        fclose(file);
+    }
+    return scclSuccess;
+}
+static bool permuteGpuIds(int* g, int n, int last, struct scclRomeModel* ref, struct scclRomeModel* topo, int* time, bool nbio, bool ignore_numa) {
+    (*time)++;
+    if(n == last) {
+        int i, j;
+        // match GPU numa
+        if(!ignore_numa) {
+            for(i = 0; i < ref->nGpus; i++)
+                if(ref->gpuNuma[i] != topo->gpuNuma[g[i]])
+                    break;
+            if(i < ref->nGpus)
+                return false;
+        }
+        // match XGMI connection
+        for(i = 0; i < ref->nGpus; i++) {
+            for(j = 0; j < ref->nGpus; j++) {
+                if(ref->connMatrix[i * ref->nGpus + j] != topo->connMatrix[g[i] * ref->nGpus + g[j]])
+                    break;
+                if((ref->gpuIds[i] - ref->gpuIds[j]) * (topo->gpuIds[g[i]] - topo->gpuIds[g[j]]) < 0)
+                    break;
+            }
+            if(j < ref->nGpus)
+                break;
+        }
+        if(i < ref->nGpus)
+            return false;
+        // match NBIO
+        if(nbio) {
+            for(i = 0; i < ref->nGpus; i++) {
+                for(j = 0; j < ref->nGpus; j++) {
+                    if(i == j)
+                        continue;
+                    bool nbio_ref  = (ref->gpuIds[i] & 0xf0000) == (ref->gpuIds[j] & 0xf0000);
+                    bool nbio_topo = (topo->gpuIds[g[i]] & 0xf0000) == (topo->gpuIds[g[j]] & 0xf0000);
+                    if(nbio_ref != nbio_topo)
+                        break;
+                    if(nbio_ref && ((ref->gpuIds[i] - ref->gpuIds[j]) * (topo->gpuIds[g[i]] - topo->gpuIds[g[j]]) < 0))
+                        break;
+                }
+                if(j < ref->nGpus)
+                    break;
+            }
+            if(i < ref->nGpus)
+                return false;
+        }
+        return true;
+    } else {
+        for(int i = n; i <= last; i++) {
+            std::swap(g[n], g[i]);
+            if(permuteGpuIds(g, n + 1, last, ref, topo, time, nbio, ignore_numa))
+                return true;
+            std::swap(g[n], g[i]);
+        }
+    }
+    return false;
+}
+static bool permuteNetIds(int* n, int* g, int s, int last, struct scclRomeModel* ref, struct scclRomeModel* topo, int* time, bool ignore_numa) {
+    (*time)++;
+    if(s == last) {
+        int i, j;
+        // match NET numa
+        if(!ignore_numa) {
+            for(i = 0; i < ref->nNics; i++) {
+                if(ref->nicNuma[i] != topo->nicNuma[n[i]])
+                    break;
+            }
+            if(i < ref->nNics)
+                return false;
+        }
+        // match gdr level
+        for(i = 0; i < ref->nNics; i++) {
+            for(j = 0; j < ref->nGpus; j++) {
+                if(ref->gdrLevel[i * ref->nGpus + j] != topo->gdrLevel[n[i] * ref->nGpus + g[j]])
+                    break;
+            }
+            if(j < ref->nGpus)
+                break;
+        }
+        if(i < ref->nNics)
+            return false;
+        return true;
+    } else {
+        for(int i = s; i <= last; i++) {
+            std::swap(n[s], n[i]);
+            if(permuteNetIds(n, g, s + 1, last, ref, topo, time, ignore_numa))
+                return true;
+            std::swap(n[s], n[i]);
+        }
+    }
+    return false;
+}
+scclResult_t parseRome4P2H(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
+    static char ringRemap[64];
+    int i;
+    int ngpus = system->nodes[GPU].count;
+    int ncpus = system->nodes[CPU].count;
+    int nnets = system->nodes[NET].count;
+    if(ngpus > 8)
+        return scclSuccess;
+    // only valid on Rome
+    int arch, vendor, model;
+    SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
+    // number of GPUs and NICs on each numa node is used as first screening pattern
+    struct scclRomeModel romeTopo;
+    char pattern[256];
+    SCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
+    // recognize system as Rome 4P2H even if no matching model
+    if(ngpus > 4 && romeTopo.nLinks)
+        system->type |= RCCL_TOPO_4P2H_ROME;
+    int g[SCCL_TOPO_MAX_NODES], n[SCCL_TOPO_MAX_NODES];
+    int time = 0;
+    struct timeval tvs, tve;
+    gettimeofday(&tvs, NULL);
+    // check if GPUs are directly connected to CPU
+    bool match_nbio = true;
+    for(i = 0; i < romeTopo.nGpus; i++) {
+        int cpu, gpu;
+        SCCLCHECK(scclTopoIdToIndex(system, CPU, romeTopo.gpuNuma[i], &cpu));
+        SCCLCHECK(scclTopoIdToIndex(system, GPU, romeTopo.gpuIds[i], &gpu));
+        if(system->nodes[GPU].nodes[gpu].paths[CPU][cpu].count > 2)
+            break;
+    }
+    if(i < romeTopo.nGpus)
+        match_nbio = false;
+    for(i = 0; i < sizeof(romeTopoModels) / sizeof(romeTopoModels[0]); i++) {
+        bool ignore_cpu = checkOption(romeTopoModels[i].options, "noCpuCheck");
+        if(!ignore_cpu && (arch != SCCL_TOPO_CPU_ARCH_X86 || vendor != SCCL_TOPO_CPU_VENDOR_AMD || model != SCCL_TOPO_CPU_TYPE_ROME))
+            continue;
+        bool ignore_numa = checkOption(romeTopoModels[i].options, "disableNumaMatching");
+        if(!ignore_numa && romeTopo.nCpus != romeTopoModels[i].nCpus)
+            continue;
+        if(romeTopo.nGpus != romeTopoModels[i].nGpus || romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks)
+            continue;
+        if(!ignore_numa && strcmp(romeTopoModels[i].pattern, pattern))
+            continue;
+        // permute GPU IDs
+        for(int j = 0; j < ngpus; j++)
+            g[j] = (j + 2) % ngpus;
+        if(!permuteGpuIds(g, 0, ngpus - 1, romeTopoModels + i, &romeTopo, &time, ignore_cpu ? false : match_nbio, ignore_numa))
+            continue;
+        if(nnets > 1) {
+            // permute NET IDs
+            for(int j = 0; j < nnets; j++)
+                n[j] = (j + 2) % nnets;
+            if(permuteNetIds(n, g, 0, nnets - 1, romeTopoModels + i, &romeTopo, &time, ignore_numa))
+                break;
+        } else
+            break;
+    }
+    gettimeofday(&tve, NULL);
+    float t = (tve.tv_sec - tvs.tv_sec) * 1E3 + (tve.tv_usec - tvs.tv_usec) / 1E3;
+    if(i >= sizeof(romeTopoModels) / sizeof(romeTopoModels[0])) {
+        // printf("No solution in %.2fms (%d iter)\n", t, time);
+        return scclSuccess;
+    }
+    char line[1024];
+    // sprintf(line, "Found matching Rome model index %d in %.2fms (%d iter) with GPU mapping: ", i, t, time);
+    sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i);
+    int offset = strlen(line);
+    for(int k = 0; k < ngpus; k++) {
+        sprintf(line + offset, "%d ", g[k]);
+        offset = strlen(line);
+    }
+    if(nnets > 1) {
+        sprintf(line + offset, "NET mapping: ");
+        offset = strlen(line);
+        for(int k = 0; k < nnets; k++) {
+            sprintf(line + offset, "%d ", n[k]);
+            offset = strlen(line);
+        }
+    }
+    INFO(SCCL_GRAPH, "%s", line);
+    parseOptions(system, romeTopoModels[i].options);
+    // create 4P2H based on reference and remapped ids
+    SCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL));
+    if(romeTopoModels[i].treeBase != nullptr)
+        SCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, g));
+    return scclSuccess;
+}
+scclResult_t parse1H16P(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
+#define NUMA_CPUS 4
+#define NUMA_GPUS 4
+#define NUMA_PERMUTE_COUNT 24
+#define TOTAL_PERMUTE_COUNT (NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT)
+    static char ringRemap[256];
+    int i;
+    int ngpus = system->nodes[GPU].count;
+    int ncpus = system->nodes[CPU].count;
+    int nnets = system->nodes[NET].count;
+    // only valid on Rome
+    int arch, vendor, model;
+    SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
+    if(arch != SCCL_TOPO_CPU_ARCH_X86 || vendor != SCCL_TOPO_CPU_VENDOR_AMD || model != SCCL_TOPO_CPU_TYPE_ROME)
+        return scclSuccess;
+    // number of GPUs and NICs on each numa node is used as first screening pattern
+    struct scclRomeModel romeTopo;
+    char pattern[256];
+    SCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
+    // only match for system with 16 GPUs
+    if(ngpus != 16 || ncpus != NUMA_CPUS)
+        return scclSuccess;
+    int gcnt = 0;
+    int *g16, n[SCCL_TOPO_MAX_NODES];
+    int* all_gpu_permutations = (int*)malloc(TOTAL_PERMUTE_COUNT * NUMA_CPUS * NUMA_GPUS * sizeof(int));
+    struct timeval tvs, tve;
+    gettimeofday(&tvs, NULL);
+    for(i = 0; i < sizeof(romeTopoModels) / sizeof(romeTopoModels[0]); i++) {
+        if(romeTopo.nCpus != romeTopoModels[i].nCpus || romeTopo.nGpus != romeTopoModels[i].nGpus || romeTopo.nNics != romeTopoModels[i].nNics ||
+           romeTopo.nLinks != romeTopoModels[i].nLinks)
+            continue;
+        if(strcmp(romeTopoModels[i].pattern, pattern))
+            continue;
+        int j, r[ngpus], g[ngpus];
+        int numa_gpu_permutations[NUMA_CPUS][NUMA_PERMUTE_COUNT][NUMA_GPUS];
+        // permute GPUs for each CPU NUMA nodes
+        for(j = 0; j < ncpus; j++) {
+            int ngpusPerNuma = 0, cnt = 0, npermute = 0;
+            for(int k = 0; k < ngpus; k++) {
+                if(romeTopoModels[i].gpuNuma[k] != j)
+                    continue;
+                r[ngpusPerNuma++] = k;
+            }
+            if(ngpusPerNuma == 0)
+                continue;
+            if(ngpusPerNuma != NUMA_GPUS)
+                break;
+            gcnt++;
+            // init GPU mapping
+            for(int k = 0; k < ngpus; k++) {
+                if(romeTopo.gpuNuma[k] != j)
+                    continue;
+                g[(2 + cnt++) % ngpusPerNuma] = k;
+            }
+            std::sort(g, g + ngpusPerNuma);
+            do {
+                for(int n = 0; n < ngpusPerNuma; n++)
+                    numa_gpu_permutations[j][npermute][n] = g[n];
+                npermute++;
+            } while(std::next_permutation(g, g + ngpusPerNuma));
+            if(npermute != NUMA_PERMUTE_COUNT)
+                break;
+        }
+        if(j < ncpus)
+            continue;
+        // permute GPUs for all CPU NUMA nodes
+        for(int a = 0; a < NUMA_PERMUTE_COUNT; a++) {
+            for(int b = 0; b < NUMA_PERMUTE_COUNT; b++) {
+                for(int c = 0; c < NUMA_PERMUTE_COUNT; c++) {
+                    for(int d = 0; d < NUMA_PERMUTE_COUNT; d++) {
+                        uint64_t offset = ((a * NUMA_PERMUTE_COUNT + b) * NUMA_PERMUTE_COUNT + c) * NUMA_PERMUTE_COUNT + d;
+                        // offset = (offset+TOTAL_PERMUTE_COUNT/2)%TOTAL_PERMUTE_COUNT;
+                        offset *= (NUMA_CPUS * NUMA_GPUS);
+                        memcpy(all_gpu_permutations + offset, &numa_gpu_permutations[0][a][0], NUMA_GPUS * sizeof(int));
+                        memcpy(all_gpu_permutations + offset + NUMA_GPUS, &numa_gpu_permutations[1][b][0], NUMA_GPUS * sizeof(int));
+                        memcpy(all_gpu_permutations + offset + NUMA_GPUS * 2, &numa_gpu_permutations[2][c][0], NUMA_GPUS * sizeof(int));
+                        memcpy(all_gpu_permutations + offset + NUMA_GPUS * 3, &numa_gpu_permutations[3][d][0], NUMA_GPUS * sizeof(int));
+                    }
+                }
+            }
+        }
+        // match all GPUs' XGMI connection
+        int p;
+        for(p = 0; p < TOTAL_PERMUTE_COUNT; p++) {
+            g16 = all_gpu_permutations + p * NUMA_CPUS * NUMA_GPUS;
+            int k;
+            for(k = 0; k < romeTopoModels[i].nGpus; k++) {
+                int m;
+                for(m = 0; m < romeTopoModels[i].nGpus; m++) {
+                    if(romeTopoModels[i].connMatrix[k * romeTopoModels[i].nGpus + m] != romeTopo.connMatrix[g16[k] * romeTopoModels[i].nGpus + g16[m]])
+                        break;
+                }
+                if(m < romeTopoModels[i].nGpus)
+                    break;
+            }
+            if(k < romeTopoModels[i].nGpus)
+                continue;
+            // printf("found match %d: ", p); for (int n = 0; n < NUMA_CPUS*NUMA_GPUS; n++) printf("%d ", g16[n]); printf("\n");
+            if(nnets > 1) {
+                // permute NET IDs
+                int time = 0;
+                for(int m = 0; m < nnets; m++)
+                    n[m] = (m + 2) % nnets;
+                if(permuteNetIds(n, g16, 0, nnets - 1, romeTopoModels + i, &romeTopo, &time, false))
+                    break;
+            } else
+                break;
+        }
+        if(p < TOTAL_PERMUTE_COUNT)
+            break;
+    }
+    gettimeofday(&tve, NULL);
+    float t = (tve.tv_sec - tvs.tv_sec) * 1E3 + (tve.tv_usec - tvs.tv_usec) / 1E3;
+    if(i >= sizeof(romeTopoModels) / sizeof(romeTopoModels[0])) {
+        // printf("No solution in %.2fms\n", t);
+        return scclSuccess;
+    }
+    char line[1024];
+    // sprintf(line, "Found matching Rome model index %d in %.2fms with GPU mapping: ", i, t);
+    sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i);
+    int offset = strlen(line);
+    for(int k = 0; k < ngpus; k++) {
+        sprintf(line + offset, "%d ", g16[k]);
+        offset = strlen(line);
+    }
+    if(nnets > 1) {
+        sprintf(line + offset, "NET mapping: ");
+        offset = strlen(line);
+        for(int k = 0; k < nnets; k++) {
+            sprintf(line + offset, "%d ", n[k]);
+            offset = strlen(line);
+        }
+    }
+    INFO(SCCL_GRAPH, "%s", line);
+    system->type |= RCCL_TOPO_16P1H;
+    parseOptions(system, romeTopoModels[i].options);
+    // create 16P1H based on reference and remapped ids
+    SCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g16, nnets > 1 ? n : NULL));
+    if(romeTopoModels[i].treeBase != nullptr)
+        SCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, g16));
+    // clean up
+    free(all_gpu_permutations);
+    return scclSuccess;
+}
+scclResult_t parse4H4P(struct scclTopoSystem* system, struct scclTopoGraph* graph) {
+#define NUM_HIVES 4
+#define HIVE_GPUS 4
+    static char ringRemap[256];
+    int ngpus = system->nodes[GPU].count;
+    int nnets = system->nodes[NET].count;
+    // only valid on Rome
+    int arch, vendor, model;
+    SCCLCHECK(scclTopoCpuType(system, &arch, &vendor, &model));
+    if(arch != SCCL_TOPO_CPU_ARCH_X86 || vendor != SCCL_TOPO_CPU_VENDOR_AMD || model != SCCL_TOPO_CPU_TYPE_ROME)
+        return scclSuccess;
+    // number of GPUs and NICs on each numa node is used as first screening pattern
+    struct scclRomeModel romeTopo;
+    char pattern[256];
+    SCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
+    // only match for system with 16 GPUs
+    if(ngpus != NUM_HIVES * HIVE_GPUS || nnets != NUM_HIVES * HIVE_GPUS)
+        return scclSuccess;
+    int g_hives[ngpus], n_hives[nnets];
+    int ng_hives[NUM_HIVES];
+    // try to sort GPUs into hives
+    for(int i = 0; i < NUM_HIVES; i++)
+        ng_hives[i] = 0;
+    for(int i = 0; i < nnets; i++)
+        n_hives[i] = -1;
+    for(int i = 0; i < ngpus; i++)
+        g_hives[i] = -1;
+    for(int i = 0; i < ngpus; i++) {
+        int j, h;
+        for(j = 0; j < NUM_HIVES; j++) {
+            if(ng_hives[j]) {
+                if(romeTopo.connMatrix[i * ngpus + g_hives[j * HIVE_GPUS]]) {
+                    g_hives[j * HIVE_GPUS + ng_hives[j]] = i;
+                    ng_hives[j]++;
+                    break;
+                }
+            }
+        }
+        if(j >= NUM_HIVES) {
+            for(h = 0; h < NUM_HIVES; h++) {
+                if(ng_hives[h] == 0) {
+                    g_hives[h * HIVE_GPUS] = i;
+                    ng_hives[h]++;
+                    break;
+                }
+            }
+            if(h >= NUM_HIVES)
+                return scclSuccess;
+        }
+    }
+    for(int i = 0; i < NUM_HIVES; i++)
+        if(ng_hives[i] != 4)
+            return scclSuccess;
+    // remap NET ids
+    for(int i = 0; i < nnets; i++) {
+        int j;
+        for(j = 0; j < ngpus; j++) {
+            if(romeTopo.gdrLevel[i * nnets + g_hives[j]] == 3) {
+                n_hives[j] = i;
+                break;
+            }
+        }
+        if(j >= ngpus)
+            return scclSuccess;
+    }
+    // validation
+    for(int i = 0; i < nnets; i++)
+        if(n_hives[i] == -1)
+            return scclSuccess;
+    for(int i = 0; i < ngpus; i++)
+        if(g_hives[i] == -1)
+            return scclSuccess;
+    char line[1024];
+    sprintf(line, "Found matching Rome model 4P4H with GPU mapping: ");
+    int offset = strlen(line);
+    for(int k = 0; k < ngpus; k++) {
+        sprintf(line + offset, "%d ", g_hives[k]);
+        offset = strlen(line);
+    }
+    if(nnets > 1) {
+        sprintf(line + offset, "NET mapping: ");
+        offset = strlen(line);
+        for(int k = 0; k < nnets; k++) {
+            sprintf(line + offset, "%d ", n_hives[k]);
+            offset = strlen(line);
+        }
+    }
+    INFO(SCCL_GRAPH, "%s", line);
+    if(arch == SCCL_TOPO_CPU_ARCH_X86 && vendor == SCCL_TOPO_CPU_VENDOR_AMD && model == SCCL_TOPO_CPU_TYPE_ROME)
+        system->type |= RCCL_TOPO_4P2H_ROME;
+    parseOptions(system, rome_model_68.options);
+    // create 4P4H based on reference and remapped ids
+    SCCLCHECK(parseGraph(rome_model_68.ringBase, system, graph, g_hives, n_hives));
+    return scclSuccess;
+}
+} // namespace detect
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/graph/rome_models.h
+++ b/src/hardware/graph/rome_models.h
+#ifndef SCCL_ROME_MODELS_H_
+#define SCCL_ROME_MODELS_H_
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace detect {
+scclResult_t parseGraph(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map, int* net_map);
+scclResult_t parseGraphLight(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map);
+scclResult_t parseRome4P2H(struct scclTopoSystem* system, struct scclTopoGraph* graph);
+scclResult_t parseChordalRing(struct scclTopoSystem* system, struct scclTopoGraph* graph);
+scclResult_t parse1H16P(struct scclTopoSystem* system, struct scclTopoGraph* graph);
+scclResult_t parse4H4P(struct scclTopoSystem* system, struct scclTopoGraph* graph);
+} // namespace detect
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
+#endif
\ No newline at end of file