Initial Code for SCCL_v1

d9d23f34 · lishen · 57df3737 · d9d23f34 · d9d23f34 · d9d23f34
Commit d9d23f34 authored Jun 20, 2025 by lishen
20 changed files
--- a/examples/2_topo/0_demo_topo/test_topo.cpp
+++ b/examples/2_topo/0_demo_topo/test_topo.cpp
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "base.h"
+#include "alloc.h"
+#include "topo.h"
+#include "xml.h"
+#include "mpi.h"
+#include "net.h"
+#include "comm.h"
+#include "graph.h"
+using namespace sccl;
+int main(int argc, char** argv) {
+    // struct sccl::hardware::topology::topo::scclXml* xml;
+    // SCCLCHECK(sccl::scclCalloc(&xml, 1));
+    // std::string xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
+    // SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
+    // struct sccl::hardware::topology::topo::scclTopoSystem* topoSystem;
+    // SCCLCHECK(sccl::hardware::topology::topo::scclTopoGetSystemFromXml(xml, &topoSystem));
+    // printf("topoSystem net.gdrSupport:%d\n", topoSystem->nodes[0].nodes[0].net.gdrSupport);
+    int rank, nranks;
+    MPI_Status status;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    struct sccl::hardware::scclComm* comm;
+    struct sccl::hardware::topology::topo::scclTopoSystem* topoSystem;
+    SCCLCHECK(sccl::hardware::topology::topo::scclTopoGetSystem(&topoSystem));
+    printf("topoSystem net.gdrSupport:%d\n", topoSystem->nodes[0].nodes[0].net.gdrSupport);
+    topoSystem->nRanks             = nranks;
+    topoSystem->netGdrLevel        = -2;
+    topoSystem->pivotA2AEnabled    = false;
+    topoSystem->pivotA2ANumBiRings = 0;
+    topoSystem->ll128Enabled       = false;
+    topoSystem->mscclEnabled       = false;
+    topoSystem->treeDefined        = false;
+    SCCLCHECK(sccl::hardware::topology::scclTopoComputePaths(topoSystem, comm));
+    return 0;
+}
\ No newline at end of file
--- a/examples/2_topo/0_demo_topo/test_xml.cpp
+++ b/examples/2_topo/0_demo_topo/test_xml.cpp
+#include <iostream>
+#include <stdio.h>
+#include "base.h"
+#include "alloc.h"
+#include "xml.h"
+using namespace sccl;
+int main(int argc, char** argv) {
+    struct sccl::hardware::topology::topo::scclXml* xml;
+    SCCLCHECK(sccl::scclCalloc(&xml, 1));
+    std::string xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
+    SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
+    SCCLCHECK(scclTopoDumpXmlToFile("test_xml.xml", xml));
+    return 0;
+} // main pass
\ No newline at end of file
--- a/examples/2_topo/0_demo_topo/test_xml.xml
+++ b/examples/2_topo/0_demo_topo/test_xml.xml
+<system version="2">
+  <cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
+      <nic>
+        <net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
+        <net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
+      </nic>
+    </pci>
+  </cpu>
+</system>
--- a/examples/2_topo/1_demo_rocm/compile_rocm_smi.sh
+++ b/examples/2_topo/1_demo_rocm/compile_rocm_smi.sh
+hipcc /public/home/lishen/Code/rocSHMEM/SCCL_v1/examples/2_topo/1_demo_rocm/test_rocm_smi.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/rocm_smi_wrap.cc \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo_utils.cpp \
+-o test_topo \
+-std=c++17 -g -O3 -fopenmp -D__HIP_PLATFORM_HCC__ \
+-I ./ -I /usr/include -I /opt/dtk/include \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/topo \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/ \
+-L /usr/lib/x86_64-linux-gnu \
+-L /usr/lib/ \
+-lamdhip64 -lrocm_smi64
\ No newline at end of file
--- a/examples/2_topo/1_demo_rocm/test_rocm_smi.cpp
+++ b/examples/2_topo/1_demo_rocm/test_rocm_smi.cpp
+#include <iostream>
+#include <stdio.h>
+#include <string.h>
+#include "base.h"
+#include "rocm_smi_wrap.h"
+#include "topo_utils.h"
+using namespace std;
+using namespace sccl;
+int main(int argc, char** argv) {
+    printf("hello world\n");
+    (void)rocm_smi_init();
+    uint32_t num_devs;
+    (void)rocm_smi_getNumDevice(&num_devs);
+    printf("num_devs=%d\n", num_devs);
+    uint32_t deviceIndex = 0;
+    char bus0[100]       = "bus0";
+    (void)rocm_smi_getDevicePciBusIdString(deviceIndex, bus0, 100);
+    printf("bus0=%s\n", bus0);
+    RSMI_IO_LINK_TYPE rsmi_type;
+    int hops, count;
+    (void)rocm_smi_getLinkInfo(0, 8, &rsmi_type, &hops, &count);
+    printf("rsmi_type=%d, hops=%d, count=%d\n", rsmi_type, hops, count);
+    // struct sccl::hardware::topology::topo::scclXml* xml;
+    // SCCLCHECK(sccl::scclCalloc(&xml, 1));
+    // std::string xmlPath = "/opt/dtk/rccl/lib/built-in-BW-topo-input.xml";
+    // SCCLCHECK(scclTopoGetXmlFromFile(xmlPath.c_str(), xml, 1));
+    // struct sccl::hardware::topology::topo::scclTopoSystem* topoSystem;
+    // SCCLCHECK(sccl::hardware::topology::topo::scclTopoGetSystemFromXml(xml, &topoSystem));
+    return 0;
+}
\ No newline at end of file
--- a/examples/2_topo/2_bootstrap/1_mpi_init.cpp
+++ b/examples/2_topo/2_bootstrap/1_mpi_init.cpp
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "mpi.h"
+#include "net.h"
+#include "bootstrap_net.h"
+using namespace sccl;
+int main(int argc, char* argv[]) {
+    int rank, nranks;
+    int tag1, src, dst, cnt;
+    MPI_Status status;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    printf("rank=%d, nranks=%d\n", rank, nranks);
+    // ----------------------------------------------------------------------- //
+    INFO(SCCL_LOG_TOPO, "Bootstrap ...\n");
+    (void)sccl::hardware::topology::bootstrap::bootstrap_net::bootstrapNetInit();
+    MPI_Finalize();
+}
+/*
+单机执行
+SCCL_DEBUG_LEVEL=SCCL_LOG_ABORT mpirun --allow-run-as-root -np 2 1_mpi_init
+SCCL_DEBUG_LEVEL=SCCL_LOG_INFO SCCL_DEBUG_POS=SCCL_LOG_CODEALL mpirun --allow-run-as-root -np 2 1_mpi_init
+跨机执行
+SCCL_DEBUG_LEVEL=SCCL_LOG_ABORT mpirun --allow-run-as-root --hostfile hostfile -np 16 ./1_mpi_init
+*/
--- a/examples/2_topo/2_bootstrap/compile_mpi.sh
+++ b/examples/2_topo/2_bootstrap/compile_mpi.sh
+hipcc ./1_mpi_init.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvsymbols.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ibvwrap.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/net_ib.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/socket.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/net_socket.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/net_utils.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/rocm_wrap.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/param.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/bootstrap_net.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/ipcsocket.cpp \
+/public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/proxy.cpp \
+-o 1_mpi_init \
+-std=c++17 -g -O3 -fopenmp -DROC_SHMEM -D__HIP_PLATFORM_HCC__ \
+-I ./ -I /usr/include -I /opt/dtk/include \
+-I /public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/include/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/utils/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/include/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/device/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/host/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/net/ \
+-I /public/home/lishen/Code/rocSHMEM/SCCL_v1/src/hardware/topology/bootstrap/ \
+-L /public/home/lishen/Code/rocSHMEM/SCCL_v1 \
+-L /usr/lib/x86_64-linux-gnu -libverbs -lrdmacm \
+-L /public/home/lishen/Code/rocSHMEM/3rd_party/install/ompi/lib -lmpi
--- a/src/algothrim/collectives/gather/readme.MD
+++ b/src/algothrim/collectives/gather/readme.MD
+# gather功能
+包括`gather`和`all-gather`
--- a/src/algothrim/collectives/reduce/readme.MD
+++ b/src/algothrim/collectives/reduce/readme.MD
+# reduce功能
+包括`reduce`和`all-reduce`
--- a/src/algothrim/collectives/scatter/readme.MD
+++ b/src/algothrim/collectives/scatter/readme.MD
+# scatter功能
+包括`scatter` 和 `reduce-scatter`
--- a/src/hardware/comm.h
+++ b/src/hardware/comm.h
+#pragma once
+#include <stdint.h>
+#include "base.h"
+#include "topo.h"
+namespace sccl {
+namespace hardware {
+// 定义结构体 scclUniqueInfo，用于存储每个通信节点的信息
+struct scclUniqueInfo {
+    int rank;       // 当前节点的全局排名
+    int nRanks;     // 总的节点数量
+    int localRank;  // 当前节点在本地计算节点中的排名
+    int localRanks; // 本地计算节点中的节点总数
+    int cudaDev;       // CUDA 设备 ID
+    int gdrSupport;    // 是否支持 GPU 直接注册 (GDR)
+    uint64_t hostHash; // 主机哈希值
+    uint64_t pidHash;  // 进程 ID 哈希值
+    int64_t busId;     // 总线 ID
+};
+// // 定义结构体 scclCommBase，用于存储通信基础信息
+// struct scclCommBase {
+//     struct scclUniqueInfo* peerInfo;           // 指向 peerInfo 结构体的指针，存储所有节点的信息
+//     sccl::hardware::net::scclNet_t* scclNet; // 指向网络结构体的指针，用于网络通信
+// };
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/connection/communication/readme.MD
+++ b/src/hardware/connection/communication/readme.MD
--- a/src/hardware/graph/connect.cc
+++ b/src/hardware/graph/connect.cc
--- a/src/hardware/graph/devcomm.h
+++ b/src/hardware/graph/devcomm.h
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef SCCL_DEVICE_H_
+#define SCCL_DEVICE_H_
+#include "check.h"
+#include "sccl_bfloat16.h"
+#include "align.h"
+#if defined(ENABLE_NPKIT)
+#include "npkit/npkit_struct.h"
+#endif
+#if defined(ENABLE_TIMELINE)
+#include "timeline/timeline.h"
+#endif
+#include <stdint.h>
+#ifdef HCU_SDMA_FEATURE
+#include "hsa/hsa_ext_amd.h"
+#include "hsa_extra.h"
+// #define HCU_PRINT_DEBUG
+#endif
+namespace sccl {
+#define PRINT_ERR(...)
+#define PRINT_INFO(...)
+#define PRINT_INFOM(...)
+#define PRINT_INFOT(tid, ...)
+#define PRINT_DEBUG(...)
+#if defined(ENABLE_NPKIT) && defined(HCU_SDMA_FEATURE)
+#define NPKIT_SET_GPU_EVENT(event, size, cost) \
+    NpKit::CollectGpuEvent(event, size, cost, NPKIT_GET_GPU_TIMESTAMP(), scclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm) NpKit::CollectGpuEvent(event, size, cost, tm, scclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+#else
+#define NPKIT_SET_GPU_EVENT(event, size, cost)
+#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm)
+#endif
+#ifdef HCU_SDMA_FEATURE
+#define INIT_PRIMS_SDMA(prims, args)                                                                           \
+    {                                                                                                          \
+        prims.rank            = scclShmem.comm.rank;                                                           \
+        prims.useSdmaConfig   = args->useSdma;                                                                 \
+        prims.useSdmaCopy     = args->useSdma && prims.sdmaQueueCtx;                                           \
+        prims.preFnOps        = args->preFnOps;                                                                \
+        prims.sdmaMinCopySize = args->useSdma && prims.sdmaQueueCtx ? prims.sdmaQueueCtx->minCopySize : 0;     \
+        prims.sdmaCountEnable = args->useSdma && prims.sdmaQueueCtx ? prims.sdmaQueueCtx->copyCountEnable : 0; \
+        prims.sdmaCopyCount   = 0;                                                                             \
+        prims.allCopyCount    = 0;                                                                             \
+    }
+#endif
+#define SCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
+typedef enum {
+    scclFuncBroadcast,
+    scclFuncReduce,
+    scclFuncAllGather,
+    scclFuncReduceScatter,
+    scclFuncAllReduce,
+    scclFuncSendRecv,
+    scclFuncSend,
+    scclFuncRecv,
+    scclFuncAllToAllPivot,
+    scclNumFuncs
+} scclFunc_t;
+extern const char* scclFuncStr[SCCL_NUM_FUNCTIONS + 2];
+#define SCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
+#define SCCL_ALGO_TREE 0
+#define SCCL_ALGO_RING 1
+#define SCCL_ALGO_COLLNET_DIRECT 2
+#define SCCL_ALGO_COLLNET_CHAIN 3
+#define SCCL_ALGO_NVLS 4
+#define SCCL_ALGO_NVLS_TREE 5
+enum scclAlgo {
+    SCCL_ALGO_TREE           = 0, // 树形算法
+    SCCL_ALGO_RING           = 1, // 环形算法
+    SCCL_ALGO_COLLNET_DIRECT = 2, // 直接网络算法
+    SCCL_ALGO_COLLNET_CHAIN  = 3, // 链式网络算法
+    SCCL_ALGO_NVLS           = 4, // NVLink算法
+    SCCL_ALGO_NVLS_TREE      = 5, // NVLink树形算法
+};
+extern const char* scclAlgoStr[SCCL_NUM_ALGORITHMS];
+#define SCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define SCCL_PROTO_LL 0
+#define SCCL_PROTO_LL128 1
+#define SCCL_PROTO_SIMPLE 2
+extern const char* scclProtoStr[SCCL_NUM_PROTOCOLS];
+#define SCCL_MAX_OPS 2048
+#define SCCL_STEPS 8
+union scclLLFifoLine {
+    /* Flags have to be *after* data, because otherwise, an incomplete receive
+       from the network may receive the flag but not the data.
+       Note this is assuming that either we receive contiguous chunks of data
+       (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+    struct {
+        uint32_t data1;
+        uint32_t flag1;
+        uint32_t data2;
+        uint32_t flag2;
+    };
+    uint64_t v[2];
+    int4 i4;
+};
+#define WARP_SIZE warpSize
+#define MAXCHANNELS 32
+#define SCCL_MAX_NTHREADS 256
+#define SCCL_SIMPLE_MAX_NTHREADS SCCL_MAX_NTHREADS
+#define SCCL_LL_MAX_NTHREADS SCCL_MAX_NTHREADS
+#define SCCL_LL_LINES_PER_THREAD 8
+#ifdef TEST_LL_CLEANUP
+#define SCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
+#define SCCL_LL_FLAG_MAX 0x100
+#define SCCL_LL_FLAG(a) ((uint32_t)((a) % SCCL_LL_FLAG_MAX))
+#else
+#define SCCL_LL_CLEAN_MASK 0x7ffffff8
+#define SCCL_LL_FLAG(a) ((uint32_t)(a))
+#endif
+// Make sure the clean mask will last for at least SCCL_NSTEPS
+static_assert(SCCL_LL_CLEAN_MASK % SCCL_STEPS == 0, "Invalid SCCL_LL_CLEAN_MASK value");
+#define SCCL_LL128_LINESIZE 64
+#define SCCL_LL128_LINEELEMS (SCCL_LL128_LINESIZE / sizeof(uint64_t))
+#define SCCL_LL128_DATAELEMS (SCCL_LL128_LINEELEMS - 1)
+#define SCCL_LL128_MAX_NTHREADS 256
+#define SCCL_LL128_ELEMS_PER_THREAD 28
+#define SCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
+#define SCCL_LL128_SHMEM_SIZE (SCCL_LL128_SHMEM_ELEMS_PER_THREAD * SCCL_LL128_MAX_NTHREADS)
+#define SCCL_DIRECT_WRITE 0x01
+#define SCCL_DIRECT_READ 0x02
+#define SCCL_DIRECT_NIC 0x04
+#define SCCL_IPC_WRITE 0x08
+#define SCCL_IPC_READ 0x10
+#define SCCL_NVLS_MIN_POLL 0x20
+#ifdef HCU_SDMA_FEATURE
+#define SDMA_CTX_VALID_MAGIC 0xD65A
+#endif
+struct scclConnInfo {
+    // Regular comm mechanism
+    char* buffs[SCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
+    uint64_t* tail;                  // Local for recv, remote for send
+    uint64_t* head;                  // Local for send, remote for recv
+    int flags;                  // Direct communication / other flags
+    int shared;                 // Buffers are shared
+    void** ptrExchange;         // Pointer exchange for direct communication
+    uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
+    int* sizesFifo; // Sizes fifo from GPU to proxy
+    int* offsFifo;  // Buffer fifo from proxy to GPU
+    uint64_t step; // Keep where we are
+    uint64_t llLastCleaning;
+    // GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
+    // allows software to explicitly initiate a flush read to HDP memory. See more
+    // descriptions in primitives.h.
+    uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
+    uint32_t* curr_hdp_reg; // Current GPU's HDP register
+#ifdef HCU_SDMA_FEATURE
+    struct sdmaQueueContext* sdmaQueueCtx;
+    uint32_t sdmaCtxValidMagic;
+#endif
+};
+struct scclProxyConnector {
+    int tpRank;
+    int tpLocalRank;
+    int sameProcess;
+    struct scclProxyConnection* connection;
+};
+struct scclConnector {
+    int connected;
+    struct scclProxyConnector proxyConn;
+    struct scclTransportComm* transportComm;
+    void* transportResources;
+    struct scclConnInfo conn;
+};
+struct scclRing {
+    // Shortcuts for userRanks[1] and userRanks[n-1]
+    int prev;
+    int next;
+    // Maps an internal sccl index to user-specified rank order. This is necessary
+    // since we need to know how the user expects data to be ordered across
+    // devices. Ordered from current device.
+    int* userRanks;
+    int index; // This rank's index in the ring
+};
+// The root of each tree only has one node down (+1 intra-node).
+#define SCCL_MAX_TREE_ARITY_TOP 2
+// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
+#define SCCL_MAX_TREE_ARITY 3
+struct scclTree {
+    int depth;
+    int up;
+    int down[SCCL_MAX_TREE_ARITY];
+};
+#define SCCL_MAX_DIRECT_ARITY 7
+struct scclDirect {
+    int depth;
+    int out;
+    int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
+    int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
+    int shift;    // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
+    int up[SCCL_MAX_DIRECT_ARITY];
+    int down[SCCL_MAX_DIRECT_ARITY];
+};
+#define SCCL_CONN_IDX_P2P_NET 2
+#define SCCL_MAX_NVLS_ARITY 8
+#define SCCL_MAX_NVLS_TREE_ARITY 3
+struct scclNvls {
+    int out;
+    int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
+    int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
+    int up[SCCL_MAX_NVLS_ARITY];
+    int down;
+    int treeUp;
+    int treeDown[SCCL_MAX_NVLS_TREE_ARITY];
+    int node;
+    int nNodes;
+};
+#define SCCL_MAX_CONNS 3
+struct scclChannelPeer {
+    struct scclConnector send[SCCL_MAX_CONNS];
+    struct scclConnector recv[SCCL_MAX_CONNS];
+    int refCount;
+};
+struct scclDevComm;
+#pragma pack(push) /* push current alignment to stack */
+#pragma pack(8)    /* set alignment to 8 bytes boundary */
+/* scclWork is to be a power of two, currently 8x64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of scclWorkElem. */
+#define SCCL_WORK_SIZE 256
+enum scclWorkType : uint8_t {
+    scclWorkTypeUnused  = 0,
+    scclWorkTypeColl    = 1,
+    scclWorkTypeP2p     = 2,
+    scclWorkTypeRegColl = 3
+};
+enum scclWorkP2PType : uint8_t {
+    scclWorkP2pTypeUnused = 0,
+    scclWorkP2pTypeSend,
+    scclWorkP2pTypeRecv
+};
+struct scclWorkHeader {
+    union {
+        int32_t workNext;  // when isLast=0: Offset from kernel argument workHead
+        uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
+    };
+    uint16_t funcIndex;
+    uint8_t isLast : 1; // last work for this kernel
+    uint8_t inFifo : 1; // is this work in the fifo
+    enum scclWorkType type;
+};
+struct scclWorkElem {
+    union {
+        uint8_t flagBits;
+        struct {
+            uint8_t isUsed : 1, redOpArgIsPtr : 1, regUsed : 1, nWarps : 5;
+        };
+    };
+    uint8_t direct;
+    uint8_t bid;
+    uint8_t nChannels;
+    struct {
+        uint32_t root : 28;
+        uint32_t preFnOps : 1;
+        uint32_t useSdma : 1;
+        uint32_t connIndex : 2;
+    };
+    const void* sendbuff;
+    void* recvbuff;
+    size_t count;
+    union {
+        size_t lastChunkSize;
+        // Pivot A2A kernel computes chunk size itself.
+        // Instead, it needs the number of bidirectional rings.
+        size_t pivotA2ANumBiRings;
+    };
+    uint64_t redOpArg;
+    uint64_t opCount;
+};
+static_assert((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElem))) / sizeof(scclWorkElem) == 4,
+              "Sanity check: SCCL_MAX_WORK_ELEMENTS == 4");
+#define SCCL_MAX_WORK_ELEMENTS 1
+struct scclWorkElemP2p {
+    struct {
+        int32_t peer : 26;
+        uint32_t preFnOps : 1;
+        uint32_t useSdma : 1;
+        uint32_t connIndex : 2;
+        int32_t proto : 2;
+    };
+    union {
+        uint16_t flagBits;
+        struct {
+            enum scclWorkP2PType p2pType : 4;
+            uint16_t nWarps : 4;
+            uint16_t warpStart : 4;
+            uint16_t ngroups : 4;
+        };
+    };
+    uint16_t opCount;
+    // Important not to use any fields with greater than 4-byte alignment since
+    // we need sizeof(scclWorkElemP2p)==28, but that would be padded up to 32 if
+    // there were 8-byte fields.
+    // void* buff;
+    uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
+    // size_t count;
+    uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
+    int chunkSize;
+};
+static_assert(((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElemP2p))) / sizeof(scclWorkElemP2p)) == 8,
+              "Sanity check: SCCL_MAX_WORK_ELEMENTS_P2P == 8");
+#define SCCL_MAX_WORK_ELEMENTS_P2P 2
+struct scclWorkElemReg {
+    struct scclWorkElem elem;
+    void* dnInputs[SCCL_MAX_DIRECT_ARITY + 1];
+    void* dnOutputs[SCCL_MAX_DIRECT_ARITY + 1];
+    void* upOutputs[SCCL_MAX_DIRECT_ARITY + 1];
+};
+#define SCCL_MAX_WORK_ELEMENTS_REG ((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElemReg))) / sizeof(scclWorkElemReg))
+static_assert(SCCL_MAX_WORK_ELEMENTS_REG == 1, "Sanity check: SCCL_MAX_WORK_ELEMENTS_REG == 1");
+// Number of named barriers supported by CUDA
+#define SCCL_MAX_GROUPS (SCCL_MAX_NTHREADS / WARP_SIZE)
+struct scclWork {
+    struct scclWorkHeader header;
+    union {
+        char pad[SCCL_WORK_SIZE - sizeof(struct scclWorkHeader)];
+        struct scclWorkElem elems[SCCL_MAX_WORK_ELEMENTS];
+        struct scclWorkElemP2p p2pElems[SCCL_MAX_WORK_ELEMENTS_P2P];
+        struct scclWorkElemReg regElems[SCCL_MAX_WORK_ELEMENTS_REG];
+    };
+};
+static_assert(sizeof(struct scclWork) == SCCL_WORK_SIZE, "Sanity check: sizeof(struct scclWork) == SCCL_WORK_SIZE");
+static_assert(sizeof(struct scclWork) % 16 == 0, "Sanity check: sizeof(struct scclWork)%16 == 0");
+struct scclDevChannelPeer {
+    // Stripped version of scclChannelPeer where we only keep the scclConnInfo
+    // instead of the full scclConnector.
+    struct scclConnInfo send[SCCL_MAX_CONNS];
+    struct scclConnInfo recv[SCCL_MAX_CONNS];
+};
+#pragma pack(pop) /* restore original alignment from stack */
+#ifdef ENABLE_PROFILING
+#define PROFILE_NUM_ITEMS 31
+#define PROFILE_NUM_LAUNCHES 1024
+struct scclProf {
+    uint32_t count;
+    uint32_t seq; // only entry from first launch is used
+    struct {
+        uint64_t line : 16;
+        uint64_t timeStamp : 48;
+    } elem[PROFILE_NUM_ITEMS];
+};
+static_assert(sizeof(struct scclProf) == 256, "scclProf must have size of 256");
+#endif
+#ifdef ENABLE_COLLTRACE
+typedef enum {
+    scclCollTraceNotReady         = 0,
+    scclCollTraceKernelLaunchType = 1,
+    scclCollTraceKernelEndType    = 2,
+    scclCollTraceCollLaunchType   = 3,
+    scclCollTraceAbortType        = 4,
+    scclCollTraceDataType         = 5,
+    scclCollTraceCollElemType     = (1 << 4),
+    scclCollTraceP2pElemType      = (1 << 5),
+} scclCollTraceDataType_t;
+struct scclCollTrace {
+    uint8_t type;
+    uint8_t bid;
+    int16_t funcIndex;
+    uint32_t data_0;
+    uint64_t timeStamp;
+    union {
+        uint64_t opCount;
+        uint32_t p2pOpCount[2];
+    };
+    union {
+        uint64_t data_1;
+        struct {
+            uint8_t nWarps;
+            uint8_t bid;
+            uint8_t nChannels;
+        } coll;
+        struct {
+            int16_t peer;
+            uint8_t ngroups : 4;
+            uint8_t connIndex : 4;
+            uint8_t warpStart : 4;
+            uint8_t nWarps : 4;
+        } p2p[2];
+    };
+};
+static_assert(sizeof(struct scclCollTrace) == 8 * sizeof(int), "scclCollTrace must have a pow2 size");
+union scclCollTraceTail {
+    uint32_t tail;
+    char padding[4096];
+};
+#define COLLTRACE_NUM_ITEMS 8192
+#endif
+#ifdef HCU_SDMA_FEATURE
+struct sdmaQueueContext {
+    hsa_sdma_info_t* sdmaInfo;
+    uint64_t pkgIndex;
+    uint32_t queueId;
+    uint32_t sumSdmaCopyCount;
+    uint32_t sumAllCopyCount;
+    uint32_t queueLock;
+    uint32_t minCopySize;
+    uint32_t copyCountEnable;
+    uint32_t sdmaQueueDepth;
+    uint32_t sdmaPkgLen;
+    uint32_t sdmaQueueLen;
+};
+#endif
+struct alignas(16) scclDevChannel {
+    struct scclDevChannelPeer** peers;
+    struct scclRing ring;
+    struct scclTree tree;
+    struct scclTree collnetChain;
+    struct scclDirect collnetDirect;
+    struct scclTree binTree;
+    struct scclNvls nvls;
+    uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
+};
+struct scclDevComm {
+    int rank;
+    int nRanks;
+    int buffSizes[SCCL_NUM_PROTOCOLS];
+    // Operation list for aggregation
+    int workFifoDepth;
+    struct scclWork* workFifoHeap; // may be cudaHost or GDR memory
+    // Flag to ask SCCL kernels to abort
+    volatile uint32_t* abortFlag;
+    // Channels, device side
+    struct scclDevChannel* channels /*[MAXCHANNELS]*/;
+#if defined(ENABLE_NPKIT)
+    NpKitEventCollectContext* npKitEventCollectContexts;
+#endif
+#ifdef ENABLE_COLLTRACE
+    struct scclCollTrace* collTrace;
+    union scclCollTraceTail* collTraceTail;
+    pthread_t collTraceThread;
+#endif
+#ifdef ENABLE_PROFILING
+    struct scclProf* devProf;
+#endif
+#if defined(ENABLE_TIMELINE)
+    TimelineGpuEventContext* gpuEventContext;
+#endif
+#if defined(ENABLE_NPKIT) || defined(ENABLE_TIMELINE)
+    uint64_t* cpuTimestamp;
+#endif
+};
+struct alignas(16) scclDevCommAndChannels {
+    struct scclDevComm comm;
+    struct scclDevChannel channels[MAXCHANNELS];
+};
+#ifdef __CUDA_ARCH__
+#define SCCL_CUDA_ARCH __CUDA_ARCH__
+#else
+#define SCCL_CUDA_ARCH 0
+#endif
+template <typename T>
+__host__ __device__ constexpr T min_constexpr(T a) {
+    return a;
+}
+template <typename T, typename... Ts>
+__host__ __device__ constexpr T min_constexpr(T a, T b, Ts... c) {
+    return min_constexpr<T>((a < b ? a : b), c...);
+}
+template <typename T>
+__host__ __device__ constexpr T max_constexpr(T a) {
+    return a;
+}
+template <typename T, typename... Ts>
+__host__ __device__ constexpr T max_constexpr(T a, T b, Ts... c) {
+    return max_constexpr<T>((a > b ? a : b), c...);
+}
+// Calculate the unroll factor given:
+// * bytePerPack: number of bytes accessed per instruction
+// * insns: max permissible unroll value
+// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
+__host__ __device__ constexpr int scclCalcUnroll(int bytePerPack, int insns, int bytes) {
+    return min_constexpr(insns, (bytes + bytePerPack - 1) / bytePerPack);
+}
+// Note that all unroll value logic should depend on a given cudaArch argument
+// and not __CUDA_ARCH__ since these need to be host-side executable where the
+// arch value is strictly runtime only. By defaulting to SCCL_CUDA_ARCH, device
+// side code can elide passing the arch for brevity.
+__host__ __device__ constexpr int scclCollUnroll(int cudaArch = SCCL_CUDA_ARCH) {
+    // Our collective unroll should move to the same bytes&insns model as NVLS.
+    return cudaArch >= 800 ? 8 : 4;
+}
+__host__ __device__ constexpr int scclNvlsUnrollBytes(int cudaArch = SCCL_CUDA_ARCH) { return 4 * 16; }
+__host__ __device__ constexpr int scclNvlsUnrollInsns(int cudaArch = SCCL_CUDA_ARCH) { return 16; }
+__host__ __device__ constexpr int scclNvlsUnroll(int bytePerPack, int cudaArch = SCCL_CUDA_ARCH) {
+    return scclCalcUnroll(bytePerPack, scclNvlsUnrollInsns(cudaArch), scclNvlsUnrollBytes(cudaArch));
+}
+// The amount of dynamic shmem per warp
+__host__ __device__ constexpr int scclShmemScratchWarpSize(int cudaArch = SCCL_CUDA_ARCH) {
+    return (max_constexpr<int>(
+                /*LL    */ 0,
+                /*LL128 */ (SCCL_LL128_SHMEM_ELEMS_PER_THREAD * WARP_SIZE) * sizeof(uint64_t),
+                /*SIMPLE*/ (scclCollUnroll(cudaArch) * WARP_SIZE + 1) * 16,
+                // NVLS needs an extra 16B to read unaligned data.
+                /*NVLS  */ WARP_SIZE * (cudaArch >= 900 ? scclNvlsUnrollBytes(cudaArch) : 0) + 16) +
+            15) &
+           -16; // pad to 16 bytes
+}
+// The amount of dynamic shmem per block
+__host__ __device__ constexpr int scclShmemDynamicSize(int cudaArch = SCCL_CUDA_ARCH) {
+    return cudaArch < 700 ? 0 : scclShmemScratchWarpSize(cudaArch) * (SCCL_MAX_NTHREADS / WARP_SIZE);
+}
+} // namespace sccl
+#endif
--- a/src/hardware/graph/graph.h
+++ b/src/hardware/graph/graph.h
+#ifndef SCCL_GRAPH_H_
+#define SCCL_GRAPH_H_
+// #include "topo_utils.h"
+#include "devcomm.h"
+#include <limits.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <sched.h>
+namespace sccl {
+namespace hardware {
+namespace topology {
+#define MAX_XGMI_INTER_GPUS 4
+struct scclTopoGraph {
+    // Input / output
+    int id; // ring : 0, tree : 1, collnet : 2
+    int pattern;
+    int crossNic;
+    int collNet;
+    int minChannels;
+    int maxChannels;
+    // Output
+    int nChannels;
+    float bwIntra;
+    float bwInter;
+    float latencyInter;
+    int typeIntra;
+    int typeInter;
+    int sameChannels;
+    int nHops;
+    int intra[MAXCHANNELS * SCCL_TOPO_MAX_NODES];
+    int inter[MAXCHANNELS * 2];
+    int nIntraChannels;
+    int intraNets[MAXCHANNELS * SCCL_TOPO_MAX_NODES * 2];
+    char treeBase[SCCL_TOPO_MAX_NODES][SCCL_TOPO_MAX_NODES * 4];
+};
+struct scclTopoRanks {
+    int ringRecv[MAXCHANNELS];
+    int ringSend[MAXCHANNELS];
+    int ringPrev[MAXCHANNELS];
+    int ringNext[MAXCHANNELS];
+    int treeToParent[MAXCHANNELS];
+    int treeToChild0[MAXCHANNELS];
+    int treeToChild1[MAXCHANNELS];
+    int nvlsHeads[MAXCHANNELS];
+};
+// struct sccl::hardware::topology::topo::scclTopoSystem;
+// 对系统拓扑结构进行排序
+scclResult_t scclTopoSortSystem(struct scclTopoSystem* system);
+// 打印系统拓扑结构
+scclResult_t scclTopoPrint(struct scclTopoSystem* system);
+// 计算系统中的路径
+scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm);
+// // 释放系统拓扑结构
+// void scclTopoFree(struct scclTopoSystem* system);
+// // 裁剪系统拓扑结构
+// scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm);
+// // 计算点对点通道
+// scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm);
+// // 获取指定rank的Nvidia GPU信息
+// scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks);
+// // 检查系统中是否所有路径都通过NVLink
+// int scclTopoPathAllNVLink(struct scclTopoSystem* system);
+// // 获取网络设备信息
+// scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
+// // 检查两个设备之间是否存在点对点连接
+scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank);
+// // 检查是否使用GDR
+// scclResult_t scclTopoCheckGdr(struct scclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
+// // 获取内部网络设备信息
+// scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev);
+// // 获取两个CUDA设备之间的连接类型
+// scclResult_t scclTopoGetLinkType(
+//     struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter = MAX_XGMI_INTER_GPUS, int nInter = 0, int* inter = nullptr);
+// // 检查是否需要刷新
+// scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush);
+// // 检查两个设备是否在同一网络中
+// scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net);
+// // 禁用PXE网络
+// int scclPxnDisable(struct scclComm* comm);
+// // 获取PXE网络中的中间节点
+// scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks);
+// // 获取本地节点的rank
+// scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank);
+// // 获取CPU亲和性
+// scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity);
+// // 获取CPU类型信息
+// scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model);
+// // 获取GPU数量
+// scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count);
+// // 获取NVS数量
+// scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count);
+// // 获取本地网络设备信息
+// scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id);
+// // 获取本地GPU索引
+// scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex);
+// // 初始化搜索，调用scclTopoCompute之前需要执行
+// scclResult_t scclTopoSearchInit(struct scclTopoSystem* system);
+// // 计算拓扑图
+// scclResult_t scclTopoCompute(struct scclTopoSystem* system, struct scclTopoGraph* graph);
+// // 打印拓扑图
+// scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph);
+// // 导出拓扑图
+// scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs);
+// // 设置预定义拓扑图
+// scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks);
+// // 设置后处理拓扑图
+// scclResult_t scclTopoPostset(
+//     struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc);
+// // 设置基于树的后处理拓扑图
+// scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph);
+// // 调整模型以适应计算能力
+// scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs);
+// scclResult_t scclTopoCudaPath(int cudaDev, char** path);
+// #include "info.h"
+// scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
+#endif
--- a/src/hardware/graph/paths.cc
+++ b/src/hardware/graph/paths.cc
--- a/src/hardware/graph/rings.cc
+++ b/src/hardware/graph/rings.cc
--- a/src/hardware/graph/rings.h
+++ b/src/hardware/graph/rings.h
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+namespace sccl {
+namespace hardware {
+namespace topology {
+namespace detect {
+scclResult_t scclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
+} // namespace detect
+} // namespace topology
+} // namespace hardware
+} // namespace sccl
--- a/src/hardware/graph/rome_models.cc
+++ b/src/hardware/graph/rome_models.cc
--- a/src/hardware/graph/rome_models.h
+++ b/src/hardware/graph/rome_models.h