通过线程池实现ipcsocket，满足节点内通信

a4ac3320 · lishen · d9d23f34 · d9d23f34 · d9d23f34 · d9d23f34
Commit a4ac3320 authored Jul 07, 2025 by lishen
20 changed files
--- a/src/hardware/graph/connect.cc
+++ b/src/hardware/graph/connect.cc
--- a/src/hardware/graph/devcomm.h
+++ b/src/hardware/graph/devcomm.h
--- a/src/hardware/graph/graph.h
+++ b/src/hardware/graph/graph.h
-#ifndef SCCL_GRAPH_H_
-#define SCCL_GRAPH_H_
-// #include "topo_utils.h"
-#include "devcomm.h"
-#include <limits.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <sched.h>
-namespace sccl {
-namespace hardware {
-namespace topology {
-#define MAX_XGMI_INTER_GPUS 4
-struct scclTopoGraph {
-    // Input / output
-    int id; // ring : 0, tree : 1, collnet : 2
-    int pattern;
-    int crossNic;
-    int collNet;
-    int minChannels;
-    int maxChannels;
-    // Output
-    int nChannels;
-    float bwIntra;
-    float bwInter;
-    float latencyInter;
-    int typeIntra;
-    int typeInter;
-    int sameChannels;
-    int nHops;
-    int intra[MAXCHANNELS * SCCL_TOPO_MAX_NODES];
-    int inter[MAXCHANNELS * 2];
-    int nIntraChannels;
-    int intraNets[MAXCHANNELS * SCCL_TOPO_MAX_NODES * 2];
-    char treeBase[SCCL_TOPO_MAX_NODES][SCCL_TOPO_MAX_NODES * 4];
-};
-struct scclTopoRanks {
-    int ringRecv[MAXCHANNELS];
-    int ringSend[MAXCHANNELS];
-    int ringPrev[MAXCHANNELS];
-    int ringNext[MAXCHANNELS];
-    int treeToParent[MAXCHANNELS];
-    int treeToChild0[MAXCHANNELS];
-    int treeToChild1[MAXCHANNELS];
-    int nvlsHeads[MAXCHANNELS];
-};
-// struct sccl::hardware::topology::topo::scclTopoSystem;
-// 对系统拓扑结构进行排序
-scclResult_t scclTopoSortSystem(struct scclTopoSystem* system);
-// 打印系统拓扑结构
-scclResult_t scclTopoPrint(struct scclTopoSystem* system);
-// 计算系统中的路径
-scclResult_t scclTopoComputePaths(struct scclTopoSystem* system, struct scclComm* comm);
-// // 释放系统拓扑结构
-// void scclTopoFree(struct scclTopoSystem* system);
-// // 裁剪系统拓扑结构
-// scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm);
-// // 计算点对点通道
-// scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm);
-// // 获取指定rank的Nvidia GPU信息
-// scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks);
-// // 检查系统中是否所有路径都通过NVLink
-// int scclTopoPathAllNVLink(struct scclTopoSystem* system);
-// // 获取网络设备信息
-// scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
-// // 检查两个设备之间是否存在点对点连接
-scclResult_t scclTopoCheckP2p(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int* read, int* intermediateRank);
-// // 检查是否使用GDR
-// scclResult_t scclTopoCheckGdr(struct scclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
-// // 获取内部网络设备信息
-// scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev);
-// // 获取两个CUDA设备之间的连接类型
-// scclResult_t scclTopoGetLinkType(
-//     struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter = MAX_XGMI_INTER_GPUS, int nInter = 0, int* inter = nullptr);
-// // 检查是否需要刷新
-// scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush);
-// // 检查两个设备是否在同一网络中
-// scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net);
-// // 禁用PXE网络
-// int scclPxnDisable(struct scclComm* comm);
-// // 获取PXE网络中的中间节点
-// scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks);
-// // 获取本地节点的rank
-// scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank);
-// // 获取CPU亲和性
-// scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity);
-// // 获取CPU类型信息
-// scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model);
-// // 获取GPU数量
-// scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count);
-// // 获取NVS数量
-// scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count);
-// // 获取本地网络设备信息
-// scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id);
-// // 获取本地GPU索引
-// scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex);
-// // 初始化搜索，调用scclTopoCompute之前需要执行
-// scclResult_t scclTopoSearchInit(struct scclTopoSystem* system);
-// // 计算拓扑图
-// scclResult_t scclTopoCompute(struct scclTopoSystem* system, struct scclTopoGraph* graph);
-// // 打印拓扑图
-// scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph);
-// // 导出拓扑图
-// scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs);
-// // 设置预定义拓扑图
-// scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks);
-// // 设置后处理拓扑图
-// scclResult_t scclTopoPostset(
-//     struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc);
-// // 设置基于树的后处理拓扑图
-// scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph);
-// // 调整模型以适应计算能力
-// scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs);
-// scclResult_t scclTopoCudaPath(int cudaDev, char** path);
-// #include "info.h"
-// scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
-#endif
--- a/src/hardware/graph/paths.cc
+++ b/src/hardware/graph/paths.cc
--- a/src/hardware/graph/rings.cc
+++ b/src/hardware/graph/rings.cc
-/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#include "core.h"
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace detect {
-#define MAXWIDTH 20
-#define PREFIXLEN 15
-#define STRLENGTH (PREFIXLEN + 5 * MAXWIDTH)
-void dumpLine(int* values, int nranks, const char* prefix) {
-    int prefixlen = strlen(prefix);
-    char line[STRLENGTH + 1];
-    line[STRLENGTH] = '\0';
-    memset(line, ' ', STRLENGTH);
-    strncpy(line, prefix, PREFIXLEN);
-    for(int i = 0; i < nranks && i < MAXWIDTH; i++)
-        sprintf(line + prefixlen + 4 * i, " %3d", values[i]);
-    INFO(SCCL_INIT, "%s", line);
-}
-scclResult_t scclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
-    for(int r = 0; r < nrings; r++) {
-        char prefix[40];
-        /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
-        dumpLine(prev+r*nranks, nranks, prefix);
-        sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
-        dumpLine(next+r*nranks, nranks, prefix);*/
-        int current = rank;
-        for(int i = 0; i < nranks; i++) {
-            rings[r * nranks + i] = current;
-            current               = next[r * nranks + current];
-        }
-        sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
-        if(rank == 0)
-            dumpLine(rings + r * nranks, nranks, prefix);
-        if(current != rank) {
-            WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
-            return scclInternalError;
-        }
-        // Check that all ranks are there
-        for(int i = 0; i < nranks; i++) {
-            int found = 0;
-            for(int j = 0; j < nranks; j++) {
-                if(rings[r * nranks + j] == i) {
-                    found = 1;
-                    break;
-                }
-            }
-            if(found == 0) {
-                WARN("Error : ring %d does not contain rank %d", r, i);
-                return scclInternalError;
-            }
-        }
-    }
-    return scclSuccess;
-}
-} // namespace detect
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/graph/rings.h
+++ b/src/hardware/graph/rings.h
-/*************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace detect {
-scclResult_t scclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
-} // namespace detect
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/graph/rome_models.cc
+++ b/src/hardware/graph/rome_models.cc
--- a/src/hardware/graph/rome_models.h
+++ b/src/hardware/graph/rome_models.h
-#ifndef SCCL_ROME_MODELS_H_
-#define SCCL_ROME_MODELS_H_
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace detect {
-scclResult_t parseGraph(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map, int* net_map);
-scclResult_t parseGraphLight(const char* str, struct scclTopoSystem* system, struct scclTopoGraph* graph, int* gpu_map);
-scclResult_t parseRome4P2H(struct scclTopoSystem* system, struct scclTopoGraph* graph);
-scclResult_t parseChordalRing(struct scclTopoSystem* system, struct scclTopoGraph* graph);
-scclResult_t parse1H16P(struct scclTopoSystem* system, struct scclTopoGraph* graph);
-scclResult_t parse4H4P(struct scclTopoSystem* system, struct scclTopoGraph* graph);
-} // namespace detect
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
-#endif
\ No newline at end of file
--- a/src/hardware/graph/sccl_bfloat16.h
+++ b/src/hardware/graph/sccl_bfloat16.h
-/**
- * MIT License
- *
- * Copyright 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-/*!\file
- * \brief sccl_bfloat16.h provides struct for sccl_bfloat16 typedef
- */
-#ifndef _SCCL_BFLOAT16_H_
-#define _SCCL_BFLOAT16_H_
-#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
-// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
-// include a minimal definition of sccl_bfloat16
-#include <stdint.h>
-/*! \brief Struct to represent a 16 bit brain floating point number. */
-namespace sccl {
-typedef struct {
-    uint16_t data;
-} sccl_bfloat16;
-} // namespace sccl
-#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <hip/hip_runtime.h>
-#include <ostream>
-#include <type_traits>
-namespace sccl {
-struct sccl_bfloat16 {
-    uint16_t data;
-    enum truncate_t {
-        truncate
-    };
-    __host__ __device__ sccl_bfloat16() = default;
-    // round upper 16 bits of IEEE float to convert to bfloat16
-    explicit __host__ __device__ sccl_bfloat16(float f) : data(float_to_bfloat16(f)) {}
-    explicit __host__ __device__ sccl_bfloat16(float f, truncate_t) : data(truncate_float_to_bfloat16(f)) {}
-    // zero extend lower 16 bits of bfloat16 to convert to IEEE float
-    __host__ __device__ operator float() const {
-        union {
-            uint32_t int32;
-            float fp32;
-        } u = {uint32_t(data) << 16};
-        return u.fp32;
-    }
-    private:
-    static __host__ __device__ uint16_t float_to_bfloat16(float f) {
-        union {
-            float fp32;
-            uint32_t int32;
-        } u = {f};
-        if(~u.int32 & 0x7f800000) {
-            // When the exponent bits are not all 1s, then the value is zero, normal,
-            // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
-            // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
-            // This causes the bfloat16's mantissa to be incremented by 1 if the 16
-            // least significant bits of the float mantissa are greater than 0x8000,
-            // or if they are equal to 0x8000 and the least significant bit of the
-            // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
-            // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
-            // has the value 0x7f, then incrementing it causes it to become 0x00 and
-            // the exponent is incremented by one, which is the next higher FP value
-            // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
-            // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
-            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-            // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
-            // incrementing it causes it to become an exponent of 0xFF and a mantissa
-            // of 0x00, which is Inf, the next higher value to the unrounded value.
-            u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
-        } else if(u.int32 & 0xffff) {
-            // When all of the exponent bits are 1, the value is Inf or NaN.
-            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-            // bit being 1. Signaling NaN is indicated by the most significant
-            // mantissa bit being 0 but some other bit(s) being 1. If any of the
-            // lower 16 bits of the mantissa are 1, we set the least significant bit
-            // of the bfloat16 mantissa, in order to preserve signaling NaN in case
-            // the bloat16's mantissa bits are all 0.
-            u.int32 |= 0x10000; // Preserve signaling NaN
-        }
-        return uint16_t(u.int32 >> 16);
-    }
-    // Truncate instead of rounding, preserving SNaN
-    static __host__ __device__ uint16_t truncate_float_to_bfloat16(float f) {
-        union {
-            float fp32;
-            uint32_t int32;
-        } u = {f};
-        return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
-    }
-};
-typedef struct {
-    uint16_t data;
-} sccl_bfloat16_public;
-static_assert(std::is_standard_layout<sccl_bfloat16>{},
-              "sccl_bfloat16 is not a standard layout type, and thus is "
-              "incompatible with C.");
-static_assert(std::is_trivial<sccl_bfloat16>{},
-              "sccl_bfloat16 is not a trivial type, and thus is "
-              "incompatible with C.");
-static_assert(sizeof(sccl_bfloat16) == sizeof(sccl_bfloat16_public) && offsetof(sccl_bfloat16, data) == offsetof(sccl_bfloat16_public, data),
-              "internal sccl_bfloat16 does not match public sccl_bfloat16");
-inline std::ostream& operator<<(std::ostream& os, const sccl_bfloat16& bf16) { return os << float(bf16); }
-inline __host__ __device__ sccl_bfloat16 operator+(sccl_bfloat16 a) { return a; }
-inline __host__ __device__ sccl_bfloat16 operator-(sccl_bfloat16 a) {
-    a.data ^= 0x8000;
-    return a;
-}
-inline __host__ __device__ sccl_bfloat16 operator+(sccl_bfloat16 a, sccl_bfloat16 b) { return sccl_bfloat16(float(a) + float(b)); }
-inline __host__ __device__ sccl_bfloat16 operator-(sccl_bfloat16 a, sccl_bfloat16 b) { return sccl_bfloat16(float(a) - float(b)); }
-inline __host__ __device__ sccl_bfloat16 operator*(sccl_bfloat16 a, sccl_bfloat16 b) { return sccl_bfloat16(float(a) * float(b)); }
-inline __host__ __device__ sccl_bfloat16 operator/(sccl_bfloat16 a, sccl_bfloat16 b) { return sccl_bfloat16(float(a) / float(b)); }
-inline __host__ __device__ bool operator<(sccl_bfloat16 a, sccl_bfloat16 b) { return float(a) < float(b); }
-inline __host__ __device__ bool operator==(sccl_bfloat16 a, sccl_bfloat16 b) { return float(a) == float(b); }
-inline __host__ __device__ bool operator>(sccl_bfloat16 a, sccl_bfloat16 b) { return b < a; }
-inline __host__ __device__ bool operator<=(sccl_bfloat16 a, sccl_bfloat16 b) { return !(a > b); }
-inline __host__ __device__ bool operator!=(sccl_bfloat16 a, sccl_bfloat16 b) { return !(a == b); }
-inline __host__ __device__ bool operator>=(sccl_bfloat16 a, sccl_bfloat16 b) { return !(a < b); }
-inline __host__ __device__ sccl_bfloat16& operator+=(sccl_bfloat16& a, sccl_bfloat16 b) { return a = a + b; }
-inline __host__ __device__ sccl_bfloat16& operator-=(sccl_bfloat16& a, sccl_bfloat16 b) { return a = a - b; }
-inline __host__ __device__ sccl_bfloat16& operator*=(sccl_bfloat16& a, sccl_bfloat16 b) { return a = a * b; }
-inline __host__ __device__ sccl_bfloat16& operator/=(sccl_bfloat16& a, sccl_bfloat16 b) { return a = a / b; }
-inline __host__ __device__ sccl_bfloat16& operator++(sccl_bfloat16& a) { return a += sccl_bfloat16(1.0f); }
-inline __host__ __device__ sccl_bfloat16& operator--(sccl_bfloat16& a) { return a -= sccl_bfloat16(1.0f); }
-inline __host__ __device__ sccl_bfloat16 operator++(sccl_bfloat16& a, int) {
-    sccl_bfloat16 orig = a;
-    ++a;
-    return orig;
-}
-inline __host__ __device__ sccl_bfloat16 operator--(sccl_bfloat16& a, int) {
-    sccl_bfloat16 orig = a;
-    --a;
-    return orig;
-}
-namespace std {
-constexpr __host__ __device__ bool isinf(sccl_bfloat16 a) { return !(~a.data & 0x7f80) && !(a.data & 0x7f); }
-constexpr __host__ __device__ bool isnan(sccl_bfloat16 a) { return !(~a.data & 0x7f80) && +(a.data & 0x7f); }
-constexpr __host__ __device__ bool iszero(sccl_bfloat16 a) { return !(a.data & 0x7fff); }
-inline sccl_bfloat16 sin(sccl_bfloat16 a) { return sccl_bfloat16(sinf(float(a))); }
-inline sccl_bfloat16 cos(sccl_bfloat16 a) { return sccl_bfloat16(cosf(float(a))); }
-} // namespace std
-} // namespace sccl
-#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
-#endif // _SCCL_BFLOAT16_H_
--- a/src/hardware/graph/search.cc
+++ b/src/hardware/graph/search.cc
--- a/src/hardware/graph/trees.cc
+++ b/src/hardware/graph/trees.cc
-#include "sccl.h"
-namespace sccl {
-namespace hardware {
-namespace topology {
-namespace detect {
-#define RANK_TO_INDEX(r) (rank > root ? rank - 1 : rank)
-/* Btree which alternates leaves and nodes.
- * Assumes root is 0, which conveniently builds a tree on powers of two,
- * (because we have pow2-1 ranks) which lets us manipulate bits.
- * Find first non-zero bit, then :
- * Find the parent :
- *   xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
- *   xx11[0] -> xx10[0] (3,7,11 below)
- * Find the children :
- *   xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
- *   xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
- *
- * Illustration :
- * 0---------------8
- *          ______/ \______
- *         4               12
- *       /   \            /  \
- *     2       6       10     \
- *    / \     / \     /  \     \
- *   1   3   5   7   9   11    13
- */
-scclResult_t scclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
-    int up, down0, down1;
-    int bit;
-    for(bit = 1; bit < nranks; bit <<= 1) {
-        if(bit & rank)
-            break;
-    }
-    if(rank == 0) {
-        *u  = -1;
-        *d0 = -1;
-        // Child rank is > 0 so it has to be our child 1, not 0.
-        *d1 = nranks > 1 ? bit >> 1 : -1;
-        return scclSuccess;
-    }
-    up = (rank ^ bit) | (bit << 1);
-    // if smaller than the parent, we are his first child, otherwise we're his second
-    if(up >= nranks)
-        up = (rank ^ bit);
-    *parentChildType = (rank < up) ? 0 : 1;
-    *u               = up;
-    int lowbit = bit >> 1;
-    // down0 is always within bounds
-    down0 = lowbit == 0 ? -1 : rank - lowbit;
-    down1 = lowbit == 0 ? -1 : rank + lowbit;
-    // Make sure down1 is within bounds
-    while(down1 >= nranks) {
-        down1 = lowbit == 0 ? -1 : rank + lowbit;
-        lowbit >>= 1;
-    }
-    *d0 = down0;
-    *d1 = down1;
-    return scclSuccess;
-}
-/* Build a double binary tree. Take the previous tree for the first tree.
- * For the second tree, we use a mirror tree (if nranks is even)
- *
- * 0---------------8                   3----------------11
- *          ______/ \                 / \______
- *         4         \               /         7
- *       /   \        \             /        /   \
- *     2       6       10         1        5      9
- *    / \     / \     /  \       / \      / \    / \
- *   1   3   5   7   9   11     0   2    4   6  8   10
- *
- * or shift it by one rank (if nranks is odd).
- *
- * 0---------------8            1---------------9
- *          ______/ \______              ______/ \______
- *         4               12           5                0
- *       /   \            /           /   \            /
- *     2       6       10           3       7       11
- *    / \     / \     /  \         / \     / \     /  \
- *   1   3   5   7   9   11       2   4   6   8  10   12
- */
-scclResult_t scclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
-    // First tree ... use a btree
-    scclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
-    // Second tree ... mirror or shift
-    if(nranks % 2 == 1) {
-        // shift
-        int shiftrank = (rank - 1 + nranks) % nranks;
-        int u, d0, d1;
-        scclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
-        *s1   = u == -1 ? -1 : (u + 1) % nranks;
-        *d1_0 = d0 == -1 ? -1 : (d0 + 1) % nranks;
-        *d1_1 = d1 == -1 ? -1 : (d1 + 1) % nranks;
-    } else {
-        // mirror
-        int u, d0, d1;
-        scclGetBtree(nranks, nranks - 1 - rank, &u, &d0, &d1, parentChildType1);
-        *s1   = u == -1 ? -1 : nranks - 1 - u;
-        *d1_0 = d0 == -1 ? -1 : nranks - 1 - d0;
-        *d1_1 = d1 == -1 ? -1 : nranks - 1 - d1;
-    }
-    return scclSuccess;
-}
-} // namespace detect
-} // namespace topology
-} // namespace hardware
-} // namespace sccl
--- a/src/hardware/graph/tuning.cc
+++ b/src/hardware/graph/tuning.cc
--- a/src/hardware/hardware_utils.cpp
+++ b/src/hardware/hardware_utils.cpp
+#include <stdint.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include "base.h"
+#include "hardware_utils.h"
+namespace sccl {
+namespace hardware {} // namespace hardware
+} // namespace sccl
--- a/src/hardware/hardware_utils.h
+++ b/src/hardware/hardware_utils.h
@@ -2,7 +2,13 @@
 #include <stdint.h>
 #include "base.h"
+#include "comm.h"
 namespace sccl {
-namespace hardware {} // namespace hardware
+namespace hardware {
+namespace ops {
+////
+} // namespace ops
+} // namespace hardware
 } // namespace sccl
--- a/src/hardware/net/device/net_ib.h
+++ b/src/hardware/net/device/net_ib.h
--- a/src/hardware/net/host/net_socket.h
+++ b/src/hardware/net/host/net_socket.h
--- a/src/hardware/net/ipc_socket/ipc_socket.cpp
+++ b/src/hardware/net/ipc_socket/ipc_socket.cpp
--- a/src/hardware/net/ipc_socket/ipc_socket.h
+++ b/src/hardware/net/ipc_socket/ipc_socket.h
--- a/src/hardware/net/net.cpp
+++ b/src/hardware/net/net.cpp
--- a/src/hardware/net/net.h
+++ b/src/hardware/net/net.h