Update RCCL source code and apply tuning optimizations

08b50ba4 · wanghan · f351fea1 · 08b50ba4 · 08b50ba4 · 08b50ba4
Commit 08b50ba4 authored Apr 02, 2026 by wanghan
8 changed files
--- a/rccl-dtk-25.04/rtest.xml
+++ b/rccl-dtk-25.04/rtest.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<testset>
+    <!-- Typically run with environment variables NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 -->
+    <var name="GTEST_FILTER">rccl-UnitTests  --gtest_color=yes --gtest_filter=</var>
+    <test sets="psdb">
+        <run name="all-psdb">{GTEST_FILTER}*sum_float32* --gtest_output=xml:output_psdb.xml </run>
+    </test>
+    <test sets="osdb">
+        <run name="all-osdb">{GTEST_FILTER}* --gtest_output=xml:output_osdb.xml </run>
+    </test>
+</testset>
--- a/rccl-dtk-25.04/src/Makefile
+++ b/rccl-dtk-25.04/src/Makefile
@@ -9,7 +9,7 @@ include ../makefiles/version.mk

 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc tuner.cc \
+LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
 		misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc \
 		misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
 		misc/ipcsocket.cc \

--- a/rccl-dtk-25.04/src/enqueue.cc
+++ b/rccl-dtk-25.04/src/enqueue.cc
@@ -9,7 +9,6 @@
 #include "argcheck.h"
 #include "coll_net.h"
 #include "graph/topo.h"
-#include "tuner.h"
 #include <hip/hip_runtime.h>
 #include <hip/hip_ext.h>
 #include <dlfcn.h>
@@ -22,7 +21,6 @@
 #include "hsa_extra.h"
 #include <cstring> // std::memcpy
 #include <cinttypes> // PRIx64
-#include <chrono>
 #if defined(ENABLE_NPKIT)
 #include "npkit/npkit.h"
 #endif
@@ -1753,35 +1751,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
      info->comm->localRankToRank[info->comm->localRank]);
  TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zi,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast<int64_t>(info->sendbuff), reinterpret_cast<int64_t>(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream);

-  // Runtime tuner: get configuration for this workload
-  if (info->comm->tuner.enabled && info->coll != ncclFuncSend && info->coll != ncclFuncRecv) {
-    int tunerAlgo, tunerProto;
-    bool needsTuning = false;
-    auto startTime = std::chrono::high_resolution_clock::now();
-    
-    NCCLCHECKGOTO(ncclTunerGetConfig(info->comm, (struct ncclInfo*)info, &tunerAlgo, &tunerProto, &needsTuning), ret, fail);
-    
-    if (needsTuning) {
-      // Override algorithm and protocol for testing
-      // Note: This is a simplified approach - in production you'd need to modify
-      // the algorithm selection logic in computeColl
-      INFO(NCCL_TUNER, "Rank %d: Will test algo=%d proto=%d", info->comm->rank, tunerAlgo, tunerProto);
-    }
-    
-    NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail);
-    
-    // If tuning, measure performance
-    if (needsTuning) {
-      // Synchronize to measure time
-      CUDACHECKGOTO(hipStreamSynchronize(info->stream), ret, fail);
-      auto endTime = std::chrono::high_resolution_clock::now();
-      float elapsed = std::chrono::duration<float, std::milli>(endTime - startTime).count();
-      
-      NCCLCHECKGOTO(ncclTunerRecordPerformance(info->comm, elapsed), ret, fail);
-    }
-  } else {
-    NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail);
-  }
+  NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail);

 exit:
  if (devOld != -1) CUDACHECK(cudaSetDevice(devOld));

--- a/rccl-dtk-25.04/src/graph/tuning.cc
+++ b/rccl-dtk-25.04/src/graph/tuning.cc
@@ -554,6 +554,56 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
 ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) {
  float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
  float lat = info->comm->latencies[info->coll][algorithm][protocol];
+  // 根据 qz.txt 性能数据自动调整执行时间
+  // 支持多个 nRanks 配置的优化
+  // 调整策略:
+  //   - diff≥8%: size_range = [original×0.5, original×2.0]
+  //   - diff<8%: size_range = [original×0.75, original×1.5]
+
+  // 优化配置: nRanks == 8
+  if (info->comm->nRanks == 8) {
+    // ncclFuncAllReduce NCCL_ALGO_TREE NCCL_PROTO_LL: 8~1024 bytes (原始 16~512)
+    if (info->coll == ncclFuncAllReduce && algorithm == NCCL_ALGO_TREE && protocol == NCCL_PROTO_LL && info->nBytes > 8 && info->nBytes <= 1024) {
+      *time = 0;
+      return ncclSuccess;
+    }
+    // ncclFuncAllReduce NCCL_ALGO_TREE NCCL_PROTO_LL: 1536~196608 bytes (原始 2048~131072)
+    if (info->coll == ncclFuncAllReduce && algorithm == NCCL_ALGO_TREE && protocol == NCCL_PROTO_LL && info->nBytes > 1536 && info->nBytes <= 196608) {
+      *time = 0;
+      return ncclSuccess;
+    }
+    // ncclFuncAllReduce NCCL_ALGO_RING NCCL_PROTO_LL: 262144~4194304 bytes (原始 524288~2097152)
+    if (info->coll == ncclFuncAllReduce && algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_LL && info->nBytes > 262144 && info->nBytes <= 4194304) {
+      *time = 0;
+      return ncclSuccess;
+    }
+    // ncclFuncAllReduce NCCL_ALGO_RING NCCL_PROTO_SIMPLE: 2097152~2147483648 bytes (原始 4194304~1073741824)
+    if (info->coll == ncclFuncAllReduce && algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->nBytes > 2097152 && info->nBytes <= 2147483648) {
+      *time = 0;
+      return ncclSuccess;
+    }
+    // ncclFuncBroadcast NCCL_ALGO_RING NCCL_PROTO_SIMPLE: 6144~262144 bytes (原始 8192~131072)
+    if (info->coll == ncclFuncBroadcast && algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->nBytes > 6144 && info->nBytes <= 262144) {
+      *time = 0;
+      return ncclSuccess;
+    }
+    // ncclFuncReduce NCCL_ALGO_RING NCCL_PROTO_SIMPLE: 6144~262144 bytes (原始 8192~131072)
+    if (info->coll == ncclFuncReduce && algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->nBytes > 6144 && info->nBytes <= 262144) {
+      *time = 0;
+      return ncclSuccess;
+    }
+    if (info->coll == ncclFuncAllGather && algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_LL && info->nBytes > 524288 && info->nBytes <= 2097152) {
+      *time = 0;
+      return ncclSuccess;
+    }
+    if (info->coll == ncclFuncReduceScatter && algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_LL && info->nBytes > 524288 && info->nBytes <= 2097152) {
+      *time = 0;
+      return ncclSuccess;
+    }
+  }
+
+
+
  if (bw == 0) {
    *time = -1.0; return ncclSuccess;
  }

--- a/rccl-dtk-25.04/src/include/comm.h
+++ b/rccl-dtk-25.04/src/include/comm.h
@@ -13,8 +13,6 @@
 #include "collectives.h"
 #include "proxy.h"
 #include "strongstream.h"
-#include <map>
-#include <chrono>
 #if defined (ENABLE_TIMELINE)
 #include "timeline/timeline.h"
 #endif
@@ -404,26 +402,6 @@ struct ncclComm {

  // Whether this comm is compatible with MSCCL
  bool mscclCompatible;
-
-  // Runtime tuner for algorithm and protocol selection
-  struct {
-    bool enabled;                    // Whether tuning is enabled
-    std::map<uint64_t, int>* workloadCache;  // workload hash -> best config index
-    
-    // Current testing configuration
-    int currentAlgo;                 // NCCL_ALGO_RING/TREE/COLLNET
-    int currentProto;                // NCCL_PROTO_SIMPLE/LL/LL128
-    
-    // Performance tracking
-    float bestTime;
-    int bestAlgo;
-    int bestProto;
-    
-    // Search state
-    int searchStep;                  // Current search step
-    bool isSearching;                // Whether currently searching
-    uint64_t currentWorkloadHash;    // Current workload being tuned
-  } tuner;
 };

 enum ncclLaunchMode {

--- a/rccl-dtk-25.04/src/include/tuner.h
+++ b/rccl-dtk-25.04/src/include/tuner.h
-/*************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Runtime tuner header
- ************************************************************************/
-
-#ifndef NCCL_TUNER_H_
-#define NCCL_TUNER_H_
-
-#include "nccl.h"
-#include "comm.h"
-
-// Get configuration for current workload (either cached or for testing)
-ncclResult_t ncclTunerGetConfig(struct ncclComm* comm, struct ncclInfo* info, 
-                                 int* algo, int* proto, bool* needsTuning);
-
-// Record performance of last executed configuration
-ncclResult_t ncclTunerRecordPerformance(struct ncclComm* comm, float elapsedMs);
-
-#endif
--- a/rccl-dtk-25.04/src/init.cc
+++ b/rccl-dtk-25.04/src/init.cc
@@ -34,7 +34,6 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
-#include <cfloat>
 #include "graph/topo.h"
 #include "graph/xml.h"
 #include "archinfo.h"
@@ -1966,19 +1965,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
  // update communicator state
  comm->initState = ncclSuccess;

-  // Initialize runtime tuner
-  comm->tuner.enabled = (getenv("RCCL_TUNER_ENABLE") != NULL);
-  if (comm->tuner.enabled) {
-    comm->tuner.workloadCache = new std::map<uint64_t, int>();
-    comm->tuner.isSearching = false;
-    comm->tuner.searchStep = 0;
-    comm->tuner.currentWorkloadHash = 0;
-    comm->tuner.bestTime = FLT_MAX;
-    INFO(NCCL_INIT, "Runtime tuner enabled for comm %p", comm);
-  } else {
-    comm->tuner.workloadCache = NULL;
-  }
-
  // Trace this call for replay tool
  if (job->parent) {
    /* unlink child abort flag. */

--- a/rccl-dtk-25.04/src/tuner.cc
+++ b/rccl-dtk-25.04/src/tuner.cc
-/*************************************************************************
- * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Runtime tuner for RCCL - Simple brute force search for algorithm and protocol
- ************************************************************************/
-
-#include "comm.h"
-#include "core.h"
-#include <cfloat>
-
-// Hash function for workload identification
-static uint64_t hashWorkload(ncclFunc_t coll, size_t count, ncclDataType_t datatype) {
-  uint64_t hash = 0xdeadbeef;
-  hash = (hash << 8) | (uint64_t)coll;
-  hash ^= (uint64_t)count;
-  hash = (hash << 8) | (uint64_t)datatype;
-  return hash;
-}
-
-// Get algorithm and protocol configuration for testing
-static void getTestConfig(int step, int* algo, int* proto) {
-  // Test configurations: Ring+Simple, Ring+LL, Tree+Simple, Tree+LL
-  int algoList[] = {NCCL_ALGO_RING, NCCL_ALGO_TREE};
-  int protoList[] = {NCCL_PROTO_SIMPLE, NCCL_PROTO_LL};
-  
-  int algoIdx = step / 2;
-  int protoIdx = step % 2;
-  
-  *algo = algoList[algoIdx];
-  *proto = protoList[protoIdx];
-}
-
-ncclResult_t ncclTunerGetConfig(struct ncclComm* comm, struct ncclInfo* info, 
-                                 int* algo, int* proto, bool* needsTuning) {
-  if (!comm->tuner.enabled) {
-    *needsTuning = false;
-    return ncclSuccess;
-  }
-  
-  // Calculate workload hash
-  uint64_t workloadHash = hashWorkload(info->coll, info->count, info->datatype);
-  
-  // Check if we have a cached result
-  auto it = comm->tuner.workloadCache->find(workloadHash);
-  
-  if (it != comm->tuner.workloadCache->end()) {
-    // Use cached configuration
-    int bestConfig = it->second;
-    *algo = bestConfig / 10;
-    *proto = bestConfig % 10;
-    *needsTuning = false;
-    
-    INFO(NCCL_TUNER, "Rank %d: Using cached config for workload %llx: algo=%d proto=%d",
-         comm->rank, (unsigned long long)workloadHash, *algo, *proto);
-    return ncclSuccess;
-  }
-  
-  // New workload - start tuning
-  if (!comm->tuner.isSearching || comm->tuner.currentWorkloadHash != workloadHash) {
-    // Start new search
-    comm->tuner.isSearching = true;
-    comm->tuner.currentWorkloadHash = workloadHash;
-    comm->tuner.searchStep = 0;
-    comm->tuner.bestTime = FLT_MAX;
-    
-    INFO(NCCL_INIT, "Rank %d: New workload %llx detected, starting tuning (coll=%d count=%zu dtype=%d)",
-         comm->rank, (unsigned long long)workloadHash, info->coll, info->count, info->datatype);
-  }
-  
-  const int totalConfigs = 4;  // 2 algos × 2 protos
-  
-  if (comm->tuner.searchStep < totalConfigs) {
-    // Get current test configuration
-    getTestConfig(comm->tuner.searchStep, algo, proto);
-    comm->tuner.currentAlgo = *algo;
-    comm->tuner.currentProto = *proto;
-    *needsTuning = true;
-    
-    INFO(NCCL_TUNER, "Rank %d: Testing config %d/%d: algo=%d proto=%d",
-         comm->rank, comm->tuner.searchStep + 1, totalConfigs, *algo, *proto);
-    
-    return ncclSuccess;
-  }
-  
-  // Search complete - use best configuration
-  *algo = comm->tuner.bestAlgo;
-  *proto = comm->tuner.bestProto;
-  *needsTuning = false;
-  
-  return ncclSuccess;
-}
-
-ncclResult_t ncclTunerRecordPerformance(struct ncclComm* comm, float elapsedMs) {
-  if (!comm->tuner.enabled || !comm->tuner.isSearching) {
-    return ncclSuccess;
-  }
-  
-  INFO(NCCL_TUNER, "Rank %d: Config %d (algo=%d proto=%d) time: %.3f ms",
-       comm->rank, comm->tuner.searchStep, 
-       comm->tuner.currentAlgo, comm->tuner.currentProto, elapsedMs);
-  
-  // Update best configuration
-  if (elapsedMs < comm->tuner.bestTime) {
-    comm->tuner.bestTime = elapsedMs;
-    comm->tuner.bestAlgo = comm->tuner.currentAlgo;
-    comm->tuner.bestProto = comm->tuner.currentProto;
-  }
-  
-  comm->tuner.searchStep++;
-  
-  const int totalConfigs = 4;
-  if (comm->tuner.searchStep >= totalConfigs) {
-    // Tuning complete
-    INFO(NCCL_INIT, "Rank %d: Tuning complete for workload %llx! Best: algo=%d proto=%d time=%.3f ms",
-         comm->rank, (unsigned long long)comm->tuner.currentWorkloadHash,
-         comm->tuner.bestAlgo, comm->tuner.bestProto, comm->tuner.bestTime);
-    
-    // Cache the result
-    int bestConfig = comm->tuner.bestAlgo * 10 + comm->tuner.bestProto;
-    (*comm->tuner.workloadCache)[comm->tuner.currentWorkloadHash] = bestConfig;
-    
-    comm->tuner.isSearching = false;
-  }
-  
-  return ncclSuccess;
-}