Commit 08b50ba4 authored by wanghan's avatar wanghan
Browse files

Update RCCL source code and apply tuning optimizations

parent f351fea1
<?xml version="1.0" encoding="UTF-8"?>
<testset>
<!-- Typically run with environment variables NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 -->
<var name="GTEST_FILTER">rccl-UnitTests --gtest_color=yes --gtest_filter=</var>
<test sets="psdb">
<run name="all-psdb">{GTEST_FILTER}*sum_float32* --gtest_output=xml:output_psdb.xml </run>
</test>
<test sets="osdb">
<run name="all-osdb">{GTEST_FILTER}* --gtest_output=xml:output_osdb.xml </run>
</test>
</testset>
......@@ -9,7 +9,7 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc tuner.cc \
LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc \
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
misc/ipcsocket.cc \
......
......@@ -9,7 +9,6 @@
#include "argcheck.h"
#include "coll_net.h"
#include "graph/topo.h"
#include "tuner.h"
#include <hip/hip_runtime.h>
#include <hip/hip_ext.h>
#include <dlfcn.h>
......@@ -22,7 +21,6 @@
#include "hsa_extra.h"
#include <cstring> // std::memcpy
#include <cinttypes> // PRIx64
#include <chrono>
#if defined(ENABLE_NPKIT)
#include "npkit/npkit.h"
#endif
......@@ -1753,35 +1751,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
info->comm->localRankToRank[info->comm->localRank]);
TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zi,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast<int64_t>(info->sendbuff), reinterpret_cast<int64_t>(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream);
// Runtime tuner: get configuration for this workload
if (info->comm->tuner.enabled && info->coll != ncclFuncSend && info->coll != ncclFuncRecv) {
int tunerAlgo, tunerProto;
bool needsTuning = false;
auto startTime = std::chrono::high_resolution_clock::now();
NCCLCHECKGOTO(ncclTunerGetConfig(info->comm, (struct ncclInfo*)info, &tunerAlgo, &tunerProto, &needsTuning), ret, fail);
if (needsTuning) {
// Override algorithm and protocol for testing
// Note: This is a simplified approach - in production you'd need to modify
// the algorithm selection logic in computeColl
INFO(NCCL_TUNER, "Rank %d: Will test algo=%d proto=%d", info->comm->rank, tunerAlgo, tunerProto);
}
NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail);
// If tuning, measure performance
if (needsTuning) {
// Synchronize to measure time
CUDACHECKGOTO(hipStreamSynchronize(info->stream), ret, fail);
auto endTime = std::chrono::high_resolution_clock::now();
float elapsed = std::chrono::duration<float, std::milli>(endTime - startTime).count();
NCCLCHECKGOTO(ncclTunerRecordPerformance(info->comm, elapsed), ret, fail);
}
} else {
NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail);
}
NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail);
exit:
if (devOld != -1) CUDACHECK(cudaSetDevice(devOld));
......
......@@ -554,6 +554,56 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) {
float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
float lat = info->comm->latencies[info->coll][algorithm][protocol];
// 根据 qz.txt 性能数据自动调整执行时间
// 支持多个 nRanks 配置的优化
// 调整策略:
// - diff≥8%: size_range = [original×0.5, original×2.0]
// - diff<8%: size_range = [original×0.75, original×1.5]
// 优化配置: nRanks == 8
if (info->comm->nRanks == 8) {
// ncclFuncAllReduce NCCL_ALGO_TREE NCCL_PROTO_LL: 8~1024 bytes (原始 16~512)
if (info->coll == ncclFuncAllReduce && algorithm == NCCL_ALGO_TREE && protocol == NCCL_PROTO_LL && info->nBytes > 8 && info->nBytes <= 1024) {
*time = 0;
return ncclSuccess;
}
// ncclFuncAllReduce NCCL_ALGO_TREE NCCL_PROTO_LL: 1536~196608 bytes (原始 2048~131072)
if (info->coll == ncclFuncAllReduce && algorithm == NCCL_ALGO_TREE && protocol == NCCL_PROTO_LL && info->nBytes > 1536 && info->nBytes <= 196608) {
*time = 0;
return ncclSuccess;
}
// ncclFuncAllReduce NCCL_ALGO_RING NCCL_PROTO_LL: 262144~4194304 bytes (原始 524288~2097152)
if (info->coll == ncclFuncAllReduce && algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_LL && info->nBytes > 262144 && info->nBytes <= 4194304) {
*time = 0;
return ncclSuccess;
}
// ncclFuncAllReduce NCCL_ALGO_RING NCCL_PROTO_SIMPLE: 2097152~2147483648 bytes (原始 4194304~1073741824)
if (info->coll == ncclFuncAllReduce && algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->nBytes > 2097152 && info->nBytes <= 2147483648) {
*time = 0;
return ncclSuccess;
}
// ncclFuncBroadcast NCCL_ALGO_RING NCCL_PROTO_SIMPLE: 6144~262144 bytes (原始 8192~131072)
if (info->coll == ncclFuncBroadcast && algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->nBytes > 6144 && info->nBytes <= 262144) {
*time = 0;
return ncclSuccess;
}
// ncclFuncReduce NCCL_ALGO_RING NCCL_PROTO_SIMPLE: 6144~262144 bytes (原始 8192~131072)
if (info->coll == ncclFuncReduce && algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->nBytes > 6144 && info->nBytes <= 262144) {
*time = 0;
return ncclSuccess;
}
if (info->coll == ncclFuncAllGather && algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_LL && info->nBytes > 524288 && info->nBytes <= 2097152) {
*time = 0;
return ncclSuccess;
}
if (info->coll == ncclFuncReduceScatter && algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_LL && info->nBytes > 524288 && info->nBytes <= 2097152) {
*time = 0;
return ncclSuccess;
}
}
if (bw == 0) {
*time = -1.0; return ncclSuccess;
}
......
......@@ -13,8 +13,6 @@
#include "collectives.h"
#include "proxy.h"
#include "strongstream.h"
#include <map>
#include <chrono>
#if defined (ENABLE_TIMELINE)
#include "timeline/timeline.h"
#endif
......@@ -404,26 +402,6 @@ struct ncclComm {
// Whether this comm is compatible with MSCCL
bool mscclCompatible;
// Runtime tuner for algorithm and protocol selection
struct {
bool enabled; // Whether tuning is enabled
std::map<uint64_t, int>* workloadCache; // workload hash -> best config index
// Current testing configuration
int currentAlgo; // NCCL_ALGO_RING/TREE/COLLNET
int currentProto; // NCCL_PROTO_SIMPLE/LL/LL128
// Performance tracking
float bestTime;
int bestAlgo;
int bestProto;
// Search state
int searchStep; // Current search step
bool isSearching; // Whether currently searching
uint64_t currentWorkloadHash; // Current workload being tuned
} tuner;
};
enum ncclLaunchMode {
......
/*************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Runtime tuner header
************************************************************************/
#ifndef NCCL_TUNER_H_
#define NCCL_TUNER_H_
#include "nccl.h"
#include "comm.h"
// Get configuration for current workload (either cached or for testing)
ncclResult_t ncclTunerGetConfig(struct ncclComm* comm, struct ncclInfo* info,
int* algo, int* proto, bool* needsTuning);
// Record performance of last executed configuration
ncclResult_t ncclTunerRecordPerformance(struct ncclComm* comm, float elapsedMs);
#endif
......@@ -34,7 +34,6 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <cfloat>
#include "graph/topo.h"
#include "graph/xml.h"
#include "archinfo.h"
......@@ -1966,19 +1965,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
// update communicator state
comm->initState = ncclSuccess;
// Initialize runtime tuner
comm->tuner.enabled = (getenv("RCCL_TUNER_ENABLE") != NULL);
if (comm->tuner.enabled) {
comm->tuner.workloadCache = new std::map<uint64_t, int>();
comm->tuner.isSearching = false;
comm->tuner.searchStep = 0;
comm->tuner.currentWorkloadHash = 0;
comm->tuner.bestTime = FLT_MAX;
INFO(NCCL_INIT, "Runtime tuner enabled for comm %p", comm);
} else {
comm->tuner.workloadCache = NULL;
}
// Trace this call for replay tool
if (job->parent) {
/* unlink child abort flag. */
......
/*************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Runtime tuner for RCCL - Simple brute force search for algorithm and protocol
************************************************************************/
#include "comm.h"
#include "core.h"
#include <cfloat>
// Hash function for workload identification
static uint64_t hashWorkload(ncclFunc_t coll, size_t count, ncclDataType_t datatype) {
uint64_t hash = 0xdeadbeef;
hash = (hash << 8) | (uint64_t)coll;
hash ^= (uint64_t)count;
hash = (hash << 8) | (uint64_t)datatype;
return hash;
}
// Get algorithm and protocol configuration for testing
static void getTestConfig(int step, int* algo, int* proto) {
// Test configurations: Ring+Simple, Ring+LL, Tree+Simple, Tree+LL
int algoList[] = {NCCL_ALGO_RING, NCCL_ALGO_TREE};
int protoList[] = {NCCL_PROTO_SIMPLE, NCCL_PROTO_LL};
int algoIdx = step / 2;
int protoIdx = step % 2;
*algo = algoList[algoIdx];
*proto = protoList[protoIdx];
}
ncclResult_t ncclTunerGetConfig(struct ncclComm* comm, struct ncclInfo* info,
int* algo, int* proto, bool* needsTuning) {
if (!comm->tuner.enabled) {
*needsTuning = false;
return ncclSuccess;
}
// Calculate workload hash
uint64_t workloadHash = hashWorkload(info->coll, info->count, info->datatype);
// Check if we have a cached result
auto it = comm->tuner.workloadCache->find(workloadHash);
if (it != comm->tuner.workloadCache->end()) {
// Use cached configuration
int bestConfig = it->second;
*algo = bestConfig / 10;
*proto = bestConfig % 10;
*needsTuning = false;
INFO(NCCL_TUNER, "Rank %d: Using cached config for workload %llx: algo=%d proto=%d",
comm->rank, (unsigned long long)workloadHash, *algo, *proto);
return ncclSuccess;
}
// New workload - start tuning
if (!comm->tuner.isSearching || comm->tuner.currentWorkloadHash != workloadHash) {
// Start new search
comm->tuner.isSearching = true;
comm->tuner.currentWorkloadHash = workloadHash;
comm->tuner.searchStep = 0;
comm->tuner.bestTime = FLT_MAX;
INFO(NCCL_INIT, "Rank %d: New workload %llx detected, starting tuning (coll=%d count=%zu dtype=%d)",
comm->rank, (unsigned long long)workloadHash, info->coll, info->count, info->datatype);
}
const int totalConfigs = 4; // 2 algos × 2 protos
if (comm->tuner.searchStep < totalConfigs) {
// Get current test configuration
getTestConfig(comm->tuner.searchStep, algo, proto);
comm->tuner.currentAlgo = *algo;
comm->tuner.currentProto = *proto;
*needsTuning = true;
INFO(NCCL_TUNER, "Rank %d: Testing config %d/%d: algo=%d proto=%d",
comm->rank, comm->tuner.searchStep + 1, totalConfigs, *algo, *proto);
return ncclSuccess;
}
// Search complete - use best configuration
*algo = comm->tuner.bestAlgo;
*proto = comm->tuner.bestProto;
*needsTuning = false;
return ncclSuccess;
}
ncclResult_t ncclTunerRecordPerformance(struct ncclComm* comm, float elapsedMs) {
if (!comm->tuner.enabled || !comm->tuner.isSearching) {
return ncclSuccess;
}
INFO(NCCL_TUNER, "Rank %d: Config %d (algo=%d proto=%d) time: %.3f ms",
comm->rank, comm->tuner.searchStep,
comm->tuner.currentAlgo, comm->tuner.currentProto, elapsedMs);
// Update best configuration
if (elapsedMs < comm->tuner.bestTime) {
comm->tuner.bestTime = elapsedMs;
comm->tuner.bestAlgo = comm->tuner.currentAlgo;
comm->tuner.bestProto = comm->tuner.currentProto;
}
comm->tuner.searchStep++;
const int totalConfigs = 4;
if (comm->tuner.searchStep >= totalConfigs) {
// Tuning complete
INFO(NCCL_INIT, "Rank %d: Tuning complete for workload %llx! Best: algo=%d proto=%d time=%.3f ms",
comm->rank, (unsigned long long)comm->tuner.currentWorkloadHash,
comm->tuner.bestAlgo, comm->tuner.bestProto, comm->tuner.bestTime);
// Cache the result
int bestConfig = comm->tuner.bestAlgo * 10 + comm->tuner.bestProto;
(*comm->tuner.workloadCache)[comm->tuner.currentWorkloadHash] = bestConfig;
comm->tuner.isSearching = false;
}
return ncclSuccess;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment