Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wh1225
rccl-auto-tuning
Commits
08b50ba4
Commit
08b50ba4
authored
Apr 02, 2026
by
wanghan
Browse files
Update RCCL source code and apply tuning optimizations
parent
f351fea1
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
63 additions
and
214 deletions
+63
-214
rccl-dtk-25.04/rtest.xml
rccl-dtk-25.04/rtest.xml
+11
-0
rccl-dtk-25.04/src/Makefile
rccl-dtk-25.04/src/Makefile
+1
-1
rccl-dtk-25.04/src/enqueue.cc
rccl-dtk-25.04/src/enqueue.cc
+1
-31
rccl-dtk-25.04/src/graph/tuning.cc
rccl-dtk-25.04/src/graph/tuning.cc
+50
-0
rccl-dtk-25.04/src/include/comm.h
rccl-dtk-25.04/src/include/comm.h
+0
-22
rccl-dtk-25.04/src/include/tuner.h
rccl-dtk-25.04/src/include/tuner.h
+0
-20
rccl-dtk-25.04/src/init.cc
rccl-dtk-25.04/src/init.cc
+0
-14
rccl-dtk-25.04/src/tuner.cc
rccl-dtk-25.04/src/tuner.cc
+0
-126
No files found.
rccl-dtk-25.04/rtest.xml
0 → 100644
View file @
08b50ba4
<?xml version="1.0" encoding="UTF-8"?>
<testset>
<!-- Typically run with environment variables NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 -->
<var
name=
"GTEST_FILTER"
>
rccl-UnitTests --gtest_color=yes --gtest_filter=
</var>
<test
sets=
"psdb"
>
<run
name=
"all-psdb"
>
{GTEST_FILTER}*sum_float32* --gtest_output=xml:output_psdb.xml
</run>
</test>
<test
sets=
"osdb"
>
<run
name=
"all-osdb"
>
{GTEST_FILTER}* --gtest_output=xml:output_osdb.xml
</run>
</test>
</testset>
rccl-dtk-25.04/src/Makefile
View file @
08b50ba4
...
@@ -9,7 +9,7 @@ include ../makefiles/version.mk
...
@@ -9,7 +9,7 @@ include ../makefiles/version.mk
##### src files
##### src files
INCEXPORTS
:=
nccl.h nccl_net.h
INCEXPORTS
:=
nccl.h nccl_net.h
LIBSRCFILES
:=
init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc
tuner.cc
\
LIBSRCFILES
:=
init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc
\
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc
\
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc
\
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc
\
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc
\
misc/ipcsocket.cc
\
misc/ipcsocket.cc
\
...
...
rccl-dtk-25.04/src/enqueue.cc
View file @
08b50ba4
...
@@ -9,7 +9,6 @@
...
@@ -9,7 +9,6 @@
#include "argcheck.h"
#include "argcheck.h"
#include "coll_net.h"
#include "coll_net.h"
#include "graph/topo.h"
#include "graph/topo.h"
#include "tuner.h"
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#include <hip/hip_ext.h>
#include <hip/hip_ext.h>
#include <dlfcn.h>
#include <dlfcn.h>
...
@@ -22,7 +21,6 @@
...
@@ -22,7 +21,6 @@
#include "hsa_extra.h"
#include "hsa_extra.h"
#include <cstring> // std::memcpy
#include <cstring> // std::memcpy
#include <cinttypes> // PRIx64
#include <cinttypes> // PRIx64
#include <chrono>
#if defined(ENABLE_NPKIT)
#if defined(ENABLE_NPKIT)
#include "npkit/npkit.h"
#include "npkit/npkit.h"
#endif
#endif
...
@@ -1753,35 +1751,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
...
@@ -1753,35 +1751,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
info
->
comm
->
localRankToRank
[
info
->
comm
->
localRank
]);
info
->
comm
->
localRankToRank
[
info
->
comm
->
localRank
]);
TRACE_CALL
(
"nccl%s(%"
PRIx64
",%"
PRIx64
",%zi,%d,%d,%d,%p,%p)"
,
info
->
opName
,
reinterpret_cast
<
int64_t
>
(
info
->
sendbuff
),
reinterpret_cast
<
int64_t
>
(
info
->
recvbuff
),
info
->
count
,
info
->
datatype
,
info
->
op
,
info
->
root
,
info
->
comm
,
info
->
stream
);
TRACE_CALL
(
"nccl%s(%"
PRIx64
",%"
PRIx64
",%zi,%d,%d,%d,%p,%p)"
,
info
->
opName
,
reinterpret_cast
<
int64_t
>
(
info
->
sendbuff
),
reinterpret_cast
<
int64_t
>
(
info
->
recvbuff
),
info
->
count
,
info
->
datatype
,
info
->
op
,
info
->
root
,
info
->
comm
,
info
->
stream
);
// Runtime tuner: get configuration for this workload
NCCLCHECKGOTO
(
taskAppend
(
info
->
comm
,
info
),
ret
,
fail
);
if
(
info
->
comm
->
tuner
.
enabled
&&
info
->
coll
!=
ncclFuncSend
&&
info
->
coll
!=
ncclFuncRecv
)
{
int
tunerAlgo
,
tunerProto
;
bool
needsTuning
=
false
;
auto
startTime
=
std
::
chrono
::
high_resolution_clock
::
now
();
NCCLCHECKGOTO
(
ncclTunerGetConfig
(
info
->
comm
,
(
struct
ncclInfo
*
)
info
,
&
tunerAlgo
,
&
tunerProto
,
&
needsTuning
),
ret
,
fail
);
if
(
needsTuning
)
{
// Override algorithm and protocol for testing
// Note: This is a simplified approach - in production you'd need to modify
// the algorithm selection logic in computeColl
INFO
(
NCCL_TUNER
,
"Rank %d: Will test algo=%d proto=%d"
,
info
->
comm
->
rank
,
tunerAlgo
,
tunerProto
);
}
NCCLCHECKGOTO
(
taskAppend
(
info
->
comm
,
info
),
ret
,
fail
);
// If tuning, measure performance
if
(
needsTuning
)
{
// Synchronize to measure time
CUDACHECKGOTO
(
hipStreamSynchronize
(
info
->
stream
),
ret
,
fail
);
auto
endTime
=
std
::
chrono
::
high_resolution_clock
::
now
();
float
elapsed
=
std
::
chrono
::
duration
<
float
,
std
::
milli
>
(
endTime
-
startTime
).
count
();
NCCLCHECKGOTO
(
ncclTunerRecordPerformance
(
info
->
comm
,
elapsed
),
ret
,
fail
);
}
}
else
{
NCCLCHECKGOTO
(
taskAppend
(
info
->
comm
,
info
),
ret
,
fail
);
}
exit:
exit:
if
(
devOld
!=
-
1
)
CUDACHECK
(
cudaSetDevice
(
devOld
));
if
(
devOld
!=
-
1
)
CUDACHECK
(
cudaSetDevice
(
devOld
));
...
...
rccl-dtk-25.04/src/graph/tuning.cc
View file @
08b50ba4
...
@@ -554,6 +554,56 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
...
@@ -554,6 +554,56 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
ncclResult_t
ncclTopoGetAlgoTime
(
struct
ncclInfo
*
info
,
int
algorithm
,
int
protocol
,
int
numPipeOps
,
float
*
time
)
{
ncclResult_t
ncclTopoGetAlgoTime
(
struct
ncclInfo
*
info
,
int
algorithm
,
int
protocol
,
int
numPipeOps
,
float
*
time
)
{
float
bw
=
info
->
comm
->
bandwidths
[
info
->
coll
][
algorithm
][
protocol
];
float
bw
=
info
->
comm
->
bandwidths
[
info
->
coll
][
algorithm
][
protocol
];
float
lat
=
info
->
comm
->
latencies
[
info
->
coll
][
algorithm
][
protocol
];
float
lat
=
info
->
comm
->
latencies
[
info
->
coll
][
algorithm
][
protocol
];
// 根据 qz.txt 性能数据自动调整执行时间
// 支持多个 nRanks 配置的优化
// 调整策略:
// - diff≥8%: size_range = [original×0.5, original×2.0]
// - diff<8%: size_range = [original×0.75, original×1.5]
// 优化配置: nRanks == 8
if
(
info
->
comm
->
nRanks
==
8
)
{
// ncclFuncAllReduce NCCL_ALGO_TREE NCCL_PROTO_LL: 8~1024 bytes (原始 16~512)
if
(
info
->
coll
==
ncclFuncAllReduce
&&
algorithm
==
NCCL_ALGO_TREE
&&
protocol
==
NCCL_PROTO_LL
&&
info
->
nBytes
>
8
&&
info
->
nBytes
<=
1024
)
{
*
time
=
0
;
return
ncclSuccess
;
}
// ncclFuncAllReduce NCCL_ALGO_TREE NCCL_PROTO_LL: 1536~196608 bytes (原始 2048~131072)
if
(
info
->
coll
==
ncclFuncAllReduce
&&
algorithm
==
NCCL_ALGO_TREE
&&
protocol
==
NCCL_PROTO_LL
&&
info
->
nBytes
>
1536
&&
info
->
nBytes
<=
196608
)
{
*
time
=
0
;
return
ncclSuccess
;
}
// ncclFuncAllReduce NCCL_ALGO_RING NCCL_PROTO_LL: 262144~4194304 bytes (原始 524288~2097152)
if
(
info
->
coll
==
ncclFuncAllReduce
&&
algorithm
==
NCCL_ALGO_RING
&&
protocol
==
NCCL_PROTO_LL
&&
info
->
nBytes
>
262144
&&
info
->
nBytes
<=
4194304
)
{
*
time
=
0
;
return
ncclSuccess
;
}
// ncclFuncAllReduce NCCL_ALGO_RING NCCL_PROTO_SIMPLE: 2097152~2147483648 bytes (原始 4194304~1073741824)
if
(
info
->
coll
==
ncclFuncAllReduce
&&
algorithm
==
NCCL_ALGO_RING
&&
protocol
==
NCCL_PROTO_SIMPLE
&&
info
->
nBytes
>
2097152
&&
info
->
nBytes
<=
2147483648
)
{
*
time
=
0
;
return
ncclSuccess
;
}
// ncclFuncBroadcast NCCL_ALGO_RING NCCL_PROTO_SIMPLE: 6144~262144 bytes (原始 8192~131072)
if
(
info
->
coll
==
ncclFuncBroadcast
&&
algorithm
==
NCCL_ALGO_RING
&&
protocol
==
NCCL_PROTO_SIMPLE
&&
info
->
nBytes
>
6144
&&
info
->
nBytes
<=
262144
)
{
*
time
=
0
;
return
ncclSuccess
;
}
// ncclFuncReduce NCCL_ALGO_RING NCCL_PROTO_SIMPLE: 6144~262144 bytes (原始 8192~131072)
if
(
info
->
coll
==
ncclFuncReduce
&&
algorithm
==
NCCL_ALGO_RING
&&
protocol
==
NCCL_PROTO_SIMPLE
&&
info
->
nBytes
>
6144
&&
info
->
nBytes
<=
262144
)
{
*
time
=
0
;
return
ncclSuccess
;
}
if
(
info
->
coll
==
ncclFuncAllGather
&&
algorithm
==
NCCL_ALGO_RING
&&
protocol
==
NCCL_PROTO_LL
&&
info
->
nBytes
>
524288
&&
info
->
nBytes
<=
2097152
)
{
*
time
=
0
;
return
ncclSuccess
;
}
if
(
info
->
coll
==
ncclFuncReduceScatter
&&
algorithm
==
NCCL_ALGO_RING
&&
protocol
==
NCCL_PROTO_LL
&&
info
->
nBytes
>
524288
&&
info
->
nBytes
<=
2097152
)
{
*
time
=
0
;
return
ncclSuccess
;
}
}
if
(
bw
==
0
)
{
if
(
bw
==
0
)
{
*
time
=
-
1.0
;
return
ncclSuccess
;
*
time
=
-
1.0
;
return
ncclSuccess
;
}
}
...
...
rccl-dtk-25.04/src/include/comm.h
View file @
08b50ba4
...
@@ -13,8 +13,6 @@
...
@@ -13,8 +13,6 @@
#include "collectives.h"
#include "collectives.h"
#include "proxy.h"
#include "proxy.h"
#include "strongstream.h"
#include "strongstream.h"
#include <map>
#include <chrono>
#if defined (ENABLE_TIMELINE)
#if defined (ENABLE_TIMELINE)
#include "timeline/timeline.h"
#include "timeline/timeline.h"
#endif
#endif
...
@@ -404,26 +402,6 @@ struct ncclComm {
...
@@ -404,26 +402,6 @@ struct ncclComm {
// Whether this comm is compatible with MSCCL
// Whether this comm is compatible with MSCCL
bool
mscclCompatible
;
bool
mscclCompatible
;
// Runtime tuner for algorithm and protocol selection
struct
{
bool
enabled
;
// Whether tuning is enabled
std
::
map
<
uint64_t
,
int
>*
workloadCache
;
// workload hash -> best config index
// Current testing configuration
int
currentAlgo
;
// NCCL_ALGO_RING/TREE/COLLNET
int
currentProto
;
// NCCL_PROTO_SIMPLE/LL/LL128
// Performance tracking
float
bestTime
;
int
bestAlgo
;
int
bestProto
;
// Search state
int
searchStep
;
// Current search step
bool
isSearching
;
// Whether currently searching
uint64_t
currentWorkloadHash
;
// Current workload being tuned
}
tuner
;
};
};
enum
ncclLaunchMode
{
enum
ncclLaunchMode
{
...
...
rccl-dtk-25.04/src/include/tuner.h
deleted
100644 → 0
View file @
f351fea1
/*************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Runtime tuner header
************************************************************************/
#ifndef NCCL_TUNER_H_
#define NCCL_TUNER_H_
#include "nccl.h"
#include "comm.h"
// Get configuration for current workload (either cached or for testing)
ncclResult_t
ncclTunerGetConfig
(
struct
ncclComm
*
comm
,
struct
ncclInfo
*
info
,
int
*
algo
,
int
*
proto
,
bool
*
needsTuning
);
// Record performance of last executed configuration
ncclResult_t
ncclTunerRecordPerformance
(
struct
ncclComm
*
comm
,
float
elapsedMs
);
#endif
rccl-dtk-25.04/src/init.cc
View file @
08b50ba4
...
@@ -34,7 +34,6 @@
...
@@ -34,7 +34,6 @@
#include <sys/types.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/stat.h>
#include <unistd.h>
#include <unistd.h>
#include <cfloat>
#include "graph/topo.h"
#include "graph/topo.h"
#include "graph/xml.h"
#include "graph/xml.h"
#include "archinfo.h"
#include "archinfo.h"
...
@@ -1966,19 +1965,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
...
@@ -1966,19 +1965,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
// update communicator state
// update communicator state
comm
->
initState
=
ncclSuccess
;
comm
->
initState
=
ncclSuccess
;
// Initialize runtime tuner
comm
->
tuner
.
enabled
=
(
getenv
(
"RCCL_TUNER_ENABLE"
)
!=
NULL
);
if
(
comm
->
tuner
.
enabled
)
{
comm
->
tuner
.
workloadCache
=
new
std
::
map
<
uint64_t
,
int
>
();
comm
->
tuner
.
isSearching
=
false
;
comm
->
tuner
.
searchStep
=
0
;
comm
->
tuner
.
currentWorkloadHash
=
0
;
comm
->
tuner
.
bestTime
=
FLT_MAX
;
INFO
(
NCCL_INIT
,
"Runtime tuner enabled for comm %p"
,
comm
);
}
else
{
comm
->
tuner
.
workloadCache
=
NULL
;
}
// Trace this call for replay tool
// Trace this call for replay tool
if
(
job
->
parent
)
{
if
(
job
->
parent
)
{
/* unlink child abort flag. */
/* unlink child abort flag. */
...
...
rccl-dtk-25.04/src/tuner.cc
deleted
100644 → 0
View file @
f351fea1
/*************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Runtime tuner for RCCL - Simple brute force search for algorithm and protocol
************************************************************************/
#include "comm.h"
#include "core.h"
#include <cfloat>
// Hash function for workload identification
static
uint64_t
hashWorkload
(
ncclFunc_t
coll
,
size_t
count
,
ncclDataType_t
datatype
)
{
uint64_t
hash
=
0xdeadbeef
;
hash
=
(
hash
<<
8
)
|
(
uint64_t
)
coll
;
hash
^=
(
uint64_t
)
count
;
hash
=
(
hash
<<
8
)
|
(
uint64_t
)
datatype
;
return
hash
;
}
// Get algorithm and protocol configuration for testing
static
void
getTestConfig
(
int
step
,
int
*
algo
,
int
*
proto
)
{
// Test configurations: Ring+Simple, Ring+LL, Tree+Simple, Tree+LL
int
algoList
[]
=
{
NCCL_ALGO_RING
,
NCCL_ALGO_TREE
};
int
protoList
[]
=
{
NCCL_PROTO_SIMPLE
,
NCCL_PROTO_LL
};
int
algoIdx
=
step
/
2
;
int
protoIdx
=
step
%
2
;
*
algo
=
algoList
[
algoIdx
];
*
proto
=
protoList
[
protoIdx
];
}
ncclResult_t
ncclTunerGetConfig
(
struct
ncclComm
*
comm
,
struct
ncclInfo
*
info
,
int
*
algo
,
int
*
proto
,
bool
*
needsTuning
)
{
if
(
!
comm
->
tuner
.
enabled
)
{
*
needsTuning
=
false
;
return
ncclSuccess
;
}
// Calculate workload hash
uint64_t
workloadHash
=
hashWorkload
(
info
->
coll
,
info
->
count
,
info
->
datatype
);
// Check if we have a cached result
auto
it
=
comm
->
tuner
.
workloadCache
->
find
(
workloadHash
);
if
(
it
!=
comm
->
tuner
.
workloadCache
->
end
())
{
// Use cached configuration
int
bestConfig
=
it
->
second
;
*
algo
=
bestConfig
/
10
;
*
proto
=
bestConfig
%
10
;
*
needsTuning
=
false
;
INFO
(
NCCL_TUNER
,
"Rank %d: Using cached config for workload %llx: algo=%d proto=%d"
,
comm
->
rank
,
(
unsigned
long
long
)
workloadHash
,
*
algo
,
*
proto
);
return
ncclSuccess
;
}
// New workload - start tuning
if
(
!
comm
->
tuner
.
isSearching
||
comm
->
tuner
.
currentWorkloadHash
!=
workloadHash
)
{
// Start new search
comm
->
tuner
.
isSearching
=
true
;
comm
->
tuner
.
currentWorkloadHash
=
workloadHash
;
comm
->
tuner
.
searchStep
=
0
;
comm
->
tuner
.
bestTime
=
FLT_MAX
;
INFO
(
NCCL_INIT
,
"Rank %d: New workload %llx detected, starting tuning (coll=%d count=%zu dtype=%d)"
,
comm
->
rank
,
(
unsigned
long
long
)
workloadHash
,
info
->
coll
,
info
->
count
,
info
->
datatype
);
}
const
int
totalConfigs
=
4
;
// 2 algos × 2 protos
if
(
comm
->
tuner
.
searchStep
<
totalConfigs
)
{
// Get current test configuration
getTestConfig
(
comm
->
tuner
.
searchStep
,
algo
,
proto
);
comm
->
tuner
.
currentAlgo
=
*
algo
;
comm
->
tuner
.
currentProto
=
*
proto
;
*
needsTuning
=
true
;
INFO
(
NCCL_TUNER
,
"Rank %d: Testing config %d/%d: algo=%d proto=%d"
,
comm
->
rank
,
comm
->
tuner
.
searchStep
+
1
,
totalConfigs
,
*
algo
,
*
proto
);
return
ncclSuccess
;
}
// Search complete - use best configuration
*
algo
=
comm
->
tuner
.
bestAlgo
;
*
proto
=
comm
->
tuner
.
bestProto
;
*
needsTuning
=
false
;
return
ncclSuccess
;
}
ncclResult_t
ncclTunerRecordPerformance
(
struct
ncclComm
*
comm
,
float
elapsedMs
)
{
if
(
!
comm
->
tuner
.
enabled
||
!
comm
->
tuner
.
isSearching
)
{
return
ncclSuccess
;
}
INFO
(
NCCL_TUNER
,
"Rank %d: Config %d (algo=%d proto=%d) time: %.3f ms"
,
comm
->
rank
,
comm
->
tuner
.
searchStep
,
comm
->
tuner
.
currentAlgo
,
comm
->
tuner
.
currentProto
,
elapsedMs
);
// Update best configuration
if
(
elapsedMs
<
comm
->
tuner
.
bestTime
)
{
comm
->
tuner
.
bestTime
=
elapsedMs
;
comm
->
tuner
.
bestAlgo
=
comm
->
tuner
.
currentAlgo
;
comm
->
tuner
.
bestProto
=
comm
->
tuner
.
currentProto
;
}
comm
->
tuner
.
searchStep
++
;
const
int
totalConfigs
=
4
;
if
(
comm
->
tuner
.
searchStep
>=
totalConfigs
)
{
// Tuning complete
INFO
(
NCCL_INIT
,
"Rank %d: Tuning complete for workload %llx! Best: algo=%d proto=%d time=%.3f ms"
,
comm
->
rank
,
(
unsigned
long
long
)
comm
->
tuner
.
currentWorkloadHash
,
comm
->
tuner
.
bestAlgo
,
comm
->
tuner
.
bestProto
,
comm
->
tuner
.
bestTime
);
// Cache the result
int
bestConfig
=
comm
->
tuner
.
bestAlgo
*
10
+
comm
->
tuner
.
bestProto
;
(
*
comm
->
tuner
.
workloadCache
)[
comm
->
tuner
.
currentWorkloadHash
]
=
bestConfig
;
comm
->
tuner
.
isSearching
=
false
;
}
return
ncclSuccess
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment