Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
0b7b979e
Unverified
Commit
0b7b979e
authored
Sep 28, 2023
by
gilbertlee-amd
Committed by
GitHub
Sep 28, 2023
Browse files
Minor changes to p2p, a2a (#55)
parent
a9cb3a25
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
148 additions
and
29 deletions
+148
-29
CHANGELOG.md
CHANGELOG.md
+12
-0
src/Makefile
src/Makefile
+6
-6
src/TransferBench.cpp
src/TransferBench.cpp
+102
-18
src/include/EnvVars.hpp
src/include/EnvVars.hpp
+27
-4
src/include/Kernels.hpp
src/include/Kernels.hpp
+1
-1
No files found.
CHANGELOG.md
View file @
0b7b979e
# Changelog for TransferBench
## v1.28
### Added
-
Added A2A_DIRECT which only executes all-to-all only directly connected GPUs (on by default now)
-
Added average statistics for p2p and a2a benchmarks
-
Added USE_FINE_GRAIN for p2p benchmark.
-
With older devices, p2p performance with default coarse grain device memory stops timing as soon as request sent to data fabric,
not actually when it arrives remotely, which may artificially inflate bandwidth numbers, especially when sending small amounts of data
### Modified
-
Modified P2P output to help distinguish between CPU / GPU devices
### Fixed
-
Fixed Makefile target to prevent unnecessary re-compilation
## v1.27
### Added
-
Adding cmdline preset to allow specify simple tests on command line
...
...
src/Makefile
View file @
0b7b979e
...
...
@@ -7,9 +7,9 @@ NVCC=$(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected
ifeq
("$(shell test -e $(NVCC) && echo found)", "found")
EXE
=
TransferBenchCuda
EXE
=
../
TransferBenchCuda
else
EXE
=
TransferBench
EXE
=
../
TransferBench
endif
CXXFLAGS
=
-O3
-Iinclude
-I
$(ROCM_PATH)
/include
-lnuma
-L
$(ROCM_PATH)
/lib
-lhsa-runtime64
...
...
@@ -17,11 +17,11 @@ NVFLAGS = -O3 -g -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gen
LDFLAGS
+=
-lpthread
all
:
$(EXE)
TransferBench
:
TransferBench.cpp $(shell find -regex ".*
\.\h
pp")
$(HIPCC)
$(CXXFLAGS)
$<
-o
../
$@
$(LDFLAGS)
../
TransferBench
:
TransferBench.cpp $(shell find -regex ".*
\.\h
pp")
$(HIPCC)
$(CXXFLAGS)
$<
-o
$@
$(LDFLAGS)
TransferBenchCuda
:
TransferBench.cpp $(shell find -regex ".*
\.\h
pp")
$(NVCC)
$(NVFLAGS)
$<
-o
../
$@
$(LDFLAGS)
../
TransferBenchCuda
:
TransferBench.cpp $(shell find -regex ".*
\.\h
pp")
$(NVCC)
$(NVFLAGS)
$<
-o
$@
$(LDFLAGS)
clean
:
rm
-f
*
.o ../TransferBench ../TransferBenchCuda
src/TransferBench.cpp
View file @
0b7b979e
...
...
@@ -1457,6 +1457,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
printf
(
"%7s %02d"
,
"CPU"
,
i
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
}
if
(
numCpus
>
0
)
printf
(
" "
);
for
(
int
i
=
0
;
i
<
numGpus
;
i
++
)
{
printf
(
"%7s %02d"
,
"GPU"
,
i
);
...
...
@@ -1464,30 +1465,38 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
}
printf
(
"
\n
"
);
double
avgBwSum
[
2
][
2
]
=
{};
int
avgCount
[
2
][
2
]
=
{};
ExeType
const
gpuExeType
=
ev
.
useDmaCopy
?
EXE_GPU_DMA
:
EXE_GPU_GFX
;
// Loop over all possible src/dst pairs
for
(
int
src
=
0
;
src
<
numDevices
;
src
++
)
{
MemType
const
srcType
=
(
src
<
numCpus
?
MEM_CPU
:
MEM_GPU
);
int
const
srcIndex
=
(
srcType
==
MEM_CPU
?
src
:
src
-
numCpus
);
MemType
const
srcTypeActual
=
((
ev
.
useFineGrain
&&
srcType
==
MEM_CPU
)
?
MEM_CPU_FINE
:
(
ev
.
useFineGrain
&&
srcType
==
MEM_GPU
)
?
MEM_GPU_FINE
:
srcType
);
std
::
vector
<
std
::
vector
<
double
>>
avgBandwidth
(
isBidirectional
+
1
);
std
::
vector
<
std
::
vector
<
double
>>
minBandwidth
(
isBidirectional
+
1
);
std
::
vector
<
std
::
vector
<
double
>>
maxBandwidth
(
isBidirectional
+
1
);
std
::
vector
<
std
::
vector
<
double
>>
stdDev
(
isBidirectional
+
1
);
if
(
src
==
numCpus
&&
src
!=
0
)
printf
(
"
\n
"
);
for
(
int
dst
=
0
;
dst
<
numDevices
;
dst
++
)
{
MemType
const
dstType
=
(
dst
<
numCpus
?
MEM_CPU
:
MEM_GPU
);
int
const
dstIndex
=
(
dstType
==
MEM_CPU
?
dst
:
dst
-
numCpus
);
MemType
const
dstTypeActual
=
((
ev
.
useFineGrain
&&
dstType
==
MEM_CPU
)
?
MEM_CPU_FINE
:
(
ev
.
useFineGrain
&&
dstType
==
MEM_GPU
)
?
MEM_GPU_FINE
:
dstType
);
// Prepare Transfers
std
::
vector
<
Transfer
>
transfers
(
isBidirectional
+
1
);
// SRC -> DST
transfers
[
0
].
numBytes
=
N
*
sizeof
(
float
);
transfers
[
0
].
srcType
.
push_back
(
srcType
);
transfers
[
0
].
dstType
.
push_back
(
dstType
);
transfers
[
0
].
srcType
.
push_back
(
srcType
Actual
);
transfers
[
0
].
dstType
.
push_back
(
dstType
Actual
);
transfers
[
0
].
srcIndex
.
push_back
(
srcIndex
);
transfers
[
0
].
dstIndex
.
push_back
(
dstIndex
);
transfers
[
0
].
numSrcs
=
transfers
[
0
].
numDsts
=
1
;
...
...
@@ -1500,8 +1509,8 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
{
transfers
[
1
].
numBytes
=
N
*
sizeof
(
float
);
transfers
[
1
].
numSrcs
=
transfers
[
1
].
numDsts
=
1
;
transfers
[
1
].
srcType
.
push_back
(
dstType
);
transfers
[
1
].
dstType
.
push_back
(
srcType
);
transfers
[
1
].
srcType
.
push_back
(
dstType
Actual
);
transfers
[
1
].
dstType
.
push_back
(
srcType
Actual
);
transfers
[
1
].
srcIndex
.
push_back
(
dstIndex
);
transfers
[
1
].
dstIndex
.
push_back
(
srcIndex
);
transfers
[
1
].
exeType
=
IsGpuType
(
ev
.
useRemoteRead
?
srcType
:
dstType
)
?
gpuExeType
:
EXE_CPU
;
...
...
@@ -1542,6 +1551,12 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
double
const
avgBw
=
(
transfers
[
dir
].
numBytesActual
/
1.0E9
)
/
avgTime
*
1000.0
f
;
avgBandwidth
[
dir
].
push_back
(
avgBw
);
if
(
!
(
srcType
==
dstType
&&
srcIndex
==
dstIndex
))
{
avgBwSum
[
srcType
][
dstType
]
+=
avgBw
;
avgCount
[
srcType
][
dstType
]
++
;
}
if
(
ev
.
showIterations
)
{
double
minTime
=
transfers
[
dir
].
perIterationTime
[
0
];
...
...
@@ -1583,6 +1598,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
for
(
int
dst
=
0
;
dst
<
numDevices
;
dst
++
)
{
if
(
dst
==
numCpus
&&
dst
!=
0
)
printf
(
" "
);
double
const
avgBw
=
avgBandwidth
[
dir
][
dst
];
if
(
avgBw
==
0.0
)
...
...
@@ -1601,6 +1617,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
{
double
const
minBw
=
minBandwidth
[
dir
][
i
];
if
(
i
==
numCpus
&&
i
!=
0
)
printf
(
" "
);
if
(
minBw
==
0.0
)
printf
(
"%10s"
,
"N/A"
);
else
...
...
@@ -1615,6 +1632,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
{
double
const
maxBw
=
maxBandwidth
[
dir
][
i
];
if
(
i
==
numCpus
&&
i
!=
0
)
printf
(
" "
);
if
(
maxBw
==
0.0
)
printf
(
"%10s"
,
"N/A"
);
else
...
...
@@ -1629,6 +1647,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
{
double
const
sd
=
stdDev
[
dir
][
i
];
if
(
i
==
numCpus
&&
i
!=
0
)
printf
(
" "
);
if
(
sd
==
-
1.0
)
printf
(
"%10s"
,
"N/A"
);
else
...
...
@@ -1647,16 +1666,37 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
for
(
int
dst
=
0
;
dst
<
numDevices
;
dst
++
)
{
double
const
sumBw
=
avgBandwidth
[
0
][
dst
]
+
avgBandwidth
[
1
][
dst
];
if
(
dst
==
numCpus
&&
dst
!=
0
)
printf
(
" "
);
if
(
sumBw
==
0.0
)
printf
(
"%10s"
,
"N/A"
);
else
printf
(
"%10.2f"
,
sumBw
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
}
if
(
src
<
numDevices
-
1
)
printf
(
"
\n\n
"
);
printf
(
"
\n
"
);
if
(
src
<
numDevices
-
1
)
printf
(
"
\n
"
);
}
}
printf
(
"
\n
"
);
if
(
!
ev
.
outputToCsv
)
{
printf
(
" "
);
for
(
int
srcType
:
{
MEM_CPU
,
MEM_GPU
})
for
(
int
dstType
:
{
MEM_CPU
,
MEM_GPU
})
printf
(
" %cPU->%cPU"
,
srcType
==
MEM_CPU
?
'C'
:
'G'
,
dstType
==
MEM_CPU
?
'C'
:
'G'
);
printf
(
"
\n
"
);
printf
(
"Averages (During %s):"
,
isBidirectional
?
" BiDir"
:
"UniDir"
);
for
(
int
srcType
:
{
MEM_CPU
,
MEM_GPU
})
for
(
int
dstType
:
{
MEM_CPU
,
MEM_GPU
})
{
if
(
avgCount
[
srcType
][
dstType
])
printf
(
"%10.2f"
,
avgBwSum
[
srcType
][
dstType
]
/
avgCount
[
srcType
][
dstType
]);
else
printf
(
"%10s"
,
"N/A"
);
}
printf
(
"
\n\n
"
);
}
}
}
...
...
@@ -1732,7 +1772,7 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
void
RunAllToAllBenchmark
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
int
const
numSubExecs
)
{
ev
.
DisplayEnvVars
();
ev
.
Display
A2A
EnvVars
();
// Collect the number of GPU devices to use
int
const
numGpus
=
ev
.
numGpuDevices
;
...
...
@@ -1763,14 +1803,27 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
for
(
int
j
=
0
;
j
<
numGpus
;
j
++
)
{
transfer
.
dstIndex
[
0
]
=
j
;
if
(
ev
.
a2aDirect
)
{
#if !defined(__NVCC__)
if
(
i
==
j
)
continue
;
uint32_t
linkType
,
hopCount
;
HIP_CALL
(
hipExtGetLinkTypeAndHopCount
(
RemappedIndex
(
i
,
false
),
RemappedIndex
(
j
,
false
),
&
linkType
,
&
hopCount
));
if
(
hopCount
!=
1
)
continue
;
#endif
}
transfers
.
push_back
(
transfer
);
}
}
printf
(
"GPU-GFX All-To-All benchmark:
\n
"
);
printf
(
"==========================
\n
"
);
printf
(
"- Copying %lu bytes between every pair of GPUs using %d CUs
\n
"
,
numBytesPerTransfer
,
numSubExecs
);
printf
(
"- All numbers reported as GB/sec
\n\n
"
);
printf
(
"- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)
\n
"
,
numBytesPerTransfer
,
ev
.
a2aDirect
?
"directly connected"
:
"all"
,
numSubExecs
,
transfers
.
size
());
if
(
transfers
.
size
()
==
0
)
return
;
double
totalBandwidthCpu
=
0
;
ExecuteTransfers
(
ev
,
0
,
numBytesPerTransfer
/
sizeof
(
float
),
transfers
,
true
,
&
totalBandwidthCpu
);
...
...
@@ -1780,21 +1833,52 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
printf
(
"SRC
\\
DST"
);
for
(
int
dst
=
0
;
dst
<
numGpus
;
dst
++
)
printf
(
"%cGPU %02d "
,
separator
,
dst
);
printf
(
"
\n
"
);
printf
(
" %cSTotal
\n
"
,
separator
);
std
::
map
<
std
::
pair
<
int
,
int
>
,
int
>
reIndex
;
for
(
int
i
=
0
;
i
<
transfers
.
size
();
i
++
)
{
Transfer
const
&
t
=
transfers
[
i
];
reIndex
[
std
::
make_pair
(
t
.
srcIndex
[
0
],
t
.
dstIndex
[
0
])]
=
i
;
}
double
totalBandwidthGpu
=
0.0
;
std
::
vector
<
double
>
colTotalBandwidth
(
numGpus
+
1
,
0.0
);
for
(
int
src
=
0
;
src
<
numGpus
;
src
++
)
{
double
rowTotalBandwidth
=
0
;
printf
(
"GPU %02d"
,
src
);
for
(
int
dst
=
0
;
dst
<
numGpus
;
dst
++
)
{
Transfer
const
&
transfer
=
transfers
[
src
*
numGpus
+
dst
];
double
transferDurationMsec
=
transfer
.
transferTime
/
(
1.0
*
ev
.
numIterations
);
double
transferBandwidthGbs
=
(
transfer
.
numBytesActual
/
1.0E9
)
/
transferDurationMsec
*
1000.0
f
;
printf
(
"%c%7.2f "
,
separator
,
transferBandwidthGbs
);
if
(
reIndex
.
count
(
std
::
make_pair
(
src
,
dst
)))
{
Transfer
const
&
transfer
=
transfers
[
reIndex
[
std
::
make_pair
(
src
,
dst
)]];
double
transferDurationMsec
=
transfer
.
transferTime
/
(
1.0
*
ev
.
numIterations
);
double
transferBandwidthGbs
=
(
transfer
.
numBytesActual
/
1.0E9
)
/
transferDurationMsec
*
1000.0
f
;
colTotalBandwidth
[
dst
]
+=
transferBandwidthGbs
;
rowTotalBandwidth
+=
transferBandwidthGbs
;
totalBandwidthGpu
+=
transferBandwidthGbs
;
printf
(
"%c%7.2f "
,
separator
,
transferBandwidthGbs
);
}
else
{
printf
(
"%c%7s "
,
separator
,
"N/A"
);
}
}
printf
(
"
\n
"
);
printf
(
" %c%7.2f
\n
"
,
separator
,
rowTotalBandwidth
);
colTotalBandwidth
[
numGpus
]
+=
rowTotalBandwidth
;
}
printf
(
"Aggregate bandwidth (CPU Timed): %7.2f
\n
"
,
totalBandwidthCpu
);
printf
(
"
\n
RTotal"
);
for
(
int
dst
=
0
;
dst
<
numGpus
;
dst
++
)
{
printf
(
"%c%7.2f "
,
separator
,
colTotalBandwidth
[
dst
]);
}
printf
(
" %c%7.2f
\n
"
,
separator
,
colTotalBandwidth
[
numGpus
]);
printf
(
"
\n
"
);
printf
(
"Average bandwidth (GPU Timed): %7.2f GB/s
\n
"
,
totalBandwidthGpu
/
transfers
.
size
());
printf
(
"Aggregate bandwidth (GPU Timed): %7.2f GB/s
\n
"
,
totalBandwidthGpu
);
printf
(
"Aggregate bandwidth (CPU Timed): %7.2f GB/s
\n
"
,
totalBandwidthCpu
);
}
void
Transfer
::
PrepareSubExecParams
(
EnvVars
const
&
ev
)
...
...
src/include/EnvVars.hpp
View file @
0b7b979e
...
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.2
7
"
#define TB_VERSION "1.2
8
"
extern
char
const
MemTypeStr
[];
extern
char
const
ExeTypeStr
[];
...
...
@@ -92,6 +92,7 @@ public:
int
p2pMode
;
// Both = 0, Unidirectional = 1, Bidirectional = 2
int
useDmaCopy
;
// Use DMA copy instead of GPU copy
int
useRemoteRead
;
// Use destination memory type as executor instead of source memory type
int
useFineGrain
;
// Use fine-grained memory
// Environment variables only for Sweep-preset
int
sweepMin
;
// Min number of simultaneous Transfers to be executed per test
...
...
@@ -106,6 +107,9 @@ public:
std
::
string
sweepExe
;
// Set of executors to be swept
std
::
string
sweepDst
;
// Set of dst memory types to be swept
// Enviroment variables only for A2A preset
int
a2aDirect
;
// Only execute on links that are directly connected
// Developer features
int
enableDebug
;
// Enable debug output
int
gpuKernel
;
// Which GPU kernel to use
...
...
@@ -170,11 +174,13 @@ public:
gpuKernel
=
GetEnvVar
(
"GPU_KERNEL"
,
defaultGpuKernel
);
// P2P Benchmark related
useRemoteRead
=
GetEnvVar
(
"USE_REMOTE_READ"
,
0
);
useDmaCopy
=
GetEnvVar
(
"USE_GPU_DMA"
,
0
);
numGpuSubExecs
=
GetEnvVar
(
"NUM_GPU_SE"
,
useDmaCopy
?
1
:
numDeviceCUs
);
useDmaCopy
=
GetEnvVar
(
"USE_GPU_DMA"
,
0
);
// Needed for numGpuSubExec
numCpuSubExecs
=
GetEnvVar
(
"NUM_CPU_SE"
,
DEFAULT_P2P_NUM_CPU_SE
);
numGpuSubExecs
=
GetEnvVar
(
"NUM_GPU_SE"
,
useDmaCopy
?
1
:
numDeviceCUs
);
p2pMode
=
GetEnvVar
(
"P2P_MODE"
,
0
);
useRemoteRead
=
GetEnvVar
(
"USE_REMOTE_READ"
,
0
);
useFineGrain
=
GetEnvVar
(
"USE_FINE_GRAIN"
,
0
);
// Sweep related
sweepMin
=
GetEnvVar
(
"SWEEP_MIN"
,
DEFAULT_SWEEP_MIN
);
...
...
@@ -188,6 +194,9 @@ public:
sweepXgmiMax
=
GetEnvVar
(
"SWEEP_XGMI_MAX"
,
-
1
);
sweepRandBytes
=
GetEnvVar
(
"SWEEP_RAND_BYTES"
,
0
);
// A2A Benchmark related
a2aDirect
=
GetEnvVar
(
"A2A_DIRECT"
,
1
);
// Determine random seed
char
*
sweepSeedStr
=
getenv
(
"SWEEP_SEED"
);
sweepSeed
=
(
sweepSeedStr
!=
NULL
?
atoi
(
sweepSeedStr
)
:
time
(
NULL
));
...
...
@@ -517,6 +526,9 @@ public:
std
::
string
(
"Running "
)
+
(
p2pMode
==
1
?
"Unidirectional"
:
p2pMode
==
2
?
"Bidirectional"
:
"Unidirectional + Bidirectional"
));
PRINT_EV
(
"USE_FINE_GRAIN"
,
useFineGrain
,
std
::
string
(
"Using "
)
+
(
useFineGrain
?
"fine"
:
"coarse"
)
+
"-grained memory"
);
PRINT_EV
(
"USE_GPU_DMA"
,
useDmaCopy
,
std
::
string
(
"Using GPU-"
)
+
(
useDmaCopy
?
"DMA"
:
"GFX"
)
+
" as GPU executor"
);
PRINT_EV
(
"USE_REMOTE_READ"
,
useRemoteRead
,
...
...
@@ -557,6 +569,17 @@ public:
printf
(
"
\n
"
);
}
void
DisplayA2AEnvVars
()
const
{
DisplayEnvVars
();
if
(
hideEnv
)
return
;
if
(
!
outputToCsv
)
printf
(
"[AllToAll Related]
\n
"
);
PRINT_EV
(
"A2A_DIRECT"
,
a2aDirect
,
std
::
string
(
a2aDirect
?
"Only using direct links"
:
"Full all-to-all"
));
printf
(
"
\n
"
);
}
// Helper function that gets parses environment variable or sets to default value
static
int
GetEnvVar
(
std
::
string
const
&
varname
,
int
defaultValue
)
{
...
...
src/include/Kernels.hpp
View file @
0b7b979e
...
...
@@ -219,8 +219,8 @@ GpuReduceKernel(SubExecParam* params)
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
{
p
.
startCycle
=
startCycle
;
p
.
stopCycle
=
wall_clock64
();
p
.
startCycle
=
startCycle
;
__trace_hwreg
();
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment