Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
9ab74205
Unverified
Commit
9ab74205
authored
Aug 14, 2023
by
gilbertlee-amd
Committed by
GitHub
Aug 14, 2023
Browse files
Adding SHOW_ITERATIONS to provide additional per-iteration timing info (#50)
parent
b0e6ccaf
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
290 additions
and
112 deletions
+290
-112
CHANGELOG.md
CHANGELOG.md
+9
-0
src/TransferBench.cpp
src/TransferBench.cpp
+269
-101
src/include/EnvVars.hpp
src/include/EnvVars.hpp
+10
-5
src/include/TransferBench.hpp
src/include/TransferBench.hpp
+2
-6
No files found.
CHANGELOG.md
View file @
9ab74205
# Changelog for TransferBench
# Changelog for TransferBench
## v1.26
### Added
-
Setting SHOW_ITERATIONS=1 provides additional information about per-iteration timing for file and p2p configs
-
For file configs, iterations are sorted from min to max bandwidth and displayed with standard deviation
-
For p2p, min/max/standard deviation is shown for each direction.
### Changed
-
P2P benchmark formatting changed. Now reports bidirectional bandwidth in each direction (as well as sum) for clarity
## v1.25
## v1.25
### Fixed
### Fixed
-
Fixed bug in P2P bidirectional benchmark using incorrect number of subExecutors for CPU
<->
GPU tests
-
Fixed bug in P2P bidirectional benchmark using incorrect number of subExecutors for CPU
<->
GPU tests
...
...
src/TransferBench.cpp
View file @
9ab74205
...
@@ -445,6 +445,33 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -445,6 +445,33 @@ void ExecuteTransfers(EnvVars const& ev,
ExeTypeName
[
transfer
->
exeType
],
transfer
->
exeIndex
,
ExeTypeName
[
transfer
->
exeType
],
transfer
->
exeIndex
,
transfer
->
numSubExecs
,
transfer
->
numSubExecs
,
transfer
->
DstToStr
().
c_str
());
transfer
->
DstToStr
().
c_str
());
if
(
ev
.
showIterations
)
{
std
::
set
<
std
::
pair
<
double
,
int
>>
times
;
double
stdDevTime
=
0
;
double
stdDevBw
=
0
;
for
(
int
i
=
0
;
i
<
numTimedIterations
;
i
++
)
{
times
.
insert
(
std
::
make_pair
(
transfer
->
perIterationTime
[
i
],
i
+
1
));
double
const
varTime
=
fabs
(
transferDurationMsec
-
transfer
->
perIterationTime
[
i
]);
stdDevTime
+=
varTime
*
varTime
;
double
iterBandwidthGbs
=
(
transfer
->
numBytesActual
/
1.0E9
)
/
transfer
->
perIterationTime
[
i
]
*
1000.0
f
;
double
const
varBw
=
fabs
(
iterBandwidthGbs
-
transferBandwidthGbs
);
stdDevBw
+=
varBw
*
varBw
;
}
stdDevTime
=
sqrt
(
stdDevTime
/
numTimedIterations
);
stdDevBw
=
sqrt
(
stdDevBw
/
numTimedIterations
);
for
(
auto
t
:
times
)
{
double
iterDurationMsec
=
t
.
first
;
double
iterBandwidthGbs
=
(
transfer
->
numBytesActual
/
1.0E9
)
/
iterDurationMsec
*
1000.0
f
;
printf
(
" Iter %03d | %7.3f GB/s | %8.3f ms |
\n
"
,
t
.
second
,
iterBandwidthGbs
,
iterDurationMsec
);
}
printf
(
" StandardDev | %7.3f GB/s | %8.3f ms |
\n
"
,
stdDevBw
,
stdDevTime
);
}
}
}
else
else
{
{
...
@@ -488,6 +515,33 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -488,6 +515,33 @@ void ExecuteTransfers(EnvVars const& ev,
ExeTypeName
[
transfer
->
exeType
],
transfer
->
exeIndex
,
ExeTypeName
[
transfer
->
exeType
],
transfer
->
exeIndex
,
transfer
->
numSubExecs
,
transfer
->
numSubExecs
,
transfer
->
DstToStr
().
c_str
());
transfer
->
DstToStr
().
c_str
());
if
(
ev
.
showIterations
)
{
std
::
set
<
std
::
pair
<
double
,
int
>>
times
;
double
stdDevTime
=
0
;
double
stdDevBw
=
0
;
for
(
int
i
=
0
;
i
<
numTimedIterations
;
i
++
)
{
times
.
insert
(
std
::
make_pair
(
transfer
->
perIterationTime
[
i
],
i
+
1
));
double
const
varTime
=
fabs
(
transferDurationMsec
-
transfer
->
perIterationTime
[
i
]);
stdDevTime
+=
varTime
*
varTime
;
double
iterBandwidthGbs
=
(
transfer
->
numBytesActual
/
1.0E9
)
/
transfer
->
perIterationTime
[
i
]
*
1000.0
f
;
double
const
varBw
=
fabs
(
iterBandwidthGbs
-
transferBandwidthGbs
);
stdDevBw
+=
varBw
*
varBw
;
}
stdDevTime
=
sqrt
(
stdDevTime
/
numTimedIterations
);
stdDevBw
=
sqrt
(
stdDevBw
/
numTimedIterations
);
for
(
auto
t
:
times
)
{
double
iterDurationMsec
=
t
.
first
;
double
iterBandwidthGbs
=
(
transfer
->
numBytesActual
/
1.0E9
)
/
iterDurationMsec
*
1000.0
f
;
printf
(
" Iter %03d | %7.3f GB/s | %8.3f ms |
\n
"
,
t
.
second
,
iterBandwidthGbs
,
iterDurationMsec
);
}
printf
(
" StandardDev | %7.3f GB/s | %8.3f ms |
\n
"
,
stdDevBw
,
stdDevTime
);
}
}
}
else
else
{
{
...
@@ -1184,12 +1238,16 @@ void RunTransfer(EnvVars const& ev, int const iteration,
...
@@ -1184,12 +1238,16 @@ void RunTransfer(EnvVars const& ev, int const iteration,
int
const
wallClockRate
=
GetWallClockRate
(
exeIndex
);
int
const
wallClockRate
=
GetWallClockRate
(
exeIndex
);
double
iterationTimeMs
=
(
maxStopCycle
-
minStartCycle
)
/
(
double
)(
wallClockRate
);
double
iterationTimeMs
=
(
maxStopCycle
-
minStartCycle
)
/
(
double
)(
wallClockRate
);
currTransfer
->
transferTime
+=
iterationTimeMs
;
currTransfer
->
transferTime
+=
iterationTimeMs
;
if
(
ev
.
showIterations
)
currTransfer
->
perIterationTime
.
push_back
(
iterationTimeMs
);
}
}
exeInfo
.
totalTime
+=
gpuDeltaMsec
;
exeInfo
.
totalTime
+=
gpuDeltaMsec
;
}
}
else
else
{
{
transfer
->
transferTime
+=
gpuDeltaMsec
;
transfer
->
transferTime
+=
gpuDeltaMsec
;
if
(
ev
.
showIterations
)
transfer
->
perIterationTime
.
push_back
(
gpuDeltaMsec
);
}
}
}
}
}
}
...
@@ -1224,6 +1282,8 @@ void RunTransfer(EnvVars const& ev, int const iteration,
...
@@ -1224,6 +1282,8 @@ void RunTransfer(EnvVars const& ev, int const iteration,
float
gpuDeltaMsec
;
float
gpuDeltaMsec
;
HIP_CALL
(
hipEventElapsedTime
(
&
gpuDeltaMsec
,
startEvent
,
stopEvent
));
HIP_CALL
(
hipEventElapsedTime
(
&
gpuDeltaMsec
,
startEvent
,
stopEvent
));
transfer
->
transferTime
+=
gpuDeltaMsec
;
transfer
->
transferTime
+=
gpuDeltaMsec
;
if
(
ev
.
showIterations
)
transfer
->
perIterationTime
.
push_back
(
gpuDeltaMsec
);
}
}
}
}
else
if
(
transfer
->
exeType
==
EXE_CPU
)
// CPU execution agent
else
if
(
transfer
->
exeType
==
EXE_CPU
)
// CPU execution agent
...
@@ -1252,7 +1312,12 @@ void RunTransfer(EnvVars const& ev, int const iteration,
...
@@ -1252,7 +1312,12 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// Record time if not a warmup iteration
// Record time if not a warmup iteration
if
(
iteration
>=
0
)
if
(
iteration
>=
0
)
transfer
->
transferTime
+=
(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
duration
<
double
>>
(
cpuDelta
).
count
()
*
1000.0
);
{
double
const
delta
=
(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
duration
<
double
>>
(
cpuDelta
).
count
()
*
1000.0
);
transfer
->
transferTime
+=
delta
;
if
(
ev
.
showIterations
)
transfer
->
perIterationTime
.
push_back
(
delta
);
}
}
}
}
}
...
@@ -1260,6 +1325,9 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
...
@@ -1260,6 +1325,9 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
{
{
ev
.
DisplayP2PBenchmarkEnvVars
();
ev
.
DisplayP2PBenchmarkEnvVars
();
char
const
separator
=
ev
.
outputToCsv
?
','
:
' '
;
printf
(
"Bytes Per Direction%c%lu
\n
"
,
separator
,
N
*
sizeof
(
float
));
// Collect the number of available CPUs/GPUs on this machine
// Collect the number of available CPUs/GPUs on this machine
int
const
numCpus
=
ev
.
numCpuDevices
;
int
const
numCpus
=
ev
.
numCpuDevices
;
int
const
numGpus
=
ev
.
numGpuDevices
;
int
const
numGpus
=
ev
.
numGpuDevices
;
...
@@ -1273,29 +1341,37 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
...
@@ -1273,29 +1341,37 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
// Perform unidirectional / bidirectional
// Perform unidirectional / bidirectional
for
(
int
isBidirectional
=
0
;
isBidirectional
<=
1
;
isBidirectional
++
)
for
(
int
isBidirectional
=
0
;
isBidirectional
<=
1
;
isBidirectional
++
)
{
{
printf
(
"%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)
\n
"
,
isBidirectional
?
"Bi"
:
"Uni"
,
ev
.
useRemoteRead
?
"Remote"
:
"Local"
,
ev
.
useRemoteRead
?
"Local"
:
"Remote"
,
ev
.
useDmaCopy
?
"DMA"
:
"GFX"
);
// Print header
// Print header
if
(
!
ev
.
outputToCsv
)
if
(
isBidirectional
)
{
{
printf
(
"%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)
\n
"
,
isBidirectional
?
"Bi"
:
"Uni"
,
printf
(
"%12s"
,
"SRC
\\
DST"
);
ev
.
useRemoteRead
?
"Remote"
:
"Local"
,
}
ev
.
useRemoteRead
?
"Local"
:
"Remote"
,
else
ev
.
useDmaCopy
?
"DMA"
:
"GFX"
);
{
if
(
ev
.
useRemoteRead
)
if
(
isBidirectional
)
printf
(
"%12s"
,
"SRC
\\
EXE+DST"
);
{
printf
(
"%12s"
,
"SRC
\\
DST"
);
}
else
else
{
printf
(
"%12s"
,
"SRC+EXE
\\
DST"
);
if
(
ev
.
useRemoteRead
)
}
printf
(
"%12s"
,
"SRC
\\
EXE+DST"
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
else
for
(
int
i
=
0
;
i
<
numCpus
;
i
++
)
printf
(
"%12s"
,
"SRC+EXE
\\
DST"
);
{
}
printf
(
"%7s %02d"
,
"CPU"
,
i
);
for
(
int
i
=
0
;
i
<
numCpus
;
i
++
)
printf
(
"%7s %02d"
,
"CPU"
,
i
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
for
(
int
i
=
0
;
i
<
numGpus
;
i
++
)
printf
(
"%7s %02d"
,
"GPU"
,
i
);
}
printf
(
"
\n
"
);
for
(
int
i
=
0
;
i
<
numGpus
;
i
++
)
{
printf
(
"%7s %02d"
,
"GPU"
,
i
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
}
}
printf
(
"
\n
"
);
ExeType
const
gpuExeType
=
ev
.
useDmaCopy
?
EXE_GPU_DMA
:
EXE_GPU_GFX
;
// Loop over all possible src/dst pairs
// Loop over all possible src/dst pairs
for
(
int
src
=
0
;
src
<
numDevices
;
src
++
)
for
(
int
src
=
0
;
src
<
numDevices
;
src
++
)
...
@@ -1303,38 +1379,193 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
...
@@ -1303,38 +1379,193 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
MemType
const
srcType
=
(
src
<
numCpus
?
MEM_CPU
:
MEM_GPU
);
MemType
const
srcType
=
(
src
<
numCpus
?
MEM_CPU
:
MEM_GPU
);
int
const
srcIndex
=
(
srcType
==
MEM_CPU
?
src
:
src
-
numCpus
);
int
const
srcIndex
=
(
srcType
==
MEM_CPU
?
src
:
src
-
numCpus
);
if
(
!
ev
.
outputToCsv
)
std
::
vector
<
std
::
vector
<
double
>>
avgBandwidth
(
isBidirectional
+
1
);
printf
(
"%9s %02d"
,
(
srcType
==
MEM_CPU
)
?
"CPU"
:
"GPU"
,
srcIndex
);
std
::
vector
<
std
::
vector
<
double
>>
minBandwidth
(
isBidirectional
+
1
);
std
::
vector
<
std
::
vector
<
double
>>
maxBandwidth
(
isBidirectional
+
1
);
std
::
vector
<
std
::
vector
<
double
>>
stdDev
(
isBidirectional
+
1
);
for
(
int
dst
=
0
;
dst
<
numDevices
;
dst
++
)
for
(
int
dst
=
0
;
dst
<
numDevices
;
dst
++
)
{
{
MemType
const
dstType
=
(
dst
<
numCpus
?
MEM_CPU
:
MEM_GPU
);
MemType
const
dstType
=
(
dst
<
numCpus
?
MEM_CPU
:
MEM_GPU
);
int
const
dstIndex
=
(
dstType
==
MEM_CPU
?
dst
:
dst
-
numCpus
);
int
const
dstIndex
=
(
dstType
==
MEM_CPU
?
dst
:
dst
-
numCpus
);
double
bandwidth
=
GetPeakBandwidth
(
ev
,
N
,
isBidirectional
,
srcType
,
srcIndex
,
dstType
,
dstIndex
);
// Prepare Transfers
if
(
!
ev
.
outputToCsv
)
std
::
vector
<
Transfer
>
transfers
(
isBidirectional
+
1
);
// SRC -> DST
transfers
[
0
].
numBytes
=
N
*
sizeof
(
float
);
transfers
[
0
].
srcType
.
push_back
(
srcType
);
transfers
[
0
].
dstType
.
push_back
(
dstType
);
transfers
[
0
].
srcIndex
.
push_back
(
srcIndex
);
transfers
[
0
].
dstIndex
.
push_back
(
dstIndex
);
transfers
[
0
].
numSrcs
=
transfers
[
0
].
numDsts
=
1
;
transfers
[
0
].
exeType
=
IsGpuType
(
ev
.
useRemoteRead
?
dstType
:
srcType
)
?
gpuExeType
:
EXE_CPU
;
transfers
[
0
].
exeIndex
=
(
ev
.
useRemoteRead
?
dstIndex
:
srcIndex
);
transfers
[
0
].
numSubExecs
=
IsGpuType
(
transfers
[
0
].
exeType
)
?
ev
.
numGpuSubExecs
:
ev
.
numCpuSubExecs
;
// DST -> SRC
if
(
isBidirectional
)
{
transfers
[
1
].
numBytes
=
N
*
sizeof
(
float
);
transfers
[
1
].
numSrcs
=
transfers
[
1
].
numDsts
=
1
;
transfers
[
1
].
srcType
.
push_back
(
dstType
);
transfers
[
1
].
dstType
.
push_back
(
srcType
);
transfers
[
1
].
srcIndex
.
push_back
(
dstIndex
);
transfers
[
1
].
dstIndex
.
push_back
(
srcIndex
);
transfers
[
1
].
exeType
=
IsGpuType
(
ev
.
useRemoteRead
?
srcType
:
dstType
)
?
gpuExeType
:
EXE_CPU
;
transfers
[
1
].
exeIndex
=
(
ev
.
useRemoteRead
?
srcIndex
:
dstIndex
);
transfers
[
1
].
numSubExecs
=
IsGpuType
(
transfers
[
1
].
exeType
)
?
ev
.
numGpuSubExecs
:
ev
.
numCpuSubExecs
;
}
bool
skipTest
=
false
;
// Abort if executing on NUMA node with no CPUs
for
(
int
i
=
0
;
i
<=
isBidirectional
;
i
++
)
{
{
if
(
bandwidth
==
0
)
if
(
transfers
[
i
].
exeType
==
EXE_CPU
&&
ev
.
numCpusPerNuma
[
transfers
[
i
].
exeIndex
]
==
0
)
{
skipTest
=
true
;
break
;
}
#if defined(__NVCC__)
// NVIDIA platform cannot access GPU memory directly from CPU executors
if
(
transfers
[
i
].
exeType
==
EXE_CPU
&&
(
IsGpuType
(
srcType
)
||
IsGpuType
(
dstType
)))
{
skipTest
=
true
;
break
;
}
#endif
}
if
(
isBidirectional
&&
srcType
==
dstType
&&
srcIndex
==
dstIndex
)
skipTest
=
true
;
if
(
!
skipTest
)
{
ExecuteTransfers
(
ev
,
0
,
N
,
transfers
,
false
);
for
(
int
dir
=
0
;
dir
<=
isBidirectional
;
dir
++
)
{
double
const
avgTime
=
transfers
[
dir
].
transferTime
/
ev
.
numIterations
;
double
const
avgBw
=
(
transfers
[
dir
].
numBytesActual
/
1.0E9
)
/
avgTime
*
1000.0
f
;
avgBandwidth
[
dir
].
push_back
(
avgBw
);
if
(
ev
.
showIterations
)
{
double
minTime
=
transfers
[
dir
].
perIterationTime
[
0
];
double
maxTime
=
transfers
[
dir
].
perIterationTime
[
0
];
double
varSum
=
0
;
for
(
int
i
=
0
;
i
<
transfers
[
dir
].
perIterationTime
.
size
();
i
++
)
{
minTime
=
std
::
min
(
minTime
,
transfers
[
dir
].
perIterationTime
[
i
]);
maxTime
=
std
::
max
(
maxTime
,
transfers
[
dir
].
perIterationTime
[
i
]);
double
const
bw
=
(
transfers
[
dir
].
numBytesActual
/
1.0E9
)
/
transfers
[
dir
].
perIterationTime
[
i
]
*
1000.0
f
;
double
const
delta
=
(
avgBw
-
bw
);
varSum
+=
delta
*
delta
;
}
double
const
minBw
=
(
transfers
[
dir
].
numBytesActual
/
1.0E9
)
/
maxTime
*
1000.0
f
;
double
const
maxBw
=
(
transfers
[
dir
].
numBytesActual
/
1.0E9
)
/
minTime
*
1000.0
f
;
double
const
stdev
=
sqrt
(
varSum
/
transfers
[
dir
].
perIterationTime
.
size
());
minBandwidth
[
dir
].
push_back
(
minBw
);
maxBandwidth
[
dir
].
push_back
(
maxBw
);
stdDev
[
dir
].
push_back
(
stdev
);
}
}
}
else
{
for
(
int
dir
=
0
;
dir
<=
isBidirectional
;
dir
++
)
{
avgBandwidth
[
dir
].
push_back
(
0
);
minBandwidth
[
dir
].
push_back
(
0
);
maxBandwidth
[
dir
].
push_back
(
0
);
stdDev
[
dir
].
push_back
(
-
1.0
);
}
}
}
for
(
int
dir
=
0
;
dir
<=
isBidirectional
;
dir
++
)
{
printf
(
"%5s %02d %3s"
,
(
srcType
==
MEM_CPU
)
?
"CPU"
:
"GPU"
,
srcIndex
,
dir
?
"<- "
:
" ->"
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
for
(
int
dst
=
0
;
dst
<
numDevices
;
dst
++
)
{
double
const
avgBw
=
avgBandwidth
[
dir
][
dst
];
if
(
avgBw
==
0.0
)
printf
(
"%10s"
,
"N/A"
);
printf
(
"%10s"
,
"N/A"
);
else
else
printf
(
"%10.2f"
,
bandwidth
);
printf
(
"%10.2f"
,
avgBw
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
}
}
else
printf
(
"
\n
"
);
if
(
ev
.
showIterations
)
{
{
printf
(
"%s %02d,%s %02d,%s,%s,%s,%.2f,%lu
\n
"
,
// minBw
srcType
==
MEM_CPU
?
"CPU"
:
"GPU"
,
srcIndex
,
printf
(
"%5s %02d %3s"
,
(
srcType
==
MEM_CPU
)
?
"CPU"
:
"GPU"
,
srcIndex
,
"min"
);
dstType
==
MEM_CPU
?
"CPU"
:
"GPU"
,
dstIndex
,
if
(
ev
.
outputToCsv
)
printf
(
","
);
isBidirectional
?
"bidirectional"
:
"unidirectional"
,
ev
.
useRemoteRead
?
"Remote"
:
"Local"
,
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
ev
.
useDmaCopy
?
"DMA"
:
"GFX"
,
{
bandwidth
,
double
const
minBw
=
minBandwidth
[
dir
][
i
];
N
*
sizeof
(
float
));
if
(
minBw
==
0.0
)
printf
(
"%10s"
,
"N/A"
);
else
printf
(
"%10.2f"
,
minBw
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
}
printf
(
"
\n
"
);
// maxBw
printf
(
"%5s %02d %3s"
,
(
srcType
==
MEM_CPU
)
?
"CPU"
:
"GPU"
,
srcIndex
,
"max"
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
{
double
const
maxBw
=
maxBandwidth
[
dir
][
i
];
if
(
maxBw
==
0.0
)
printf
(
"%10s"
,
"N/A"
);
else
printf
(
"%10.2f"
,
maxBw
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
}
printf
(
"
\n
"
);
// stddev
printf
(
"%5s %02d %3s"
,
(
srcType
==
MEM_CPU
)
?
"CPU"
:
"GPU"
,
srcIndex
,
" sd"
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
{
double
const
sd
=
stdDev
[
dir
][
i
];
if
(
sd
==
-
1.0
)
printf
(
"%10s"
,
"N/A"
);
else
printf
(
"%10.2f"
,
sd
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
}
printf
(
"
\n
"
);
}
}
fflush
(
stdout
);
fflush
(
stdout
);
}
}
if
(
!
ev
.
outputToCsv
)
printf
(
"
\n
"
);
if
(
isBidirectional
)
{
printf
(
"%5s %02d %3s"
,
(
srcType
==
MEM_CPU
)
?
"CPU"
:
"GPU"
,
srcIndex
,
"<->"
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
for
(
int
dst
=
0
;
dst
<
numDevices
;
dst
++
)
{
double
const
sumBw
=
avgBandwidth
[
0
][
dst
]
+
avgBandwidth
[
1
][
dst
];
if
(
sumBw
==
0.0
)
printf
(
"%10s"
,
"N/A"
);
else
printf
(
"%10.2f"
,
sumBw
);
if
(
ev
.
outputToCsv
)
printf
(
","
);
}
if
(
src
<
numDevices
-
1
)
printf
(
"
\n\n
"
);
}
}
}
if
(
!
ev
.
outputToCsv
)
printf
(
"
\n
"
);
printf
(
"
\n
"
);
}
}
}
}
...
@@ -1475,70 +1706,6 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
...
@@ -1475,70 +1706,6 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
printf
(
"Aggregate bandwidth (CPU Timed): %7.2f
\n
"
,
totalBandwidthCpu
);
printf
(
"Aggregate bandwidth (CPU Timed): %7.2f
\n
"
,
totalBandwidthCpu
);
}
}
double
GetPeakBandwidth
(
EnvVars
const
&
ev
,
size_t
const
N
,
int
const
isBidirectional
,
MemType
const
srcType
,
int
const
srcIndex
,
MemType
const
dstType
,
int
const
dstIndex
)
{
// Skip bidirectional on same device
if
(
isBidirectional
&&
srcType
==
dstType
&&
srcIndex
==
dstIndex
)
return
0.0
f
;
// Prepare Transfers
std
::
vector
<
Transfer
>
transfers
(
2
);
transfers
[
0
].
numBytes
=
transfers
[
1
].
numBytes
=
N
*
sizeof
(
float
);
// SRC -> DST
transfers
[
0
].
numSrcs
=
transfers
[
0
].
numDsts
=
1
;
transfers
[
0
].
srcType
.
push_back
(
srcType
);
transfers
[
0
].
dstType
.
push_back
(
dstType
);
transfers
[
0
].
srcIndex
.
push_back
(
srcIndex
);
transfers
[
0
].
dstIndex
.
push_back
(
dstIndex
);
// DST -> SRC
transfers
[
1
].
numSrcs
=
transfers
[
1
].
numDsts
=
1
;
transfers
[
1
].
srcType
.
push_back
(
dstType
);
transfers
[
1
].
dstType
.
push_back
(
srcType
);
transfers
[
1
].
srcIndex
.
push_back
(
dstIndex
);
transfers
[
1
].
dstIndex
.
push_back
(
srcIndex
);
// Either perform (local read + remote write), or (remote read + local write)
ExeType
gpuExeType
=
ev
.
useDmaCopy
?
EXE_GPU_DMA
:
EXE_GPU_GFX
;
transfers
[
0
].
exeType
=
IsGpuType
(
ev
.
useRemoteRead
?
dstType
:
srcType
)
?
gpuExeType
:
EXE_CPU
;
transfers
[
1
].
exeType
=
IsGpuType
(
ev
.
useRemoteRead
?
srcType
:
dstType
)
?
gpuExeType
:
EXE_CPU
;
transfers
[
0
].
exeIndex
=
(
ev
.
useRemoteRead
?
dstIndex
:
srcIndex
);
transfers
[
1
].
exeIndex
=
(
ev
.
useRemoteRead
?
srcIndex
:
dstIndex
);
transfers
[
0
].
numSubExecs
=
IsGpuType
(
transfers
[
0
].
exeType
)
?
ev
.
numGpuSubExecs
:
ev
.
numCpuSubExecs
;
transfers
[
1
].
numSubExecs
=
IsGpuType
(
transfers
[
1
].
exeType
)
?
ev
.
numGpuSubExecs
:
ev
.
numCpuSubExecs
;
// Remove (DST->SRC) if not bidirectional
transfers
.
resize
(
isBidirectional
+
1
);
// Abort if executing on NUMA node with no CPUs
for
(
int
i
=
0
;
i
<=
isBidirectional
;
i
++
)
{
if
(
transfers
[
i
].
exeType
==
EXE_CPU
&&
ev
.
numCpusPerNuma
[
transfers
[
i
].
exeIndex
]
==
0
)
return
0
;
#if defined(__NVCC__)
// NVIDIA platform cannot access GPU memory directly from CPU executors
if
(
transfers
[
i
].
exeType
==
EXE_CPU
&&
(
IsGpuType
(
srcType
)
||
IsGpuType
(
dstType
)))
return
0
;
#endif
}
ExecuteTransfers
(
ev
,
0
,
N
,
transfers
,
false
);
// Collect aggregate bandwidth
double
totalBandwidth
=
0
;
for
(
int
i
=
0
;
i
<=
isBidirectional
;
i
++
)
{
double
transferDurationMsec
=
transfers
[
i
].
transferTime
/
(
1.0
*
ev
.
numIterations
);
double
transferBandwidthGbs
=
(
transfers
[
i
].
numBytesActual
/
1.0E9
)
/
transferDurationMsec
*
1000.0
f
;
totalBandwidth
+=
transferBandwidthGbs
;
}
return
totalBandwidth
;
}
void
Transfer
::
PrepareSubExecParams
(
EnvVars
const
&
ev
)
void
Transfer
::
PrepareSubExecParams
(
EnvVars
const
&
ev
)
{
{
// Each subExecutor needs to know src/dst pointers and how many elements to transfer
// Each subExecutor needs to know src/dst pointers and how many elements to transfer
...
@@ -1582,6 +1749,7 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)
...
@@ -1582,6 +1749,7 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)
}
}
this
->
transferTime
=
0.0
;
this
->
transferTime
=
0.0
;
this
->
perIterationTime
.
clear
();
}
}
void
Transfer
::
PrepareReference
(
EnvVars
const
&
ev
,
std
::
vector
<
float
>&
buffer
,
int
bufferIdx
)
void
Transfer
::
PrepareReference
(
EnvVars
const
&
ev
,
std
::
vector
<
float
>&
buffer
,
int
bufferIdx
)
...
...
src/include/EnvVars.hpp
View file @
9ab74205
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Compatibility.hpp"
#include "Kernels.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.2
5
"
#define TB_VERSION "1.2
6
"
extern
char
const
MemTypeStr
[];
extern
char
const
MemTypeStr
[];
extern
char
const
ExeTypeStr
[];
extern
char
const
ExeTypeStr
[];
...
@@ -75,6 +75,7 @@ public:
...
@@ -75,6 +75,7 @@ public:
int
outputToCsv
;
// Output in CSV format
int
outputToCsv
;
// Output in CSV format
int
samplingFactor
;
// Affects how many different values of N are generated (when N set to 0)
int
samplingFactor
;
// Affects how many different values of N are generated (when N set to 0)
int
sharedMemBytes
;
// Amount of shared memory to use per threadblock
int
sharedMemBytes
;
// Amount of shared memory to use per threadblock
int
showIterations
;
// Show per-iteration timing info
int
useInteractive
;
// Pause for user-input before starting transfer loop
int
useInteractive
;
// Pause for user-input before starting transfer loop
int
usePcieIndexing
;
// Base GPU indexing on PCIe address instead of HIP device
int
usePcieIndexing
;
// Base GPU indexing on PCIe address instead of HIP device
int
usePrepSrcKernel
;
// Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
int
usePrepSrcKernel
;
// Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
...
@@ -155,6 +156,7 @@ public:
...
@@ -155,6 +156,7 @@ public:
outputToCsv
=
GetEnvVar
(
"OUTPUT_TO_CSV"
,
0
);
outputToCsv
=
GetEnvVar
(
"OUTPUT_TO_CSV"
,
0
);
samplingFactor
=
GetEnvVar
(
"SAMPLING_FACTOR"
,
DEFAULT_SAMPLING_FACTOR
);
samplingFactor
=
GetEnvVar
(
"SAMPLING_FACTOR"
,
DEFAULT_SAMPLING_FACTOR
);
sharedMemBytes
=
GetEnvVar
(
"SHARED_MEM_BYTES"
,
defaultSharedMemBytes
);
sharedMemBytes
=
GetEnvVar
(
"SHARED_MEM_BYTES"
,
defaultSharedMemBytes
);
showIterations
=
GetEnvVar
(
"SHOW_ITERATIONS"
,
0
);
useInteractive
=
GetEnvVar
(
"USE_INTERACTIVE"
,
0
);
useInteractive
=
GetEnvVar
(
"USE_INTERACTIVE"
,
0
);
usePcieIndexing
=
GetEnvVar
(
"USE_PCIE_INDEX"
,
0
);
usePcieIndexing
=
GetEnvVar
(
"USE_PCIE_INDEX"
,
0
);
usePrepSrcKernel
=
GetEnvVar
(
"USE_PREP_KERNEL"
,
0
);
usePrepSrcKernel
=
GetEnvVar
(
"USE_PREP_KERNEL"
,
0
);
...
@@ -164,10 +166,10 @@ public:
...
@@ -164,10 +166,10 @@ public:
gpuKernel
=
GetEnvVar
(
"GPU_KERNEL"
,
defaultGpuKernel
);
gpuKernel
=
GetEnvVar
(
"GPU_KERNEL"
,
defaultGpuKernel
);
// P2P Benchmark related
// P2P Benchmark related
useRemoteRead
=
GetEnvVar
(
"USE_REMOTE_READ"
,
0
);
useRemoteRead
=
GetEnvVar
(
"USE_REMOTE_READ"
,
0
);
useDmaCopy
=
GetEnvVar
(
"USE_GPU_DMA"
,
0
);
useDmaCopy
=
GetEnvVar
(
"USE_GPU_DMA"
,
0
);
numGpuSubExecs
=
GetEnvVar
(
"NUM_GPU_SE"
,
useDmaCopy
?
1
:
numDeviceCUs
);
numGpuSubExecs
=
GetEnvVar
(
"NUM_GPU_SE"
,
useDmaCopy
?
1
:
numDeviceCUs
);
numCpuSubExecs
=
GetEnvVar
(
"NUM_CPU_SE"
,
DEFAULT_P2P_NUM_CPU_SE
);
numCpuSubExecs
=
GetEnvVar
(
"NUM_CPU_SE"
,
DEFAULT_P2P_NUM_CPU_SE
);
// Sweep related
// Sweep related
sweepMin
=
GetEnvVar
(
"SWEEP_MIN"
,
DEFAULT_SWEEP_MIN
);
sweepMin
=
GetEnvVar
(
"SWEEP_MIN"
,
DEFAULT_SWEEP_MIN
);
...
@@ -382,6 +384,7 @@ public:
...
@@ -382,6 +384,7 @@ public:
printf
(
" OUTPUT_TO_CSV - Outputs to CSV format if set
\n
"
);
printf
(
" OUTPUT_TO_CSV - Outputs to CSV format if set
\n
"
);
printf
(
" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes
\n
"
);
printf
(
" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes
\n
"
);
printf
(
" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU
\n
"
);
printf
(
" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU
\n
"
);
printf
(
" SHOW_ITERATIONS - Show per-iteration timing info
\n
"
);
printf
(
" USE_INTERACTIVE - Pause for user-input before starting transfer loop
\n
"
);
printf
(
" USE_INTERACTIVE - Pause for user-input before starting transfer loop
\n
"
);
printf
(
" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing
\n
"
);
printf
(
" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing
\n
"
);
printf
(
" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern
\n
"
);
printf
(
" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern
\n
"
);
...
@@ -429,6 +432,8 @@ public:
...
@@ -429,6 +432,8 @@ public:
std
::
string
(
"Running "
+
std
::
to_string
(
numWarmups
)
+
" warmup iteration(s) per Test"
));
std
::
string
(
"Running "
+
std
::
to_string
(
numWarmups
)
+
" warmup iteration(s) per Test"
));
PRINT_EV
(
"SHARED_MEM_BYTES"
,
sharedMemBytes
,
PRINT_EV
(
"SHARED_MEM_BYTES"
,
sharedMemBytes
,
std
::
string
(
"Using "
+
std
::
to_string
(
sharedMemBytes
)
+
" shared mem per threadblock"
));
std
::
string
(
"Using "
+
std
::
to_string
(
sharedMemBytes
)
+
" shared mem per threadblock"
));
PRINT_EV
(
"SHOW_ITERATIONS"
,
showIterations
,
std
::
string
(
showIterations
?
"Showing"
:
"Hiding"
)
+
" per-iteration timing"
);
PRINT_EV
(
"USE_INTERACTIVE"
,
useInteractive
,
PRINT_EV
(
"USE_INTERACTIVE"
,
useInteractive
,
std
::
string
(
"Running in "
)
+
(
useInteractive
?
"interactive"
:
"non-interactive"
)
+
" mode"
);
std
::
string
(
"Running in "
)
+
(
useInteractive
?
"interactive"
:
"non-interactive"
)
+
" mode"
);
PRINT_EV
(
"USE_PCIE_INDEX"
,
usePcieIndexing
,
PRINT_EV
(
"USE_PCIE_INDEX"
,
usePcieIndexing
,
...
...
src/include/TransferBench.hpp
View file @
9ab74205
...
@@ -119,6 +119,8 @@ struct Transfer
...
@@ -119,6 +119,8 @@ struct Transfer
std
::
vector
<
SubExecParam
>
subExecParam
;
// Defines subarrays assigned to each threadblock
std
::
vector
<
SubExecParam
>
subExecParam
;
// Defines subarrays assigned to each threadblock
SubExecParam
*
subExecParamGpuPtr
;
// Pointer to GPU copy of subExecParam
SubExecParam
*
subExecParamGpuPtr
;
// Pointer to GPU copy of subExecParam
std
::
vector
<
double
>
perIterationTime
;
// Per-iteration timing
// Prepares src/dst subarray pointers for each SubExecutor
// Prepares src/dst subarray pointers for each SubExecutor
void
PrepareSubExecParams
(
EnvVars
const
&
ev
);
void
PrepareSubExecParams
(
EnvVars
const
&
ev
);
...
@@ -187,12 +189,6 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
...
@@ -187,12 +189,6 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
void
RunSweepPreset
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
int
const
numGpuSubExec
,
int
const
numCpuSubExec
,
bool
const
isRandom
);
void
RunSweepPreset
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
int
const
numGpuSubExec
,
int
const
numCpuSubExec
,
bool
const
isRandom
);
void
RunAllToAllBenchmark
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
int
const
numSubExecs
);
void
RunAllToAllBenchmark
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
int
const
numSubExecs
);
// Return the maximum bandwidth measured for given (src/dst) pair
double
GetPeakBandwidth
(
EnvVars
const
&
ev
,
size_t
const
N
,
int
const
isBidirectional
,
MemType
const
srcType
,
int
const
srcIndex
,
MemType
const
dstType
,
int
const
dstIndex
);
std
::
string
GetLinkTypeDesc
(
uint32_t
linkType
,
uint32_t
hopCount
);
std
::
string
GetLinkTypeDesc
(
uint32_t
linkType
,
uint32_t
hopCount
);
int
RemappedIndex
(
int
const
origIdx
,
bool
const
isCpuType
);
int
RemappedIndex
(
int
const
origIdx
,
bool
const
isCpuType
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment