Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
0bdcd635
Unverified
Commit
0bdcd635
authored
Jul 14, 2023
by
gilbertlee-amd
Committed by
GitHub
Jul 14, 2023
Browse files
Add All2All Benchmark (#46)
* Adding a2a preset benchmark, fixing some gfx941 values
parent
9132801d
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
106 additions
and
8 deletions
+106
-8
CHANGELOG.md
CHANGELOG.md
+5
-0
src/TransferBench.cpp
src/TransferBench.cpp
+95
-5
src/include/EnvVars.hpp
src/include/EnvVars.hpp
+3
-2
src/include/TransferBench.hpp
src/include/TransferBench.hpp
+3
-1
No files found.
CHANGELOG.md
View file @
0bdcd635
# Changelog for TransferBench
# Changelog for TransferBench
## v1.24
### Added
-
New All-To-All GPU benchmark accessed by preset "a2a"
-
Adding gfx941 wall clock frequency
## v1.23
## v1.23
### Added
### Added
-
New GPU subexec scaling benchmark accessed by preset "scaling"
-
New GPU subexec scaling benchmark accessed by preset "scaling"
...
...
src/TransferBench.cpp
View file @
0bdcd635
...
@@ -104,6 +104,17 @@ int main(int argc, char **argv)
...
@@ -104,6 +104,17 @@ int main(int argc, char **argv)
RunScalingBenchmark
(
ev
,
numBytesPerTransfer
/
sizeof
(
float
),
exeIndex
,
maxSubExecs
);
RunScalingBenchmark
(
ev
,
numBytesPerTransfer
/
sizeof
(
float
),
exeIndex
,
maxSubExecs
);
exit
(
0
);
exit
(
0
);
}
}
// - Test all2all benchmark
else
if
(
!
strcmp
(
argv
[
1
],
"a2a"
))
{
int
numSubExecs
=
(
argc
>
3
?
atoi
(
argv
[
3
])
:
4
);
// Force single-stream mode for all-to-all benchmark
ev
.
useSingleStream
=
1
;
ev
.
configMode
=
CFG_A2A
;
RunAllToAllBenchmark
(
ev
,
numBytesPerTransfer
,
numSubExecs
);
exit
(
0
);
}
// Check that Transfer configuration file can be opened
// Check that Transfer configuration file can be opened
ev
.
configMode
=
CFG_FILE
;
ev
.
configMode
=
CFG_FILE
;
...
@@ -163,14 +174,17 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -163,14 +174,17 @@ void ExecuteTransfers(EnvVars const& ev,
int
const
testNum
,
int
const
testNum
,
size_t
const
N
,
size_t
const
N
,
std
::
vector
<
Transfer
>&
transfers
,
std
::
vector
<
Transfer
>&
transfers
,
bool
verbose
)
bool
verbose
,
double
*
totalBandwidthCpu
)
{
{
int
const
initOffset
=
ev
.
byteOffset
/
sizeof
(
float
);
int
const
initOffset
=
ev
.
byteOffset
/
sizeof
(
float
);
// Map transfers by executor
// Map transfers by executor
TransferMap
transferMap
;
TransferMap
transferMap
;
for
(
Transfer
&
transfer
:
transfers
)
for
(
int
i
=
0
;
i
<
transfers
.
size
();
i
++
)
{
{
Transfer
&
transfer
=
transfers
[
i
];
transfer
.
transferIndex
=
i
;
Executor
executor
(
transfer
.
exeType
,
transfer
.
exeIndex
);
Executor
executor
(
transfer
.
exeType
,
transfer
.
exeIndex
);
ExecutorInfo
&
executorInfo
=
transferMap
[
executor
];
ExecutorInfo
&
executorInfo
=
transferMap
[
executor
];
executorInfo
.
transfers
.
push_back
(
&
transfer
);
executorInfo
.
transfers
.
push_back
(
&
transfer
);
...
@@ -370,6 +384,7 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -370,6 +384,7 @@ void ExecuteTransfers(EnvVars const& ev,
// Validate that each transfer has transferred correctly
// Validate that each transfer has transferred correctly
size_t
totalBytesTransferred
=
0
;
size_t
totalBytesTransferred
=
0
;
int
const
numTransfers
=
transferList
.
size
();
int
const
numTransfers
=
transferList
.
size
();
for
(
auto
transferPair
:
transferList
)
for
(
auto
transferPair
:
transferList
)
{
{
Transfer
*
transfer
=
transferPair
.
second
;
Transfer
*
transfer
=
transferPair
.
second
;
...
@@ -380,6 +395,8 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -380,6 +395,8 @@ void ExecuteTransfers(EnvVars const& ev,
// Report timings
// Report timings
totalCpuTime
=
totalCpuTime
/
(
1.0
*
numTimedIterations
)
*
1000
;
totalCpuTime
=
totalCpuTime
/
(
1.0
*
numTimedIterations
)
*
1000
;
double
totalBandwidthGbs
=
(
totalBytesTransferred
/
1.0E6
)
/
totalCpuTime
;
double
totalBandwidthGbs
=
(
totalBytesTransferred
/
1.0E6
)
/
totalCpuTime
;
if
(
totalBandwidthCpu
)
*
totalBandwidthCpu
=
totalBandwidthGbs
;
double
maxGpuTime
=
0
;
double
maxGpuTime
=
0
;
if
(
!
isSrcCorrect
)
goto
cleanup
;
if
(
!
isSrcCorrect
)
goto
cleanup
;
...
@@ -568,6 +585,8 @@ void DisplayUsage(char const* cmdName)
...
@@ -568,6 +585,8 @@ void DisplayUsage(char const* cmdName)
printf
(
" scaling - GPU SubExec scaling copy test
\n
"
);
printf
(
" scaling - GPU SubExec scaling copy test
\n
"
);
printf
(
" - 3th optional arg: Max # of SubExecs to use
\n
"
);
printf
(
" - 3th optional arg: Max # of SubExecs to use
\n
"
);
printf
(
" - 4rd optional arg: GPU index to use as executor
\n
"
);
printf
(
" - 4rd optional arg: GPU index to use as executor
\n
"
);
printf
(
" a2a - GPU All-To-All benchmark
\n
"
);
printf
(
" - 3rd optional arg: # of SubExecs to use
\n
"
);
printf
(
" N : (Optional) Number of bytes to copy per Transfer.
\n
"
);
printf
(
" N : (Optional) Number of bytes to copy per Transfer.
\n
"
);
printf
(
" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes
\n
"
,
printf
(
" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes
\n
"
,
DEFAULT_BYTES_PER_TRANSFER
);
DEFAULT_BYTES_PER_TRANSFER
);
...
@@ -882,7 +901,6 @@ void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>&
...
@@ -882,7 +901,6 @@ void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>&
for
(
int
i
=
0
;
i
<
numTransfers
;
i
++
)
for
(
int
i
=
0
;
i
<
numTransfers
;
i
++
)
{
{
Transfer
transfer
;
Transfer
transfer
;
transfer
.
transferIndex
=
i
;
transfer
.
numBytes
=
0
;
transfer
.
numBytes
=
0
;
transfer
.
numBytesActual
=
0
;
transfer
.
numBytesActual
=
0
;
if
(
!
advancedMode
)
if
(
!
advancedMode
)
...
@@ -1022,8 +1040,13 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
...
@@ -1022,8 +1040,13 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
exit
(
1
);
exit
(
1
);
#else
#else
HIP_CALL
(
hipSetDevice
(
devIndex
));
HIP_CALL
(
hipSetDevice
(
devIndex
));
HIP_CALL
(
hipExtMallocWithFlags
((
void
**
)
memPtr
,
numBytes
,
hipDeviceMallocFinegrained
));
// NOTE: hipDeviceMallocFinegrained will be replaced by hipDeviceMallocUncached eventually
// Until then, this workaround is required
hipDeviceProp_t
prop
;
HIP_CALL
(
hipGetDeviceProperties
(
&
prop
,
0
));
int
flag
=
(
prop
.
gcnArch
/
10
==
94
)
?
0x3
:
hipDeviceMallocFinegrained
;
HIP_CALL
(
hipExtMallocWithFlags
((
void
**
)
memPtr
,
numBytes
,
flag
));
#endif
#endif
}
}
HIP_CALL
(
hipMemset
(
*
memPtr
,
0
,
numBytes
));
HIP_CALL
(
hipMemset
(
*
memPtr
,
0
,
numBytes
));
...
@@ -1385,6 +1408,73 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
...
@@ -1385,6 +1408,73 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
printf
(
"
\n
"
);
printf
(
"
\n
"
);
}
}
void
RunAllToAllBenchmark
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
int
const
numSubExecs
)
{
ev
.
DisplayEnvVars
();
// Collect the number of GPU devices to use
int
const
numGpus
=
ev
.
numGpuDevices
;
// Enable peer to peer for each GPU
for
(
int
i
=
0
;
i
<
numGpus
;
i
++
)
for
(
int
j
=
0
;
j
<
numGpus
;
j
++
)
if
(
i
!=
j
)
EnablePeerAccess
(
i
,
j
);
char
separator
=
(
ev
.
outputToCsv
?
','
:
' '
);
Transfer
transfer
;
transfer
.
numBytes
=
numBytesPerTransfer
;
transfer
.
numSubExecs
=
numSubExecs
;
transfer
.
numSrcs
=
1
;
transfer
.
numDsts
=
1
;
transfer
.
exeType
=
EXE_GPU_GFX
;
transfer
.
srcType
.
resize
(
1
,
MEM_GPU
);
transfer
.
dstType
.
resize
(
1
,
MEM_GPU
);
transfer
.
srcIndex
.
resize
(
1
);
transfer
.
dstIndex
.
resize
(
1
);
std
::
vector
<
Transfer
>
transfers
;
for
(
int
i
=
0
;
i
<
numGpus
;
i
++
)
{
transfer
.
srcIndex
[
0
]
=
i
;
transfer
.
exeIndex
=
i
;
for
(
int
j
=
0
;
j
<
numGpus
;
j
++
)
{
transfer
.
dstIndex
[
0
]
=
j
;
transfers
.
push_back
(
transfer
);
}
}
printf
(
"GPU-GFX All-To-All benchmark:
\n
"
);
printf
(
"==========================
\n
"
);
printf
(
"- Copying %lu bytes between every pair of GPUs using %d CUs
\n
"
,
numBytesPerTransfer
,
numSubExecs
);
printf
(
"- All numbers reported as GB/sec
\n\n
"
);
double
totalBandwidthCpu
=
0
;
ExecuteTransfers
(
ev
,
0
,
numBytesPerTransfer
/
sizeof
(
float
),
transfers
,
true
,
&
totalBandwidthCpu
);
printf
(
"
\n
Summary:
\n
"
);
printf
(
"==========================================================
\n
"
);
printf
(
"SRC
\\
DST"
);
for
(
int
dst
=
0
;
dst
<
numGpus
;
dst
++
)
printf
(
"%cGPU %02d "
,
separator
,
dst
);
printf
(
"
\n
"
);
for
(
int
src
=
0
;
src
<
numGpus
;
src
++
)
{
printf
(
"GPU %02d"
,
src
);
for
(
int
dst
=
0
;
dst
<
numGpus
;
dst
++
)
{
Transfer
const
&
transfer
=
transfers
[
src
*
numGpus
+
dst
];
double
transferDurationMsec
=
transfer
.
transferTime
/
(
1.0
*
ev
.
numIterations
);
double
transferBandwidthGbs
=
(
transfer
.
numBytesActual
/
1.0E9
)
/
transferDurationMsec
*
1000.0
f
;
printf
(
"%c%7.2f "
,
separator
,
transferBandwidthGbs
);
}
printf
(
"
\n
"
);
}
printf
(
"Aggregate bandwidth (CPU Timed): %7.2f
\n
"
,
totalBandwidthCpu
);
}
double
GetPeakBandwidth
(
EnvVars
const
&
ev
,
size_t
const
N
,
double
GetPeakBandwidth
(
EnvVars
const
&
ev
,
size_t
const
N
,
int
const
isBidirectional
,
int
const
isBidirectional
,
MemType
const
srcType
,
int
const
srcIndex
,
MemType
const
srcType
,
int
const
srcIndex
,
...
@@ -1715,6 +1805,7 @@ int GetWallClockRate(int deviceId)
...
@@ -1715,6 +1805,7 @@ int GetWallClockRate(int deviceId)
switch
(
prop
.
gcnArch
)
switch
(
prop
.
gcnArch
)
{
{
case
906
:
case
910
:
value
=
25000
;
break
;
case
906
:
case
910
:
value
=
25000
;
break
;
case
940
:
case
941
:
case
942
:
value
=
100000
;
break
;
default:
default:
printf
(
"Unrecognized GCN arch %d
\n
"
,
prop
.
gcnArch
);
printf
(
"Unrecognized GCN arch %d
\n
"
,
prop
.
gcnArch
);
}
}
...
@@ -1943,7 +2034,6 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
...
@@ -1943,7 +2034,6 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
transfer
.
dstType
=
{
possibleTransfers
[
value
].
dstType
};
transfer
.
dstType
=
{
possibleTransfers
[
value
].
dstType
};
transfer
.
dstIndex
=
{
possibleTransfers
[
value
].
dstIndex
};
transfer
.
dstIndex
=
{
possibleTransfers
[
value
].
dstIndex
};
transfer
.
numSubExecs
=
IsGpuType
(
transfer
.
exeType
)
?
numGpuSubExecs
:
numCpuSubExecs
;
transfer
.
numSubExecs
=
IsGpuType
(
transfer
.
exeType
)
?
numGpuSubExecs
:
numCpuSubExecs
;
transfer
.
transferIndex
=
transfers
.
size
();
transfer
.
numBytes
=
ev
.
sweepRandBytes
?
randSize
(
*
ev
.
generator
)
*
sizeof
(
float
)
:
0
;
transfer
.
numBytes
=
ev
.
sweepRandBytes
?
randSize
(
*
ev
.
generator
)
*
sizeof
(
float
)
:
0
;
transfers
.
push_back
(
transfer
);
transfers
.
push_back
(
transfer
);
}
}
...
...
src/include/EnvVars.hpp
View file @
0bdcd635
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Compatibility.hpp"
#include "Kernels.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.2
3
"
#define TB_VERSION "1.2
4
"
extern
char
const
MemTypeStr
[];
extern
char
const
MemTypeStr
[];
extern
char
const
ExeTypeStr
[];
extern
char
const
ExeTypeStr
[];
...
@@ -39,7 +39,8 @@ enum ConfigModeEnum
...
@@ -39,7 +39,8 @@ enum ConfigModeEnum
CFG_FILE
=
0
,
CFG_FILE
=
0
,
CFG_P2P
=
1
,
CFG_P2P
=
1
,
CFG_SWEEP
=
2
,
CFG_SWEEP
=
2
,
CFG_SCALE
=
3
CFG_SCALE
=
3
,
CFG_A2A
=
4
};
};
// This class manages environment variable that affect TransferBench
// This class manages environment variable that affect TransferBench
...
...
src/include/TransferBench.hpp
View file @
0bdcd635
...
@@ -174,7 +174,8 @@ void ParseTransfers(char* line, int numCpus, int numGpus,
...
@@ -174,7 +174,8 @@ void ParseTransfers(char* line, int numCpus, int numGpus,
std
::
vector
<
Transfer
>&
transfers
);
std
::
vector
<
Transfer
>&
transfers
);
void
ExecuteTransfers
(
EnvVars
const
&
ev
,
int
const
testNum
,
size_t
const
N
,
void
ExecuteTransfers
(
EnvVars
const
&
ev
,
int
const
testNum
,
size_t
const
N
,
std
::
vector
<
Transfer
>&
transfers
,
bool
verbose
=
true
);
std
::
vector
<
Transfer
>&
transfers
,
bool
verbose
=
true
,
double
*
totalBandwidthCpu
=
nullptr
);
void
EnablePeerAccess
(
int
const
deviceId
,
int
const
peerDeviceId
);
void
EnablePeerAccess
(
int
const
deviceId
,
int
const
peerDeviceId
);
void
AllocateMemory
(
MemType
memType
,
int
devIndex
,
size_t
numBytes
,
void
**
memPtr
);
void
AllocateMemory
(
MemType
memType
,
int
devIndex
,
size_t
numBytes
,
void
**
memPtr
);
...
@@ -184,6 +185,7 @@ void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo,
...
@@ -184,6 +185,7 @@ void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo,
void
RunPeerToPeerBenchmarks
(
EnvVars
const
&
ev
,
size_t
N
);
void
RunPeerToPeerBenchmarks
(
EnvVars
const
&
ev
,
size_t
N
);
void
RunScalingBenchmark
(
EnvVars
const
&
ev
,
size_t
N
,
int
const
exeIndex
,
int
const
maxSubExecs
);
void
RunScalingBenchmark
(
EnvVars
const
&
ev
,
size_t
N
,
int
const
exeIndex
,
int
const
maxSubExecs
);
void
RunSweepPreset
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
int
const
numGpuSubExec
,
int
const
numCpuSubExec
,
bool
const
isRandom
);
void
RunSweepPreset
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
int
const
numGpuSubExec
,
int
const
numCpuSubExec
,
bool
const
isRandom
);
void
RunAllToAllBenchmark
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
int
const
numSubExecs
);
// Return the maximum bandwidth measured for given (src/dst) pair
// Return the maximum bandwidth measured for given (src/dst) pair
double
GetPeakBandwidth
(
EnvVars
const
&
ev
,
size_t
const
N
,
double
GetPeakBandwidth
(
EnvVars
const
&
ev
,
size_t
const
N
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment