Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
811064b5
Commit
811064b5
authored
Aug 24, 2022
by
Gilbert Lee
Browse files
Adding new sweep environment variables
parent
93430da1
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
278 additions
and
76 deletions
+278
-76
CHANGELOG.md
CHANGELOG.md
+13
-0
EnvVars.hpp
EnvVars.hpp
+105
-15
TransferBench.cpp
TransferBench.cpp
+157
-60
TransferBench.hpp
TransferBench.hpp
+3
-1
No files found.
CHANGELOG.md
View file @
811064b5
# Changelog for TransferBench
## v1.04
### Added
-
New environment variables for sweep based presets
-
SWEEP_XGMI_MIN - Min number of XGMI hops for Transfers
-
SWEEP_XGMI_MAX - Max number of XGMI hops for Transfers
-
SWEEP_SEED - Random seed being used
-
SWEEP_RAND_BYTES - Use random amount of bytes (up to pre-specified N) for each Transfer
### Changed
-
CSV output for sweep includes env vars section followed by output
-
CSV output no longer lists env var parameters in columns
-
Default number of warmup iterations changed from 3 to 1
-
Splitting CSV output of link type to ExeToSrcLinkType and ExeToDstLinkType
## v1.03
### Added
-
New preset modes stress-test benchmarks "sweep" and "randomsweep"
...
...
EnvVars.hpp
View file @
811064b5
...
...
@@ -24,17 +24,25 @@ THE SOFTWARE.
#define ENVVARS_HPP
#include <algorithm>
#define TB_VERSION "1.03"
#include <random>
#include <time.h>
#define TB_VERSION "1.04"
extern
char
const
MemTypeStr
[];
enum
ConfigModeEnum
{
CFG_FILE
=
0
,
CFG_P2P
=
1
,
CFG_SWEEP
=
2
};
// This class manages environment variable that affect TransferBench
class
EnvVars
{
public:
// Default configuration values
int
const
DEFAULT_NUM_WARMUPS
=
3
;
int
const
DEFAULT_NUM_WARMUPS
=
1
;
int
const
DEFAULT_NUM_ITERATIONS
=
10
;
int
const
DEFAULT_SAMPLING_FACTOR
=
1
;
int
const
DEFAULT_NUM_CPU_PER_TRANSFER
=
4
;
...
...
@@ -73,10 +81,20 @@ public:
int
sweepMax
;
// Max number of simulatneous Transfers to be executed per test
int
sweepTestLimit
;
// Max number of tests to run during sweep (0 = no limit)
int
sweepTimeLimit
;
// Max number of seconds to run sweep for (0 = no limit)
int
sweepXgmiMin
;
// Min number of XGMI hops for Transfers
int
sweepXgmiMax
;
// Max number of XGMI hops for Transfers (-1 = no limit)
int
sweepSeed
;
// Random seed to use
int
sweepRandBytes
;
// Whether or not to use random number of bytes per Transfer
std
::
string
sweepSrc
;
// Set of src memory types to be swept
std
::
string
sweepExe
;
// Set of executors to be swept
std
::
string
sweepDst
;
// Set of dst memory types to be swept
// Used to track current configuration mode
ConfigModeEnum
configMode
;
// Random generator
std
::
default_random_engine
*
generator
;
// Constructor that collects values
EnvVars
()
{
...
...
@@ -104,14 +122,23 @@ public:
usePcieIndexing
=
GetEnvVar
(
"USE_PCIE_INDEX"
,
0
);
useSingleStream
=
GetEnvVar
(
"USE_SINGLE_STREAM"
,
0
);
sweepSrcIsExe
=
GetEnvVar
(
"SWEEP_SRC_IS_EXE"
,
DEFAULT_SWEEP_SRC_IS_EXE
);
sweepMin
=
GetEnvVar
(
"SWEEP_MIN"
,
DEFAULT_SWEEP_MIN
);
sweepMax
=
GetEnvVar
(
"SWEEP_MAX"
,
DEFAULT_SWEEP_MAX
);
sweepSrc
=
GetEnvVar
(
"SWEEP_SRC"
,
DEFAULT_SWEEP_SRC
);
sweepExe
=
GetEnvVar
(
"SWEEP_EXE"
,
DEFAULT_SWEEP_EXE
);
sweepDst
=
GetEnvVar
(
"SWEEP_DST"
,
DEFAULT_SWEEP_DST
);
sweepTestLimit
=
GetEnvVar
(
"SWEEP_TEST_LIMIT"
,
DEFAULT_SWEEP_TEST_LIMIT
);
sweepTimeLimit
=
GetEnvVar
(
"SWEEP_TIME_LIMIT"
,
DEFAULT_SWEEP_TIME_LIMIT
);
sweepSrcIsExe
=
GetEnvVar
(
"SWEEP_SRC_IS_EXE"
,
DEFAULT_SWEEP_SRC_IS_EXE
);
sweepMin
=
GetEnvVar
(
"SWEEP_MIN"
,
DEFAULT_SWEEP_MIN
);
sweepMax
=
GetEnvVar
(
"SWEEP_MAX"
,
DEFAULT_SWEEP_MAX
);
sweepSrc
=
GetEnvVar
(
"SWEEP_SRC"
,
DEFAULT_SWEEP_SRC
);
sweepExe
=
GetEnvVar
(
"SWEEP_EXE"
,
DEFAULT_SWEEP_EXE
);
sweepDst
=
GetEnvVar
(
"SWEEP_DST"
,
DEFAULT_SWEEP_DST
);
sweepTestLimit
=
GetEnvVar
(
"SWEEP_TEST_LIMIT"
,
DEFAULT_SWEEP_TEST_LIMIT
);
sweepTimeLimit
=
GetEnvVar
(
"SWEEP_TIME_LIMIT"
,
DEFAULT_SWEEP_TIME_LIMIT
);
sweepXgmiMin
=
GetEnvVar
(
"SWEEP_XGMI_MIN"
,
0
);
sweepXgmiMax
=
GetEnvVar
(
"SWEEP_XGMI_MAX"
,
-
1
);
sweepRandBytes
=
GetEnvVar
(
"SWEEP_RAND_BYTES"
,
0
);
// Determine random seed
char
*
sweepSeedStr
=
getenv
(
"SWEEP_SEED"
);
sweepSeed
=
(
sweepSeedStr
!=
NULL
?
atoi
(
sweepSeedStr
)
:
time
(
NULL
));
generator
=
new
std
::
default_random_engine
(
sweepSeed
);
// Check for fill pattern
char
*
pattern
=
getenv
(
"FILL_PATTERN"
);
...
...
@@ -303,10 +330,10 @@ public:
printf
(
"%-20s = %12d : Using %d CPU devices
\n
"
,
"NUM_CPU_DEVICES"
,
numCpuDevices
,
numCpuDevices
);
printf
(
"%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer
\n
"
,
"NUM_CPU_PER_TRANSFER"
,
numCpuPerTransfer
,
numCpuPerTransfer
);
printf
(
"%-20s = %12d : Using %d GPU devices
\n
"
,
"NUM_GPU_DEVICES"
,
numGpuDevices
,
numGpuDevices
);
printf
(
"%-20s = %12d : Running %d %s per
t
est
\n
"
,
"NUM_ITERATIONS"
,
numIterations
,
printf
(
"%-20s = %12d : Running %d %s per
T
est
\n
"
,
"NUM_ITERATIONS"
,
numIterations
,
numIterations
>
0
?
numIterations
:
-
numIterations
,
numIterations
>
0
?
"timed iteration(s)"
:
"second(s)"
);
printf
(
"%-20s = %12d : Running %d warmup iteration(s) per
topology
\n
"
,
"NUM_WARMUPS"
,
numWarmups
,
numWarmups
);
printf
(
"%-20s = %12d : Running %d warmup iteration(s) per
Test
\n
"
,
"NUM_WARMUPS"
,
numWarmups
,
numWarmups
);
printf
(
"%-20s = %12d : Output to %s
\n
"
,
"OUTPUT_TO_CSV"
,
outputToCsv
,
outputToCsv
?
"CSV"
:
"console"
);
printf
(
"%-20s = %12s : Using %d shared mem per threadblock
\n
"
,
"SHARED_MEM_BYTES"
,
...
...
@@ -329,6 +356,30 @@ public:
useSingleStream
,
(
useSingleStream
?
"device"
:
"Transfer"
));
printf
(
"
\n
"
);
}
else
{
printf
(
"EnvVar,Value,Description,(TransferBench v%s)
\n
"
,
TB_VERSION
);
printf
(
"BLOCK_BYTES,%d,Each CU gets a multiple of %d bytes to copy
\n
"
,
blockBytes
,
blockBytes
);
printf
(
"BYTE_OFFSET,%d,Using byte offset of %d
\n
"
,
byteOffset
,
byteOffset
);
printf
(
"FILL_PATTERN,%s,"
,
getenv
(
"FILL_PATTERN"
)
?
"(specified)"
:
"(unset)"
);
if
(
fillPattern
.
size
())
printf
(
"Pattern: %s"
,
getenv
(
"FILL_PATTERN"
));
else
printf
(
"Pseudo-random: (Element i = i modulo 383 + 31)"
);
printf
(
"
\n
"
);
printf
(
"NUM_CPU_DEVICES,%d,Using %d CPU devices
\n
"
,
numCpuDevices
,
numCpuDevices
);
printf
(
"NUM_CPU_PER_TRANSFER,%d,Using %d CPU thread(s) per CPU-executed Transfer
\n
"
,
numCpuPerTransfer
,
numCpuPerTransfer
);
printf
(
"NUM_GPU_DEVICES,%d,Using %d GPU devices
\n
"
,
numGpuDevices
,
numGpuDevices
);
printf
(
"NUM_ITERATIONS,%d,Running %d %s per Test
\n
"
,
numIterations
,
numIterations
>
0
?
numIterations
:
-
numIterations
,
numIterations
>
0
?
"timed iteration(s)"
:
"second(s)"
);
printf
(
"NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test
\n
"
,
numWarmups
,
numWarmups
);
printf
(
"SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock
\n
"
,
sharedMemBytes
,
sharedMemBytes
);
printf
(
"USE_HIP_CALL,%d,Using %s for GPU-executed copies
\n
"
,
useHipCall
,
useHipCall
?
"HIP functions"
:
"custom kernels"
);
printf
(
"USE_MEMSET,%d,Performing %s
\n
"
,
useMemset
,
useMemset
?
"memset"
:
"memcopy"
);
printf
(
"USE_PCIE_INDEX,%d,Using %s-based GPU indexing
\n
"
,
usePcieIndexing
,
(
usePcieIndexing
?
"PCIe"
:
"HIP"
));
printf
(
"USE_SINGLE_STREAM,%d,Using single stream per %s
\n
"
,
useSingleStream
,
(
useSingleStream
?
"device"
:
"Transfer"
));
}
};
// Display env var settings
...
...
@@ -338,6 +389,7 @@ public:
{
printf
(
"Sweep configuration (TransferBench v%s)
\n
"
,
TB_VERSION
);
printf
(
"=====================================================
\n
"
);
printf
(
"%-20s = %12d : Random seed
\n
"
,
"SWEEP_SEED"
,
sweepSeed
);
printf
(
"%-20s = %12s : Source Memory Types to sweep
\n
"
,
"SWEEP_SRC"
,
sweepSrc
.
c_str
());
printf
(
"%-20s = %12s : Executor Types to sweep
\n
"
,
"SWEEP_EXE"
,
sweepExe
.
c_str
());
printf
(
"%-20s = %12s : Destination Memory Types to sweep
\n
"
,
"SWEEP_DST"
,
sweepDst
.
c_str
());
...
...
@@ -346,6 +398,9 @@ public:
printf
(
"%-20s = %12d : Max simultaneous Transfers (0 = no limit)
\n
"
,
"SWEEP_MAX"
,
sweepMax
);
printf
(
"%-20s = %12d : Max number of tests to run during sweep (0 = no limit)
\n
"
,
"SWEEP_TEST_LIMIT"
,
sweepTestLimit
);
printf
(
"%-20s = %12d : Max number of seconds to run sweep for (0 = no limit)
\n
"
,
"SWEEP_TIME_LIMIT"
,
sweepTimeLimit
);
printf
(
"%-20s = %12d : Min number of XGMI hops for Transfers
\n
"
,
"SWEEP_XGMI_MIN"
,
sweepXgmiMin
);
printf
(
"%-20s = %12d : Max number of XGMI hops for Transfers (-1 = no limit)
\n
"
,
"SWEEP_XGMI_MAX"
,
sweepXgmiMax
);
printf
(
"%-20s = %12d : Using %s number of bytes per Transfer
\n
"
,
"SWEEP_RAND_BYTES"
,
sweepRandBytes
,
sweepRandBytes
?
"random"
:
"constant"
);
printf
(
"%-20s = %12d : Using %d CPU devices
\n
"
,
"NUM_CPU_DEVICES"
,
numCpuDevices
,
numCpuDevices
);
printf
(
"%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer
\n
"
,
"NUM_CPU_PER_TRANSFER"
,
numCpuPerTransfer
,
numCpuPerTransfer
);
printf
(
"%-20s = %12d : Using %d GPU devices
\n
"
,
"NUM_GPU_DEVICES"
,
numGpuDevices
,
numGpuDevices
);
...
...
@@ -357,10 +412,10 @@ public:
else
printf
(
"Pseudo-random: (Element i = i modulo 383 + 31)"
);
printf
(
"
\n
"
);
printf
(
"%-20s = %12d : Running %d %s per
t
est
\n
"
,
"NUM_ITERATIONS"
,
numIterations
,
printf
(
"%-20s = %12d : Running %d %s per
T
est
\n
"
,
"NUM_ITERATIONS"
,
numIterations
,
numIterations
>
0
?
numIterations
:
-
numIterations
,
numIterations
>
0
?
"timed iteration(s)"
:
"second(s)"
);
printf
(
"%-20s = %12d : Running %d warmup iteration(s) per
topology
\n
"
,
"NUM_WARMUPS"
,
numWarmups
,
numWarmups
);
printf
(
"%-20s = %12d : Running %d warmup iteration(s) per
Test
\n
"
,
"NUM_WARMUPS"
,
numWarmups
,
numWarmups
);
printf
(
"%-20s = %12d : Output to %s
\n
"
,
"OUTPUT_TO_CSV"
,
outputToCsv
,
outputToCsv
?
"CSV"
:
"console"
);
printf
(
"%-20s = %12s : Using %d shared mem per threadblock
\n
"
,
"SHARED_MEM_BYTES"
,
...
...
@@ -379,6 +434,41 @@ public:
useSingleStream
,
(
useSingleStream
?
"device"
:
"Transfer"
));
printf
(
"
\n
"
);
}
else
{
printf
(
"EnvVar,Value,Description,(TransferBench v%s)
\n
"
,
TB_VERSION
);
printf
(
"SWEEP_SRC,%s,Source Memory Types to sweep
\n
"
,
sweepSrc
.
c_str
());
printf
(
"SWEEP_EXE,%s,Executor Types to sweep
\n
"
,
sweepExe
.
c_str
());
printf
(
"SWEEP_DST,%s,Destination Memory Types to sweep
\n
"
,
sweepDst
.
c_str
());
printf
(
"SWEEP_SRC_IS_EXE,%d, Transfer executor %s Transfer source
\n
"
,
sweepSrcIsExe
,
sweepSrcIsExe
?
"must match"
:
"may have any"
);
printf
(
"SWEEP_SEED,%d,Random seed
\n
"
,
sweepSeed
);
printf
(
"SWEEP_MIN,%d,Min simultaneous Transfers
\n
"
,
sweepMin
);
printf
(
"SWEEP_MAX,%d,Max simultaneous Transfers (0 = no limit)
\n
"
,
sweepMax
);
printf
(
"SWEEP_TEST_LIMIT,%d,Max number of tests to run during sweep (0 = no limit)
\n
"
,
sweepTestLimit
);
printf
(
"SWEEP_TIME_LIMIT,%d,Max number of seconds to run sweep for (0 = no limit)
\n
"
,
sweepTimeLimit
);
printf
(
"SWEEP_XGMI_MIN,%d,Min number of XGMI hops for Transfers
\n
"
,
sweepXgmiMin
);
printf
(
"SWEEP_XGMI_MAX,%d,Max number of XGMI hops for Transfers (-1 = no limit)
\n
"
,
sweepXgmiMax
);
printf
(
"SWEEP_RAND_BYTES,%d,Using %s number of bytes per Transfer
\n
"
,
sweepRandBytes
,
sweepRandBytes
?
"random"
:
"constant"
);
printf
(
"NUM_CPU_DEVICES,%d,Using %d CPU devices
\n
"
,
numCpuDevices
,
numCpuDevices
);
printf
(
"NUM_CPU_PER_TRANSFER,%d,Using %d CPU thread(s) per CPU-executed Transfer
\n
"
,
numCpuPerTransfer
,
numCpuPerTransfer
);
printf
(
"NUM_GPU_DEVICES,%d,Using %d GPU devices
\n
"
,
numGpuDevices
,
numGpuDevices
);
printf
(
"BLOCK_BYTES,%d,Each CU gets a multiple of %d bytes to copy
\n
"
,
blockBytes
,
blockBytes
);
printf
(
"BYTE_OFFSET,%d,Using byte offset of %d
\n
"
,
byteOffset
,
byteOffset
);
printf
(
"FILL_PATTERN,%s,"
,
getenv
(
"FILL_PATTERN"
)
?
"(specified)"
:
"(unset)"
);
if
(
fillPattern
.
size
())
printf
(
"Pattern: %s"
,
getenv
(
"FILL_PATTERN"
));
else
printf
(
"Pseudo-random: (Element i = i modulo 383 + 31)"
);
printf
(
"
\n
"
);
printf
(
"NUM_ITERATIONS,%d,Running %d %s per Test
\n
"
,
numIterations
,
numIterations
>
0
?
numIterations
:
-
numIterations
,
numIterations
>
0
?
"timed iteration(s)"
:
"second(s)"
);
printf
(
"NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test
\n
"
,
numWarmups
,
numWarmups
);
printf
(
"SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock
\n
"
,
sharedMemBytes
,
sharedMemBytes
);
printf
(
"USE_HIP_CALL,%d,Using %s for GPU-executed copies
\n
"
,
useHipCall
,
useHipCall
?
"HIP functions"
:
"custom kernels"
);
printf
(
"USE_PCIE_INDEX,%d,Using %s-based GPU indexing
\n
"
,
usePcieIndexing
,
(
usePcieIndexing
?
"PCIe"
:
"HIP"
));
printf
(
"USE_SINGLE_STREAM,%d,Using single stream per %s
\n
"
,
useSingleStream
,
(
useSingleStream
?
"device"
:
"Transfer"
));
}
};
// Helper function that gets parses environment variable or sets to default value
...
...
TransferBench.cpp
View file @
811064b5
...
...
@@ -75,6 +75,7 @@ int main(int argc, char **argv)
// - Tests that sweep across possible sets of Transfers
if
(
!
strcmp
(
argv
[
1
],
"sweep"
)
||
!
strcmp
(
argv
[
1
],
"rsweep"
))
{
ev
.
configMode
=
CFG_SWEEP
;
RunSweepPreset
(
ev
,
numBytesPerTransfer
,
!
strcmp
(
argv
[
1
],
"rsweep"
));
exit
(
0
);
}
...
...
@@ -94,11 +95,13 @@ int main(int argc, char **argv)
int
skipCpu
=
(
!
strcmp
(
argv
[
1
],
"g2g"
)
||
!
strcmp
(
argv
[
1
],
"g2g_rr"
)
?
1
:
0
);
// Execute peer to peer benchmark mode
ev
.
configMode
=
CFG_P2P
;
RunPeerToPeerBenchmarks
(
ev
,
numBytesPerTransfer
/
sizeof
(
float
),
numBlocksToUse
,
readMode
,
skipCpu
);
exit
(
0
);
}
// Check that Transfer configuration file can be opened
ev
.
configMode
=
CFG_FILE
;
FILE
*
fp
=
fopen
(
argv
[
1
],
"r"
);
if
(
!
fp
)
{
...
...
@@ -110,8 +113,8 @@ int main(int argc, char **argv)
ev
.
DisplayEnvVars
();
if
(
ev
.
outputToCsv
)
{
printf
(
"Test,NumBytes,Src
Mem
,Exe
cutor
,Dst
Mem
,CUs,BW(GB/s),Time(ms),"
"
TransferDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters
\n
"
);
printf
(
"Test
#,Transfer#
,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
"
ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr
\n
"
);
}
int
testNum
=
0
;
...
...
@@ -233,19 +236,23 @@ void ExecuteTransfers(EnvVars const& ev,
// Loop over all the different number of bytes to use per Transfer
for
(
auto
N
:
valuesOfN
)
{
if
(
!
ev
.
outputToCsv
)
printf
(
"Test %d: [%lu bytes]
\n
"
,
testNum
,
N
*
sizeof
(
float
));
std
::
uniform_int_distribution
<
int
>
distribution
(
1
,
N
);
if
(
!
ev
.
outputToCsv
)
printf
(
"Test %d:
\n
"
,
testNum
);
// Prepare input memory and block parameters for current N
for
(
auto
&
exeInfoPair
:
transferMap
)
{
ExecutorInfo
&
exeInfo
=
exeInfoPair
.
second
;
exeInfo
.
totalBytes
=
0
;
int
transferOffset
=
0
;
for
(
int
i
=
0
;
i
<
exeInfo
.
transfers
.
size
();
++
i
)
{
// Prepare subarrays each threadblock works on and fill src memory with patterned data
Transfer
&
transfer
=
exeInfo
.
transfers
[
i
];
transfer
.
PrepareBlockParams
(
ev
,
N
);
transfer
.
numBytes
=
((
ev
.
configMode
==
CFG_SWEEP
&&
ev
.
sweepRandBytes
)
?
distribution
(
*
ev
.
generator
)
:
N
)
*
sizeof
(
float
);
transfer
.
PrepareBlockParams
(
ev
,
transfer
.
numBytes
/
sizeof
(
float
));
exeInfo
.
totalBytes
+=
transfer
.
numBytes
;
// Copy block parameters to GPU for GPU executors
if
(
transfer
.
exeMemType
==
MEM_GPU
)
...
...
@@ -286,7 +293,7 @@ void ExecuteTransfers(EnvVars const& ev,
int
const
numTransfersToRun
=
(
IsGpuType
(
exeInfoPair
.
first
.
first
)
&&
ev
.
useSingleStream
)
?
1
:
exeInfo
.
transfers
.
size
();
for
(
int
i
=
0
;
i
<
numTransfersToRun
;
++
i
)
threads
.
push
(
std
::
thread
(
RunTransfer
,
std
::
ref
(
ev
),
N
,
iteration
,
std
::
ref
(
exeInfo
),
i
));
threads
.
push
(
std
::
thread
(
RunTransfer
,
std
::
ref
(
ev
),
iteration
,
std
::
ref
(
exeInfo
),
i
));
}
// Wait for all threads to finish
...
...
@@ -317,13 +324,17 @@ void ExecuteTransfers(EnvVars const& ev,
}
// Validate that each transfer has transferred correctly
size_t
totalBytesTransferred
=
0
;
int
const
numTransfers
=
transferList
.
size
();
for
(
auto
transfer
:
transferList
)
CheckOrFill
(
MODE_CHECK
,
N
,
ev
.
useMemset
,
ev
.
useHipCall
,
ev
.
fillPattern
,
transfer
->
dstMem
+
initOffset
);
{
CheckOrFill
(
MODE_CHECK
,
transfer
->
numBytes
/
sizeof
(
float
),
ev
.
useMemset
,
ev
.
useHipCall
,
ev
.
fillPattern
,
transfer
->
dstMem
+
initOffset
);
totalBytesTransferred
+=
transfer
->
numBytes
;
}
// Report timings
totalCpuTime
=
totalCpuTime
/
(
1.0
*
numTimedIterations
)
*
1000
;
double
totalBandwidthGbs
=
(
numTransfers
*
N
*
sizeof
(
float
)
/
1.0E6
)
/
totalCpuTime
;
double
totalBandwidthGbs
=
(
totalBytesTransferred
/
1.0E6
)
/
totalCpuTime
;
double
maxGpuTime
=
0
;
if
(
ev
.
useSingleStream
)
...
...
@@ -343,56 +354,56 @@ void ExecuteTransfers(EnvVars const& ev,
}
double
exeDurationMsec
=
exeInfo
.
totalTime
/
(
1.0
*
numTimedIterations
);
double
exeBandwidthGbs
=
(
exeInfo
.
transfers
.
size
()
*
N
*
sizeof
(
float
)
/
1.0E9
)
/
exeDurationMsec
*
1000.0
f
;
double
exeBandwidthGbs
=
(
exeInfo
.
totalBytes
/
1.0E9
)
/
exeDurationMsec
*
1000.0
f
;
maxGpuTime
=
std
::
max
(
maxGpuTime
,
exeDurationMsec
);
if
(
!
ev
.
outputToCsv
)
{
printf
(
" Executor: %cPU %02d (# Transfers %02lu)| %9.3f GB/s | %8.3f ms |
\n
"
,
MemTypeStr
[
exeMemType
],
exeIndex
,
exeInfo
.
transfers
.
size
(),
exeBandwidthGbs
,
exeDurationMsec
);
printf
(
" Executor: %cPU %02d (# Transfers %02lu)| %9.3f GB/s | %8.3f ms |
%12lu bytes
\n
"
,
MemTypeStr
[
exeMemType
],
exeIndex
,
exeInfo
.
transfers
.
size
(),
exeBandwidthGbs
,
exeDurationMsec
,
exeInfo
.
totalBytes
);
}
int
totalCUs
=
0
;
for
(
auto
const
&
transfer
:
exeInfo
.
transfers
)
{
double
transferDurationMsec
=
transfer
.
transferTime
/
(
1.0
*
numTimedIterations
);
double
transferBandwidthGbs
=
(
N
*
sizeof
(
float
)
/
1.0E9
)
/
transferDurationMsec
*
1000.0
f
;
totalCUs
+=
transfer
.
exeMemType
==
MEM_CPU
?
ev
.
numCpuPerTransfer
:
transfer
.
numBlocksToUse
;
if
(
!
ev
.
outputToCsv
)
{
printf
(
" Transfer %02d | %9.3f GB/s | %8.3f ms | %c%02d -> %c%02d:(%03d) -> %c%02d
\n
"
,
printf
(
" Transfer %02d | %9.3f GB/s | %8.3f ms |
%12lu bytes |
%c%02d -> %c%02d:(%03d) -> %c%02d
\n
"
,
transfer
.
transferIndex
,
transferBandwidthGbs
,
transferDurationMsec
,
transfer
.
numBytes
,
MemTypeStr
[
transfer
.
srcMemType
],
transfer
.
srcIndex
,
MemTypeStr
[
transfer
.
exeMemType
],
transfer
.
exeIndex
,
transfer
.
exeMemType
==
MEM_CPU
?
ev
.
numCpuPerTransfer
:
transfer
.
numBlocksToUse
,
MemTypeStr
[
transfer
.
dstMemType
],
transfer
.
dstIndex
);
}
else
{
printf
(
"%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%
p
,%p,%
d,%d,%lu
\n
"
,
testNum
,
N
*
sizeof
(
float
)
,
printf
(
"%d,%
d,%
lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%
s
,%p,%
p
\n
"
,
testNum
,
transfer
.
transferIndex
,
transfer
.
numBytes
,
MemTypeStr
[
transfer
.
srcMemType
],
transfer
.
srcIndex
,
MemTypeStr
[
transfer
.
exeMemType
],
transfer
.
exeIndex
,
MemTypeStr
[
transfer
.
dstMemType
],
transfer
.
dstIndex
,
transfer
.
exeMemType
==
MEM_CPU
?
ev
.
numCpuPerTransfer
:
transfer
.
numBlocksToUse
,
transferBandwidthGbs
,
transferDurationMsec
,
GetTransferDesc
(
transfer
).
c_str
(),
transfer
.
srcMem
+
initOffset
,
transfer
.
dstMem
+
initOffset
,
ev
.
byteOffset
,
ev
.
numWarmups
,
numTimedIterations
);
GetDesc
(
transfer
.
exeMemType
,
transfer
.
exeIndex
,
transfer
.
srcMemType
,
transfer
.
srcIndex
).
c_str
(),
GetDesc
(
transfer
.
exeMemType
,
transfer
.
exeIndex
,
transfer
.
dstMemType
,
transfer
.
dstIndex
).
c_str
(),
transfer
.
srcMem
+
initOffset
,
transfer
.
dstMem
+
initOffset
);
}
}
if
(
ev
.
outputToCsv
)
{
printf
(
"%d,%lu,ALL,%c%02d,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu
\n
"
,
testNum
,
N
*
sizeof
(
float
),
MemTypeStr
[
exeMemType
],
exeIndex
,
exeBandwidthGbs
,
exeDurationMsec
,
ev
.
byteOffset
,
ev
.
numWarmups
,
numTimedIterations
);
printf
(
"%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL,ALL,ALL
\n
"
,
testNum
,
totalBytesTransferred
,
MemTypeStr
[
exeMemType
],
exeIndex
,
totalCUs
,
exeBandwidthGbs
,
exeDurationMsec
);
}
}
}
...
...
@@ -401,32 +412,32 @@ void ExecuteTransfers(EnvVars const& ev,
for
(
auto
const
&
transfer
:
transferList
)
{
double
transferDurationMsec
=
transfer
->
transferTime
/
(
1.0
*
numTimedIterations
);
double
transferBandwidthGbs
=
(
N
*
sizeof
(
float
)
/
1.0E9
)
/
transferDurationMsec
*
1000.0
f
;
double
transferBandwidthGbs
=
(
transfer
->
numBytes
/
1.0E9
)
/
transferDurationMsec
*
1000.0
f
;
maxGpuTime
=
std
::
max
(
maxGpuTime
,
transferDurationMsec
);
if
(
!
ev
.
outputToCsv
)
{
printf
(
" Transfer %02d: %c%02d -> [%cPU %02d:%03d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s
\n
"
,
printf
(
" Transfer %02d: %c%02d -> [%cPU %02d:%03d] -> %c%02d | %9.3f GB/s | %8.3f ms |
%12lu bytes |
%-16s
\n
"
,
transfer
->
transferIndex
,
MemTypeStr
[
transfer
->
srcMemType
],
transfer
->
srcIndex
,
MemTypeStr
[
transfer
->
exeMemType
],
transfer
->
exeIndex
,
transfer
->
exeMemType
==
MEM_CPU
?
ev
.
numCpuPerTransfer
:
transfer
->
numBlocksToUse
,
MemTypeStr
[
transfer
->
dstMemType
],
transfer
->
dstIndex
,
transferBandwidthGbs
,
transferDurationMsec
,
transfer
->
numBytes
,
GetTransferDesc
(
*
transfer
).
c_str
());
}
else
{
printf
(
"%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%
p
,%p,%
d,%d,%lu
\n
"
,
testNum
,
N
*
sizeof
(
float
)
,
printf
(
"%d,%
d,%
lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%
s
,%p,%
p
\n
"
,
testNum
,
transfer
->
transferIndex
,
transfer
->
numBytes
,
MemTypeStr
[
transfer
->
srcMemType
],
transfer
->
srcIndex
,
MemTypeStr
[
transfer
->
exeMemType
],
transfer
->
exeIndex
,
MemTypeStr
[
transfer
->
dstMemType
],
transfer
->
dstIndex
,
transfer
->
exeMemType
==
MEM_CPU
?
ev
.
numCpuPerTransfer
:
transfer
->
numBlocksToUse
,
transferBandwidthGbs
,
transferDurationMsec
,
GetTransferDesc
(
*
transfer
).
c_str
(),
transfer
->
srcMem
+
initOffset
,
transfer
->
dstMem
+
initOffset
,
ev
.
byteOffset
,
ev
.
numWarmups
,
numTimedIterations
);
GetDesc
(
transfer
->
exeMemType
,
transfer
->
exeIndex
,
transfer
->
srcMemType
,
transfer
->
srcIndex
).
c_str
(),
GetDesc
(
transfer
->
exeMemType
,
transfer
->
exeIndex
,
transfer
->
dstMemType
,
transfer
->
dstIndex
).
c_str
(),
transfer
->
srcMem
+
initOffset
,
transfer
->
dstMem
+
initOffset
);
}
}
}
...
...
@@ -434,14 +445,13 @@ void ExecuteTransfers(EnvVars const& ev,
// Display aggregate statistics
if
(
!
ev
.
outputToCsv
)
{
printf
(
" Aggregate Bandwidth (CPU timed) | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms
\n
"
,
totalBandwidthGbs
,
totalCpuTime
,
totalCpuTime
-
maxGpuTime
);
printf
(
" Aggregate Bandwidth (CPU timed) | %9.3f GB/s | %8.3f ms |
%12lu bytes |
Overhead: %.3f ms
\n
"
,
totalBandwidthGbs
,
totalCpuTime
,
totalBytesTransferred
,
totalCpuTime
-
maxGpuTime
);
}
else
{
printf
(
"%d,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu
\n
"
,
testNum
,
N
*
sizeof
(
float
),
totalBandwidthGbs
,
totalCpuTime
,
ev
.
byteOffset
,
ev
.
numWarmups
,
numTimedIterations
);
printf
(
"%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,ALL
\n
"
,
testNum
,
totalBytesTransferred
,
totalBandwidthGbs
,
totalCpuTime
);
}
}
...
...
@@ -991,7 +1001,7 @@ std::string GetTransferDesc(Transfer const& transfer)
+
GetDesc
(
transfer
.
exeMemType
,
transfer
.
exeIndex
,
transfer
.
dstMemType
,
transfer
.
dstIndex
);
}
void
RunTransfer
(
EnvVars
const
&
ev
,
size_t
const
N
,
int
const
iteration
,
void
RunTransfer
(
EnvVars
const
&
ev
,
int
const
iteration
,
ExecutorInfo
&
exeInfo
,
int
const
transferIdx
)
{
Transfer
&
transfer
=
exeInfo
.
transfers
[
transferIdx
];
...
...
@@ -1016,11 +1026,11 @@ void RunTransfer(EnvVars const& ev, size_t const N, int const iteration,
// Execute hipMemset / hipMemcpy
if
(
ev
.
useMemset
)
HIP_CALL
(
hipMemsetAsync
(
transfer
.
dstMem
+
initOffset
,
42
,
N
*
sizeof
(
float
)
,
stream
));
HIP_CALL
(
hipMemsetAsync
(
transfer
.
dstMem
+
initOffset
,
42
,
transfer
.
numBytes
,
stream
));
else
HIP_CALL
(
hipMemcpyAsync
(
transfer
.
dstMem
+
initOffset
,
transfer
.
srcMem
+
initOffset
,
N
*
sizeof
(
float
)
,
hipMemcpyDefault
,
transfer
.
numBytes
,
hipMemcpyDefault
,
stream
));
// Record stop event
HIP_CALL
(
hipEventRecord
(
stopEvent
,
stream
));
...
...
@@ -1259,7 +1269,7 @@ double GetPeakBandwidth(EnvVars const& ev,
{
// Perform timed iterations
for
(
int
i
=
0
;
i
<=
isBidirectional
;
i
++
)
threads
.
push
(
std
::
thread
(
RunTransfer
,
std
::
ref
(
ev
),
N
,
iteration
,
std
::
ref
(
exeInfo
[
i
]),
0
));
threads
.
push
(
std
::
thread
(
RunTransfer
,
std
::
ref
(
ev
),
iteration
,
std
::
ref
(
exeInfo
[
i
]),
0
));
// Wait for all threads to finish
for
(
int
i
=
0
;
i
<=
isBidirectional
;
i
++
)
...
...
@@ -1412,16 +1422,101 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
}
int
numDsts
=
dstList
.
size
();
int
const
numPossible
=
numSrcs
*
numExes
*
numDsts
;
int
maxParallelTransfers
=
(
ev
.
sweepMax
==
0
?
numPossible
:
ev
.
sweepMax
);
if
(
ev
.
sweepSrcIsExe
)
// Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
struct
TransferInfo
{
MemType
srcMemType
;
int
srcIndex
;
MemType
exeMemType
;
int
exeIndex
;
MemType
dstMemType
;
int
dstIndex
;
};
// If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
bool
const
useXgmiOnly
=
(
ev
.
sweepXgmiMin
>
0
||
ev
.
sweepXgmiMax
>
0
);
std
::
vector
<
TransferInfo
>
possibleTransfers
;
TransferInfo
tinfo
;
for
(
int
i
=
0
;
i
<
numExes
;
++
i
)
{
// Skip CPU executors if XGMI link must be used
if
(
useXgmiOnly
&&
!
IsGpuType
(
exeList
[
i
].
first
))
continue
;
tinfo
.
exeMemType
=
exeList
[
i
].
first
;
tinfo
.
exeIndex
=
exeList
[
i
].
second
;
bool
isXgmiSrc
=
false
;
int
numHopsSrc
=
0
;
for
(
int
j
=
0
;
j
<
numSrcs
;
++
j
)
{
printf
(
"Num possible (SRC/DST) triplets: (%d/%d) = %d
\n
"
,
numSrcs
,
numDsts
,
numPossible
);
if
(
IsGpuType
(
exeList
[
i
].
first
)
&&
IsGpuType
(
srcList
[
j
].
first
))
{
if
(
exeList
[
i
].
second
!=
srcList
[
j
].
second
)
{
uint32_t
exeToSrcLinkType
,
exeToSrcHopCount
;
HIP_CALL
(
hipExtGetLinkTypeAndHopCount
(
RemappedIndex
(
exeList
[
i
].
second
,
MEM_GPU
),
RemappedIndex
(
srcList
[
j
].
second
,
MEM_GPU
),
&
exeToSrcLinkType
,
&
exeToSrcHopCount
));
isXgmiSrc
=
(
exeToSrcLinkType
==
HSA_AMD_LINK_INFO_TYPE_XGMI
);
if
(
isXgmiSrc
)
numHopsSrc
=
exeToSrcHopCount
;
}
else
{
printf
(
"Num possible (SRC/EXE/DST) triplets: (%d/%d/%d) = %d
\n
"
,
numSrcs
,
numExes
,
numDsts
,
numPossible
);
isXgmiSrc
=
true
;
numHopsSrc
=
0
;
}
// Skip this SRC if it is not XGMI but only XGMI links may be used
if
(
useXgmiOnly
&&
!
isXgmiSrc
)
continue
;
// Skip this SRC if XGMI distance is already past limit
if
(
ev
.
sweepXgmiMax
>=
0
&&
isXgmiSrc
&&
numHopsSrc
>
ev
.
sweepXgmiMax
)
continue
;
}
else
if
(
useXgmiOnly
)
continue
;
tinfo
.
srcMemType
=
srcList
[
j
].
first
;
tinfo
.
srcIndex
=
srcList
[
j
].
second
;
bool
isXgmiDst
=
false
;
int
numHopsDst
=
0
;
for
(
int
k
=
0
;
k
<
numDsts
;
++
k
)
{
if
(
IsGpuType
(
exeList
[
i
].
first
)
&&
IsGpuType
(
dstList
[
k
].
first
))
{
if
(
exeList
[
i
].
second
!=
dstList
[
k
].
second
)
{
uint32_t
exeToDstLinkType
,
exeToDstHopCount
;
HIP_CALL
(
hipExtGetLinkTypeAndHopCount
(
RemappedIndex
(
exeList
[
i
].
second
,
MEM_GPU
),
RemappedIndex
(
dstList
[
k
].
second
,
MEM_GPU
),
&
exeToDstLinkType
,
&
exeToDstHopCount
));
isXgmiDst
=
(
exeToDstLinkType
==
HSA_AMD_LINK_INFO_TYPE_XGMI
);
if
(
isXgmiDst
)
numHopsDst
=
exeToDstHopCount
;
}
else
{
isXgmiDst
=
true
;
numHopsDst
=
0
;
}
}
// Skip this DST if it is not XGMI but only XGMI links may be used
if
(
useXgmiOnly
&&
!
isXgmiDst
)
continue
;
// Skip this DST if total XGMI distance (SRC + DST) is less than min limit
if
(
ev
.
sweepXgmiMin
>
0
&&
(
numHopsSrc
+
numHopsDst
<
ev
.
sweepXgmiMin
))
continue
;
// Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
if
(
ev
.
sweepXgmiMax
>=
0
&&
(
numHopsSrc
+
numHopsDst
)
>
ev
.
sweepXgmiMax
)
continue
;
tinfo
.
dstMemType
=
dstList
[
k
].
first
;
tinfo
.
dstIndex
=
dstList
[
k
].
second
;
possibleTransfers
.
push_back
(
tinfo
);
}
}
}
int
const
numPossible
=
(
int
)
possibleTransfers
.
size
();
int
maxParallelTransfers
=
(
ev
.
sweepMax
==
0
?
numPossible
:
ev
.
sweepMax
);
if
(
ev
.
sweepMin
>
numPossible
)
{
...
...
@@ -1429,11 +1524,16 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
return
;
}
if
(
ev
.
outputToCsv
)
{
printf
(
"
\n
Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
"ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr
\n
"
);
}
int
numTestsRun
=
0
;
int
M
=
ev
.
sweepMin
;
// Create bitmask of numPossible triplets, of which M will be chosen
std
::
string
bitmask
(
M
,
1
);
bitmask
.
resize
(
numPossible
,
0
);
auto
rng
=
std
::
default_random_engine
{};
auto
cpuStart
=
std
::
chrono
::
high_resolution_clock
::
now
();
while
(
1
)
{
...
...
@@ -1447,7 +1547,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
// Generate a random bitmask
for
(
int
i
=
0
;
i
<
numPossible
;
i
++
)
bitmask
[
i
]
=
(
i
<
M
)
?
1
:
0
;
std
::
shuffle
(
bitmask
.
begin
(),
bitmask
.
end
(),
rng
);
std
::
shuffle
(
bitmask
.
begin
(),
bitmask
.
end
(),
*
ev
.
generator
);
}
// Convert bitmask to list of Transfers
...
...
@@ -1458,15 +1558,12 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
{
// Convert integer value to (SRC->EXE->DST) triplet
Transfer
transfer
;
int
srcValue
=
value
/
numDsts
/
numExes
;
int
exeValue
=
value
/
numDsts
%
numExes
;
int
dstValue
=
value
%
numDsts
;
transfer
.
srcMemType
=
srcList
[
srcValue
].
first
;
transfer
.
srcIndex
=
srcList
[
srcValue
].
second
;
transfer
.
exeMemType
=
ev
.
sweepSrcIsExe
?
transfer
.
srcMemType
:
exeList
[
exeValue
].
first
;
transfer
.
exeIndex
=
ev
.
sweepSrcIsExe
?
transfer
.
srcIndex
:
exeList
[
exeValue
].
second
;
transfer
.
dstMemType
=
dstList
[
dstValue
].
first
;
transfer
.
dstIndex
=
dstList
[
dstValue
].
second
;
transfer
.
srcMemType
=
possibleTransfers
[
value
].
srcMemType
;
transfer
.
srcIndex
=
possibleTransfers
[
value
].
srcIndex
;
transfer
.
exeMemType
=
possibleTransfers
[
value
].
exeMemType
;
transfer
.
exeIndex
=
possibleTransfers
[
value
].
exeIndex
;
transfer
.
dstMemType
=
possibleTransfers
[
value
].
dstMemType
;
transfer
.
dstIndex
=
possibleTransfers
[
value
].
dstIndex
;
transfer
.
numBlocksToUse
=
IsGpuType
(
transfer
.
exeMemType
)
?
4
:
ev
.
numCpuPerTransfer
;
transfer
.
transferIndex
=
transfers
.
size
();
transfers
.
push_back
(
transfer
);
...
...
TransferBench.hpp
View file @
811064b5
...
...
@@ -111,6 +111,7 @@ struct Transfer
MemType
dstMemType
;
// Destination memory type
int
dstIndex
;
// Destination device index
int
numBlocksToUse
;
// Number of threadblocks to use for this Transfer
size_t
numBytes
;
// Number of bytes to Transfer
// Memory
float
*
srcMem
;
// Source memory
...
...
@@ -132,6 +133,7 @@ typedef std::pair<MemType, int> Executor;
struct
ExecutorInfo
{
std
::
vector
<
Transfer
>
transfers
;
// Transfers to execute
size_t
totalBytes
;
// Total bytes this executor transfers
// For GPU-Executors
int
totalBlocks
;
// Total number of CUs/CPU threads to use
...
...
@@ -170,7 +172,7 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
void
DeallocateMemory
(
MemType
memType
,
void
*
memPtr
);
void
CheckPages
(
char
*
byteArray
,
size_t
numBytes
,
int
targetId
);
void
CheckOrFill
(
ModeType
mode
,
int
N
,
bool
isMemset
,
bool
isHipCall
,
std
::
vector
<
float
>
const
&
fillPattern
,
float
*
ptr
);
void
RunTransfer
(
EnvVars
const
&
ev
,
size_t
const
N
,
int
const
iteration
,
ExecutorInfo
&
exeInfo
,
int
const
transferIdx
);
void
RunTransfer
(
EnvVars
const
&
ev
,
int
const
iteration
,
ExecutorInfo
&
exeInfo
,
int
const
transferIdx
);
void
RunPeerToPeerBenchmarks
(
EnvVars
const
&
ev
,
size_t
N
,
int
numBlocksToUse
,
int
readMode
,
int
skipCpu
);
void
RunSweepPreset
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
bool
const
isRandom
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment