Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
5901ce0e
Unverified
Commit
5901ce0e
authored
Mar 30, 2023
by
gilbertlee-amd
Committed by
GitHub
Mar 30, 2023
Browse files
Adding direct destination mem validation, env var refactor (#19)
parent
e6f64e97
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
109 additions
and
193 deletions
+109
-193
CHANGELOG.md
CHANGELOG.md
+8
-0
src/TransferBench.cpp
src/TransferBench.cpp
+15
-3
src/include/EnvVars.hpp
src/include/EnvVars.hpp
+86
-190
No files found.
CHANGELOG.md
View file @
5901ce0e
# Changelog for TransferBench
# Changelog for TransferBench
## v1.18
### Added
-
Adding ability to validate GPU destination memory directly without going through CPU staging buffer (VALIDATE_DIRECT)
-
NOTE: This will only work on AMD devices with large-bar access enable and may slow things down considerably
### Changed
-
Refactored how environment variables are displayed
-
Mismatch stops after first detected error within an array instead of list all mismatched elements
## v1.17
## v1.17
### Added
### Added
-
Allow switch to GFX kernel for source array initialization (USE_PREP_KERNEL)
-
Allow switch to GFX kernel for source array initialization (USE_PREP_KERNEL)
...
...
src/TransferBench.cpp
View file @
5901ce0e
...
@@ -1215,7 +1215,17 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
...
@@ -1215,7 +1215,17 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
ev
.
useRemoteRead
?
"Local"
:
"Remote"
,
ev
.
useRemoteRead
?
"Local"
:
"Remote"
,
ev
.
useDmaCopy
?
"DMA"
:
"GFX"
);
ev
.
useDmaCopy
?
"DMA"
:
"GFX"
);
printf
(
"%10s"
,
"SRC
\\
DST"
);
if
(
isBidirectional
)
{
printf
(
"%12s"
,
"SRC
\\
DST"
);
}
else
{
if
(
ev
.
useRemoteRead
)
printf
(
"%12s"
,
"SRC
\\
EXE+DST"
);
else
printf
(
"%12s"
,
"SRC+EXE
\\
DST"
);
}
for
(
int
i
=
0
;
i
<
numCpus
;
i
++
)
printf
(
"%7s %02d"
,
"CPU"
,
i
);
for
(
int
i
=
0
;
i
<
numCpus
;
i
++
)
printf
(
"%7s %02d"
,
"CPU"
,
i
);
for
(
int
i
=
0
;
i
<
numGpus
;
i
++
)
printf
(
"%7s %02d"
,
"GPU"
,
i
);
for
(
int
i
=
0
;
i
<
numGpus
;
i
++
)
printf
(
"%7s %02d"
,
"GPU"
,
i
);
printf
(
"
\n
"
);
printf
(
"
\n
"
);
...
@@ -1228,7 +1238,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
...
@@ -1228,7 +1238,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
int
const
srcIndex
=
(
srcType
==
MEM_CPU
?
src
:
src
-
numCpus
);
int
const
srcIndex
=
(
srcType
==
MEM_CPU
?
src
:
src
-
numCpus
);
if
(
!
ev
.
outputToCsv
)
if
(
!
ev
.
outputToCsv
)
printf
(
"%
7
s %02d"
,
(
srcType
==
MEM_CPU
)
?
"CPU"
:
"GPU"
,
srcIndex
);
printf
(
"%
9
s %02d"
,
(
srcType
==
MEM_CPU
)
?
"CPU"
:
"GPU"
,
srcIndex
);
for
(
int
dst
=
0
;
dst
<
numDevices
;
dst
++
)
for
(
int
dst
=
0
;
dst
<
numDevices
;
dst
++
)
{
{
...
@@ -1482,7 +1492,7 @@ void Transfer::ValidateDst(EnvVars const& ev)
...
@@ -1482,7 +1492,7 @@ void Transfer::ValidateDst(EnvVars const& ev)
for
(
int
dstIdx
=
0
;
dstIdx
<
this
->
numDsts
;
++
dstIdx
)
for
(
int
dstIdx
=
0
;
dstIdx
<
this
->
numDsts
;
++
dstIdx
)
{
{
float
*
output
;
float
*
output
;
if
(
IsCpuType
(
this
->
dstType
[
dstIdx
]))
if
(
IsCpuType
(
this
->
dstType
[
dstIdx
])
||
ev
.
validateDirect
)
{
{
output
=
this
->
dstMem
[
dstIdx
]
+
initOffset
;
output
=
this
->
dstMem
[
dstIdx
]
+
initOffset
;
}
}
...
@@ -1525,6 +1535,8 @@ void Transfer::ValidateDst(EnvVars const& ev)
...
@@ -1525,6 +1535,8 @@ void Transfer::ValidateDst(EnvVars const& ev)
this
->
DstToStr
().
c_str
());
this
->
DstToStr
().
c_str
());
if
(
!
ev
.
continueOnError
)
if
(
!
ev
.
continueOnError
)
exit
(
1
);
exit
(
1
);
else
break
;
}
}
}
}
}
}
...
...
src/include/EnvVars.hpp
View file @
5901ce0e
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Compatibility.hpp"
#include "Kernels.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.1
7
"
#define TB_VERSION "1.1
8
"
extern
char
const
MemTypeStr
[];
extern
char
const
MemTypeStr
[];
extern
char
const
ExeTypeStr
[];
extern
char
const
ExeTypeStr
[];
...
@@ -77,6 +77,7 @@ public:
...
@@ -77,6 +77,7 @@ public:
int
usePcieIndexing
;
// Base GPU indexing on PCIe address instead of HIP device
int
usePcieIndexing
;
// Base GPU indexing on PCIe address instead of HIP device
int
usePrepSrcKernel
;
// Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
int
usePrepSrcKernel
;
// Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
int
useSingleStream
;
// Use a single stream per GPU GFX executor instead of stream per Transfer
int
useSingleStream
;
// Use a single stream per GPU GFX executor instead of stream per Transfer
int
validateDirect
;
// Validate GPU destination memory directly instead of staging GPU memory on host
std
::
vector
<
float
>
fillPattern
;
// Pattern of floats used to fill source data
std
::
vector
<
float
>
fillPattern
;
// Pattern of floats used to fill source data
...
@@ -156,6 +157,7 @@ public:
...
@@ -156,6 +157,7 @@ public:
usePcieIndexing
=
GetEnvVar
(
"USE_PCIE_INDEX"
,
0
);
usePcieIndexing
=
GetEnvVar
(
"USE_PCIE_INDEX"
,
0
);
usePrepSrcKernel
=
GetEnvVar
(
"USE_PREP_KERNEL"
,
0
);
usePrepSrcKernel
=
GetEnvVar
(
"USE_PREP_KERNEL"
,
0
);
useSingleStream
=
GetEnvVar
(
"USE_SINGLE_STREAM"
,
0
);
useSingleStream
=
GetEnvVar
(
"USE_SINGLE_STREAM"
,
0
);
validateDirect
=
GetEnvVar
(
"VALIDATE_DIRECT"
,
0
);
enableDebug
=
GetEnvVar
(
"DEBUG"
,
0
);
enableDebug
=
GetEnvVar
(
"DEBUG"
,
0
);
gpuKernel
=
GetEnvVar
(
"GPU_KERNEL"
,
defaultGpuKernel
);
gpuKernel
=
GetEnvVar
(
"GPU_KERNEL"
,
defaultGpuKernel
);
...
@@ -382,218 +384,112 @@ public:
...
@@ -382,218 +384,112 @@ public:
printf
(
" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing
\n
"
);
printf
(
" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing
\n
"
);
printf
(
" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern
\n
"
);
printf
(
" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern
\n
"
);
printf
(
" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer
\n
"
);
printf
(
" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer
\n
"
);
printf
(
" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host
\n
"
);
}
}
// Helper macro to switch between CSV and terminal output
#define PRINT_EV(NAME, VALUE, DESCRIPTION) \
printf("%-20s%s%12d%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ", (DESCRIPTION).c_str())
#define PRINT_ES(NAME, VALUE, DESCRIPTION) \
printf("%-20s%s%12s%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ", (DESCRIPTION).c_str())
// Display env var settings
// Display env var settings
void
DisplayEnvVars
()
const
void
DisplayEnvVars
()
const
{
{
if
(
!
outputToCsv
)
if
(
!
outputToCsv
)
{
{
printf
(
"
Run configuration (
TransferBench v%s
)
\n
"
,
TB_VERSION
);
printf
(
"TransferBench v%s
\n
"
,
TB_VERSION
);
printf
(
"=====================================================
\n
"
);
printf
(
"=====================================================
\n
"
);
printf
(
"%-20s = %12d : Each CU gets a multiple of %d bytes to copy
\n
"
,
"BLOCK_BYTES"
,
blockBytes
,
blockBytes
);
printf
(
"[Common]
\n
"
);
printf
(
"%-20s = %12d : Using byte offset of %d
\n
"
,
"BYTE_OFFSET"
,
byteOffset
,
byteOffset
);
printf
(
"%-20s = %12d : Continue on error
\n
"
,
"CONTINUE_ON_ERROR"
,
continueOnError
);
printf
(
"%-20s = %12s : "
,
"FILL_PATTERN"
,
getenv
(
"FILL_PATTERN"
)
?
"(specified)"
:
"(unset)"
);
if
(
fillPattern
.
size
())
printf
(
"Pattern: %s"
,
getenv
(
"FILL_PATTERN"
));
else
printf
(
"Pseudo-random: %s"
,
PrepSrcValueString
().
c_str
());
printf
(
"
\n
"
);
printf
(
"%-20s = %12d : Using GPU kernel %d [%s]
\n
"
,
"GPU_KERNEL"
,
gpuKernel
,
gpuKernel
,
GpuKernelNames
[
gpuKernel
].
c_str
());
printf
(
"%-20s = %12d : Using %d CPU devices
\n
"
,
"NUM_CPU_DEVICES"
,
numCpuDevices
,
numCpuDevices
);
printf
(
"%-20s = %12d : Using %d GPU devices
\n
"
,
"NUM_GPU_DEVICES"
,
numGpuDevices
,
numGpuDevices
);
printf
(
"%-20s = %12d : Running %d %s per Test
\n
"
,
"NUM_ITERATIONS"
,
numIterations
,
numIterations
>
0
?
numIterations
:
-
numIterations
,
numIterations
>
0
?
"timed iteration(s)"
:
"second(s)"
);
printf
(
"%-20s = %12d : Running %d warmup iteration(s) per Test
\n
"
,
"NUM_WARMUPS"
,
numWarmups
,
numWarmups
);
printf
(
"%-20s = %12d : Output to %s
\n
"
,
"OUTPUT_TO_CSV"
,
outputToCsv
,
outputToCsv
?
"CSV"
:
"console"
);
printf
(
"%-20s = %12s : Using %d shared mem per threadblock
\n
"
,
"SHARED_MEM_BYTES"
,
getenv
(
"SHARED_MEM_BYTES"
)
?
"(specified)"
:
"(unset)"
,
sharedMemBytes
);
printf
(
"%-20s = %12d : Running in %s mode
\n
"
,
"USE_INTERACTIVE"
,
useInteractive
,
useInteractive
?
"interactive"
:
"non-interactive"
);
printf
(
"%-20s = %12d : Using %s-based GPU indexing
\n
"
,
"USE_PCIE_INDEX"
,
usePcieIndexing
,
(
usePcieIndexing
?
"PCIe"
:
"HIP"
));
printf
(
"%-20s = %12d : Using %s to initialize source data
\n
"
,
"USE_PREP_KERNEL"
,
usePrepSrcKernel
,
(
usePrepSrcKernel
?
"GPU kernels"
:
"hipMemcpy"
));
printf
(
"%-20s = %12d : Using single stream per %s
\n
"
,
"USE_SINGLE_STREAM"
,
useSingleStream
,
(
useSingleStream
?
"device"
:
"Transfer"
));
printf
(
"
\n
"
);
}
}
else
else
{
printf
(
"EnvVar,Value,Description,(TransferBench v%s)
\n
"
,
TB_VERSION
);
printf
(
"EnvVar,Value,Description,(TransferBench v%s)
\n
"
,
TB_VERSION
);
printf
(
"BLOCK_BYTES,%d,Each CU gets a multiple of %d bytes to copy
\n
"
,
blockBytes
,
blockBytes
);
printf
(
"BYTE_OFFSET,%d,Using byte offset of %d
\n
"
,
byteOffset
,
byteOffset
);
PRINT_EV
(
"BLOCK_BYTES"
,
blockBytes
,
printf
(
"CONTINUE_ON_ERROR,%d,Continue test on mismatch error
\n
"
,
continueOnError
);
std
::
string
(
"Each CU gets a multiple of "
+
std
::
to_string
(
blockBytes
)
+
" bytes to copy"
));
printf
(
"FILL_PATTERN,%s,"
,
getenv
(
"FILL_PATTERN"
)
?
"(specified)"
:
"(unset)"
);
PRINT_EV
(
"BYTE_OFFSET"
,
byteOffset
,
if
(
fillPattern
.
size
())
std
::
string
(
"Using byte offset of "
+
std
::
to_string
(
byteOffset
)));
printf
(
"Pattern: %s"
,
getenv
(
"FILL_PATTERN"
));
PRINT_EV
(
"CONTINUE_ON_ERROR"
,
continueOnError
,
else
std
::
string
(
continueOnError
?
"Continue on mismatch error"
:
"Stop after first error"
));
printf
(
"Pseudo-random: %s"
,
PrepSrcValueString
().
c_str
());
PRINT_EV
(
"FILL_PATTERN"
,
getenv
(
"FILL_PATTERN"
)
?
1
:
0
,
printf
(
"
\n
"
);
(
fillPattern
.
size
()
?
std
::
string
(
getenv
(
"FILL_PATTERN"
))
:
PrepSrcValueString
()));
printf
(
"NUM_CPU_DEVICES,%d,Using %d CPU devices
\n
"
,
numCpuDevices
,
numCpuDevices
);
PRINT_EV
(
"GPU_KERNEL"
,
gpuKernel
,
printf
(
"NUM_GPU_DEVICES,%d,Using %d GPU devices
\n
"
,
numGpuDevices
,
numGpuDevices
);
std
::
string
(
"Using GPU kernel "
)
+
std
::
to_string
(
gpuKernel
)
+
" ["
+
std
::
string
(
GpuKernelNames
[
gpuKernel
])
+
"]"
);
printf
(
"NUM_ITERATIONS,%d,Running %d %s per Test
\n
"
,
numIterations
,
PRINT_EV
(
"NUM_CPU_DEVICES"
,
numCpuDevices
,
numIterations
>
0
?
numIterations
:
-
numIterations
,
std
::
string
(
"Using "
)
+
std
::
to_string
(
numCpuDevices
)
+
" CPU devices"
);
numIterations
>
0
?
"timed iteration(s)"
:
"second(s)"
);
PRINT_EV
(
"NUM_GPU_DEVICES"
,
numGpuDevices
,
printf
(
"NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test
\n
"
,
numWarmups
,
numWarmups
);
std
::
string
(
"Using "
)
+
std
::
to_string
(
numGpuDevices
)
+
" GPU devices"
);
printf
(
"SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock
\n
"
,
sharedMemBytes
,
sharedMemBytes
);
PRINT_EV
(
"NUM_ITERATIONS"
,
numIterations
,
printf
(
"USE_PCIE_INDEX,%d,Using %s-based GPU indexing
\n
"
,
usePcieIndexing
,
(
usePcieIndexing
?
"PCIe"
:
"HIP"
));
std
::
string
(
"Running "
)
+
std
::
to_string
(
numIterations
>
0
?
numIterations
:
-
numIterations
)
+
" "
printf
(
"USE_PREP_KERNEL,%d,Using %s to initialize source data
\n
"
,
+
(
numIterations
>
0
?
" timed iteration(s)"
:
"seconds(s) per Test"
));
usePrepSrcKernel
,
(
usePrepSrcKernel
?
"GPU kernels"
:
"hipMemcpy"
));
PRINT_EV
(
"NUM_WARMUPS"
,
numWarmups
,
printf
(
"USE_SINGLE_STREAM,%d,Using single stream per %s
\n
"
,
useSingleStream
,
(
useSingleStream
?
"device"
:
"Transfer"
));
std
::
string
(
"Running "
+
std
::
to_string
(
numWarmups
)
+
" warmup iteration(s) per Test"
));
}
PRINT_EV
(
"SHARED_MEM_BYTES"
,
sharedMemBytes
,
std
::
string
(
"Using "
+
std
::
to_string
(
sharedMemBytes
)
+
" shared mem per threadblock"
));
PRINT_EV
(
"USE_INTERACTIVE"
,
useInteractive
,
std
::
string
(
"Running in "
)
+
(
useInteractive
?
"interactive"
:
"non-interactive"
)
+
" mode"
);
PRINT_EV
(
"USE_PCIE_INDEX"
,
usePcieIndexing
,
std
::
string
(
"Use "
)
+
(
usePcieIndexing
?
"PCIe"
:
"HIP"
)
+
" GPU device indexing"
);
PRINT_EV
(
"USE_PREP_KERNEL"
,
usePrepSrcKernel
,
std
::
string
(
"Using "
)
+
(
usePrepSrcKernel
?
"GPU kernels"
:
"hipMemcpy"
)
+
" to initialize source data"
);
PRINT_EV
(
"USE_SINGLE_STREAM"
,
useSingleStream
,
std
::
string
(
"Using single stream per "
)
+
(
useSingleStream
?
"device"
:
"Transfer"
));
PRINT_EV
(
"VALIDATE_DIRECT"
,
validateDirect
,
std
::
string
(
"Validate GPU destination memory "
)
+
(
validateDirect
?
"directly"
:
"via CPU staging buffer"
));
printf
(
"
\n
"
);
};
};
// Display env var for P2P Benchmark preset
// Display env var for P2P Benchmark preset
void
DisplayP2PBenchmarkEnvVars
()
const
void
DisplayP2PBenchmarkEnvVars
()
const
{
{
DisplayEnvVars
();
if
(
!
outputToCsv
)
if
(
!
outputToCsv
)
{
printf
(
"[P2P Related]
\n
"
);
printf
(
"Peer-to-peer Benchmark configuration (TransferBench v%s)
\n
"
,
TB_VERSION
);
printf
(
"=====================================================
\n
"
);
PRINT_EV
(
"NUM_CPU_SE"
,
numCpuSubExecs
,
printf
(
"%-20s = %12d : Using %s as executor
\n
"
,
"USE_REMOTE_READ"
,
useRemoteRead
,
useRemoteRead
?
"DST"
:
"SRC"
);
std
::
string
(
"Using "
)
+
std
::
to_string
(
numCpuSubExecs
)
+
" CPU subexecutors"
);
printf
(
"%-20s = %12d : Using GPU-%s as GPU executor
\n
"
,
"USE_GPU_DMA"
,
useDmaCopy
,
useDmaCopy
?
"DMA"
:
"GFX"
);
PRINT_EV
(
"NUM_GPU_SE"
,
numGpuSubExecs
,
printf
(
"%-20s = %12d : Using %d CPU subexecutors
\n
"
,
"NUM_CPU_SE"
,
numCpuSubExecs
,
numCpuSubExecs
);
std
::
string
(
"Using "
)
+
std
::
to_string
(
numGpuSubExecs
)
+
" GPU subexecutors"
);
printf
(
"%-20s = %12d : Using %d GPU subexecutors
\n
"
,
"NUM_GPU_SE"
,
numGpuSubExecs
,
numGpuSubExecs
);
PRINT_EV
(
"USE_GPU_DMA"
,
useDmaCopy
,
std
::
string
(
"Using GPU-"
)
+
(
useDmaCopy
?
"DMA"
:
"GFX"
)
+
" as GPU executor"
);
printf
(
"%-20s = %12d : Each CU gets a multiple of %d bytes to copy
\n
"
,
"BLOCK_BYTES"
,
blockBytes
,
blockBytes
);
PRINT_EV
(
"USE_REMOTE_READ"
,
useRemoteRead
,
printf
(
"%-20s = %12d : Using byte offset of %d
\n
"
,
"BYTE_OFFSET"
,
byteOffset
,
byteOffset
);
std
::
string
(
"Using "
)
+
(
useRemoteRead
?
"DST"
:
"SRC"
)
+
" as executor"
);
printf
(
"%-20s = %12d : Continue on error
\n
"
,
"CONTINUE_ON_ERROR"
,
continueOnError
);
printf
(
"
\n
"
);
printf
(
"%-20s = %12s : "
,
"FILL_PATTERN"
,
getenv
(
"FILL_PATTERN"
)
?
"(specified)"
:
"(unset)"
);
if
(
fillPattern
.
size
())
printf
(
"Pattern: %s"
,
getenv
(
"FILL_PATTERN"
));
else
printf
(
"Pseudo-random: %s"
,
PrepSrcValueString
().
c_str
());
printf
(
"
\n
"
);
printf
(
"%-20s = %12d : Using %d CPU devices
\n
"
,
"NUM_CPU_DEVICES"
,
numCpuDevices
,
numCpuDevices
);
printf
(
"%-20s = %12d : Using %d GPU devices
\n
"
,
"NUM_GPU_DEVICES"
,
numGpuDevices
,
numGpuDevices
);
printf
(
"%-20s = %12d : Running %d %s per Test
\n
"
,
"NUM_ITERATIONS"
,
numIterations
,
numIterations
>
0
?
numIterations
:
-
numIterations
,
numIterations
>
0
?
"timed iteration(s)"
:
"second(s)"
);
printf
(
"%-20s = %12d : Running %d warmup iteration(s) per Test
\n
"
,
"NUM_WARMUPS"
,
numWarmups
,
numWarmups
);
printf
(
"%-20s = %12s : Using %d shared mem per threadblock
\n
"
,
"SHARED_MEM_BYTES"
,
getenv
(
"SHARED_MEM_BYTES"
)
?
"(specified)"
:
"(unset)"
,
sharedMemBytes
);
printf
(
"%-20s = %12d : Running in %s mode
\n
"
,
"USE_INTERACTIVE"
,
useInteractive
,
useInteractive
?
"interactive"
:
"non-interactive"
);
printf
(
"%-20s = %12d : Using %s-based GPU indexing
\n
"
,
"USE_PCIE_INDEX"
,
usePcieIndexing
,
(
usePcieIndexing
?
"PCIe"
:
"HIP"
));
printf
(
"%-20s = %12d : Using %s to initialize source data
\n
"
,
"USE_PREP_KERNEL"
,
usePrepSrcKernel
,
(
usePrepSrcKernel
?
"GPU kernels"
:
"hipMemcpy"
));
printf
(
"
\n
"
);
}
else
{
printf
(
"EnvVar,Value,Description,(TransferBench v%s)
\n
"
,
TB_VERSION
);
printf
(
"USE_REMOTE_READ,%d,Using %s as executor
\n
"
,
useRemoteRead
,
useRemoteRead
?
"DST"
:
"SRC"
);
printf
(
"USE_GPU_DMA,%d,Using GPU-%s as GPU executor
\n
"
,
useDmaCopy
,
useDmaCopy
?
"DMA"
:
"GFX"
);
printf
(
"NUM_CPU_SE,%d,Using %d CPU subexecutors
\n
"
,
numCpuSubExecs
,
numCpuSubExecs
);
printf
(
"NUM_GPU_SE,%d,Using %d GPU subexecutors
\n
"
,
numGpuSubExecs
,
numGpuSubExecs
);
printf
(
"BLOCK_BYTES,%d,Each CU gets a multiple of %d bytes to copy
\n
"
,
blockBytes
,
blockBytes
);
printf
(
"BYTE_OFFSET,%d,Using byte offset of %d
\n
"
,
byteOffset
,
byteOffset
);
printf
(
"FILL_PATTERN,%s,"
,
getenv
(
"FILL_PATTERN"
)
?
"(specified)"
:
"(unset)"
);
if
(
fillPattern
.
size
())
printf
(
"Pattern: %s"
,
getenv
(
"FILL_PATTERN"
));
else
printf
(
"Pseudo-random: %s"
,
PrepSrcValueString
().
c_str
());
printf
(
"
\n
"
);
printf
(
"NUM_CPU_DEVICES,%d,Using %d CPU devices
\n
"
,
numCpuDevices
,
numCpuDevices
);
printf
(
"NUM_GPU_DEVICES,%d,Using %d GPU devices
\n
"
,
numGpuDevices
,
numGpuDevices
);
printf
(
"NUM_ITERATIONS,%d,Running %d %s per Test
\n
"
,
numIterations
,
numIterations
>
0
?
numIterations
:
-
numIterations
,
numIterations
>
0
?
"timed iteration(s)"
:
"second(s)"
);
printf
(
"NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test
\n
"
,
numWarmups
,
numWarmups
);
printf
(
"SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock
\n
"
,
sharedMemBytes
,
sharedMemBytes
);
printf
(
"USE_PCIE_INDEX,%d,Using %s-based GPU indexing
\n
"
,
usePcieIndexing
,
(
usePcieIndexing
?
"PCIe"
:
"HIP"
));
printf
(
"USE_SINGLE_STREAM,%d,Using single stream per %s
\n
"
,
useSingleStream
,
(
useSingleStream
?
"device"
:
"Transfer"
));
printf
(
"USE_PREP_KERNEL,%d,Using %s to initialize source data
\n
"
,
usePrepSrcKernel
,
(
usePrepSrcKernel
?
"GPU kernels"
:
"hipMemcpy"
));
printf
(
"
\n
"
);
}
}
}
// Display env var settings
// Display env var settings
void
DisplaySweepEnvVars
()
const
void
DisplaySweepEnvVars
()
const
{
{
DisplayEnvVars
();
if
(
!
outputToCsv
)
if
(
!
outputToCsv
)
{
printf
(
"[Sweep Related]
\n
"
);
printf
(
"Sweep configuration (TransferBench v%s)
\n
"
,
TB_VERSION
);
PRINT_ES
(
"SWEEP_DST"
,
sweepDst
.
c_str
(),
printf
(
"=====================================================
\n
"
);
std
::
string
(
"Destination Memory Types to sweep"
));
printf
(
"%-20s = %12d : Random seed
\n
"
,
"SWEEP_SEED"
,
sweepSeed
);
PRINT_ES
(
"SWEEP_EXE"
,
sweepExe
.
c_str
(),
printf
(
"%-20s = %12s : Source Memory Types to sweep
\n
"
,
"SWEEP_SRC"
,
sweepSrc
.
c_str
());
std
::
string
(
"Executor Types to sweep"
));
printf
(
"%-20s = %12s : Executor Types to sweep
\n
"
,
"SWEEP_EXE"
,
sweepExe
.
c_str
());
PRINT_EV
(
"SWEEP_MAX"
,
sweepMax
,
printf
(
"%-20s = %12s : Destination Memory Types to sweep
\n
"
,
"SWEEP_DST"
,
sweepDst
.
c_str
());
std
::
string
(
"Max simultaneous transfers (0 = no limit)"
));
printf
(
"%-20s = %12d : Min simultaneous Transfers
\n
"
,
"SWEEP_MIN"
,
sweepMin
);
PRINT_EV
(
"SWEEP_MIN"
,
sweepMin
,
printf
(
"%-20s = %12d : Max simultaneous Transfers (0 = no limit)
\n
"
,
"SWEEP_MAX"
,
sweepMax
);
std
::
string
(
"Min simultaenous transfers"
));
printf
(
"%-20s = %12d : Max number of tests to run during sweep (0 = no limit)
\n
"
,
"SWEEP_TEST_LIMIT"
,
sweepTestLimit
);
PRINT_EV
(
"SWEEP_RAND_BYTES"
,
sweepRandBytes
,
printf
(
"%-20s = %12d : Max number of seconds to run sweep for (0 = no limit)
\n
"
,
"SWEEP_TIME_LIMIT"
,
sweepTimeLimit
);
std
::
string
(
"Using "
)
+
(
sweepRandBytes
?
"random"
:
"constant"
)
+
" number of bytes per Transfer"
);
printf
(
"%-20s = %12d : Min number of XGMI hops for Transfers
\n
"
,
"SWEEP_XGMI_MIN"
,
sweepXgmiMin
);
PRINT_EV
(
"SWEEP_SEED"
,
sweepSeed
,
printf
(
"%-20s = %12d : Max number of XGMI hops for Transfers (-1 = no limit)
\n
"
,
"SWEEP_XGMI_MAX"
,
sweepXgmiMax
);
std
::
string
(
"Random seed set to "
)
+
std
::
to_string
(
sweepSeed
));
printf
(
"%-20s = %12d : Using %s number of bytes per Transfer
\n
"
,
"SWEEP_RAND_BYTES"
,
sweepRandBytes
,
sweepRandBytes
?
"random"
:
"constant"
);
PRINT_ES
(
"SWEEP_SRC"
,
sweepSrc
.
c_str
(),
printf
(
"%-20s = %12d : Using %d CPU devices
\n
"
,
"NUM_CPU_DEVICES"
,
numCpuDevices
,
numCpuDevices
);
std
::
string
(
"Source Memory Types to sweep"
));
printf
(
"%-20s = %12d : Using %d GPU devices
\n
"
,
"NUM_GPU_DEVICES"
,
numGpuDevices
,
numGpuDevices
);
PRINT_EV
(
"SWEEP_TEST_LIMIT"
,
sweepTestLimit
,
printf
(
"%-20s = %12d : Each CU gets a multiple of %d bytes to copy
\n
"
,
"BLOCK_BYTES"
,
blockBytes
,
blockBytes
);
std
::
string
(
"Max number of tests to run during sweep (0 = no limit)"
));
printf
(
"%-20s = %12d : Using byte offset of %d
\n
"
,
"BYTE_OFFSET"
,
byteOffset
,
byteOffset
);
PRINT_EV
(
"SWEEP_TIME_LIMIT"
,
sweepTimeLimit
,
printf
(
"%-20s = %12s : "
,
"FILL_PATTERN"
,
getenv
(
"FILL_PATTERN"
)
?
"(specified)"
:
"(unset)"
);
std
::
string
(
"Max number of seconds to run sweep for (0 = no limit)"
));
if
(
fillPattern
.
size
())
PRINT_EV
(
"SWEEP_XGMI_MAX"
,
sweepXgmiMax
,
printf
(
"Pattern: %s"
,
getenv
(
"FILL_PATTERN"
));
std
::
string
(
"Max number of XGMI hops for Transfers (-1 = no limit)"
));
else
PRINT_EV
(
"SWEEP_XGMI_MIN"
,
sweepXgmiMin
,
printf
(
"Pseudo-random: %s"
,
PrepSrcValueString
().
c_str
());
std
::
string
(
"Min number of XGMI hops for Transfers"
));
printf
(
"
\n
"
);
printf
(
"
\n
"
);
printf
(
"%-20s = %12d : Running %d %s per Test
\n
"
,
"NUM_ITERATIONS"
,
numIterations
,
}
numIterations
>
0
?
numIterations
:
-
numIterations
,
numIterations
>
0
?
"timed iteration(s)"
:
"second(s)"
);
printf
(
"%-20s = %12d : Running %d warmup iteration(s) per Test
\n
"
,
"NUM_WARMUPS"
,
numWarmups
,
numWarmups
);
printf
(
"%-20s = %12d : Output to %s
\n
"
,
"OUTPUT_TO_CSV"
,
outputToCsv
,
outputToCsv
?
"CSV"
:
"console"
);
printf
(
"%-20s = %12s : Using %d shared mem per threadblock
\n
"
,
"SHARED_MEM_BYTES"
,
getenv
(
"SHARED_MEM_BYTES"
)
?
"(specified)"
:
"(unset)"
,
sharedMemBytes
);
printf
(
"%-20s = %12d : Using %s-based GPU indexing
\n
"
,
"USE_PCIE_INDEX"
,
usePcieIndexing
,
(
usePcieIndexing
?
"PCIe"
:
"HIP"
));
printf
(
"USE_PREP_KERNEL,%d,Using %s to initialize source data
\n
"
,
usePrepSrcKernel
,
(
usePrepSrcKernel
?
"GPU kernels"
:
"hipMemcpy"
));
printf
(
"%-20s = %12d : Using single stream per %s
\n
"
,
"USE_SINGLE_STREAM"
,
useSingleStream
,
(
useSingleStream
?
"device"
:
"Transfer"
));
printf
(
"%-20s = %12d : Continue on error
\n
"
,
"CONTINUE_ON_ERROR"
,
continueOnError
);
printf
(
"
\n
"
);
}
else
{
printf
(
"EnvVar,Value,Description,(TransferBench v%s)
\n
"
,
TB_VERSION
);
printf
(
"SWEEP_SRC,%s,Source Memory Types to sweep
\n
"
,
sweepSrc
.
c_str
());
printf
(
"SWEEP_EXE,%s,Executor Types to sweep
\n
"
,
sweepExe
.
c_str
());
printf
(
"SWEEP_DST,%s,Destination Memory Types to sweep
\n
"
,
sweepDst
.
c_str
());
printf
(
"SWEEP_SEED,%d,Random seed
\n
"
,
sweepSeed
);
printf
(
"SWEEP_MIN,%d,Min simultaneous Transfers
\n
"
,
sweepMin
);
printf
(
"SWEEP_MAX,%d,Max simultaneous Transfers (0 = no limit)
\n
"
,
sweepMax
);
printf
(
"SWEEP_TEST_LIMIT,%d,Max number of tests to run during sweep (0 = no limit)
\n
"
,
sweepTestLimit
);
printf
(
"SWEEP_TIME_LIMIT,%d,Max number of seconds to run sweep for (0 = no limit)
\n
"
,
sweepTimeLimit
);
printf
(
"SWEEP_XGMI_MIN,%d,Min number of XGMI hops for Transfers
\n
"
,
sweepXgmiMin
);
printf
(
"SWEEP_XGMI_MAX,%d,Max number of XGMI hops for Transfers (-1 = no limit)
\n
"
,
sweepXgmiMax
);
printf
(
"SWEEP_RAND_BYTES,%d,Using %s number of bytes per Transfer
\n
"
,
sweepRandBytes
,
sweepRandBytes
?
"random"
:
"constant"
);
printf
(
"NUM_CPU_DEVICES,%d,Using %d CPU devices
\n
"
,
numCpuDevices
,
numCpuDevices
);
printf
(
"NUM_GPU_DEVICES,%d,Using %d GPU devices
\n
"
,
numGpuDevices
,
numGpuDevices
);
printf
(
"BLOCK_BYTES,%d,Each CU gets a multiple of %d bytes to copy
\n
"
,
blockBytes
,
blockBytes
);
printf
(
"BYTE_OFFSET,%d,Using byte offset of %d
\n
"
,
byteOffset
,
byteOffset
);
printf
(
"FILL_PATTERN,%s,"
,
getenv
(
"FILL_PATTERN"
)
?
"(specified)"
:
"(unset)"
);
if
(
fillPattern
.
size
())
printf
(
"Pattern: %s"
,
getenv
(
"FILL_PATTERN"
));
else
printf
(
"Pseudo-random: %s"
,
PrepSrcValueString
().
c_str
());
printf
(
"
\n
"
);
printf
(
"NUM_ITERATIONS,%d,Running %d %s per Test
\n
"
,
numIterations
,
numIterations
>
0
?
numIterations
:
-
numIterations
,
numIterations
>
0
?
"timed iteration(s)"
:
"second(s)"
);
printf
(
"NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test
\n
"
,
numWarmups
,
numWarmups
);
printf
(
"SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock
\n
"
,
sharedMemBytes
,
sharedMemBytes
);
printf
(
"USE_PCIE_INDEX,%d,Using %s-based GPU indexing
\n
"
,
usePcieIndexing
,
(
usePcieIndexing
?
"PCIe"
:
"HIP"
));
printf
(
"USE_PREP_KERNEL,%d,Using %s to initialize source data
\n
"
,
usePrepSrcKernel
,
(
usePrepSrcKernel
?
"GPU kernels"
:
"hipMemcpy"
));
printf
(
"USE_SINGLE_STREAM,%d,Using single stream per %s
\n
"
,
useSingleStream
,
(
useSingleStream
?
"device"
:
"Transfer"
));
}
};
// Helper function that gets parses environment variable or sets to default value
// Helper function that gets parses environment variable or sets to default value
static
int
GetEnvVar
(
std
::
string
const
&
varname
,
int
defaultValue
)
static
int
GetEnvVar
(
std
::
string
const
&
varname
,
int
defaultValue
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment