Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
5331f980
Unverified
Commit
5331f980
authored
Aug 25, 2022
by
gilbertlee-amd
Committed by
GitHub
Aug 25, 2022
Browse files
Merge pull request #3 from gilbertlee-amd/CXLFix
Adding support for NUMA nodes without CPUs
parents
8f88ce3f
2f047a8e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
98 additions
and
21 deletions
+98
-21
CHANGELOG.md
CHANGELOG.md
+7
-0
EnvVars.hpp
EnvVars.hpp
+10
-6
TransferBench.cpp
TransferBench.cpp
+81
-15
No files found.
CHANGELOG.md
View file @
5331f980
# Changelog for TransferBench
## v1.05
### Added
-
Topology output now includes NUMA node information
-
Support for NUMA nodes with no CPU cores (e.g. CXL memory)
### Removed
-
SWEEP_SRC_IS_EXE environment variable
## v1.04
### Added
-
New environment variables for sweep based presets
...
...
EnvVars.hpp
View file @
5331f980
...
...
@@ -26,7 +26,7 @@ THE SOFTWARE.
#include <algorithm>
#include <random>
#include <time.h>
#define TB_VERSION "1.0
4
"
#define TB_VERSION "1.0
5
"
extern
char
const
MemTypeStr
[];
...
...
@@ -47,7 +47,6 @@ public:
int
const
DEFAULT_SAMPLING_FACTOR
=
1
;
int
const
DEFAULT_NUM_CPU_PER_TRANSFER
=
4
;
int
const
DEFAULT_SWEEP_SRC_IS_EXE
=
0
;
std
::
string
const
DEFAULT_SWEEP_SRC
=
"CG"
;
std
::
string
const
DEFAULT_SWEEP_EXE
=
"CG"
;
std
::
string
const
DEFAULT_SWEEP_DST
=
"CG"
;
...
...
@@ -76,7 +75,6 @@ public:
std
::
vector
<
float
>
fillPattern
;
// Pattern of floats used to fill source data
// Environment variables only for Sweep-preset
int
sweepSrcIsExe
;
// Non-zero if executor should always be the same as source
int
sweepMin
;
// Min number of simultaneous Transfers to be executed per test
int
sweepMax
;
// Max number of simulatneous Transfers to be executed per test
int
sweepTestLimit
;
// Max number of tests to run during sweep (0 = no limit)
...
...
@@ -95,6 +93,9 @@ public:
// Random generator
std
::
default_random_engine
*
generator
;
// Track how many CPUs are available per NUMA node
std
::
vector
<
int
>
numCpusPerNuma
;
// Constructor that collects values
EnvVars
()
{
...
...
@@ -122,7 +123,6 @@ public:
usePcieIndexing
=
GetEnvVar
(
"USE_PCIE_INDEX"
,
0
);
useSingleStream
=
GetEnvVar
(
"USE_SINGLE_STREAM"
,
0
);
sweepSrcIsExe
=
GetEnvVar
(
"SWEEP_SRC_IS_EXE"
,
DEFAULT_SWEEP_SRC_IS_EXE
);
sweepMin
=
GetEnvVar
(
"SWEEP_MIN"
,
DEFAULT_SWEEP_MIN
);
sweepMax
=
GetEnvVar
(
"SWEEP_MAX"
,
DEFAULT_SWEEP_MAX
);
sweepSrc
=
GetEnvVar
(
"SWEEP_SRC"
,
DEFAULT_SWEEP_SRC
);
...
...
@@ -287,6 +287,12 @@ public:
exit
(
1
);
}
}
// Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
numCpusPerNuma
.
resize
(
numDetectedCpus
);
int
const
totalCpus
=
numa_num_configured_cpus
();
for
(
int
i
=
0
;
i
<
totalCpus
;
i
++
)
numCpusPerNuma
[
numa_node_of_cpu
(
i
)]
++
;
}
// Display info on the env vars that can be used
...
...
@@ -393,7 +399,6 @@ public:
printf
(
"%-20s = %12s : Source Memory Types to sweep
\n
"
,
"SWEEP_SRC"
,
sweepSrc
.
c_str
());
printf
(
"%-20s = %12s : Executor Types to sweep
\n
"
,
"SWEEP_EXE"
,
sweepExe
.
c_str
());
printf
(
"%-20s = %12s : Destination Memory Types to sweep
\n
"
,
"SWEEP_DST"
,
sweepDst
.
c_str
());
printf
(
"%-20s = %12d : Transfer executor %s Transfer source
\n
"
,
"SWEEP_SRC_IS_EXE"
,
sweepSrcIsExe
,
sweepSrcIsExe
?
"must match"
:
"may have any"
);
printf
(
"%-20s = %12d : Min simultaneous Transfers
\n
"
,
"SWEEP_MIN"
,
sweepMin
);
printf
(
"%-20s = %12d : Max simultaneous Transfers (0 = no limit)
\n
"
,
"SWEEP_MAX"
,
sweepMax
);
printf
(
"%-20s = %12d : Max number of tests to run during sweep (0 = no limit)
\n
"
,
"SWEEP_TEST_LIMIT"
,
sweepTestLimit
);
...
...
@@ -440,7 +445,6 @@ public:
printf
(
"SWEEP_SRC,%s,Source Memory Types to sweep
\n
"
,
sweepSrc
.
c_str
());
printf
(
"SWEEP_EXE,%s,Executor Types to sweep
\n
"
,
sweepExe
.
c_str
());
printf
(
"SWEEP_DST,%s,Destination Memory Types to sweep
\n
"
,
sweepDst
.
c_str
());
printf
(
"SWEEP_SRC_IS_EXE,%d, Transfer executor %s Transfer source
\n
"
,
sweepSrcIsExe
,
sweepSrcIsExe
?
"must match"
:
"may have any"
);
printf
(
"SWEEP_SEED,%d,Random seed
\n
"
,
sweepSeed
);
printf
(
"SWEEP_MIN,%d,Min simultaneous Transfers
\n
"
,
sweepMin
);
printf
(
"SWEEP_MAX,%d,Max simultaneous Transfers (0 = no limit)
\n
"
,
sweepMax
);
...
...
TransferBench.cpp
View file @
5331f980
...
...
@@ -563,13 +563,76 @@ int RemappedIndex(int const origIdx, MemType const memType)
void
DisplayTopology
(
bool
const
outputToCsv
)
{
int
numCpuDevices
=
numa_num_configured_nodes
();
int
numGpuDevices
;
HIP_CALL
(
hipGetDeviceCount
(
&
numGpuDevices
));
if
(
outputToCsv
)
{
printf
(
"NumCpus,%d
\n
"
,
num
a_num_configured_nodes
()
);
printf
(
"NumCpus,%d
\n
"
,
num
CpuDevices
);
printf
(
"NumGpus,%d
\n
"
,
numGpuDevices
);
}
else
{
printf
(
"
\n
Detected topology: %d CPU NUMA node(s) %d GPU device(s)
\n
"
,
numa_num_configured_nodes
(),
numGpuDevices
);
}
// Print out detected CPU topology
if
(
outputToCsv
)
{
printf
(
"NUMA"
);
for
(
int
j
=
0
;
j
<
numCpuDevices
;
j
++
)
printf
(
",NUMA%02d"
,
j
);
printf
(
",# CPUs,ClosestGPUs
\n
"
);
}
else
{
printf
(
" |"
);
for
(
int
j
=
0
;
j
<
numCpuDevices
;
j
++
)
printf
(
"NUMA %02d |"
,
j
);
printf
(
" # Cpus | Closest GPU(s)
\n
"
);
for
(
int
j
=
0
;
j
<=
numCpuDevices
;
j
++
)
printf
(
"--------+"
);
printf
(
"--------+-------------
\n
"
);
}
for
(
int
i
=
0
;
i
<
numCpuDevices
;
i
++
)
{
printf
(
"NUMA %02d%s"
,
i
,
outputToCsv
?
","
:
" |"
);
for
(
int
j
=
0
;
j
<
numCpuDevices
;
j
++
)
{
int
numaDist
=
numa_distance
(
i
,
j
);
if
(
outputToCsv
)
printf
(
"%d,"
,
numaDist
);
else
printf
(
" %6d |"
,
numaDist
);
}
int
numCpus
=
0
;
for
(
int
j
=
0
;
j
<
numa_num_configured_cpus
();
j
++
)
if
(
numa_node_of_cpu
(
j
)
==
i
)
numCpus
++
;
if
(
outputToCsv
)
printf
(
"%d,"
,
numCpus
);
else
printf
(
" %6d | "
,
numCpus
);
bool
isFirst
=
true
;
for
(
int
j
=
0
;
j
<
numGpuDevices
;
j
++
)
{
if
(
GetClosestNumaNode
(
RemappedIndex
(
j
,
MEM_GPU
))
==
i
)
{
if
(
isFirst
)
isFirst
=
false
;
else
printf
(
","
);
printf
(
"%d"
,
j
);
}
}
printf
(
"
\n
"
);
}
printf
(
"
\n
"
);
// Print out detected GPU topology
if
(
outputToCsv
)
{
printf
(
"GPU"
);
for
(
int
j
=
0
;
j
<
numGpuDevices
;
j
++
)
printf
(
",GPU %02d"
,
j
);
...
...
@@ -577,7 +640,6 @@ void DisplayTopology(bool const outputToCsv)
}
else
{
printf
(
"
\n
Detected topology: %d CPU NUMA node(s) %d GPU device(s)
\n
"
,
numa_num_configured_nodes
(),
numGpuDevices
);
printf
(
" |"
);
for
(
int
j
=
0
;
j
<
numGpuDevices
;
j
++
)
printf
(
" GPU %02d |"
,
j
);
...
...
@@ -1232,6 +1294,13 @@ double GetPeakBandwidth(EnvVars const& ev,
transfers
[
0
]
->
exeIndex
=
RemappedIndex
((
readMode
==
0
?
srcIndex
:
dstIndex
),
transfers
[
0
]
->
exeMemType
);
transfers
[
1
]
->
exeIndex
=
RemappedIndex
((
readMode
==
0
?
dstIndex
:
srcIndex
),
transfers
[
1
]
->
exeMemType
);
// Abort if executing on NUMA node with no CPUs
for
(
int
i
=
0
;
i
<=
isBidirectional
;
i
++
)
{
if
(
transfers
[
i
]
->
exeMemType
==
MEM_CPU
&&
ev
.
numCpusPerNuma
[
transfers
[
i
]
->
exeIndex
]
==
0
)
return
0
;
}
for
(
int
i
=
0
;
i
<=
isBidirectional
;
i
++
)
{
AllocateMemory
(
transfers
[
i
]
->
srcMemType
,
transfers
[
i
]
->
srcIndex
,
...
...
@@ -1375,36 +1444,33 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
std
::
vector
<
size_t
>
valuesOfN
(
1
,
numBytesPerTransfer
/
sizeof
(
float
));
// Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
bool
hasCpuExecutor
=
false
;
bool
hasGpuExecutor
=
false
;
std
::
vector
<
std
::
pair
<
MemType
,
int
>>
exeList
;
for
(
auto
exe
:
ev
.
sweepExe
)
{
MemType
const
exeMemType
=
CharToMemType
(
exe
);
int
numDevices
;
if
(
IsGpuType
(
exeMemType
))
{
numDevices
=
ev
.
numGpuDevices
;
hasGpuExecutor
=
true
;
for
(
int
exeIndex
=
0
;
exeIndex
<
ev
.
numGpuDevices
;
++
exeIndex
)
exeList
.
push_back
(
std
::
make_pair
(
exeMemType
,
exeIndex
))
;
}
else
{
numDevices
=
ev
.
numCpuDevices
;
hasCpuExecutor
=
true
;
for
(
int
exeIndex
=
0
;
exeIndex
<
ev
.
numCpuDevices
;
++
exeIndex
)
{
// Skip NUMA nodes that have no CPUs (e.g. CXL)
if
(
ev
.
numCpusPerNuma
[
exeIndex
]
==
0
)
continue
;
exeList
.
push_back
(
std
::
make_pair
(
exeMemType
,
exeIndex
));
}
}
for
(
int
exeIndex
=
0
;
exeIndex
<
numDevices
;
++
exeIndex
)
exeList
.
push_back
(
std
::
make_pair
(
exeMemType
,
exeIndex
));
}
int
numExes
=
ev
.
sweepSrcIsExe
?
1
:
exeList
.
size
();
int
numExes
=
exeList
.
size
();
std
::
vector
<
std
::
pair
<
MemType
,
int
>>
srcList
;
for
(
auto
src
:
ev
.
sweepSrc
)
{
MemType
const
srcMemType
=
CharToMemType
(
src
);
int
const
numDevices
=
IsGpuType
(
srcMemType
)
?
ev
.
numGpuDevices
:
ev
.
numCpuDevices
;
// Skip source memory type if executor is supposed to be source but not specified
if
((
IsGpuType
(
srcMemType
)
&&
!
hasGpuExecutor
)
||
(
!
IsGpuType
(
srcMemType
)
&&
!
hasCpuExecutor
))
continue
;
for
(
int
srcIndex
=
0
;
srcIndex
<
numDevices
;
++
srcIndex
)
srcList
.
push_back
(
std
::
make_pair
(
srcMemType
,
srcIndex
));
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment