Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
2f047a8e
Commit
2f047a8e
authored
Aug 25, 2022
by
Gilbert Lee
Browse files
Adding support for NUMA nodes without CPUs
parent
8f88ce3f
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
98 additions
and
21 deletions
+98
-21
CHANGELOG.md
CHANGELOG.md
+7
-0
EnvVars.hpp
EnvVars.hpp
+10
-6
TransferBench.cpp
TransferBench.cpp
+81
-15
No files found.
CHANGELOG.md
View file @
2f047a8e
# Changelog for TransferBench
## v1.05
### Added
-
Topology output now includes NUMA node information
-
Support for NUMA nodes with no CPU cores (e.g. CXL memory)
### Removed
-
SWEEP_SRC_IS_EXE environment variable
## v1.04
### Added
-
New environment variables for sweep based presets
...
...
EnvVars.hpp
View file @
2f047a8e
...
...
@@ -26,7 +26,7 @@ THE SOFTWARE.
#include <algorithm>
#include <random>
#include <time.h>
#define TB_VERSION "1.0
4
"
#define TB_VERSION "1.0
5
"
extern
char
const
MemTypeStr
[];
...
...
@@ -47,7 +47,6 @@ public:
int
const
DEFAULT_SAMPLING_FACTOR
=
1
;
int
const
DEFAULT_NUM_CPU_PER_TRANSFER
=
4
;
int
const
DEFAULT_SWEEP_SRC_IS_EXE
=
0
;
std
::
string
const
DEFAULT_SWEEP_SRC
=
"CG"
;
std
::
string
const
DEFAULT_SWEEP_EXE
=
"CG"
;
std
::
string
const
DEFAULT_SWEEP_DST
=
"CG"
;
...
...
@@ -76,7 +75,6 @@ public:
std
::
vector
<
float
>
fillPattern
;
// Pattern of floats used to fill source data
// Environment variables only for Sweep-preset
int
sweepSrcIsExe
;
// Non-zero if executor should always be the same as source
int
sweepMin
;
// Min number of simultaneous Transfers to be executed per test
int
sweepMax
;
// Max number of simulatneous Transfers to be executed per test
int
sweepTestLimit
;
// Max number of tests to run during sweep (0 = no limit)
...
...
@@ -95,6 +93,9 @@ public:
// Random generator
std
::
default_random_engine
*
generator
;
// Track how many CPUs are available per NUMA node
std
::
vector
<
int
>
numCpusPerNuma
;
// Constructor that collects values
EnvVars
()
{
...
...
@@ -122,7 +123,6 @@ public:
usePcieIndexing
=
GetEnvVar
(
"USE_PCIE_INDEX"
,
0
);
useSingleStream
=
GetEnvVar
(
"USE_SINGLE_STREAM"
,
0
);
sweepSrcIsExe
=
GetEnvVar
(
"SWEEP_SRC_IS_EXE"
,
DEFAULT_SWEEP_SRC_IS_EXE
);
sweepMin
=
GetEnvVar
(
"SWEEP_MIN"
,
DEFAULT_SWEEP_MIN
);
sweepMax
=
GetEnvVar
(
"SWEEP_MAX"
,
DEFAULT_SWEEP_MAX
);
sweepSrc
=
GetEnvVar
(
"SWEEP_SRC"
,
DEFAULT_SWEEP_SRC
);
...
...
@@ -287,6 +287,12 @@ public:
exit
(
1
);
}
}
// Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
numCpusPerNuma
.
resize
(
numDetectedCpus
);
int
const
totalCpus
=
numa_num_configured_cpus
();
for
(
int
i
=
0
;
i
<
totalCpus
;
i
++
)
numCpusPerNuma
[
numa_node_of_cpu
(
i
)]
++
;
}
// Display info on the env vars that can be used
...
...
@@ -393,7 +399,6 @@ public:
printf
(
"%-20s = %12s : Source Memory Types to sweep
\n
"
,
"SWEEP_SRC"
,
sweepSrc
.
c_str
());
printf
(
"%-20s = %12s : Executor Types to sweep
\n
"
,
"SWEEP_EXE"
,
sweepExe
.
c_str
());
printf
(
"%-20s = %12s : Destination Memory Types to sweep
\n
"
,
"SWEEP_DST"
,
sweepDst
.
c_str
());
printf
(
"%-20s = %12d : Transfer executor %s Transfer source
\n
"
,
"SWEEP_SRC_IS_EXE"
,
sweepSrcIsExe
,
sweepSrcIsExe
?
"must match"
:
"may have any"
);
printf
(
"%-20s = %12d : Min simultaneous Transfers
\n
"
,
"SWEEP_MIN"
,
sweepMin
);
printf
(
"%-20s = %12d : Max simultaneous Transfers (0 = no limit)
\n
"
,
"SWEEP_MAX"
,
sweepMax
);
printf
(
"%-20s = %12d : Max number of tests to run during sweep (0 = no limit)
\n
"
,
"SWEEP_TEST_LIMIT"
,
sweepTestLimit
);
...
...
@@ -440,7 +445,6 @@ public:
printf
(
"SWEEP_SRC,%s,Source Memory Types to sweep
\n
"
,
sweepSrc
.
c_str
());
printf
(
"SWEEP_EXE,%s,Executor Types to sweep
\n
"
,
sweepExe
.
c_str
());
printf
(
"SWEEP_DST,%s,Destination Memory Types to sweep
\n
"
,
sweepDst
.
c_str
());
printf
(
"SWEEP_SRC_IS_EXE,%d, Transfer executor %s Transfer source
\n
"
,
sweepSrcIsExe
,
sweepSrcIsExe
?
"must match"
:
"may have any"
);
printf
(
"SWEEP_SEED,%d,Random seed
\n
"
,
sweepSeed
);
printf
(
"SWEEP_MIN,%d,Min simultaneous Transfers
\n
"
,
sweepMin
);
printf
(
"SWEEP_MAX,%d,Max simultaneous Transfers (0 = no limit)
\n
"
,
sweepMax
);
...
...
TransferBench.cpp
View file @
2f047a8e
...
...
@@ -563,13 +563,76 @@ int RemappedIndex(int const origIdx, MemType const memType)
void
DisplayTopology
(
bool
const
outputToCsv
)
{
int
numCpuDevices
=
numa_num_configured_nodes
();
int
numGpuDevices
;
HIP_CALL
(
hipGetDeviceCount
(
&
numGpuDevices
));
if
(
outputToCsv
)
{
printf
(
"NumCpus,%d
\n
"
,
num
a_num_configured_nodes
()
);
printf
(
"NumCpus,%d
\n
"
,
num
CpuDevices
);
printf
(
"NumGpus,%d
\n
"
,
numGpuDevices
);
}
else
{
printf
(
"
\n
Detected topology: %d CPU NUMA node(s) %d GPU device(s)
\n
"
,
numa_num_configured_nodes
(),
numGpuDevices
);
}
// Print out detected CPU topology
if
(
outputToCsv
)
{
printf
(
"NUMA"
);
for
(
int
j
=
0
;
j
<
numCpuDevices
;
j
++
)
printf
(
",NUMA%02d"
,
j
);
printf
(
",# CPUs,ClosestGPUs
\n
"
);
}
else
{
printf
(
" |"
);
for
(
int
j
=
0
;
j
<
numCpuDevices
;
j
++
)
printf
(
"NUMA %02d |"
,
j
);
printf
(
" # Cpus | Closest GPU(s)
\n
"
);
for
(
int
j
=
0
;
j
<=
numCpuDevices
;
j
++
)
printf
(
"--------+"
);
printf
(
"--------+-------------
\n
"
);
}
for
(
int
i
=
0
;
i
<
numCpuDevices
;
i
++
)
{
printf
(
"NUMA %02d%s"
,
i
,
outputToCsv
?
","
:
" |"
);
for
(
int
j
=
0
;
j
<
numCpuDevices
;
j
++
)
{
int
numaDist
=
numa_distance
(
i
,
j
);
if
(
outputToCsv
)
printf
(
"%d,"
,
numaDist
);
else
printf
(
" %6d |"
,
numaDist
);
}
int
numCpus
=
0
;
for
(
int
j
=
0
;
j
<
numa_num_configured_cpus
();
j
++
)
if
(
numa_node_of_cpu
(
j
)
==
i
)
numCpus
++
;
if
(
outputToCsv
)
printf
(
"%d,"
,
numCpus
);
else
printf
(
" %6d | "
,
numCpus
);
bool
isFirst
=
true
;
for
(
int
j
=
0
;
j
<
numGpuDevices
;
j
++
)
{
if
(
GetClosestNumaNode
(
RemappedIndex
(
j
,
MEM_GPU
))
==
i
)
{
if
(
isFirst
)
isFirst
=
false
;
else
printf
(
","
);
printf
(
"%d"
,
j
);
}
}
printf
(
"
\n
"
);
}
printf
(
"
\n
"
);
// Print out detected GPU topology
if
(
outputToCsv
)
{
printf
(
"GPU"
);
for
(
int
j
=
0
;
j
<
numGpuDevices
;
j
++
)
printf
(
",GPU %02d"
,
j
);
...
...
@@ -577,7 +640,6 @@ void DisplayTopology(bool const outputToCsv)
}
else
{
printf
(
"
\n
Detected topology: %d CPU NUMA node(s) %d GPU device(s)
\n
"
,
numa_num_configured_nodes
(),
numGpuDevices
);
printf
(
" |"
);
for
(
int
j
=
0
;
j
<
numGpuDevices
;
j
++
)
printf
(
" GPU %02d |"
,
j
);
...
...
@@ -1232,6 +1294,13 @@ double GetPeakBandwidth(EnvVars const& ev,
transfers
[
0
]
->
exeIndex
=
RemappedIndex
((
readMode
==
0
?
srcIndex
:
dstIndex
),
transfers
[
0
]
->
exeMemType
);
transfers
[
1
]
->
exeIndex
=
RemappedIndex
((
readMode
==
0
?
dstIndex
:
srcIndex
),
transfers
[
1
]
->
exeMemType
);
// Abort if executing on NUMA node with no CPUs
for
(
int
i
=
0
;
i
<=
isBidirectional
;
i
++
)
{
if
(
transfers
[
i
]
->
exeMemType
==
MEM_CPU
&&
ev
.
numCpusPerNuma
[
transfers
[
i
]
->
exeIndex
]
==
0
)
return
0
;
}
for
(
int
i
=
0
;
i
<=
isBidirectional
;
i
++
)
{
AllocateMemory
(
transfers
[
i
]
->
srcMemType
,
transfers
[
i
]
->
srcIndex
,
...
...
@@ -1375,36 +1444,33 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
std
::
vector
<
size_t
>
valuesOfN
(
1
,
numBytesPerTransfer
/
sizeof
(
float
));
// Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
bool
hasCpuExecutor
=
false
;
bool
hasGpuExecutor
=
false
;
std
::
vector
<
std
::
pair
<
MemType
,
int
>>
exeList
;
for
(
auto
exe
:
ev
.
sweepExe
)
{
MemType
const
exeMemType
=
CharToMemType
(
exe
);
int
numDevices
;
if
(
IsGpuType
(
exeMemType
))
{
numDevices
=
ev
.
numGpuDevices
;
hasGpuExecutor
=
true
;
for
(
int
exeIndex
=
0
;
exeIndex
<
ev
.
numGpuDevices
;
++
exeIndex
)
exeList
.
push_back
(
std
::
make_pair
(
exeMemType
,
exeIndex
))
;
}
else
{
numDevices
=
ev
.
numCpuDevices
;
hasCpuExecutor
=
true
;
}
for
(
int
exeIndex
=
0
;
exeIndex
<
numDevices
;
++
exeIndex
)
for
(
int
exeIndex
=
0
;
exeIndex
<
ev
.
numCpuDevices
;
++
exeIndex
)
{
// Skip NUMA nodes that have no CPUs (e.g. CXL)
if
(
ev
.
numCpusPerNuma
[
exeIndex
]
=
=
0
)
continue
;
exeList
.
push_back
(
std
::
make_pair
(
exeMemType
,
exeIndex
));
}
int
numExes
=
ev
.
sweepSrcIsExe
?
1
:
exeList
.
size
();
}
}
int
numExes
=
exeList
.
size
();
std
::
vector
<
std
::
pair
<
MemType
,
int
>>
srcList
;
for
(
auto
src
:
ev
.
sweepSrc
)
{
MemType
const
srcMemType
=
CharToMemType
(
src
);
int
const
numDevices
=
IsGpuType
(
srcMemType
)
?
ev
.
numGpuDevices
:
ev
.
numCpuDevices
;
// Skip source memory type if executor is supposed to be source but not specified
if
((
IsGpuType
(
srcMemType
)
&&
!
hasGpuExecutor
)
||
(
!
IsGpuType
(
srcMemType
)
&&
!
hasCpuExecutor
))
continue
;
for
(
int
srcIndex
=
0
;
srcIndex
<
numDevices
;
++
srcIndex
)
srcList
.
push_back
(
std
::
make_pair
(
srcMemType
,
srcIndex
));
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment