Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
ff2a96c9
Unverified
Commit
ff2a96c9
authored
Oct 07, 2022
by
gilbertlee-amd
Committed by
GitHub
Oct 07, 2022
Browse files
Configured NUMA node fixes (#6)
parent
6771015c
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
56 additions
and
34 deletions
+56
-34
CHANGELOG.md
CHANGELOG.md
+6
-0
EnvVars.hpp
EnvVars.hpp
+2
-2
TransferBench.cpp
TransferBench.cpp
+48
-32
No files found.
CHANGELOG.md
View file @
ff2a96c9
# Changelog for TransferBench
# Changelog for TransferBench
## v1.08
### Changed
-
Fixing handling of non-configured NUMA nodes
-
Topology detection now shows actual NUMA node indices
-
Fix for issue with NUM_GPU_DEVICES
## v1.07
## v1.07
### Changed
### Changed
-
Fix bug with allocations involving non-default CPU memory types
-
Fix bug with allocations involving non-default CPU memory types
...
...
EnvVars.hpp
View file @
ff2a96c9
...
@@ -26,7 +26,7 @@ THE SOFTWARE.
...
@@ -26,7 +26,7 @@ THE SOFTWARE.
#include <algorithm>
#include <algorithm>
#include <random>
#include <random>
#include <time.h>
#include <time.h>
#define TB_VERSION "1.0
7
"
#define TB_VERSION "1.0
8
"
extern
char
const
MemTypeStr
[];
extern
char
const
MemTypeStr
[];
...
@@ -105,7 +105,7 @@ public:
...
@@ -105,7 +105,7 @@ public:
int
numDetectedCpus
=
numa_num_configured_nodes
();
int
numDetectedCpus
=
numa_num_configured_nodes
();
int
numDetectedGpus
;
int
numDetectedGpus
;
hipGetDeviceCount
(
&
num
GpuDevice
s
);
hipGetDeviceCount
(
&
num
DetectedGpu
s
);
blockBytes
=
GetEnvVar
(
"BLOCK_BYTES"
,
256
);
blockBytes
=
GetEnvVar
(
"BLOCK_BYTES"
,
256
);
byteOffset
=
GetEnvVar
(
"BYTE_OFFSET"
,
0
);
byteOffset
=
GetEnvVar
(
"BYTE_OFFSET"
,
0
);
...
...
TransferBench.cpp
View file @
ff2a96c9
...
@@ -543,24 +543,31 @@ void DisplayUsage(char const* cmdName)
...
@@ -543,24 +543,31 @@ void DisplayUsage(char const* cmdName)
int
RemappedIndex
(
int
const
origIdx
,
MemType
const
memType
)
int
RemappedIndex
(
int
const
origIdx
,
MemType
const
memType
)
{
{
static
std
::
vector
<
int
>
remapping
;
static
std
::
vector
<
int
>
remappingCpu
;
static
std
::
vector
<
int
>
remappingGpu
;
// No need to re-map CPU devices
// Build CPU remapping on first use
if
(
IsCpuType
(
memType
))
return
origIdx
;
// Skip numa nodes that are not configured
if
(
remappingCpu
.
empty
())
{
for
(
int
node
=
0
;
node
<=
numa_max_node
();
node
++
)
if
(
numa_bitmask_isbitset
(
numa_get_mems_allowed
(),
node
))
remappingCpu
.
push_back
(
node
);
}
// Build remapping on first use
// Build remapping
Gpu
on first use
if
(
remapping
.
empty
())
if
(
remapping
Gpu
.
empty
())
{
{
int
numGpuDevices
;
int
numGpuDevices
;
HIP_CALL
(
hipGetDeviceCount
(
&
numGpuDevices
));
HIP_CALL
(
hipGetDeviceCount
(
&
numGpuDevices
));
remapping
.
resize
(
numGpuDevices
);
remapping
Gpu
.
resize
(
numGpuDevices
);
int
const
usePcieIndexing
=
getenv
(
"USE_PCIE_INDEX"
)
?
atoi
(
getenv
(
"USE_PCIE_INDEX"
))
:
0
;
int
const
usePcieIndexing
=
getenv
(
"USE_PCIE_INDEX"
)
?
atoi
(
getenv
(
"USE_PCIE_INDEX"
))
:
0
;
if
(
!
usePcieIndexing
)
if
(
!
usePcieIndexing
)
{
{
// For HIP-based indexing no remapping is necessary
// For HIP-based indexing no remapping
Gpu
is necessary
for
(
int
i
=
0
;
i
<
numGpuDevices
;
++
i
)
for
(
int
i
=
0
;
i
<
numGpuDevices
;
++
i
)
remapping
[
i
]
=
i
;
remapping
Gpu
[
i
]
=
i
;
}
}
else
else
{
{
...
@@ -575,10 +582,10 @@ int RemappedIndex(int const origIdx, MemType const memType)
...
@@ -575,10 +582,10 @@ int RemappedIndex(int const origIdx, MemType const memType)
// Sort GPUs by PCIe address then use that as mapping
// Sort GPUs by PCIe address then use that as mapping
std
::
sort
(
mapping
.
begin
(),
mapping
.
end
());
std
::
sort
(
mapping
.
begin
(),
mapping
.
end
());
for
(
int
i
=
0
;
i
<
numGpuDevices
;
++
i
)
for
(
int
i
=
0
;
i
<
numGpuDevices
;
++
i
)
remapping
[
i
]
=
mapping
[
i
].
second
;
remapping
Gpu
[
i
]
=
mapping
[
i
].
second
;
}
}
}
}
return
remapping
[
origIdx
];
return
IsCpuType
(
memType
)
?
remappingCpu
[
origIdx
]
:
remappingGpu
[
origIdx
];
}
}
void
DisplayTopology
(
bool
const
outputToCsv
)
void
DisplayTopology
(
bool
const
outputToCsv
)
...
@@ -594,7 +601,8 @@ void DisplayTopology(bool const outputToCsv)
...
@@ -594,7 +601,8 @@ void DisplayTopology(bool const outputToCsv)
}
}
else
else
{
{
printf
(
"
\n
Detected topology: %d CPU NUMA node(s) %d GPU device(s)
\n
"
,
numa_num_configured_nodes
(),
numGpuDevices
);
printf
(
"
\n
Detected topology: %d configured CPU NUMA node(s) [%d total] %d GPU device(s)
\n
"
,
numa_num_configured_nodes
(),
numa_max_node
()
+
1
,
numGpuDevices
);
}
}
// Print out detected CPU topology
// Print out detected CPU topology
...
@@ -603,38 +611,42 @@ void DisplayTopology(bool const outputToCsv)
...
@@ -603,38 +611,42 @@ void DisplayTopology(bool const outputToCsv)
printf
(
"NUMA"
);
printf
(
"NUMA"
);
for
(
int
j
=
0
;
j
<
numCpuDevices
;
j
++
)
for
(
int
j
=
0
;
j
<
numCpuDevices
;
j
++
)
printf
(
",NUMA%02d"
,
j
);
printf
(
",NUMA%02d"
,
j
);
printf
(
",# CPUs,ClosestGPUs
\n
"
);
printf
(
",# CPUs,ClosestGPUs
,ActualNode
\n
"
);
}
}
else
else
{
{
printf
(
" |"
);
printf
(
" |"
);
for
(
int
j
=
0
;
j
<
numCpuDevices
;
j
++
)
for
(
int
j
=
0
;
j
<
numCpuDevices
;
j
++
)
printf
(
"NUMA %02d |"
,
j
);
printf
(
"NUMA %02d|"
,
j
);
printf
(
" # Cpus | Closest GPU(s)
\n
"
);
printf
(
" #Cpus | Closest GPU(s)
\n
"
);
printf
(
"------------+"
);
for
(
int
j
=
0
;
j
<=
numCpuDevices
;
j
++
)
for
(
int
j
=
0
;
j
<=
numCpuDevices
;
j
++
)
printf
(
"-------
-
+"
);
printf
(
"-------+"
);
printf
(
"--------
+------
-------
\n
"
);
printf
(
"---------------
\n
"
);
}
}
for
(
int
i
=
0
;
i
<
numCpuDevices
;
i
++
)
for
(
int
i
=
0
;
i
<
numCpuDevices
;
i
++
)
{
{
printf
(
"NUMA %02d%s"
,
i
,
outputToCsv
?
","
:
" |"
);
int
nodeI
=
RemappedIndex
(
i
,
MEM_CPU
);
printf
(
"NUMA %02d (%02d)%s"
,
i
,
nodeI
,
outputToCsv
?
","
:
"|"
);
for
(
int
j
=
0
;
j
<
numCpuDevices
;
j
++
)
for
(
int
j
=
0
;
j
<
numCpuDevices
;
j
++
)
{
{
int
numaDist
=
numa_distance
(
i
,
j
);
int
nodeJ
=
RemappedIndex
(
j
,
MEM_CPU
);
int
numaDist
=
numa_distance
(
nodeI
,
nodeJ
);
if
(
outputToCsv
)
if
(
outputToCsv
)
printf
(
"%d,"
,
numaDist
);
printf
(
"%d,"
,
numaDist
);
else
else
printf
(
" %
6
d |"
,
numaDist
);
printf
(
" %
5
d |"
,
numaDist
);
}
}
int
numCpus
=
0
;
int
numCpus
=
0
;
for
(
int
j
=
0
;
j
<
numa_num_configured_cpus
();
j
++
)
for
(
int
j
=
0
;
j
<
numa_num_configured_cpus
();
j
++
)
if
(
numa_node_of_cpu
(
j
)
==
i
)
numCpus
++
;
if
(
numa_node_of_cpu
(
j
)
==
nodeI
)
numCpus
++
;
if
(
outputToCsv
)
if
(
outputToCsv
)
printf
(
"%d,"
,
numCpus
);
printf
(
"%d,"
,
numCpus
);
else
else
printf
(
" %
6
d | "
,
numCpus
);
printf
(
" %
5
d | "
,
numCpus
);
bool
isFirst
=
true
;
bool
isFirst
=
true
;
for
(
int
j
=
0
;
j
<
numGpuDevices
;
j
++
)
for
(
int
j
=
0
;
j
<
numGpuDevices
;
j
++
)
...
@@ -869,7 +881,11 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
...
@@ -869,7 +881,11 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
}
}
else
if
(
memType
==
MEM_CPU
)
else
if
(
memType
==
MEM_CPU
)
{
{
HIP_CALL
(
hipHostMalloc
((
void
**
)
memPtr
,
numBytes
,
hipHostMallocNumaUser
|
hipHostMallocNonCoherent
));
if
(
hipHostMalloc
((
void
**
)
memPtr
,
numBytes
,
hipHostMallocNumaUser
|
hipHostMallocNonCoherent
)
!=
hipSuccess
)
{
printf
(
"[ERROR] Unable to allocate non-coherent host memory on NUMA node %d
\n
"
,
devIndex
);
exit
(
1
);
}
}
}
else
if
(
memType
==
MEM_CPU_UNPINNED
)
else
if
(
memType
==
MEM_CPU_UNPINNED
)
{
{
...
@@ -1150,9 +1166,10 @@ void RunTransfer(EnvVars const& ev, int const iteration,
...
@@ -1150,9 +1166,10 @@ void RunTransfer(EnvVars const& ev, int const iteration,
else
if
(
transfer
->
exeMemType
==
MEM_CPU
)
// CPU execution agent
else
if
(
transfer
->
exeMemType
==
MEM_CPU
)
// CPU execution agent
{
{
// Force this thread and all child threads onto correct NUMA node
// Force this thread and all child threads onto correct NUMA node
if
(
numa_run_on_node
(
transfer
->
exeIndex
))
int
const
exeIndex
=
RemappedIndex
(
transfer
->
exeIndex
,
MEM_CPU
);
if
(
numa_run_on_node
(
exeIndex
))
{
{
printf
(
"[ERROR] Unable to set CPU to NUMA node %d
\n
"
,
transfer
->
exeIndex
);
printf
(
"[ERROR] Unable to set CPU to NUMA node %d
\n
"
,
exeIndex
);
exit
(
1
);
exit
(
1
);
}
}
...
@@ -1179,9 +1196,8 @@ void RunTransfer(EnvVars const& ev, int const iteration,
...
@@ -1179,9 +1196,8 @@ void RunTransfer(EnvVars const& ev, int const iteration,
void
RunPeerToPeerBenchmarks
(
EnvVars
const
&
ev
,
size_t
N
,
int
numBlocksToUse
,
int
readMode
,
int
skipCpu
)
void
RunPeerToPeerBenchmarks
(
EnvVars
const
&
ev
,
size_t
N
,
int
numBlocksToUse
,
int
readMode
,
int
skipCpu
)
{
{
// Collect the number of available CPUs/GPUs on this machine
// Collect the number of available CPUs/GPUs on this machine
int
numGpus
;
int
const
numGpus
=
ev
.
numGpuDevices
;
HIP_CALL
(
hipGetDeviceCount
(
&
numGpus
));
int
const
numCpus
=
ev
.
numCpuDevices
;
int
const
numCpus
=
numa_num_configured_nodes
();
int
const
numDevices
=
numCpus
+
numGpus
;
int
const
numDevices
=
numCpus
+
numGpus
;
// Enable peer to peer for each GPU
// Enable peer to peer for each GPU
...
@@ -1281,16 +1297,16 @@ double GetPeakBandwidth(EnvVars const& ev,
...
@@ -1281,16 +1297,16 @@ double GetPeakBandwidth(EnvVars const& ev,
std
::
vector
<
Transfer
>
transfers
(
2
);
std
::
vector
<
Transfer
>
transfers
(
2
);
transfers
[
0
].
srcMemType
=
transfers
[
1
].
dstMemType
=
srcMemType
;
transfers
[
0
].
srcMemType
=
transfers
[
1
].
dstMemType
=
srcMemType
;
transfers
[
0
].
dstMemType
=
transfers
[
1
].
srcMemType
=
dstMemType
;
transfers
[
0
].
dstMemType
=
transfers
[
1
].
srcMemType
=
dstMemType
;
transfers
[
0
].
srcIndex
=
transfers
[
1
].
dstIndex
=
RemappedIndex
(
srcIndex
,
srcMemType
)
;
transfers
[
0
].
srcIndex
=
transfers
[
1
].
dstIndex
=
srcIndex
;
transfers
[
0
].
dstIndex
=
transfers
[
1
].
srcIndex
=
RemappedIndex
(
dstIndex
,
dstMemType
)
;
transfers
[
0
].
dstIndex
=
transfers
[
1
].
srcIndex
=
dstIndex
;
transfers
[
0
].
numBytes
=
transfers
[
1
].
numBytes
=
N
*
sizeof
(
float
);
transfers
[
0
].
numBytes
=
transfers
[
1
].
numBytes
=
N
*
sizeof
(
float
);
transfers
[
0
].
numBlocksToUse
=
transfers
[
1
].
numBlocksToUse
=
numBlocksToUse
;
transfers
[
0
].
numBlocksToUse
=
transfers
[
1
].
numBlocksToUse
=
numBlocksToUse
;
// Either perform (local read + remote write), or (remote read + local write)
// Either perform (local read + remote write), or (remote read + local write)
transfers
[
0
].
exeMemType
=
(
readMode
==
0
?
srcMemType
:
dstMemType
);
transfers
[
0
].
exeMemType
=
(
readMode
==
0
?
srcMemType
:
dstMemType
);
transfers
[
1
].
exeMemType
=
(
readMode
==
0
?
dstMemType
:
srcMemType
);
transfers
[
1
].
exeMemType
=
(
readMode
==
0
?
dstMemType
:
srcMemType
);
transfers
[
0
].
exeIndex
=
RemappedIndex
(
(
readMode
==
0
?
srcIndex
:
dstIndex
)
,
transfers
[
0
].
exeMemType
)
;
transfers
[
0
].
exeIndex
=
(
readMode
==
0
?
srcIndex
:
dstIndex
);
transfers
[
1
].
exeIndex
=
RemappedIndex
(
(
readMode
==
0
?
dstIndex
:
srcIndex
)
,
transfers
[
1
].
exeMemType
)
;
transfers
[
1
].
exeIndex
=
(
readMode
==
0
?
dstIndex
:
srcIndex
);
transfers
.
resize
(
isBidirectional
+
1
);
transfers
.
resize
(
isBidirectional
+
1
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment