Unverified Commit ff2a96c9 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Configured NUMA node fixes (#6)

parent 6771015c
# Changelog for TransferBench # Changelog for TransferBench
## v1.08
### Changed
- Fixing handling of non-configured NUMA nodes
- Topology detection now shows actual NUMA node indices
- Fix for issue with NUM_GPU_DEVICES
## v1.07 ## v1.07
### Changed ### Changed
- Fix bug with allocations involving non-default CPU memory types - Fix bug with allocations involving non-default CPU memory types
......
...@@ -26,7 +26,7 @@ THE SOFTWARE. ...@@ -26,7 +26,7 @@ THE SOFTWARE.
#include <algorithm> #include <algorithm>
#include <random> #include <random>
#include <time.h> #include <time.h>
#define TB_VERSION "1.07" #define TB_VERSION "1.08"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
...@@ -105,7 +105,7 @@ public: ...@@ -105,7 +105,7 @@ public:
int numDetectedCpus = numa_num_configured_nodes(); int numDetectedCpus = numa_num_configured_nodes();
int numDetectedGpus; int numDetectedGpus;
hipGetDeviceCount(&numGpuDevices); hipGetDeviceCount(&numDetectedGpus);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256); blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0); byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
......
...@@ -543,24 +543,31 @@ void DisplayUsage(char const* cmdName) ...@@ -543,24 +543,31 @@ void DisplayUsage(char const* cmdName)
int RemappedIndex(int const origIdx, MemType const memType) int RemappedIndex(int const origIdx, MemType const memType)
{ {
static std::vector<int> remapping; static std::vector<int> remappingCpu;
static std::vector<int> remappingGpu;
// No need to re-map CPU devices // Build CPU remapping on first use
if (IsCpuType(memType)) return origIdx; // Skip numa nodes that are not configured
if (remappingCpu.empty())
{
for (int node = 0; node <= numa_max_node(); node++)
if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
remappingCpu.push_back(node);
}
// Build remapping on first use // Build remappingGpu on first use
if (remapping.empty()) if (remappingGpu.empty())
{ {
int numGpuDevices; int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices)); HIP_CALL(hipGetDeviceCount(&numGpuDevices));
remapping.resize(numGpuDevices); remappingGpu.resize(numGpuDevices);
int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0; int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
if (!usePcieIndexing) if (!usePcieIndexing)
{ {
// For HIP-based indexing no remapping is necessary // For HIP-based indexing no remappingGpu is necessary
for (int i = 0; i < numGpuDevices; ++i) for (int i = 0; i < numGpuDevices; ++i)
remapping[i] = i; remappingGpu[i] = i;
} }
else else
{ {
...@@ -575,10 +582,10 @@ int RemappedIndex(int const origIdx, MemType const memType) ...@@ -575,10 +582,10 @@ int RemappedIndex(int const origIdx, MemType const memType)
// Sort GPUs by PCIe address then use that as mapping // Sort GPUs by PCIe address then use that as mapping
std::sort(mapping.begin(), mapping.end()); std::sort(mapping.begin(), mapping.end());
for (int i = 0; i < numGpuDevices; ++i) for (int i = 0; i < numGpuDevices; ++i)
remapping[i] = mapping[i].second; remappingGpu[i] = mapping[i].second;
} }
} }
return remapping[origIdx]; return IsCpuType(memType) ? remappingCpu[origIdx] : remappingGpu[origIdx];
} }
void DisplayTopology(bool const outputToCsv) void DisplayTopology(bool const outputToCsv)
...@@ -594,7 +601,8 @@ void DisplayTopology(bool const outputToCsv) ...@@ -594,7 +601,8 @@ void DisplayTopology(bool const outputToCsv)
} }
else else
{ {
printf("\nDetected topology: %d CPU NUMA node(s) %d GPU device(s)\n", numa_num_configured_nodes(), numGpuDevices); printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total] %d GPU device(s)\n",
numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices);
} }
// Print out detected CPU topology // Print out detected CPU topology
...@@ -603,38 +611,42 @@ void DisplayTopology(bool const outputToCsv) ...@@ -603,38 +611,42 @@ void DisplayTopology(bool const outputToCsv)
printf("NUMA"); printf("NUMA");
for (int j = 0; j < numCpuDevices; j++) for (int j = 0; j < numCpuDevices; j++)
printf(",NUMA%02d", j); printf(",NUMA%02d", j);
printf(",# CPUs,ClosestGPUs\n"); printf(",# CPUs,ClosestGPUs,ActualNode\n");
} }
else else
{ {
printf(" |"); printf(" |");
for (int j = 0; j < numCpuDevices; j++) for (int j = 0; j < numCpuDevices; j++)
printf("NUMA %02d |", j); printf("NUMA %02d|", j);
printf(" # Cpus | Closest GPU(s)\n"); printf(" #Cpus | Closest GPU(s)\n");
printf("------------+");
for (int j = 0; j <= numCpuDevices; j++) for (int j = 0; j <= numCpuDevices; j++)
printf("--------+"); printf("-------+");
printf("--------+-------------\n"); printf("---------------\n");
} }
for (int i = 0; i < numCpuDevices; i++) for (int i = 0; i < numCpuDevices; i++)
{ {
printf("NUMA %02d%s", i, outputToCsv ? "," : " |"); int nodeI = RemappedIndex(i, MEM_CPU);
printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|");
for (int j = 0; j < numCpuDevices; j++) for (int j = 0; j < numCpuDevices; j++)
{ {
int numaDist = numa_distance(i,j); int nodeJ = RemappedIndex(j, MEM_CPU);
int numaDist = numa_distance(nodeI, nodeJ);
if (outputToCsv) if (outputToCsv)
printf("%d,", numaDist); printf("%d,", numaDist);
else else
printf(" %6d |", numaDist); printf(" %5d |", numaDist);
} }
int numCpus = 0; int numCpus = 0;
for (int j = 0; j < numa_num_configured_cpus(); j++) for (int j = 0; j < numa_num_configured_cpus(); j++)
if (numa_node_of_cpu(j) == i) numCpus++; if (numa_node_of_cpu(j) == nodeI) numCpus++;
if (outputToCsv) if (outputToCsv)
printf("%d,", numCpus); printf("%d,", numCpus);
else else
printf(" %6d | ", numCpus); printf(" %5d | ", numCpus);
bool isFirst = true; bool isFirst = true;
for (int j = 0; j < numGpuDevices; j++) for (int j = 0; j < numGpuDevices; j++)
...@@ -869,7 +881,11 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt ...@@ -869,7 +881,11 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
} }
else if (memType == MEM_CPU) else if (memType == MEM_CPU)
{ {
HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent)); if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
{
printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
exit(1);
}
} }
else if (memType == MEM_CPU_UNPINNED) else if (memType == MEM_CPU_UNPINNED)
{ {
...@@ -1150,9 +1166,10 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1150,9 +1166,10 @@ void RunTransfer(EnvVars const& ev, int const iteration,
else if (transfer->exeMemType == MEM_CPU) // CPU execution agent else if (transfer->exeMemType == MEM_CPU) // CPU execution agent
{ {
// Force this thread and all child threads onto correct NUMA node // Force this thread and all child threads onto correct NUMA node
if (numa_run_on_node(transfer->exeIndex)) int const exeIndex = RemappedIndex(transfer->exeIndex, MEM_CPU);
if (numa_run_on_node(exeIndex))
{ {
printf("[ERROR] Unable to set CPU to NUMA node %d\n", transfer->exeIndex); printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex);
exit(1); exit(1);
} }
...@@ -1179,9 +1196,8 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1179,9 +1196,8 @@ void RunTransfer(EnvVars const& ev, int const iteration,
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu) void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu)
{ {
// Collect the number of available CPUs/GPUs on this machine // Collect the number of available CPUs/GPUs on this machine
int numGpus; int const numGpus = ev.numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpus)); int const numCpus = ev.numCpuDevices;
int const numCpus = numa_num_configured_nodes();
int const numDevices = numCpus + numGpus; int const numDevices = numCpus + numGpus;
// Enable peer to peer for each GPU // Enable peer to peer for each GPU
...@@ -1281,16 +1297,16 @@ double GetPeakBandwidth(EnvVars const& ev, ...@@ -1281,16 +1297,16 @@ double GetPeakBandwidth(EnvVars const& ev,
std::vector<Transfer> transfers(2); std::vector<Transfer> transfers(2);
transfers[0].srcMemType = transfers[1].dstMemType = srcMemType; transfers[0].srcMemType = transfers[1].dstMemType = srcMemType;
transfers[0].dstMemType = transfers[1].srcMemType = dstMemType; transfers[0].dstMemType = transfers[1].srcMemType = dstMemType;
transfers[0].srcIndex = transfers[1].dstIndex = RemappedIndex(srcIndex, srcMemType); transfers[0].srcIndex = transfers[1].dstIndex = srcIndex;
transfers[0].dstIndex = transfers[1].srcIndex = RemappedIndex(dstIndex, dstMemType); transfers[0].dstIndex = transfers[1].srcIndex = dstIndex;
transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float); transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float);
transfers[0].numBlocksToUse = transfers[1].numBlocksToUse = numBlocksToUse; transfers[0].numBlocksToUse = transfers[1].numBlocksToUse = numBlocksToUse;
// Either perform (local read + remote write), or (remote read + local write) // Either perform (local read + remote write), or (remote read + local write)
transfers[0].exeMemType = (readMode == 0 ? srcMemType : dstMemType); transfers[0].exeMemType = (readMode == 0 ? srcMemType : dstMemType);
transfers[1].exeMemType = (readMode == 0 ? dstMemType : srcMemType); transfers[1].exeMemType = (readMode == 0 ? dstMemType : srcMemType);
transfers[0].exeIndex = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), transfers[0].exeMemType); transfers[0].exeIndex = (readMode == 0 ? srcIndex : dstIndex);
transfers[1].exeIndex = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), transfers[1].exeMemType); transfers[1].exeIndex = (readMode == 0 ? dstIndex : srcIndex);
transfers.resize(isBidirectional + 1); transfers.resize(isBidirectional + 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment