Commit 2f047a8e authored by Gilbert Lee's avatar Gilbert Lee
Browse files

Adding support for NUMA nodes without CPUs

parent 8f88ce3f
# Changelog for TransferBench # Changelog for TransferBench
## v1.05
### Added
- Topology output now includes NUMA node information
- Support for NUMA nodes with no CPU cores (e.g. CXL memory)
### Removed
- SWEEP_SRC_IS_EXE environment variable
## v1.04 ## v1.04
### Added ### Added
- New environment variables for sweep based presets - New environment variables for sweep based presets
......
...@@ -26,7 +26,7 @@ THE SOFTWARE. ...@@ -26,7 +26,7 @@ THE SOFTWARE.
#include <algorithm> #include <algorithm>
#include <random> #include <random>
#include <time.h> #include <time.h>
#define TB_VERSION "1.04" #define TB_VERSION "1.05"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
...@@ -47,7 +47,6 @@ public: ...@@ -47,7 +47,6 @@ public:
int const DEFAULT_SAMPLING_FACTOR = 1; int const DEFAULT_SAMPLING_FACTOR = 1;
int const DEFAULT_NUM_CPU_PER_TRANSFER = 4; int const DEFAULT_NUM_CPU_PER_TRANSFER = 4;
int const DEFAULT_SWEEP_SRC_IS_EXE = 0;
std::string const DEFAULT_SWEEP_SRC = "CG"; std::string const DEFAULT_SWEEP_SRC = "CG";
std::string const DEFAULT_SWEEP_EXE = "CG"; std::string const DEFAULT_SWEEP_EXE = "CG";
std::string const DEFAULT_SWEEP_DST = "CG"; std::string const DEFAULT_SWEEP_DST = "CG";
...@@ -76,7 +75,6 @@ public: ...@@ -76,7 +75,6 @@ public:
std::vector<float> fillPattern; // Pattern of floats used to fill source data std::vector<float> fillPattern; // Pattern of floats used to fill source data
// Environment variables only for Sweep-preset // Environment variables only for Sweep-preset
int sweepSrcIsExe; // Non-zero if executor should always be the same as source
int sweepMin; // Min number of simultaneous Transfers to be executed per test int sweepMin; // Min number of simultaneous Transfers to be executed per test
int sweepMax; // Max number of simulatneous Transfers to be executed per test int sweepMax; // Max number of simulatneous Transfers to be executed per test
int sweepTestLimit; // Max number of tests to run during sweep (0 = no limit) int sweepTestLimit; // Max number of tests to run during sweep (0 = no limit)
...@@ -95,6 +93,9 @@ public: ...@@ -95,6 +93,9 @@ public:
// Random generator // Random generator
std::default_random_engine *generator; std::default_random_engine *generator;
// Track how many CPUs are available per NUMA node
std::vector<int> numCpusPerNuma;
// Constructor that collects values // Constructor that collects values
EnvVars() EnvVars()
{ {
...@@ -122,7 +123,6 @@ public: ...@@ -122,7 +123,6 @@ public:
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0); usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0); useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0);
sweepSrcIsExe = GetEnvVar("SWEEP_SRC_IS_EXE" , DEFAULT_SWEEP_SRC_IS_EXE);
sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN); sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN);
sweepMax = GetEnvVar("SWEEP_MAX" , DEFAULT_SWEEP_MAX); sweepMax = GetEnvVar("SWEEP_MAX" , DEFAULT_SWEEP_MAX);
sweepSrc = GetEnvVar("SWEEP_SRC" , DEFAULT_SWEEP_SRC); sweepSrc = GetEnvVar("SWEEP_SRC" , DEFAULT_SWEEP_SRC);
...@@ -287,6 +287,12 @@ public: ...@@ -287,6 +287,12 @@ public:
exit(1); exit(1);
} }
} }
// Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
numCpusPerNuma.resize(numDetectedCpus);
int const totalCpus = numa_num_configured_cpus();
for (int i = 0; i < totalCpus; i++)
numCpusPerNuma[numa_node_of_cpu(i)]++;
} }
// Display info on the env vars that can be used // Display info on the env vars that can be used
...@@ -393,7 +399,6 @@ public: ...@@ -393,7 +399,6 @@ public:
printf("%-20s = %12s : Source Memory Types to sweep\n", "SWEEP_SRC", sweepSrc.c_str()); printf("%-20s = %12s : Source Memory Types to sweep\n", "SWEEP_SRC", sweepSrc.c_str());
printf("%-20s = %12s : Executor Types to sweep\n", "SWEEP_EXE", sweepExe.c_str()); printf("%-20s = %12s : Executor Types to sweep\n", "SWEEP_EXE", sweepExe.c_str());
printf("%-20s = %12s : Destination Memory Types to sweep\n", "SWEEP_DST", sweepDst.c_str()); printf("%-20s = %12s : Destination Memory Types to sweep\n", "SWEEP_DST", sweepDst.c_str());
printf("%-20s = %12d : Transfer executor %s Transfer source\n", "SWEEP_SRC_IS_EXE", sweepSrcIsExe, sweepSrcIsExe ? "must match" : "may have any");
printf("%-20s = %12d : Min simultaneous Transfers\n", "SWEEP_MIN", sweepMin); printf("%-20s = %12d : Min simultaneous Transfers\n", "SWEEP_MIN", sweepMin);
printf("%-20s = %12d : Max simultaneous Transfers (0 = no limit)\n", "SWEEP_MAX", sweepMax); printf("%-20s = %12d : Max simultaneous Transfers (0 = no limit)\n", "SWEEP_MAX", sweepMax);
printf("%-20s = %12d : Max number of tests to run during sweep (0 = no limit)\n", "SWEEP_TEST_LIMIT", sweepTestLimit); printf("%-20s = %12d : Max number of tests to run during sweep (0 = no limit)\n", "SWEEP_TEST_LIMIT", sweepTestLimit);
...@@ -440,7 +445,6 @@ public: ...@@ -440,7 +445,6 @@ public:
printf("SWEEP_SRC,%s,Source Memory Types to sweep\n", sweepSrc.c_str()); printf("SWEEP_SRC,%s,Source Memory Types to sweep\n", sweepSrc.c_str());
printf("SWEEP_EXE,%s,Executor Types to sweep\n", sweepExe.c_str()); printf("SWEEP_EXE,%s,Executor Types to sweep\n", sweepExe.c_str());
printf("SWEEP_DST,%s,Destination Memory Types to sweep\n", sweepDst.c_str()); printf("SWEEP_DST,%s,Destination Memory Types to sweep\n", sweepDst.c_str());
printf("SWEEP_SRC_IS_EXE,%d, Transfer executor %s Transfer source\n", sweepSrcIsExe, sweepSrcIsExe ? "must match" : "may have any");
printf("SWEEP_SEED,%d,Random seed\n", sweepSeed); printf("SWEEP_SEED,%d,Random seed\n", sweepSeed);
printf("SWEEP_MIN,%d,Min simultaneous Transfers\n", sweepMin); printf("SWEEP_MIN,%d,Min simultaneous Transfers\n", sweepMin);
printf("SWEEP_MAX,%d,Max simultaneous Transfers (0 = no limit)\n", sweepMax); printf("SWEEP_MAX,%d,Max simultaneous Transfers (0 = no limit)\n", sweepMax);
......
...@@ -563,13 +563,76 @@ int RemappedIndex(int const origIdx, MemType const memType) ...@@ -563,13 +563,76 @@ int RemappedIndex(int const origIdx, MemType const memType)
void DisplayTopology(bool const outputToCsv) void DisplayTopology(bool const outputToCsv)
{ {
int numCpuDevices = numa_num_configured_nodes();
int numGpuDevices; int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices)); HIP_CALL(hipGetDeviceCount(&numGpuDevices));
if (outputToCsv) if (outputToCsv)
{ {
printf("NumCpus,%d\n", numa_num_configured_nodes()); printf("NumCpus,%d\n", numCpuDevices);
printf("NumGpus,%d\n", numGpuDevices); printf("NumGpus,%d\n", numGpuDevices);
}
else
{
printf("\nDetected topology: %d CPU NUMA node(s) %d GPU device(s)\n", numa_num_configured_nodes(), numGpuDevices);
}
// Print out detected CPU topology
if (outputToCsv)
{
printf("NUMA");
for (int j = 0; j < numCpuDevices; j++)
printf(",NUMA%02d", j);
printf(",# CPUs,ClosestGPUs\n");
}
else
{
printf(" |");
for (int j = 0; j < numCpuDevices; j++)
printf("NUMA %02d |", j);
printf(" # Cpus | Closest GPU(s)\n");
for (int j = 0; j <= numCpuDevices; j++)
printf("--------+");
printf("--------+-------------\n");
}
for (int i = 0; i < numCpuDevices; i++)
{
printf("NUMA %02d%s", i, outputToCsv ? "," : " |");
for (int j = 0; j < numCpuDevices; j++)
{
int numaDist = numa_distance(i,j);
if (outputToCsv)
printf("%d,", numaDist);
else
printf(" %6d |", numaDist);
}
int numCpus = 0;
for (int j = 0; j < numa_num_configured_cpus(); j++)
if (numa_node_of_cpu(j) == i) numCpus++;
if (outputToCsv)
printf("%d,", numCpus);
else
printf(" %6d | ", numCpus);
bool isFirst = true;
for (int j = 0; j < numGpuDevices; j++)
{
if (GetClosestNumaNode(RemappedIndex(j, MEM_GPU)) == i)
{
if (isFirst) isFirst = false;
else printf(",");
printf("%d", j);
}
}
printf("\n");
}
printf("\n");
// Print out detected GPU topology
if (outputToCsv)
{
printf("GPU"); printf("GPU");
for (int j = 0; j < numGpuDevices; j++) for (int j = 0; j < numGpuDevices; j++)
printf(",GPU %02d", j); printf(",GPU %02d", j);
...@@ -577,7 +640,6 @@ void DisplayTopology(bool const outputToCsv) ...@@ -577,7 +640,6 @@ void DisplayTopology(bool const outputToCsv)
} }
else else
{ {
printf("\nDetected topology: %d CPU NUMA node(s) %d GPU device(s)\n", numa_num_configured_nodes(), numGpuDevices);
printf(" |"); printf(" |");
for (int j = 0; j < numGpuDevices; j++) for (int j = 0; j < numGpuDevices; j++)
printf(" GPU %02d |", j); printf(" GPU %02d |", j);
...@@ -1232,6 +1294,13 @@ double GetPeakBandwidth(EnvVars const& ev, ...@@ -1232,6 +1294,13 @@ double GetPeakBandwidth(EnvVars const& ev,
transfers[0]->exeIndex = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), transfers[0]->exeMemType); transfers[0]->exeIndex = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), transfers[0]->exeMemType);
transfers[1]->exeIndex = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), transfers[1]->exeMemType); transfers[1]->exeIndex = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), transfers[1]->exeMemType);
// Abort if executing on NUMA node with no CPUs
for (int i = 0; i <= isBidirectional; i++)
{
if (transfers[i]->exeMemType == MEM_CPU && ev.numCpusPerNuma[transfers[i]->exeIndex] == 0)
return 0;
}
for (int i = 0; i <= isBidirectional; i++) for (int i = 0; i <= isBidirectional; i++)
{ {
AllocateMemory(transfers[i]->srcMemType, transfers[i]->srcIndex, AllocateMemory(transfers[i]->srcMemType, transfers[i]->srcIndex,
...@@ -1375,36 +1444,33 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co ...@@ -1375,36 +1444,33 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
std::vector<size_t> valuesOfN(1, numBytesPerTransfer / sizeof(float)); std::vector<size_t> valuesOfN(1, numBytesPerTransfer / sizeof(float));
// Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets) // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
bool hasCpuExecutor = false;
bool hasGpuExecutor = false;
std::vector<std::pair<MemType, int>> exeList; std::vector<std::pair<MemType, int>> exeList;
for (auto exe : ev.sweepExe) for (auto exe : ev.sweepExe)
{ {
MemType const exeMemType = CharToMemType(exe); MemType const exeMemType = CharToMemType(exe);
int numDevices;
if (IsGpuType(exeMemType)) if (IsGpuType(exeMemType))
{ {
numDevices = ev.numGpuDevices; for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
hasGpuExecutor = true; exeList.push_back(std::make_pair(exeMemType, exeIndex));
} }
else else
{ {
numDevices = ev.numCpuDevices; for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
hasCpuExecutor = true; {
} // Skip NUMA nodes that have no CPUs (e.g. CXL)
for (int exeIndex = 0; exeIndex < numDevices; ++exeIndex) if (ev.numCpusPerNuma[exeIndex] == 0) continue;
exeList.push_back(std::make_pair(exeMemType, exeIndex)); exeList.push_back(std::make_pair(exeMemType, exeIndex));
} }
int numExes = ev.sweepSrcIsExe ? 1 : exeList.size(); }
}
int numExes = exeList.size();
std::vector<std::pair<MemType, int>> srcList; std::vector<std::pair<MemType, int>> srcList;
for (auto src : ev.sweepSrc) for (auto src : ev.sweepSrc)
{ {
MemType const srcMemType = CharToMemType(src); MemType const srcMemType = CharToMemType(src);
int const numDevices = IsGpuType(srcMemType) ? ev.numGpuDevices : ev.numCpuDevices; int const numDevices = IsGpuType(srcMemType) ? ev.numGpuDevices : ev.numCpuDevices;
// Skip source memory type if executor is supposed to be source but not specified
if ((IsGpuType(srcMemType) && !hasGpuExecutor) ||
(!IsGpuType(srcMemType) && !hasCpuExecutor)) continue;
for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex) for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
srcList.push_back(std::make_pair(srcMemType, srcIndex)); srcList.push_back(std::make_pair(srcMemType, srcIndex));
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment