Commit 2f047a8e authored by Gilbert Lee's avatar Gilbert Lee
Browse files

Adding support for NUMA nodes without CPUs

parent 8f88ce3f
# Changelog for TransferBench
## v1.05
### Added
- Topology output now includes NUMA node information
- Support for NUMA nodes with no CPU cores (e.g. CXL memory)
### Removed
- SWEEP_SRC_IS_EXE environment variable
## v1.04
### Added
- New environment variables for sweep based presets
......
......@@ -26,7 +26,7 @@ THE SOFTWARE.
#include <algorithm>
#include <random>
#include <time.h>
#define TB_VERSION "1.04"
#define TB_VERSION "1.05"
extern char const MemTypeStr[];
......@@ -47,7 +47,6 @@ public:
int const DEFAULT_SAMPLING_FACTOR = 1;
int const DEFAULT_NUM_CPU_PER_TRANSFER = 4;
int const DEFAULT_SWEEP_SRC_IS_EXE = 0;
std::string const DEFAULT_SWEEP_SRC = "CG";
std::string const DEFAULT_SWEEP_EXE = "CG";
std::string const DEFAULT_SWEEP_DST = "CG";
......@@ -76,7 +75,6 @@ public:
std::vector<float> fillPattern; // Pattern of floats used to fill source data
// Environment variables only for Sweep-preset
int sweepSrcIsExe; // Non-zero if executor should always be the same as source
int sweepMin; // Min number of simultaneous Transfers to be executed per test
int sweepMax; // Max number of simulatneous Transfers to be executed per test
int sweepTestLimit; // Max number of tests to run during sweep (0 = no limit)
......@@ -95,6 +93,9 @@ public:
// Random generator
std::default_random_engine *generator;
// Track how many CPUs are available per NUMA node
std::vector<int> numCpusPerNuma;
// Constructor that collects values
EnvVars()
{
......@@ -122,7 +123,6 @@ public:
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0);
sweepSrcIsExe = GetEnvVar("SWEEP_SRC_IS_EXE" , DEFAULT_SWEEP_SRC_IS_EXE);
sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN);
sweepMax = GetEnvVar("SWEEP_MAX" , DEFAULT_SWEEP_MAX);
sweepSrc = GetEnvVar("SWEEP_SRC" , DEFAULT_SWEEP_SRC);
......@@ -287,6 +287,12 @@ public:
exit(1);
}
}
// Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
numCpusPerNuma.resize(numDetectedCpus);
int const totalCpus = numa_num_configured_cpus();
for (int i = 0; i < totalCpus; i++)
numCpusPerNuma[numa_node_of_cpu(i)]++;
}
// Display info on the env vars that can be used
......@@ -393,7 +399,6 @@ public:
printf("%-20s = %12s : Source Memory Types to sweep\n", "SWEEP_SRC", sweepSrc.c_str());
printf("%-20s = %12s : Executor Types to sweep\n", "SWEEP_EXE", sweepExe.c_str());
printf("%-20s = %12s : Destination Memory Types to sweep\n", "SWEEP_DST", sweepDst.c_str());
printf("%-20s = %12d : Transfer executor %s Transfer source\n", "SWEEP_SRC_IS_EXE", sweepSrcIsExe, sweepSrcIsExe ? "must match" : "may have any");
printf("%-20s = %12d : Min simultaneous Transfers\n", "SWEEP_MIN", sweepMin);
printf("%-20s = %12d : Max simultaneous Transfers (0 = no limit)\n", "SWEEP_MAX", sweepMax);
printf("%-20s = %12d : Max number of tests to run during sweep (0 = no limit)\n", "SWEEP_TEST_LIMIT", sweepTestLimit);
......@@ -440,7 +445,6 @@ public:
printf("SWEEP_SRC,%s,Source Memory Types to sweep\n", sweepSrc.c_str());
printf("SWEEP_EXE,%s,Executor Types to sweep\n", sweepExe.c_str());
printf("SWEEP_DST,%s,Destination Memory Types to sweep\n", sweepDst.c_str());
printf("SWEEP_SRC_IS_EXE,%d, Transfer executor %s Transfer source\n", sweepSrcIsExe, sweepSrcIsExe ? "must match" : "may have any");
printf("SWEEP_SEED,%d,Random seed\n", sweepSeed);
printf("SWEEP_MIN,%d,Min simultaneous Transfers\n", sweepMin);
printf("SWEEP_MAX,%d,Max simultaneous Transfers (0 = no limit)\n", sweepMax);
......
......@@ -563,13 +563,76 @@ int RemappedIndex(int const origIdx, MemType const memType)
void DisplayTopology(bool const outputToCsv)
{
int numCpuDevices = numa_num_configured_nodes();
int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
if (outputToCsv)
{
printf("NumCpus,%d\n", numa_num_configured_nodes());
printf("NumCpus,%d\n", numCpuDevices);
printf("NumGpus,%d\n", numGpuDevices);
}
else
{
printf("\nDetected topology: %d CPU NUMA node(s) %d GPU device(s)\n", numa_num_configured_nodes(), numGpuDevices);
}
// Print out detected CPU topology
if (outputToCsv)
{
printf("NUMA");
for (int j = 0; j < numCpuDevices; j++)
printf(",NUMA%02d", j);
printf(",# CPUs,ClosestGPUs\n");
}
else
{
printf(" |");
for (int j = 0; j < numCpuDevices; j++)
printf("NUMA %02d |", j);
printf(" # Cpus | Closest GPU(s)\n");
for (int j = 0; j <= numCpuDevices; j++)
printf("--------+");
printf("--------+-------------\n");
}
for (int i = 0; i < numCpuDevices; i++)
{
printf("NUMA %02d%s", i, outputToCsv ? "," : " |");
for (int j = 0; j < numCpuDevices; j++)
{
int numaDist = numa_distance(i,j);
if (outputToCsv)
printf("%d,", numaDist);
else
printf(" %6d |", numaDist);
}
int numCpus = 0;
for (int j = 0; j < numa_num_configured_cpus(); j++)
if (numa_node_of_cpu(j) == i) numCpus++;
if (outputToCsv)
printf("%d,", numCpus);
else
printf(" %6d | ", numCpus);
bool isFirst = true;
for (int j = 0; j < numGpuDevices; j++)
{
if (GetClosestNumaNode(RemappedIndex(j, MEM_GPU)) == i)
{
if (isFirst) isFirst = false;
else printf(",");
printf("%d", j);
}
}
printf("\n");
}
printf("\n");
// Print out detected GPU topology
if (outputToCsv)
{
printf("GPU");
for (int j = 0; j < numGpuDevices; j++)
printf(",GPU %02d", j);
......@@ -577,7 +640,6 @@ void DisplayTopology(bool const outputToCsv)
}
else
{
printf("\nDetected topology: %d CPU NUMA node(s) %d GPU device(s)\n", numa_num_configured_nodes(), numGpuDevices);
printf(" |");
for (int j = 0; j < numGpuDevices; j++)
printf(" GPU %02d |", j);
......@@ -1232,6 +1294,13 @@ double GetPeakBandwidth(EnvVars const& ev,
transfers[0]->exeIndex = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), transfers[0]->exeMemType);
transfers[1]->exeIndex = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), transfers[1]->exeMemType);
// Abort if executing on NUMA node with no CPUs
for (int i = 0; i <= isBidirectional; i++)
{
if (transfers[i]->exeMemType == MEM_CPU && ev.numCpusPerNuma[transfers[i]->exeIndex] == 0)
return 0;
}
for (int i = 0; i <= isBidirectional; i++)
{
AllocateMemory(transfers[i]->srcMemType, transfers[i]->srcIndex,
......@@ -1375,36 +1444,33 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
std::vector<size_t> valuesOfN(1, numBytesPerTransfer / sizeof(float));
// Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
bool hasCpuExecutor = false;
bool hasGpuExecutor = false;
std::vector<std::pair<MemType, int>> exeList;
for (auto exe : ev.sweepExe)
{
MemType const exeMemType = CharToMemType(exe);
int numDevices;
if (IsGpuType(exeMemType))
{
numDevices = ev.numGpuDevices;
hasGpuExecutor = true;
for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
exeList.push_back(std::make_pair(exeMemType, exeIndex));
}
else
{
numDevices = ev.numCpuDevices;
hasCpuExecutor = true;
for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
{
// Skip NUMA nodes that have no CPUs (e.g. CXL)
if (ev.numCpusPerNuma[exeIndex] == 0) continue;
exeList.push_back(std::make_pair(exeMemType, exeIndex));
}
}
for (int exeIndex = 0; exeIndex < numDevices; ++exeIndex)
exeList.push_back(std::make_pair(exeMemType, exeIndex));
}
int numExes = ev.sweepSrcIsExe ? 1 : exeList.size();
int numExes = exeList.size();
std::vector<std::pair<MemType, int>> srcList;
for (auto src : ev.sweepSrc)
{
MemType const srcMemType = CharToMemType(src);
int const numDevices = IsGpuType(srcMemType) ? ev.numGpuDevices : ev.numCpuDevices;
// Skip source memory type if executor is supposed to be source but not specified
if ((IsGpuType(srcMemType) && !hasGpuExecutor) ||
(!IsGpuType(srcMemType) && !hasCpuExecutor)) continue;
for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
srcList.push_back(std::make_pair(srcMemType, srcIndex));
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment