Adding support for NUMA nodes without CPUs

2f047a8e · Gilbert Lee · 8f88ce3f · 2f047a8e · 2f047a8e · 2f047a8e
Commit 2f047a8e authored Aug 25, 2022 by Gilbert Lee
Hide whitespace changes
Inline Side-by-side

Showing with 98 additions and 21 deletions

CHANGELOG.md CHANGELOG.md +7 -0

EnvVars.hpp EnvVars.hpp +10 -6

TransferBench.cpp TransferBench.cpp +81 -15

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench

+## v1.05
+### Added
+- Topology output now includes NUMA node information
+- Support for NUMA nodes with no CPU cores (e.g. CXL memory)
+### Removed
+- SWEEP_SRC_IS_EXE environment variable
+
 ## v1.04
 ### Added
 - New environment variables for sweep based presets

--- a/EnvVars.hpp
+++ b/EnvVars.hpp
@@ -26,7 +26,7 @@ THE SOFTWARE.
 #include <algorithm>
 #include <random>
 #include <time.h>
-#define TB_VERSION "1.04"
+#define TB_VERSION "1.05"

 extern char const MemTypeStr[];

@@ -47,7 +47,6 @@ public:
  int const DEFAULT_SAMPLING_FACTOR      =  1;
  int const DEFAULT_NUM_CPU_PER_TRANSFER =  4;

-  int const DEFAULT_SWEEP_SRC_IS_EXE  = 0;
  std::string const DEFAULT_SWEEP_SRC = "CG";
  std::string const DEFAULT_SWEEP_EXE = "CG";
  std::string const DEFAULT_SWEEP_DST = "CG";
@@ -76,7 +75,6 @@ public:
  std::vector<float> fillPattern; // Pattern of floats used to fill source data

  // Environment variables only for Sweep-preset
-  int sweepSrcIsExe;     // Non-zero if executor should always be the same as source
  int sweepMin;          // Min number of simultaneous Transfers to be executed per test
  int sweepMax;          // Max number of simulatneous Transfers to be executed per test
  int sweepTestLimit;    // Max number of tests to run during sweep (0 = no limit)
@@ -95,6 +93,9 @@ public:
  // Random generator
  std::default_random_engine *generator;

+  // Track how many CPUs are available per NUMA node
+  std::vector<int> numCpusPerNuma;
+
  // Constructor that collects values
  EnvVars()
  {
@@ -122,7 +123,6 @@ public:
    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 0);

-    sweepSrcIsExe     = GetEnvVar("SWEEP_SRC_IS_EXE"    , DEFAULT_SWEEP_SRC_IS_EXE);
    sweepMin          = GetEnvVar("SWEEP_MIN"           , DEFAULT_SWEEP_MIN);
    sweepMax          = GetEnvVar("SWEEP_MAX"           , DEFAULT_SWEEP_MAX);
    sweepSrc          = GetEnvVar("SWEEP_SRC"           , DEFAULT_SWEEP_SRC);
@@ -287,6 +287,12 @@ public:
        exit(1);
      }
    }
+
+    // Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
+    numCpusPerNuma.resize(numDetectedCpus);
+    int const totalCpus = numa_num_configured_cpus();
+    for (int i = 0; i < totalCpus; i++)
+      numCpusPerNuma[numa_node_of_cpu(i)]++;
  }

  // Display info on the env vars that can be used
@@ -393,7 +399,6 @@ public:
      printf("%-20s = %12s : Source Memory Types to sweep\n", "SWEEP_SRC", sweepSrc.c_str());
      printf("%-20s = %12s : Executor Types to sweep\n", "SWEEP_EXE", sweepExe.c_str());
      printf("%-20s = %12s : Destination Memory Types to sweep\n", "SWEEP_DST", sweepDst.c_str());
-      printf("%-20s = %12d : Transfer executor %s Transfer source\n", "SWEEP_SRC_IS_EXE", sweepSrcIsExe, sweepSrcIsExe ? "must match" : "may have any");
      printf("%-20s = %12d : Min simultaneous Transfers\n", "SWEEP_MIN", sweepMin);
      printf("%-20s = %12d : Max simultaneous Transfers              (0 = no limit)\n", "SWEEP_MAX", sweepMax);
      printf("%-20s = %12d : Max number of tests to run during sweep (0 = no limit)\n", "SWEEP_TEST_LIMIT", sweepTestLimit);
@@ -440,7 +445,6 @@ public:
      printf("SWEEP_SRC,%s,Source Memory Types to sweep\n", sweepSrc.c_str());
      printf("SWEEP_EXE,%s,Executor Types to sweep\n", sweepExe.c_str());
      printf("SWEEP_DST,%s,Destination Memory Types to sweep\n", sweepDst.c_str());
-      printf("SWEEP_SRC_IS_EXE,%d, Transfer executor %s Transfer source\n", sweepSrcIsExe, sweepSrcIsExe ? "must match" : "may have any");
      printf("SWEEP_SEED,%d,Random seed\n", sweepSeed);
      printf("SWEEP_MIN,%d,Min simultaneous Transfers\n", sweepMin);
      printf("SWEEP_MAX,%d,Max simultaneous Transfers (0 = no limit)\n", sweepMax);

--- a/TransferBench.cpp
+++ b/TransferBench.cpp
@@ -563,13 +563,76 @@ int RemappedIndex(int const origIdx, MemType const memType)

 void DisplayTopology(bool const outputToCsv)
 {
+  int numCpuDevices = numa_num_configured_nodes();
  int numGpuDevices;
  HIP_CALL(hipGetDeviceCount(&numGpuDevices));

  if (outputToCsv)
  {
-    printf("NumCpus,%d\n", numa_num_configured_nodes());
+    printf("NumCpus,%d\n", numCpuDevices);
    printf("NumGpus,%d\n", numGpuDevices);
+  }
+  else
+  {
+    printf("\nDetected topology: %d CPU NUMA node(s)   %d GPU device(s)\n", numa_num_configured_nodes(), numGpuDevices);
+  }
+
+  // Print out detected CPU topology
+  if (outputToCsv)
+  {
+    printf("NUMA");
+    for (int j = 0; j < numCpuDevices; j++)
+      printf(",NUMA%02d", j);
+    printf(",# CPUs,ClosestGPUs\n");
+  }
+  else
+  {
+    printf("        |");
+    for (int j = 0; j < numCpuDevices; j++)
+      printf("NUMA %02d |", j);
+    printf(" # Cpus | Closest GPU(s)\n");
+    for (int j = 0; j <= numCpuDevices; j++)
+      printf("--------+");
+    printf("--------+-------------\n");
+  }
+
+  for (int i = 0; i < numCpuDevices; i++)
+  {
+    printf("NUMA %02d%s", i, outputToCsv ? "," : " |");
+    for (int j = 0; j < numCpuDevices; j++)
+    {
+      int numaDist = numa_distance(i,j);
+      if (outputToCsv)
+	printf("%d,", numaDist);
+      else
+	printf(" %6d |", numaDist);
+    }
+
+    int numCpus = 0;
+    for (int j = 0; j < numa_num_configured_cpus(); j++)
+      if (numa_node_of_cpu(j) == i) numCpus++;
+    if (outputToCsv)
+      printf("%d,", numCpus);
+    else
+      printf(" %6d | ", numCpus);
+
+    bool isFirst = true;
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      if (GetClosestNumaNode(RemappedIndex(j, MEM_GPU)) == i)
+      {
+        if (isFirst) isFirst = false;
+	else printf(",");
+	printf("%d", j);
+      }
+    }
+    printf("\n");
+  }
+  printf("\n");
+
+  // Print out detected GPU topology
+  if (outputToCsv)
+  {
    printf("GPU");
    for (int j = 0; j < numGpuDevices; j++)
      printf(",GPU %02d", j);
@@ -577,7 +640,6 @@ void DisplayTopology(bool const outputToCsv)
  }
  else
  {
-    printf("\nDetected topology: %d CPU NUMA node(s)   %d GPU device(s)\n", numa_num_configured_nodes(), numGpuDevices);
    printf("        |");
    for (int j = 0; j < numGpuDevices; j++)
      printf(" GPU %02d |", j);
@@ -1232,6 +1294,13 @@ double GetPeakBandwidth(EnvVars const& ev,
  transfers[0]->exeIndex   = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), transfers[0]->exeMemType);
  transfers[1]->exeIndex   = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), transfers[1]->exeMemType);

+  // Abort if executing on NUMA node with no CPUs
+  for (int i = 0; i <= isBidirectional; i++)
+  {
+    if (transfers[i]->exeMemType == MEM_CPU && ev.numCpusPerNuma[transfers[i]->exeIndex] == 0)
+      return 0;
+  }
+
  for (int i = 0; i <= isBidirectional; i++)
  {
    AllocateMemory(transfers[i]->srcMemType, transfers[i]->srcIndex,
@@ -1375,36 +1444,33 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
  std::vector<size_t> valuesOfN(1, numBytesPerTransfer / sizeof(float));

  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
-  bool hasCpuExecutor = false;
-  bool hasGpuExecutor = false;
  std::vector<std::pair<MemType, int>> exeList;
  for (auto exe : ev.sweepExe)
  {
    MemType const exeMemType = CharToMemType(exe);
-    int numDevices;
    if (IsGpuType(exeMemType))
    {
-      numDevices = ev.numGpuDevices;
-      hasGpuExecutor = true;
+      for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
+        exeList.push_back(std::make_pair(exeMemType, exeIndex));
    }
    else
    {
-      numDevices = ev.numCpuDevices;
-      hasCpuExecutor = true;
+      for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
+      {
+        // Skip NUMA nodes that have no CPUs (e.g. CXL)
+        if (ev.numCpusPerNuma[exeIndex] == 0) continue;
+        exeList.push_back(std::make_pair(exeMemType, exeIndex));
+      }
    }
-    for (int exeIndex = 0; exeIndex < numDevices; ++exeIndex)
-      exeList.push_back(std::make_pair(exeMemType, exeIndex));
  }
-  int numExes = ev.sweepSrcIsExe ? 1 : exeList.size();
+  int numExes = exeList.size();

  std::vector<std::pair<MemType, int>> srcList;
  for (auto src : ev.sweepSrc)
  {
    MemType const srcMemType = CharToMemType(src);
    int const numDevices = IsGpuType(srcMemType) ? ev.numGpuDevices : ev.numCpuDevices;
-    // Skip source memory type if executor is supposed to be source but not specified
-    if ((IsGpuType(srcMemType) && !hasGpuExecutor) ||
-        (!IsGpuType(srcMemType) && !hasCpuExecutor)) continue;
+
    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
      srcList.push_back(std::make_pair(srcMemType, srcIndex));
  }