Adding pcopy benchmark, fixing CPU kernel on null destination (#101)

* Adding pcopy benchmark, fixing CPU kernel on null destination

Adding pcopy benchmark, fixing CPU kernel on null destination (#101)
* Adding pcopy benchmark, fixing CPU kernel on null destination
eaf32b47 · gilbertlee-amd · GitHub · 4c4fa4a3 · eaf32b47 · eaf32b47
Unverified Commit eaf32b47 authored Apr 03, 2024 by gilbertlee-amd Committed by GitHub Apr 03, 2024
6 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,15 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.50
+
+### Added
+- Adding new parallel copy preset benchmark (pcopy)
+  - Usage: ./TransferBench pcopy <numBytes=64M> <#CUs=8> <srcGpu=0> <minGpus=1> <maxGpus=#GPU-1>
+### Fixed
+- Removed non-copies DMA Transfers (this had previously been using hipMemset)
+- Fixed CPU executor when operating on null destination
+
 ## v1.49

 ### Fixes

--- a/README.md
+++ b/README.md
@@ -70,10 +70,11 @@ make
  * `a2a`    : All-to-all benchmark test
  * `cmdline`: Take in Transfers to run from command-line instead of via file
  * `p2p`    : Peer-to-peer benchmark test
+  * `pcopy`  : Benchmark parallel copies from a single GPU to other GPUs
  * `rsweep` : Random sweep across possible sets of transfers
-  * `rwrite` : Benchmarks parallel remote writes from a single GPU
+  * `rwrite` : Benchmarks parallel remote writes from a single GPU to other GPUs
  * `scaling`: GPU subexecutor scaling tests
-  * 'schmoo` : Local/Remote read/write/copy between two GPUs
+  * `schmoo` : Local/Remote read/write/copy between two GPUs
  * `sweep`  : Sweep across possible sets of transfers

 * When using the same GPU executor in multiple simultaneous transfers on separate streams (USE_SINGLE_STREAM=0),

--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
@@ -175,6 +175,40 @@ int main(int argc, char **argv)
      } while (curr < N * 2);
    }
  }
+  else if (!strcmp(argv[1], "pcopy"))
+  {
+    if (ev.numGpuDevices < 2)
+    {
+      printf("[ERROR] Parallel copy benchmark requires at least 2 GPUs\n");
+      exit(1);
+    }
+    ev.DisplayParallelCopyEnvVars();
+
+    int numSubExecs = (argc > 3 ? atoi(argv[3]) : 8);
+    int srcIdx      = (argc > 4 ? atoi(argv[4]) : 0);
+    int minGpus     = (argc > 5 ? atoi(argv[5]) : 1);
+    int maxGpus     = (argc > 6 ? atoi(argv[6]) : ev.numGpuDevices - 1);
+
+    if (maxGpus > ev.gpuMaxHwQueues && ev.useDmaCopy)
+    {
+      printf("[ERROR] DMA executor %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d\n",
+             srcIdx, maxGpus, ev.gpuMaxHwQueues);
+      printf("[ERROR] Aborting to avoid misleading results due to potential serialization of Transfers\n");
+      exit(1);
+    }
+
+    for (int N = 256; N <= (1<<27); N *= 2)
+    {
+      int delta = std::max(1, N / ev.samplingFactor);
+      int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
+      do
+      {
+        RunParallelCopyBenchmark(ev, curr * sizeof(float), numSubExecs, srcIdx, minGpus, maxGpus);
+        if (numBytesPerTransfer != 0) exit(0);
+        curr += delta;
+      } while (curr < N * 2);
+    }
+  }
  else if (!strcmp(argv[1], "cmdline"))
  {
    // Print environment variables and CSV header
@@ -969,16 +1003,22 @@ void DisplayUsage(char const* cmdName)
  printf("  config: Either:\n");
  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
  printf("          - Name of preset config:\n");
+  printf("              a2a          - GPU All-To-All benchmark\n");
+  printf("                             - 3rd optional arg: # of SubExecs to use\n");
+  printf("              cmdline      - Read Transfers from command line arguments (after N)\n");
  printf("              p2p          - Peer-to-peer benchmark tests\n");
+  printf("              rwrite/pcopy - Parallel writes/copies from single GPU to other GPUs\n");
+  printf("                             - 3rd optional arg: # GPU SubExecs per Transfer\n");
+  printf("                             - 4th optional arg: Root GPU index\n");
+  printf("                             - 5th optional arg: Min number of other GPUs to transfer to\n");
+  printf("                             - 6th optional arg: Max number of other GPUs to transfer to\n");
  printf("              sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
  printf("                             - 3rd optional arg: # GPU SubExecs per Transfer\n");
  printf("                             - 4th optional arg: # CPU SubExecs per Transfer\n");
-  printf("              scaling      - GPU SubExec scaling copy test\n");
+  printf("              scaling      - GPU GFX SubExec scaling copy test\n");
  printf("                             - 3th optional arg: Max # of SubExecs to use\n");
  printf("                             - 4rd optional arg: GPU index to use as executor\n");
-  printf("              a2a          - GPU All-To-All benchmark\n");
-  printf("                             - 3rd optional arg: # of SubExecs to use\n");
-  printf("              cmdline      - Read Transfers from command line arguments (after N)\n");
+  printf("              schmoo       - Local/RemoteRead/Write/Copy between two GPUs\n");
  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
         DEFAULT_BYTES_PER_TRANSFER);
@@ -1413,9 +1453,9 @@ void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transf
      exit(1);
    }

-    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1))
+    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs != 1 || transfer.numDsts != 1))
    {
-      printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n");
+      printf("[ERROR] GPU DMA executor can only be used for single source + single dst copies\n");
      exit(1);
    }

@@ -1718,12 +1758,7 @@ void RunTransfer(EnvVars const& ev, int const iteration,
      hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];

      HIP_CALL(hipEventRecord(startEvent, stream));
-      if (transfer->numSrcs == 0 && transfer->numDsts == 1)
-      {
-        HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
-                                MEMSET_CHAR, transfer->numBytesActual, stream));
-      }
-      else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
+      if (transfer->numSrcs == 1 && transfer->numDsts == 1)
      {
        HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
                                transfer->numBytesActual, hipMemcpyDefault,
@@ -2756,7 +2791,91 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
        {
          printf(" (%s %c%d %s)",
                 transfers[i].SrcToStr().c_str(),
-                 MemTypeStr[transfers[i].exeType], transfers[i].exeIndex,
+                 ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
+                 transfers[i].DstToStr().c_str());
+        }
+        printf("\n");
+      }
+    }
+  }
+}
+
+void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
+{
+  if (ev.useDmaCopy)
+    printf("Bytes to copy: %lu from GPU %d using DMA [Sweeping %d to %d parallel writes]\n",
+           numBytesPerTransfer, srcIdx, minGpus, maxGpus);
+  else
+    printf("Bytes to copy: %lu from GPU %d using GFX (%d CUs) [Sweeping %d to %d parallel writes]\n",
+           numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
+
+  char sep = (ev.outputToCsv ? ',' : ' ');
+
+  for (int i = 0; i < ev.numGpuDevices; i++)
+  {
+    if (i == srcIdx) continue;
+    printf("   GPU %-3d  %c", i, sep);
+  }
+  printf("\n");
+  if (!ev.outputToCsv)
+  {
+    for (int i = 0; i < ev.numGpuDevices-1; i++)
+    {
+      printf("-------------");
+    }
+    printf("\n");
+  }
+
+  for (int p = minGpus; p <= maxGpus; p++)
+  {
+    for (int bitmask = 0; bitmask < (1<<ev.numGpuDevices); bitmask++)
+    {
+      if (bitmask & (1<<srcIdx)) continue;
+      if (__builtin_popcount(bitmask) == p)
+      {
+        std::vector<Transfer> transfers;
+        for (int i = 0; i < ev.numGpuDevices; i++)
+        {
+          if (bitmask & (1<<i))
+          {
+            Transfer t;
+            t.exeType     = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
+            t.exeSubIndex = -1;
+            t.numSubExecs = ev.useDmaCopy ? 1 : numSubExecs;
+            t.numBytes    = numBytesPerTransfer;
+
+            t.numSrcs     = 1;
+            t.numDsts     = 1;
+            t.exeIndex    = srcIdx;
+            t.srcType.resize(1);
+            t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
+            t.srcIndex.resize(1);
+            t.srcIndex[0] = srcIdx;
+            t.dstType.resize(1);
+            t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
+            t.dstIndex.resize(1);
+            t.dstIndex[0] = i;
+
+            transfers.push_back(t);
+          }
+        }
+        ExecuteTransfers(ev, 0, 0, transfers, false);
+
+        int counter = 0;
+        for (int i = 0; i < ev.numGpuDevices; i++)
+        {
+          if (bitmask & (1<<i))
+            printf("  %8.3f  %c", transfers[counter++].transferBandwidth, sep);
+          else if (i != srcIdx)
+            printf("            %c", sep);
+        }
+
+        printf(" %d %d", p, numSubExecs);
+        for (auto i = 0; i < transfers.size(); i++)
+        {
+          printf(" (%s %c%d %s)",
+                 transfers[i].SrcToStr().c_str(),
+                 ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
                 transfers[i].DstToStr().c_str());
        }
        printf("\n");

--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"

-#define TB_VERSION "1.49"
+#define TB_VERSION "1.50"

 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -797,6 +797,18 @@ public:
    printf("\n");
  }

+  void DisplayParallelCopyEnvVars() const
+  {
+    DisplayEnvVars();
+    if (hideEnv) return;
+    if (!outputToCsv)
+      printf("[Parallel-copy Related]\n");
+    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
+             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
+    PRINT_EV("USE_GPU_DMA", useDmaCopy,
+             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
+    printf("\n");
+  }

  // Helper function that gets parses environment variable or sets to default value
  static int GetEnvVar(std::string const& varname, int defaultValue)

--- a/src/include/Kernels.hpp
+++ b/src/include/Kernels.hpp
@@ -95,16 +95,32 @@ void CpuReduceKernel(SubExecParam const& p)
  else if (numSrcs == 1)
  {
    float const* __restrict__ src = p.src[0];
+    if (numDsts == 0)
+    {
+      float sum = 0.0;
+      for (int j = 0; j < p.N; j++)
+        sum += p.src[0][j];
+
+      // Add a dummy check to ensure the read is not optimized out
+      if (sum != sum)
+      {
+        printf("[ERROR] Nan detected\n");
+      }
+    }
+    else
+    {
      for (int i = 0; i < numDsts; ++i)
      {
        memcpy(p.dst[i], src, p.N * sizeof(float));
      }
    }
+  }
  else
  {
+    float sum = 0.0f;
    for (int j = 0; j < p.N; j++)
    {
-      float sum = p.src[0][j];
+      sum = p.src[0][j];
      for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
      for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
    }

--- a/src/include/TransferBench.hpp
+++ b/src/include/TransferBench.hpp
@@ -191,6 +191,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
 void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs);
 void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs);
 void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
+void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);

 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);