Update doc reqs

1c2484aa · Sam Wu · 6b1aa592 · 3ee75292 · 1c2484aa · 1c2484aa
Commit 1c2484aa authored Apr 16, 2024 by Sam Wu
8 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,15 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.50
+
+### Added
+- Adding new parallel copy preset benchmark (pcopy)
+  - Usage: ./TransferBench pcopy <numBytes=64M> <#CUs=8> <srcGpu=0> <minGpus=1> <maxGpus=#GPU-1>
+### Fixed
+- Removed non-copies DMA Transfers (this had previously been using hipMemset)
+- Fixed CPU executor when operating on null destination
+
 ## v1.49

 ### Fixes

--- a/README.md
+++ b/README.md
@@ -70,10 +70,11 @@ make
  * `a2a`    : All-to-all benchmark test
  * `cmdline`: Take in Transfers to run from command-line instead of via file
  * `p2p`    : Peer-to-peer benchmark test
+  * `pcopy`  : Benchmark parallel copies from a single GPU to other GPUs
  * `rsweep` : Random sweep across possible sets of transfers
-  * `rwrite` : Benchmarks parallel remote writes from a single GPU
+  * `rwrite` : Benchmarks parallel remote writes from a single GPU to other GPUs
  * `scaling`: GPU subexecutor scaling tests
-  * 'schmoo` : Local/Remote read/write/copy between two GPUs
+  * `schmoo` : Local/Remote read/write/copy between two GPUs
  * `sweep`  : Sweep across possible sets of transfers

 * When using the same GPU executor in multiple simultaneous transfers on separate streams (USE_SINGLE_STREAM=0),

--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==0.36.0
+rocm-docs-core==0.38.1
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
-#    pip-compile --resolver=backtracking requirements.in
+#    pip-compile --resolver=backtracking docs/sphinx/requirements.in
 #
 accessible-pygments==0.0.4
    # via pydata-sphinx-theme
@@ -25,54 +25,35 @@ cffi==1.16.0
 charset-normalizer==3.3.2
    # via requests
 click==8.1.7
-    # via
-    #   click-log
-    #   doxysphinx
-    #   sphinx-external-toc
-click-log==0.4.0
-    # via doxysphinx
+    # via sphinx-external-toc
 cryptography==42.0.5
    # via pyjwt
 deprecated==1.2.14
    # via pygithub
-docutils==0.17.1
+docutils==0.19
    # via
    #   breathe
    #   myst-parser
-    #   pybtex-docutils
    #   pydata-sphinx-theme
    #   sphinx
-    #   sphinxcontrib-bibtex
-doxysphinx==3.3.7
-    # via rocm-docs-core
 fastjsonschema==2.19.1
    # via rocm-docs-core
 gitdb==4.0.11
    # via gitpython
-gitpython==3.1.42
+gitpython==3.1.43
    # via rocm-docs-core
-idna==3.6
+idna==3.7
    # via requests
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==7.0.2
-    # via
-    #   sphinx
-    #   sphinxcontrib-bibtex
-importlib-resources==6.3.2
-    # via
-    #   mpire
-    #   rocm-docs-core
+importlib-metadata==7.1.0
+    # via sphinx
+importlib-resources==6.4.0
+    # via rocm-docs-core
 jinja2==3.1.3
    # via
    #   myst-parser
    #   sphinx
-latexcodec==3.0.0
-    # via pybtex
-libsass==0.22.0
-    # via doxysphinx
-lxml==4.9.4
-    # via doxysphinx
 markdown-it-py==2.2.0
    # via
    #   mdit-py-plugins
@@ -83,60 +64,44 @@ mdit-py-plugins==0.3.5
    # via myst-parser
 mdurl==0.1.2
    # via markdown-it-py
-mpire==2.10.1
-    # via doxysphinx
 myst-parser==1.0.0
    # via rocm-docs-core
-numpy==1.24.4
-    # via -r requirements.in
 packaging==24.0
    # via
    #   pydata-sphinx-theme
    #   sphinx
-pybtex==0.24.0
-    # via
-    #   pybtex-docutils
-    #   sphinxcontrib-bibtex
-pybtex-docutils==1.0.3
-    # via sphinxcontrib-bibtex
-pycparser==2.21
+pycparser==2.22
    # via cffi
 pydata-sphinx-theme==0.14.4
    # via
    #   rocm-docs-core
    #   sphinx-book-theme
-pygithub==2.2.0
+pygithub==2.3.0
    # via rocm-docs-core
 pygments==2.17.2
    # via
    #   accessible-pygments
-    #   mpire
    #   pydata-sphinx-theme
    #   sphinx
-pyjson5==1.6.6
-    # via doxysphinx
 pyjwt[crypto]==2.8.0
-    # via pygithub
+    # via
+    #   pygithub
+    #   pyjwt
 pynacl==1.5.0
    # via pygithub
-pyparsing==3.1.2
-    # via doxysphinx
 pytz==2024.1
    # via babel
 pyyaml==6.0.1
    # via
    #   myst-parser
-    #   pybtex
    #   rocm-docs-core
    #   sphinx-external-toc
 requests==2.31.0
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core[api_reference]==0.37.0
-    # via -r requirements.in
-six==1.16.0
-    # via pybtex
+rocm-docs-core==0.38.1
+    # via -r docs/sphinx/requirements.in
 smmap==5.0.1
    # via gitdb
 snowballstemmer==2.2.0
@@ -154,7 +119,6 @@ sphinx==5.3.0
    #   sphinx-design
    #   sphinx-external-toc
    #   sphinx-notfound-page
-    #   sphinxcontrib-bibtex
 sphinx-book-theme==1.0.1
    # via rocm-docs-core
 sphinx-copybutton==0.5.2
@@ -167,8 +131,6 @@ sphinx-notfound-page==1.0.0
    # via rocm-docs-core
 sphinxcontrib-applehelp==1.0.4
    # via sphinx
-sphinxcontrib-bibtex==2.6.2
-    # via -r requirements.in
 sphinxcontrib-devhelp==1.0.2
    # via sphinx
 sphinxcontrib-htmlhelp==2.0.1
@@ -179,9 +141,7 @@ sphinxcontrib-qthelp==1.0.3
    # via sphinx
 sphinxcontrib-serializinghtml==1.1.5
    # via sphinx
-tqdm==4.66.2
-    # via mpire
-typing-extensions==4.10.0
+typing-extensions==4.11.0
    # via
    #   pydata-sphinx-theme
    #   pygithub

--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
@@ -175,6 +175,40 @@ int main(int argc, char **argv)
      } while (curr < N * 2);
    }
  }
+  else if (!strcmp(argv[1], "pcopy"))
+  {
+    if (ev.numGpuDevices < 2)
+    {
+      printf("[ERROR] Parallel copy benchmark requires at least 2 GPUs\n");
+      exit(1);
+    }
+    ev.DisplayParallelCopyEnvVars();
+
+    int numSubExecs = (argc > 3 ? atoi(argv[3]) : 8);
+    int srcIdx      = (argc > 4 ? atoi(argv[4]) : 0);
+    int minGpus     = (argc > 5 ? atoi(argv[5]) : 1);
+    int maxGpus     = (argc > 6 ? atoi(argv[6]) : ev.numGpuDevices - 1);
+
+    if (maxGpus > ev.gpuMaxHwQueues && ev.useDmaCopy)
+    {
+      printf("[ERROR] DMA executor %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d\n",
+             srcIdx, maxGpus, ev.gpuMaxHwQueues);
+      printf("[ERROR] Aborting to avoid misleading results due to potential serialization of Transfers\n");
+      exit(1);
+    }
+
+    for (int N = 256; N <= (1<<27); N *= 2)
+    {
+      int delta = std::max(1, N / ev.samplingFactor);
+      int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
+      do
+      {
+        RunParallelCopyBenchmark(ev, curr * sizeof(float), numSubExecs, srcIdx, minGpus, maxGpus);
+        if (numBytesPerTransfer != 0) exit(0);
+        curr += delta;
+      } while (curr < N * 2);
+    }
+  }
  else if (!strcmp(argv[1], "cmdline"))
  {
    // Print environment variables and CSV header
@@ -969,16 +1003,22 @@ void DisplayUsage(char const* cmdName)
  printf("  config: Either:\n");
  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
  printf("          - Name of preset config:\n");
+  printf("              a2a          - GPU All-To-All benchmark\n");
+  printf("                             - 3rd optional arg: # of SubExecs to use\n");
+  printf("              cmdline      - Read Transfers from command line arguments (after N)\n");
  printf("              p2p          - Peer-to-peer benchmark tests\n");
+  printf("              rwrite/pcopy - Parallel writes/copies from single GPU to other GPUs\n");
+  printf("                             - 3rd optional arg: # GPU SubExecs per Transfer\n");
+  printf("                             - 4th optional arg: Root GPU index\n");
+  printf("                             - 5th optional arg: Min number of other GPUs to transfer to\n");
+  printf("                             - 6th optional arg: Max number of other GPUs to transfer to\n");
  printf("              sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
  printf("                             - 3rd optional arg: # GPU SubExecs per Transfer\n");
  printf("                             - 4th optional arg: # CPU SubExecs per Transfer\n");
-  printf("              scaling      - GPU SubExec scaling copy test\n");
+  printf("              scaling      - GPU GFX SubExec scaling copy test\n");
  printf("                             - 3th optional arg: Max # of SubExecs to use\n");
  printf("                             - 4rd optional arg: GPU index to use as executor\n");
-  printf("              a2a          - GPU All-To-All benchmark\n");
-  printf("                             - 3rd optional arg: # of SubExecs to use\n");
-  printf("              cmdline      - Read Transfers from command line arguments (after N)\n");
+  printf("              schmoo       - Local/RemoteRead/Write/Copy between two GPUs\n");
  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
         DEFAULT_BYTES_PER_TRANSFER);
@@ -1413,9 +1453,9 @@ void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transf
      exit(1);
    }

-    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1))
+    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs != 1 || transfer.numDsts != 1))
    {
-      printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n");
+      printf("[ERROR] GPU DMA executor can only be used for single source + single dst copies\n");
      exit(1);
    }

@@ -1718,12 +1758,7 @@ void RunTransfer(EnvVars const& ev, int const iteration,
      hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];

      HIP_CALL(hipEventRecord(startEvent, stream));
-      if (transfer->numSrcs == 0 && transfer->numDsts == 1)
-      {
-        HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
-                                MEMSET_CHAR, transfer->numBytesActual, stream));
-      }
-      else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
+      if (transfer->numSrcs == 1 && transfer->numDsts == 1)
      {
        HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
                                transfer->numBytesActual, hipMemcpyDefault,
@@ -2756,7 +2791,91 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
        {
          printf(" (%s %c%d %s)",
                 transfers[i].SrcToStr().c_str(),
-                 MemTypeStr[transfers[i].exeType], transfers[i].exeIndex,
+                 ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
+                 transfers[i].DstToStr().c_str());
+        }
+        printf("\n");
+      }
+    }
+  }
+}
+
+void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
+{
+  if (ev.useDmaCopy)
+    printf("Bytes to copy: %lu from GPU %d using DMA [Sweeping %d to %d parallel writes]\n",
+           numBytesPerTransfer, srcIdx, minGpus, maxGpus);
+  else
+    printf("Bytes to copy: %lu from GPU %d using GFX (%d CUs) [Sweeping %d to %d parallel writes]\n",
+           numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
+
+  char sep = (ev.outputToCsv ? ',' : ' ');
+
+  for (int i = 0; i < ev.numGpuDevices; i++)
+  {
+    if (i == srcIdx) continue;
+    printf("   GPU %-3d  %c", i, sep);
+  }
+  printf("\n");
+  if (!ev.outputToCsv)
+  {
+    for (int i = 0; i < ev.numGpuDevices-1; i++)
+    {
+      printf("-------------");
+    }
+    printf("\n");
+  }
+
+  for (int p = minGpus; p <= maxGpus; p++)
+  {
+    for (int bitmask = 0; bitmask < (1<<ev.numGpuDevices); bitmask++)
+    {
+      if (bitmask & (1<<srcIdx)) continue;
+      if (__builtin_popcount(bitmask) == p)
+      {
+        std::vector<Transfer> transfers;
+        for (int i = 0; i < ev.numGpuDevices; i++)
+        {
+          if (bitmask & (1<<i))
+          {
+            Transfer t;
+            t.exeType     = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
+            t.exeSubIndex = -1;
+            t.numSubExecs = ev.useDmaCopy ? 1 : numSubExecs;
+            t.numBytes    = numBytesPerTransfer;
+
+            t.numSrcs     = 1;
+            t.numDsts     = 1;
+            t.exeIndex    = srcIdx;
+            t.srcType.resize(1);
+            t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
+            t.srcIndex.resize(1);
+            t.srcIndex[0] = srcIdx;
+            t.dstType.resize(1);
+            t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
+            t.dstIndex.resize(1);
+            t.dstIndex[0] = i;
+
+            transfers.push_back(t);
+          }
+        }
+        ExecuteTransfers(ev, 0, 0, transfers, false);
+
+        int counter = 0;
+        for (int i = 0; i < ev.numGpuDevices; i++)
+        {
+          if (bitmask & (1<<i))
+            printf("  %8.3f  %c", transfers[counter++].transferBandwidth, sep);
+          else if (i != srcIdx)
+            printf("            %c", sep);
+        }
+
+        printf(" %d %d", p, numSubExecs);
+        for (auto i = 0; i < transfers.size(); i++)
+        {
+          printf(" (%s %c%d %s)",
+                 transfers[i].SrcToStr().c_str(),
+                 ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
                 transfers[i].DstToStr().c_str());
        }
        printf("\n");

--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"

-#define TB_VERSION "1.49"
+#define TB_VERSION "1.50"

 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -797,6 +797,18 @@ public:
    printf("\n");
  }

+  void DisplayParallelCopyEnvVars() const
+  {
+    DisplayEnvVars();
+    if (hideEnv) return;
+    if (!outputToCsv)
+      printf("[Parallel-copy Related]\n");
+    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
+             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
+    PRINT_EV("USE_GPU_DMA", useDmaCopy,
+             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
+    printf("\n");
+  }

  // Helper function that gets parses environment variable or sets to default value
  static int GetEnvVar(std::string const& varname, int defaultValue)

--- a/src/include/Kernels.hpp
+++ b/src/include/Kernels.hpp
@@ -95,16 +95,32 @@ void CpuReduceKernel(SubExecParam const& p)
  else if (numSrcs == 1)
  {
    float const* __restrict__ src = p.src[0];
+    if (numDsts == 0)
+    {
+      float sum = 0.0;
+      for (int j = 0; j < p.N; j++)
+        sum += p.src[0][j];
+
+      // Add a dummy check to ensure the read is not optimized out
+      if (sum != sum)
+      {
+        printf("[ERROR] Nan detected\n");
+      }
+    }
+    else
+    {
      for (int i = 0; i < numDsts; ++i)
      {
        memcpy(p.dst[i], src, p.N * sizeof(float));
      }
    }
+  }
  else
  {
+    float sum = 0.0f;
    for (int j = 0; j < p.N; j++)
    {
-      float sum = p.src[0][j];
+      sum = p.src[0][j];
      for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
      for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
    }

--- a/src/include/TransferBench.hpp
+++ b/src/include/TransferBench.hpp
@@ -191,6 +191,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
 void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs);
 void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs);
 void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
+void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);

 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);