Minor changes to p2p, a2a (#55)

0b7b979e · gilbertlee-amd · GitHub · a9cb3a25 · 0b7b979e · 0b7b979e
Unverified Commit 0b7b979e authored Sep 28, 2023 by gilbertlee-amd Committed by GitHub Sep 28, 2023
5 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench

+## v1.28
+### Added
+- Added A2A_DIRECT which only executes all-to-all only directly connected GPUs (on by default now)
+- Added average statistics for p2p and a2a benchmarks
+- Added USE_FINE_GRAIN for p2p benchmark.
+  - With older devices, p2p performance with default coarse grain device memory stops timing as soon as request sent to data fabric,
+    not actually when it arrives remotely, which may artificially inflate bandwidth numbers, especially when sending small amounts of data
+### Modified
+- Modified P2P output to help distinguish between CPU / GPU devices
+### Fixed
+- Fixed Makefile target to prevent unnecessary re-compilation
+
 ## v1.27
 ### Added
 - Adding cmdline preset to allow specify simple tests on command line

--- a/src/Makefile
+++ b/src/Makefile
@@ -7,9 +7,9 @@ NVCC=$(CUDA_PATH)/bin/nvcc

 # Compile TransferBenchCuda if nvcc detected
 ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
-	EXE=TransferBenchCuda
+	EXE=../TransferBenchCuda
 else
-	EXE=TransferBench
+	EXE=../TransferBench
 endif

 CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
@@ -17,11 +17,11 @@ NVFLAGS = -O3 -g -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gen
 LDFLAGS    += -lpthread
 all: $(EXE)

-TransferBench: TransferBench.cpp $(shell find -regex ".*\.\hpp")
-	$(HIPCC) $(CXXFLAGS) $< -o ../$@ $(LDFLAGS)
+../TransferBench: TransferBench.cpp $(shell find -regex ".*\.\hpp")
+	$(HIPCC) $(CXXFLAGS) $< -o $@ $(LDFLAGS)

-TransferBenchCuda: TransferBench.cpp $(shell find -regex ".*\.\hpp")
-	$(NVCC) $(NVFLAGS) $< -o ../$@ $(LDFLAGS)
+../TransferBenchCuda: TransferBench.cpp $(shell find -regex ".*\.\hpp")
+	$(NVCC) $(NVFLAGS) $< -o $@ $(LDFLAGS)

 clean:
 	rm -f *.o ../TransferBench ../TransferBenchCuda
--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
@@ -1457,6 +1457,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
      printf("%7s %02d", "CPU", i);
      if (ev.outputToCsv) printf(",");
    }
+    if (numCpus > 0) printf("   ");
    for (int i = 0; i < numGpus; i++)
    {
      printf("%7s %02d", "GPU", i);
@@ -1464,30 +1465,38 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
    }
    printf("\n");

+    double avgBwSum[2][2] = {};
+    int    avgCount[2][2] = {};
+
    ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
    // Loop over all possible src/dst pairs
    for (int src = 0; src < numDevices; src++)
    {
      MemType const srcType  = (src < numCpus ? MEM_CPU : MEM_GPU);
      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
-
+      MemType const srcTypeActual = ((ev.useFineGrain && srcType == MEM_CPU) ? MEM_CPU_FINE :
+                                     (ev.useFineGrain && srcType == MEM_GPU) ? MEM_GPU_FINE :
+                                                                               srcType);
      std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
      std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
      std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
      std::vector<std::vector<double>> stdDev(isBidirectional + 1);

+      if (src == numCpus && src != 0) printf("\n");
      for (int dst = 0; dst < numDevices; dst++)
      {
        MemType const dstType  = (dst < numCpus ? MEM_CPU : MEM_GPU);
        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
-
+        MemType const dstTypeActual = ((ev.useFineGrain && dstType == MEM_CPU) ? MEM_CPU_FINE :
+                                       (ev.useFineGrain && dstType == MEM_GPU) ? MEM_GPU_FINE :
+                                                                                 dstType);
        // Prepare Transfers
        std::vector<Transfer> transfers(isBidirectional + 1);

        // SRC -> DST
        transfers[0].numBytes = N * sizeof(float);
-        transfers[0].srcType.push_back(srcType);
-        transfers[0].dstType.push_back(dstType);
+        transfers[0].srcType.push_back(srcTypeActual);
+        transfers[0].dstType.push_back(dstTypeActual);
        transfers[0].srcIndex.push_back(srcIndex);
        transfers[0].dstIndex.push_back(dstIndex);
        transfers[0].numSrcs = transfers[0].numDsts = 1;
@@ -1500,8 +1509,8 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
        {
          transfers[1].numBytes = N * sizeof(float);
          transfers[1].numSrcs = transfers[1].numDsts = 1;
-          transfers[1].srcType.push_back(dstType);
-          transfers[1].dstType.push_back(srcType);
+          transfers[1].srcType.push_back(dstTypeActual);
+          transfers[1].dstType.push_back(srcTypeActual);
          transfers[1].srcIndex.push_back(dstIndex);
          transfers[1].dstIndex.push_back(srcIndex);
          transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
@@ -1542,6 +1551,12 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
            double const avgBw   = (transfers[dir].numBytesActual / 1.0E9) / avgTime * 1000.0f;
            avgBandwidth[dir].push_back(avgBw);

+            if (!(srcType == dstType && srcIndex == dstIndex))
+            {
+              avgBwSum[srcType][dstType] += avgBw;
+              avgCount[srcType][dstType]++;
+            }
+
            if (ev.showIterations)
            {
              double minTime = transfers[dir].perIterationTime[0];
@@ -1583,6 +1598,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)

        for (int dst = 0; dst < numDevices; dst++)
        {
+          if (dst == numCpus && dst != 0) printf("   ");
          double const avgBw = avgBandwidth[dir][dst];

          if (avgBw == 0.0)
@@ -1601,6 +1617,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
          for (int i = 0; i < numDevices; i++)
          {
            double const minBw = minBandwidth[dir][i];
+            if (i == numCpus && i != 0) printf("   ");
            if (minBw == 0.0)
              printf("%10s", "N/A");
            else
@@ -1615,6 +1632,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
          for (int i = 0; i < numDevices; i++)
          {
            double const maxBw = maxBandwidth[dir][i];
+            if (i == numCpus && i != 0) printf("   ");
            if (maxBw == 0.0)
              printf("%10s", "N/A");
            else
@@ -1629,6 +1647,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
          for (int i = 0; i < numDevices; i++)
          {
            double const sd = stdDev[dir][i];
+            if (i == numCpus && i != 0) printf("   ");
            if (sd == -1.0)
              printf("%10s", "N/A");
            else
@@ -1647,16 +1666,37 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
        for (int dst = 0; dst < numDevices; dst++)
        {
          double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
+          if (dst == numCpus && dst != 0) printf("   ");
          if (sumBw == 0.0)
            printf("%10s", "N/A");
          else
            printf("%10.2f", sumBw);
          if (ev.outputToCsv) printf(",");
        }
-        if (src < numDevices - 1) printf("\n\n");
+        printf("\n");
+        if (src < numDevices - 1) printf("\n");
      }
    }
-    printf("\n");
+
+    if (!ev.outputToCsv)
+    {
+      printf("                         ");
+      for (int srcType : {MEM_CPU, MEM_GPU})
+        for (int dstType : {MEM_CPU, MEM_GPU})
+          printf("  %cPU->%cPU", srcType == MEM_CPU ? 'C' : 'G', dstType == MEM_CPU ? 'C' : 'G');
+      printf("\n");
+
+      printf("Averages (During %s):",  isBidirectional ? " BiDir" : "UniDir");
+      for (int srcType : {MEM_CPU, MEM_GPU})
+        for (int dstType : {MEM_CPU, MEM_GPU})
+        {
+          if (avgCount[srcType][dstType])
+            printf("%10.2f", avgBwSum[srcType][dstType] / avgCount[srcType][dstType]);
+          else
+            printf("%10s", "N/A");
+        }
+      printf("\n\n");
+    }
  }
 }

@@ -1732,7 +1772,7 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co

 void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs)
 {
-  ev.DisplayEnvVars();
+  ev.DisplayA2AEnvVars();

  // Collect the number of GPU devices to use
  int const numGpus = ev.numGpuDevices;
@@ -1763,14 +1803,27 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
    for (int j = 0; j < numGpus; j++)
    {
      transfer.dstIndex[0] = j;
+      if (ev.a2aDirect)
+      {
+#if !defined(__NVCC__)
+        if (i == j) continue;
+
+        uint32_t linkType, hopCount;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, false),
+                                              RemappedIndex(j, false),
+                                              &linkType, &hopCount));
+        if (hopCount != 1) continue;
+#endif
+      }
      transfers.push_back(transfer);
    }
  }

  printf("GPU-GFX All-To-All benchmark:\n");
  printf("==========================\n");
-  printf("- Copying %lu bytes between every pair of GPUs using %d CUs\n", numBytesPerTransfer, numSubExecs);
-  printf("- All numbers reported as GB/sec\n\n");
+  printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
+         numBytesPerTransfer, ev.a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
+  if (transfers.size() == 0) return;

  double totalBandwidthCpu = 0;
  ExecuteTransfers(ev, 0, numBytesPerTransfer / sizeof(float), transfers, true, &totalBandwidthCpu);
@@ -1780,21 +1833,52 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
  printf("SRC\\DST");
  for (int dst = 0; dst < numGpus; dst++)
    printf("%cGPU %02d   ", separator, dst);
-  printf("\n");
+  printf("   %cSTotal\n", separator);
+
+  std::map<std::pair<int, int>, int> reIndex;
+  for (int i = 0; i < transfers.size(); i++)
+  {
+    Transfer const& t = transfers[i];
+    reIndex[std::make_pair(t.srcIndex[0], t.dstIndex[0])] = i;
+  }

+  double totalBandwidthGpu = 0.0;
+  std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
  for (int src = 0; src < numGpus; src++)
  {
+    double rowTotalBandwidth = 0;
    printf("GPU %02d", src);
    for (int dst = 0; dst < numGpus; dst++)
    {
-      Transfer const& transfer = transfers[src * numGpus + dst];
-      double transferDurationMsec = transfer.transferTime / (1.0 * ev.numIterations);
-      double transferBandwidthGbs = (transfer.numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
-      printf("%c%7.2f  ", separator, transferBandwidthGbs);
+      if (reIndex.count(std::make_pair(src, dst)))
+      {
+        Transfer const& transfer = transfers[reIndex[std::make_pair(src,dst)]];
+        double transferDurationMsec = transfer.transferTime / (1.0 * ev.numIterations);
+        double transferBandwidthGbs = (transfer.numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+        colTotalBandwidth[dst] += transferBandwidthGbs;
+        rowTotalBandwidth += transferBandwidthGbs;
+        totalBandwidthGpu += transferBandwidthGbs;
+        printf("%c%7.2f  ", separator, transferBandwidthGbs);
+      }
+      else
+      {
+        printf("%c%7s  ", separator, "N/A");
+      }
    }
-    printf("\n");
+    printf("   %c%7.2f\n", separator, rowTotalBandwidth);
+    colTotalBandwidth[numGpus] += rowTotalBandwidth;
  }
-  printf("Aggregate bandwidth (CPU Timed): %7.2f\n", totalBandwidthCpu);
+  printf("\nRTotal");
+  for (int dst = 0; dst < numGpus; dst++)
+  {
+    printf("%c%7.2f  ", separator, colTotalBandwidth[dst]);
+  }
+  printf("   %c%7.2f\n", separator, colTotalBandwidth[numGpus]);
+  printf("\n");
+
+  printf("Average   bandwidth (GPU Timed): %7.2f GB/s\n", totalBandwidthGpu / transfers.size());
+  printf("Aggregate bandwidth (GPU Timed): %7.2f GB/s\n", totalBandwidthGpu);
+  printf("Aggregate bandwidth (CPU Timed): %7.2f GB/s\n", totalBandwidthCpu);
 }

 void Transfer::PrepareSubExecParams(EnvVars const& ev)

--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"

-#define TB_VERSION "1.27"
+#define TB_VERSION "1.28"

 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -92,6 +92,7 @@ public:
  int p2pMode;           // Both = 0, Unidirectional = 1, Bidirectional = 2
  int useDmaCopy;        // Use DMA copy instead of GPU copy
  int useRemoteRead;     // Use destination memory type as executor instead of source memory type
+  int useFineGrain;      // Use fine-grained memory

  // Environment variables only for Sweep-preset
  int sweepMin;          // Min number of simultaneous Transfers to be executed per test
@@ -106,6 +107,9 @@ public:
  std::string sweepExe;  // Set of executors to be swept
  std::string sweepDst;  // Set of dst memory types to be swept

+  // Enviroment variables only for A2A preset
+  int a2aDirect;         // Only execute on links that are directly connected
+
  // Developer features
  int enableDebug;       // Enable debug output
  int gpuKernel;         // Which GPU kernel to use
@@ -170,11 +174,13 @@ public:
    gpuKernel         = GetEnvVar("GPU_KERNEL"          , defaultGpuKernel);

    // P2P Benchmark related
-    useRemoteRead     = GetEnvVar("USE_REMOTE_READ"     , 0);
-    useDmaCopy        = GetEnvVar("USE_GPU_DMA"         , 0);
-    numGpuSubExecs    = GetEnvVar("NUM_GPU_SE"          , useDmaCopy ? 1 : numDeviceCUs);
+    useDmaCopy        = GetEnvVar("USE_GPU_DMA"         , 0); // Needed for numGpuSubExec
+
    numCpuSubExecs    = GetEnvVar("NUM_CPU_SE"          , DEFAULT_P2P_NUM_CPU_SE);
+    numGpuSubExecs    = GetEnvVar("NUM_GPU_SE"          , useDmaCopy ? 1 : numDeviceCUs);
    p2pMode           = GetEnvVar("P2P_MODE"            , 0);
+    useRemoteRead     = GetEnvVar("USE_REMOTE_READ"     , 0);
+    useFineGrain      = GetEnvVar("USE_FINE_GRAIN"      , 0);

    // Sweep related
    sweepMin          = GetEnvVar("SWEEP_MIN"           , DEFAULT_SWEEP_MIN);
@@ -188,6 +194,9 @@ public:
    sweepXgmiMax      = GetEnvVar("SWEEP_XGMI_MAX"      , -1);
    sweepRandBytes    = GetEnvVar("SWEEP_RAND_BYTES"    , 0);

+    // A2A Benchmark related
+    a2aDirect         = GetEnvVar("A2A_DIRECT"          , 1);
+
    // Determine random seed
    char *sweepSeedStr = getenv("SWEEP_SEED");
    sweepSeed = (sweepSeedStr != NULL ? atoi(sweepSeedStr) : time(NULL));
@@ -517,6 +526,9 @@ public:
             std::string("Running ") + (p2pMode == 1 ? "Unidirectional" :
                                        p2pMode == 2 ? "Bidirectional"  :
                                                       "Unidirectional + Bidirectional"));
+    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
+             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
+
    PRINT_EV("USE_GPU_DMA", useDmaCopy,
             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
@@ -557,6 +569,17 @@ public:
    printf("\n");
  }

+  void DisplayA2AEnvVars() const
+  {
+    DisplayEnvVars();
+    if (hideEnv) return;
+    if (!outputToCsv)
+      printf("[AllToAll Related]\n");
+    PRINT_EV("A2A_DIRECT", a2aDirect,
+             std::string(a2aDirect ? "Only using direct links" : "Full all-to-all"));
+    printf("\n");
+  }
+
  // Helper function that gets parses environment variable or sets to default value
  static int GetEnvVar(std::string const& varname, int defaultValue)
  {

--- a/src/include/Kernels.hpp
+++ b/src/include/Kernels.hpp
@@ -219,8 +219,8 @@ GpuReduceKernel(SubExecParam* params)
  __syncthreads();
  if (threadIdx.x == 0)
  {
-    p.startCycle = startCycle;
    p.stopCycle  = wall_clock64();
+    p.startCycle = startCycle;
    __trace_hwreg();
  }
 }