V1.52 candidate (#122)

* Adding USE_HSA_DMA to switch to using hsa_amd_memory_async_copy in lieu of hipMemcpyAsync * Adding USE_GPU_DMA for A2A benchmark * Adding largeBAR check and fix for 0-hop GPU-CPU links

V1.52 candidate (#122)
* Adding USE_HSA_DMA to switch to using hsa_amd_memory_async_copy in lieu of hipMemcpyAsync * Adding USE_GPU_DMA for A2A benchmark * Adding largeBAR check and fix for 0-hop GPU-CPU links
600cf131 · gilbertlee-amd · GitHub · 340244a6 · 600cf131 · 600cf131
Unverified Commit 600cf131 authored Oct 09, 2024 by gilbertlee-amd Committed by GitHub Oct 09, 2024
4 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,14 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
+## v1.52
+### Added
+- Added USE_HSA_DMA env var to switch to using hsa_amd_memory_async_copy instead of hipMemcpyAsync for DMA execution
+- Added ability to set USE_GPU_DMA env var for a2a benchmark
+- Adding check for large BAR enablement for GPU devices during topology check
+### Fixed
+- Potential memory leak if HSA reports 0 hops between GPUs and CPUs
 ## v1.51
 ## Modified

--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
@@ -508,9 +508,9 @@ TestResults ExecuteTransfersImpl(EnvVars const& ev,
        for (Transfer* transfer : exeInfo.transfers)
        {
-          if (transfer->exeSubIndex != -1)
+          if (transfer->exeSubIndex != -1 || ev.useHsaDma)
          {
-            useTargetDma = true;
+            useTargetDma = (transfer->exeSubIndex != -1);
 #if defined(__NVCC__)
            printf("[ERROR] DMA executor subindex not supported on NVIDIA hardware\n");
@@ -544,18 +544,20 @@ TestResults ExecuteTransfersImpl(EnvVars const& ev,
            }
            // Check that engine Id exists between agents
-            uint32_t engineIdMask = 0;
+            if (useTargetDma) {
-            HSA_CHECK(hsa_amd_memory_copy_engine_status(transfer->dstAgent,
+              uint32_t engineIdMask = 0;
-                                                        transfer->srcAgent,
+              HSA_CHECK(hsa_amd_memory_copy_engine_status(transfer->dstAgent,
-                                                        &engineIdMask));
+                                                          transfer->srcAgent,
-            transfer->sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << transfer->exeSubIndex);
+                                                          &engineIdMask));
-            if (!(transfer->sdmaEngineId & engineIdMask))
+              transfer->sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << transfer->exeSubIndex);
-            {
+              if (!(transfer->sdmaEngineId & engineIdMask))
-              printf("[ERROR] DMA executor %d.%d does not exist or cannot copy between source %s to destination %s\n",
+              {
-                     transfer->exeIndex, transfer->exeSubIndex,
+                printf("[ERROR] DMA executor %d.%d does not exist or cannot copy between source %s to destination %s\n",
-                     transfer->SrcToStr().c_str(),
+                       transfer->exeIndex, transfer->exeSubIndex,
-                     transfer->DstToStr().c_str());
+                       transfer->SrcToStr().c_str(),
-              exit(1);
+                       transfer->DstToStr().c_str());
+                exit(1);
+              }
            }
 #endif
          }
@@ -822,7 +824,7 @@ cleanup:
      }
      transfer->subExecParam.clear();
-      if (exeType == EXE_GPU_DMA && transfer->exeSubIndex != -1)
+      if (exeType == EXE_GPU_DMA && (transfer->exeSubIndex != -1 || ev.useHsaDma))
      {
 #if !defined(__NVCC__)
        HSA_CHECK(hsa_signal_destroy(transfer->signal));
@@ -1161,6 +1163,15 @@ void DisplayTopology(bool const outputToCsv)
      printf("\n");
    }
  }
+  // Check that large BAR is enabled on all GPUs
+  for (int i = 0; i < numGpuDevices; i++) {
+    int const deviceIdx = RemappedIndex(i, false);
+    int isLargeBar = 0;
+    HIP_CALL(hipDeviceGetAttribute(&isLargeBar, hipDeviceAttributeIsLargeBar, deviceIdx));
+    if (!isLargeBar)
+      printf("[WARN] Large BAR is not enabled for GPU %d in BIOS.  This may result in segfaults\n", i);
+  }
 #endif
 }
@@ -1619,65 +1630,61 @@ void RunTransfer(EnvVars const& ev, int const iteration,
  {
    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
-    if (transfer->exeSubIndex == -1)
+    int subIterations = 0;
-    {
+    if (transfer->exeSubIndex == -1 && !ev.useHsaDma) {
      // Switch to executing GPU
      HIP_CALL(hipSetDevice(exeIndex));
      hipStream_t& stream     = exeInfo.streams[transferIdx];
      hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
      hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
-      int subIteration = 0;
      HIP_CALL(hipEventRecord(startEvent, stream));
      do {
        HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
                                transfer->numBytesActual, hipMemcpyDefault,
                                stream));
-      } while (++subIteration != ev.numSubIterations);
+      } while (++subIterations != ev.numSubIterations);
      HIP_CALL(hipEventRecord(stopEvent, stream));
      HIP_CALL(hipStreamSynchronize(stream));
-      if (iteration >= 0)
+      // Record time based on HIP events
-      {
+      if (iteration >= 0) {
-        // Record GPU timing
        float gpuDeltaMsec;
        HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-        //gpuDeltaMsec /= (1.0 * ev.numSubIterations);
        transfer->transferTime += gpuDeltaMsec;
        if (ev.showIterations)
          transfer->perIterationTime.push_back(gpuDeltaMsec);
      }
-    }
+    } else {
-    else
-    {
 #if defined(__NVCC__)
      printf("[ERROR] CUDA does not support targeting specific DMA engines\n");
      exit(1);
 #else
-      // Target specific DMA engine
+      // Use hsa_amd_memory copy (either targeted or untargeted)
      auto cpuStart = std::chrono::high_resolution_clock::now();
-      int subIterations = 0;
      do {
        // Atomically set signal to 1
        HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1));
+        if (ev.useHsaDma) {
-        HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
+          HSA_CALL(hsa_amd_memory_async_copy(transfer->dstMem[0], transfer->dstAgent,
-                                                     transfer->srcMem[0], transfer->srcAgent,
+                                             transfer->srcMem[0], transfer->srcAgent,
-                                                     transfer->numBytesActual, 0, NULL,
+                                             transfer->numBytesActual, 0, NULL,
-                                                     transfer->signal,
+                                             transfer->signal));
-                                                     transfer->sdmaEngineId, true));
+        } else {
+          HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
+                                                       transfer->srcMem[0], transfer->srcAgent,
+                                                       transfer->numBytesActual, 0, NULL,
+                                                       transfer->signal,
+                                                       transfer->sdmaEngineId, true));
+        }
        // Wait for SDMA transfer to complete
-        // NOTE: "A wait operation can spuriously resume at any time sooner than the timeout
-        //        (for example, due to system or other external factors) even when the
-        //         condition has not been met.)
        while(hsa_signal_wait_scacquire(transfer->signal,
                                        HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
                                        HSA_WAIT_STATE_ACTIVE) >= 1);
      } while (++subIterations < ev.numSubIterations);
-      if (iteration >= 0)
+      if (iteration >= 0) {
-      {
        // Record GPU timing
        auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
        double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
@@ -2106,7 +2113,7 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
  transfer.numSubExecs = numSubExecs;
  transfer.numSrcs     = ev.a2aMode == 2 ? 0 : 1;
  transfer.numDsts     = ev.a2aMode == 1 ? 0 : 1;
-  transfer.exeType     = EXE_GPU_GFX;
+  transfer.exeType     = (ev.useDmaCopy && transfer.numSrcs == 1 && transfer.numDsts == 1) ? EXE_GPU_DMA : EXE_GPU_GFX;
  transfer.exeSubIndex = -1;
  transfer.srcType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
  transfer.dstType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);

--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"
-#define TB_VERSION "1.51"
+#define TB_VERSION "1.52"
 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -95,6 +95,7 @@ public:
  int samplingFactor;    // Affects how many different values of N are generated (when N set to 0)
  int sharedMemBytes;    // Amount of shared memory to use per threadblock
  int showIterations;    // Show per-iteration timing info
+  int useHsaDma;         // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
  int useInteractive;    // Pause for user-input before starting transfer loop
  int usePcieIndexing;   // Base GPU indexing on PCIe address instead of HIP device
  int usePrepSrcKernel;  // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
@@ -202,6 +203,7 @@ public:
    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , DEFAULT_SAMPLING_FACTOR);
    sharedMemBytes    = GetEnvVar("SHARED_MEM_BYTES"    , defaultSharedMemBytes);
    showIterations    = GetEnvVar("SHOW_ITERATIONS"     , 0);
+    useHsaDma         = GetEnvVar("USE_HSA_DMA"         , 0);
    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
    usePrepSrcKernel  = GetEnvVar("USE_PREP_KERNEL"     , 0);
@@ -612,6 +614,7 @@ public:
    printf(" SAMPLING_FACTOR=F      - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
    printf(" SHARED_MEM_BYTES=X     - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
    printf(" SHOW_ITERATIONS        - Show per-iteration timing info\n");
+    printf(" USE_HSA_DMA            - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
    printf(" USE_INTERACTIVE        - Pause for user-input before starting transfer loop\n");
    printf(" USE_PCIE_INDEX         - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
    printf(" USE_PREP_KERNEL        - Use GPU kernel to initialize source data array pattern\n");
@@ -691,6 +694,8 @@ public:
             std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock"));
    PRINT_EV("SHOW_ITERATIONS", showIterations,
             std::string(showIterations ? "Showing" : "Hiding") + " per-iteration timing");
+    PRINT_EV("USE_HSA_DMA", useHsaDma,
+             std::string("Using ") + (useHsaDma ? "hsa_amd_async_copy" : "hipMemcpyAsync") + " for DMA execution");
    PRINT_EV("USE_INTERACTIVE", useInteractive,
             std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode");
    PRINT_EV("USE_PCIE_INDEX", usePcieIndexing,
@@ -797,6 +802,8 @@ public:
                                        "Perform write-only"));
    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
+    PRINT_EV("USE_GPU_DMA", useDmaCopy,
+             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
             std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");

--- a/src/include/GetClosestNumaNode.hpp
+++ b/src/include/GetClosestNumaNode.hpp
@@ -84,24 +84,22 @@ AgentData& GetAgentData()
 {
  static AgentData agentData = {};
-  if (!agentData.isInitialized)
+  if (!agentData.isInitialized) {
-  {
    agentData.isInitialized = true;
    // Add all detected agents to the list
    HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData));
    // Loop over each GPU
-    for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++)
+    for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++) {
-    {
      // Collect memory pool
      hsa_amd_memory_pool_t pool;
      HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool));
      // Loop over each CPU agent and check distance
+      agentData.closestNumaNode[i] = 0;
      int bestDistance = -1;
-      for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++)
+      for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++) {
-      {
        // Determine number of hops from GPU memory pool to CPU agent
        uint32_t hops = 0;
        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
@@ -109,23 +107,23 @@ AgentData& GetAgentData()
                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS,
                                                     &hops));
        // Gather link info
-        hsa_amd_memory_pool_link_info_t* link_info =
+        if (hops) {
-          (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t));
+          hsa_amd_memory_pool_link_info_t* link_info =
-        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
+            (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t));
-                                                     pool,
+          HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
-                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO,
+                                                       pool,
-                                                     link_info));
+                                                       HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO,
-        int numaDist = 0;
+                                                       link_info));
-        for (int k = 0; k < hops; k++)
+          int numaDist = 0;
-        {
+          for (int k = 0; k < hops; k++)
-          numaDist += link_info[k].numa_distance;
+            numaDist += link_info[k].numa_distance;
-        }
-        if (bestDistance == -1 || numaDist < bestDistance)
+          if (bestDistance == -1 || numaDist < bestDistance) {
-        {
+            agentData.closestNumaNode[i] = j;
-          agentData.closestNumaNode[i] = j;
+            bestDistance = numaDist;
-          bestDistance = numaDist;
+          }
+          free(link_info);
        }
-        free(link_info);
      }
    }
  }