V1.48 - Adding targeting specific DMA engine support (#87)

* Adding targeted DMA engine support * Fixing CUDA compilation for H100

V1.48 - Adding targeting specific DMA engine support (#87)
* Adding targeted DMA engine support * Fixing CUDA compilation for H100
aa801b97 · gilbertlee-amd · GitHub · ef4252bb · aa801b97 · aa801b97
Unverified Commit aa801b97 authored Feb 02, 2024 by gilbertlee-amd Committed by GitHub Feb 02, 2024
11 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,18 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.48
+
+### Fixes
+* Various fixes for TransferBenchCuda
+
+### Additions
+* Support for targeting specific DMA engines via executor subindex (e.g. D0.1)
+* Printing warnings when exeuctors are overcommited
+
+### Modifications
+* USE_REMOTE_READ supported for rwrite preset benchmark
+
 ## v1.47

 ### Fixes

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
 if (DEFINED ENV{ROCM_PATH})
    set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
 else()

--- a/LICENSE.md
+++ b/LICENSE.md
-Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Documentation for TransferBench is available at

 * You must have a ROCm stack installed on your system (HIP runtime)
 * You must have `libnuma` installed on your system
+* AMD IOMMU must be enabled and set to passthrough for AMD Instinct cards

 ## Documentation

@@ -66,11 +67,15 @@ make
 * Running TransferBench with no arguments displays usage instructions and detected topology
  information
 * You can use several preset configurations instead of a configuration file:
-  * `p2p`: Peer-to-peer benchmark test
-  * `sweep`: Sweep across possible sets of transfers
-  * `rsweep`: Random sweep across possible sets of transfers
-* When using the same GPU executor in multiple simultaneous transfers, performance may be
-  serialized due to the maximum number of hardware queues available
+  * `a2a`    : All-to-all benchmark test
+  * `cmdline`: Take in Transfers to run from command-line instead of via file
+  * `p2p`    : Peer-to-peer benchmark test
+  * `rsweep` : Random sweep across possible sets of transfers
+  * `rwrite` : Benchmarks parallel remote writes from a single GPU
+  * `scaling`: GPU subexecutor scaling tests
+  * 'schmoo` : Local/Remote read/write/copy between two GPUs
+  * `sweep`  : Sweep across possible sets of transfers
+
+* When using the same GPU executor in multiple simultaneous transfers on separate streams (USE_SINGLE_STREAM=0),
+  performance may be serialized due to the maximum number of hardware queues available
  * The number of maximum hardware queues can be adjusted via `GPU_MAX_HW_QUEUES`
-  * Alternatively, running in single-stream mode (`USE_SINGLE_STREAM`=1) may avoid this issue
-    by launching all transfers on a single stream, rather than on individual streams
--- a/src/Makefile
+++ b/src/Makefile
@@ -13,7 +13,7 @@ else
 endif

 CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
-NVFLAGS = -O3 -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_75,code=sm_75
+NVFLAGS = -O3 -Iinclude -x cu -lnuma -arch=native
 LDFLAGS    += -lpthread
 all: $(EXE)


--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
 /*
-Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -161,7 +161,7 @@ int main(int argc, char **argv)
    int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
    int srcIdx      = (argc > 4 ? atoi(argv[4]) : 0);
    int minGpus     = (argc > 5 ? atoi(argv[5]) : 1);
-    int maxGpus     = (argc > 6 ? atoi(argv[6]) : std::min(ev.numGpuDevices - 1, 3));
+    int maxGpus     = (argc > 6 ? atoi(argv[6]) : ev.numGpuDevices - 1);

    for (int N = 256; N <= (1<<27); N *= 2)
    {
@@ -358,6 +358,9 @@ void ExecuteTransfers(EnvVars const& ev,
        {
 #if !defined(__NVCC__)
          HIP_CALL(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], ev.cuMask.size(), ev.cuMask.data()));
+#else
+          printf("[ERROR] CU Masking in not supported on NVIDIA hardware\n");
+          exit(-1);
 #endif
        }
        else
@@ -376,9 +379,98 @@ void ExecuteTransfers(EnvVars const& ev,
        AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
                       (void**)&exeInfo.subExecParamGpu);
 #else
-        AllocateMemory(MEM_CPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
+        AllocateMemory(MEM_MANAGED, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
                       (void**)&exeInfo.subExecParamGpu);
 #endif
+
+        // Check for sufficient subExecutors
+        int numDeviceCUs = 0;
+        HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, exeIndex));
+        if (exeInfo.totalSubExecs > numDeviceCUs)
+        {
+          printf("[WARN] GFX executor %d requesting %d total subexecutors, however only has %d.  Some Transfers may be serialized\n",
+                 exeIndex, exeInfo.totalSubExecs, numDeviceCUs);
+        }
+      }
+
+      // Check for targeted DMA
+      if (exeType == EXE_GPU_DMA)
+      {
+        bool useRandomDma = false;
+        bool useTargetDma = false;
+
+        // Check for sufficient hardware queues
+#if !defined(__NVCC_)
+        if (exeInfo.transfers.size() > ev.gpuMaxHwQueues)
+        {
+          printf("[WARN] DMA executor %d attempting %lu parallel transfers, however GPU_MAX_HW_QUEUES only set to %d\n",
+                 exeIndex, exeInfo.transfers.size(), ev.gpuMaxHwQueues);
+        }
+#endif
+
+        for (Transfer* transfer : exeInfo.transfers)
+        {
+          if (transfer->exeSubIndex != -1)
+          {
+            useTargetDma = true;
+
+#if defined(__NVCC__)
+            printf("[ERROR] DMA executor subindex not supported on NVIDIA hardware\n");
+            exit(-1);
+#else
+            if (transfer->numSrcs != 1 || transfer->numDsts != 1)
+            {
+              printf("[ERROR] DMA Transfer must have at exactly one source and one destination");
+              exit(1);
+            }
+
+            // Collect HSA agent information
+
+            hsa_amd_pointer_info_t info;
+            info.size = sizeof(info);
+
+            HSA_CHECK(hsa_amd_pointer_info(transfer->dstMem[0], &info, NULL, NULL, NULL));
+            transfer->dstAgent = info.agentOwner;
+
+            HSA_CHECK(hsa_amd_pointer_info(transfer->srcMem[0], &info, NULL, NULL, NULL));
+            transfer->srcAgent = info.agentOwner;
+
+            // Create HSA completion signal
+            HSA_CHECK(hsa_signal_create(1, 0, NULL, &transfer->signal));
+
+            // Check for valid engine Id
+            if (transfer->exeSubIndex < -1 || transfer->exeSubIndex >= 32)
+            {
+              printf("[ERROR] DMA executor subindex must be between 0 and 31\n");
+              exit(1);
+            }
+
+            // Check that engine Id exists between agents
+            uint32_t engineIdMask = 0;
+            HSA_CHECK(hsa_amd_memory_copy_engine_status(transfer->dstAgent,
+                                                        transfer->srcAgent,
+                                                        &engineIdMask));
+            transfer->sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << transfer->exeSubIndex);
+            if (!(transfer->sdmaEngineId & engineIdMask))
+            {
+              printf("[ERROR] DMA executor %d.%d does not exist or cannot copy between source %s to destination %s\n",
+                     transfer->exeIndex, transfer->exeSubIndex,
+                     transfer->SrcToStr().c_str(),
+                     transfer->DstToStr().c_str());
+              exit(1);
+            }
+#endif
+          }
+          else
+          {
+            useRandomDma = true;
+          }
+        }
+        if (useRandomDma && useTargetDma)
+        {
+          printf("[WARN] Mixing targeted and untargetted DMA execution on GPU %d may result in resource conflicts\n",
+            exeIndex);
+        }
      }
    }
  }
@@ -618,7 +710,7 @@ void ExecuteTransfers(EnvVars const& ev,
        totalCUs += transfer->numSubExecs;

        char exeSubIndexStr[32] = "";
-        if (ev.useXccFilter)
+        if (ev.useXccFilter || transfer->exeType == EXE_GPU_DMA)
        {
          if (transfer->exeSubIndex == -1)
            sprintf(exeSubIndexStr, ".*");
@@ -719,7 +811,7 @@ void ExecuteTransfers(EnvVars const& ev,
      char exeSubIndexStr[32] = "";
      if (ev.useXccFilter)
      {
-        if (transfer->exeSubIndex == -1)
+        if (transfer->exeSubIndex == -1 || transfer->exeType == EXE_GPU_DMA)
          sprintf(exeSubIndexStr, ".*");
        else
          sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex);
@@ -828,6 +920,13 @@ cleanup:
        DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset);
      }
      transfer->subExecParam.clear();
+
+      if (exeType == EXE_GPU_DMA && transfer->exeSubIndex != -1)
+      {
+#if !defined(__NVCC__)
+        HSA_CHECK(hsa_signal_destroy(transfer->signal));
+#endif
+      }
    }

    if (IsGpuType(exeType))
@@ -845,7 +944,7 @@ cleanup:
 #if !defined(__NVCC__)
        DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
 #else
-        DeallocateMemory(MEM_CPU, exeInfo.subExecParamGpu);
+        DeallocateMemory(MEM_MANAGED, exeInfo.subExecParamGpu);
 #endif
      }
    }
@@ -1015,9 +1114,52 @@ void DisplayTopology(bool const outputToCsv)
  printf("\n");

 #if defined(__NVCC__)
+
+  for (int i = 0; i < numGpuDevices; i++)
+  {
+    hipDeviceProp_t prop;
+    HIP_CALL(hipGetDeviceProperties(&prop, i));
+    printf(" GPU %02d | %s\n", i, prop.name);
+  }
+
  // No further topology detection done for NVIDIA platforms
  return;
-#endif
+#else
+    // Figure out DMA engines per GPU
+  std::vector<std::set<int>> dmaEngineIdsPerDevice(numGpuDevices);
+  {
+    std::vector<hsa_agent_t> agentList;
+    hsa_amd_pointer_info_t info;
+    info.size = sizeof(info);
+    for (int deviceId = 0; deviceId < numGpuDevices; deviceId++)
+    {
+      HIP_CALL(hipSetDevice(deviceId));
+      int32_t* tempBuffer;
+      HIP_CALL(hipMalloc((void**)&tempBuffer, 1024));
+      HSA_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
+      agentList.push_back(info.agentOwner);
+      HIP_CALL(hipFree(tempBuffer));
+    }
+
+    for (int srcDevice = 0; srcDevice < numGpuDevices; srcDevice++)
+    {
+      dmaEngineIdsPerDevice[srcDevice].clear();
+      for (int dstDevice = 0; dstDevice < numGpuDevices; dstDevice++)
+      {
+        if (srcDevice == dstDevice) continue;
+        uint32_t engineIdMask = 0;
+        if (hsa_amd_memory_copy_engine_status(agentList[dstDevice],
+                                              agentList[srcDevice],
+                                              &engineIdMask) != HSA_STATUS_SUCCESS)
+          continue;
+        for (int engineId = 0; engineId < 32; engineId++)
+        {
+          if (engineIdMask & (1U << engineId))
+            dmaEngineIdsPerDevice[srcDevice].insert(engineId);
+        }
+      }
+    }
+  }

  // Print out detected GPU topology
  if (outputToCsv)
@@ -1025,7 +1167,7 @@ void DisplayTopology(bool const outputToCsv)
    printf("GPU");
    for (int j = 0; j < numGpuDevices; j++)
      printf(",GPU %02d", j);
-    printf(",PCIe Bus ID,ClosestNUMA\n");
+    printf(",PCIe Bus ID,ClosestNUMA,DMA engines\n");
  }
  else
  {
@@ -1042,13 +1184,12 @@ void DisplayTopology(bool const outputToCsv)
    printf("        |");
    for (int j = 0; j < numGpuDevices; j++)
      printf(" GPU %02d |", j);
-    printf(" PCIe Bus ID  | #CUs | Closest NUMA\n");
+    printf(" PCIe Bus ID  | #CUs | Closest NUMA | DMA engines\n");
    for (int j = 0; j <= numGpuDevices; j++)
      printf("--------+");
-    printf("--------------+------+-------------\n");
+    printf("--------------+------+-------------+------------\n");
  }

-#if !defined(__NVCC__)
  char pciBusId[20];
  for (int i = 0; i < numGpuDevices; i++)
  {
@@ -1085,9 +1226,19 @@ void DisplayTopology(bool const outputToCsv)
    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx));

    if (outputToCsv)
-      printf("%s,%d,%d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
+      printf("%s,%d,%d,", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
    else
-      printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
+    {
+      printf(" %11s | %4d | %-12d |", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
+
+      bool isFirst = true;
+      for (auto x : dmaEngineIdsPerDevice[deviceIdx])
+      {
+        if (isFirst) isFirst = false; else printf(",");
+        printf("%d", x);
+      }
+      printf("\n");
+    }
  }
 #endif
 }
@@ -1324,6 +1475,7 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt

    // Check that the allocated pages are actually on the correct NUMA node
    memset(*memPtr, 0, numBytes);
+
    CheckPages((char*)*memPtr, numBytes, devIndex);

    // Reset to default numa mem policy
@@ -1344,13 +1496,15 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
      exit(1);
 #else
      HIP_CALL(hipSetDevice(devIndex));
-
-      hipDeviceProp_t prop;
-      HIP_CALL(hipGetDeviceProperties(&prop, 0));
      int flag = hipDeviceMallocUncached;
      HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
 #endif
    }
+    else if (memType == MEM_MANAGED)
+    {
+      HIP_CALL(hipSetDevice(devIndex));
+      HIP_CALL(hipMallocManaged((void**)memPtr, numBytes));
+    }
    HIP_CALL(hipMemset(*memPtr, 0, numBytes));
    HIP_CALL(hipDeviceSynchronize());
  }
@@ -1390,6 +1544,15 @@ void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
    }
    HIP_CALL(hipFree(memPtr));
  }
+  else if (memType == MEM_MANAGED)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null managed pointer for %lu bytes. Skipping hipMFree\n", bytes);
+      return;
+    }
+    HIP_CALL(hipFree(memPtr));
+  }
 }

 void CheckPages(char* array, size_t numBytes, int targetId)
@@ -1432,11 +1595,15 @@ void CheckPages(char* array, size_t numBytes, int targetId)

 uint32_t GetId(uint32_t hwId)
 {
+#if defined(__NVCC_)
+  return hwId;
+#else
  // Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
  int const shId = (hwId >> 12) &  1;
  int const cuId = (hwId >>  8) & 15;
  int const seId = (hwId >> 13) &  3;
  return (shId << 5) + (cuId << 2) + seId;
+#endif
 }

 void RunTransfer(EnvVars const& ev, int const iteration,
@@ -1529,37 +1696,75 @@ void RunTransfer(EnvVars const& ev, int const iteration,
  }
  else if (transfer->exeType == EXE_GPU_DMA)
  {
-    // Switch to executing GPU
    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
-    HIP_CALL(hipSetDevice(exeIndex));
-
-    hipStream_t& stream     = exeInfo.streams[transferIdx];
-    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
-    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];

-    HIP_CALL(hipEventRecord(startEvent, stream));
-    if (transfer->numSrcs == 0 && transfer->numDsts == 1)
+    if (transfer->exeSubIndex == -1)
    {
-      HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
-                              MEMSET_CHAR, transfer->numBytesActual, stream));
-    }
-    else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
-    {
-      HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
-                              transfer->numBytesActual, hipMemcpyDefault,
-                              stream));
-    }
-    HIP_CALL(hipEventRecord(stopEvent, stream));
-    HIP_CALL(hipStreamSynchronize(stream));
+      // Switch to executing GPU
+      HIP_CALL(hipSetDevice(exeIndex));
+      hipStream_t& stream     = exeInfo.streams[transferIdx];
+      hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
+      hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];

-    if (iteration >= 0)
+      HIP_CALL(hipEventRecord(startEvent, stream));
+      if (transfer->numSrcs == 0 && transfer->numDsts == 1)
+      {
+        HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
+                                MEMSET_CHAR, transfer->numBytesActual, stream));
+      }
+      else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
+      {
+        HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
+                                transfer->numBytesActual, hipMemcpyDefault,
+                                stream));
+      }
+      HIP_CALL(hipEventRecord(stopEvent, stream));
+      HIP_CALL(hipStreamSynchronize(stream));
+
+      if (iteration >= 0)
+      {
+        // Record GPU timing
+        float gpuDeltaMsec;
+        HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+        transfer->transferTime += gpuDeltaMsec;
+        if (ev.showIterations)
+          transfer->perIterationTime.push_back(gpuDeltaMsec);
+      }
+    }
+    else
    {
-      // Record GPU timing
-      float gpuDeltaMsec;
-      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-      transfer->transferTime += gpuDeltaMsec;
-      if (ev.showIterations)
-        transfer->perIterationTime.push_back(gpuDeltaMsec);
+#if defined(__NVCC__)
+      printf("[ERROR] CUDA does not support targeting specific DMA engines\n");
+      exit(1);
+#else
+      // Target specific DMA engine
+
+      // Atomically set signal to 1
+      HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1));
+
+      auto cpuStart = std::chrono::high_resolution_clock::now();
+      HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
+                                                   transfer->srcMem[0], transfer->srcAgent,
+                                                   transfer->numBytesActual, 0, NULL,
+                                                   transfer->signal,
+                                                   transfer->sdmaEngineId, true));
+      // Wait for SDMA transfer to complete
+      // NOTE: "A wait operation can spuriously resume at any time sooner than the timeout
+      //        (for example, due to system or other external factors) even when the
+      //         condition has not been met.)
+      while(hsa_signal_wait_scacquire(transfer->signal,
+                                      HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
+                                      HSA_WAIT_STATE_ACTIVE) >= 1);
+      if (iteration >= 0)
+      {
+        // Record GPU timing
+        auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+        double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+        transfer->transferTime += deltaMsec;
+        if (ev.showIterations)
+          transfer->perIterationTime.push_back(deltaMsec);
+      }
+#endif
    }
  }
  else if (transfer->exeType == EXE_CPU) // CPU execution agent
@@ -2260,6 +2465,8 @@ bool Transfer::PrepareSrc(EnvVars const& ev)
               ExeTypeStr[this->exeType], this->exeIndex,
               this->numSubExecs,
               this->DstToStr().c_str());
+        printf("[ERROR] Possible cause is misconfigured IOMMU (AMD Instinct cards require amd_iommu=on and iommu=pt)\n");
+        printf("[ERROR] Please see https://community.amd.com/t5/knowledge-base/iommu-advisory-for-amd-instinct/ta-p/484601 for more details\n");
        if (!ev.continueOnError)
          exit(1);
        return false;
@@ -2461,7 +2668,8 @@ void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int

 void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
 {
-  printf("Bytes to write: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
+  printf("Bytes to %s: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n",
+         ev.useRemoteRead ? "read" : "write", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);

  char sep = (ev.outputToCsv ? ',' : ' ');

@@ -2493,17 +2701,31 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
          if (bitmask & (1<<i))
          {
            Transfer t;
-            t.dstType.resize(1);
-            t.dstIndex.resize(1);
            t.exeType     = EXE_GPU_GFX;
-            t.exeIndex    = srcIdx;
            t.exeSubIndex = -1;
            t.numSubExecs = numSubExecs;
            t.numBytes    = numBytesPerTransfer;
-            t.numSrcs     = 0;
-            t.numDsts     = 1;
-            t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-            t.dstIndex[0] = i;
+
+            if (ev.useRemoteRead)
+            {
+              t.numSrcs  = 1;
+              t.numDsts  = 0;
+              t.exeIndex = i;
+              t.srcType.resize(1);
+              t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
+              t.srcIndex.resize(1);
+              t.srcIndex[0] = srcIdx;
+            }
+            else
+            {
+              t.numSrcs     = 0;
+              t.numDsts     = 1;
+              t.exeIndex    = srcIdx;
+              t.dstType.resize(1);
+              t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
+              t.dstIndex.resize(1);
+              t.dstIndex[0] = i;
+            }
            transfers.push_back(t);
          }
        }
@@ -2521,7 +2743,10 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
        printf(" %d %d", p, numSubExecs);
        for (auto i = 0; i < transfers.size(); i++)
        {
-          printf(" (N0 G%d %c%d)", srcIdx, MemTypeStr[transfers[i].dstType[0]], transfers[i].dstIndex[0]);
+          printf(" (%s %c%d %s)",
+                 transfers[i].SrcToStr().c_str(),
+                 MemTypeStr[transfers[i].exeType], transfers[i].exeIndex,
+                 transfers[i].DstToStr().c_str());
        }
        printf("\n");
      }

--- a/src/include/Compatibility.hpp
+++ b/src/include/Compatibility.hpp
 /*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -22,6 +22,18 @@ THE SOFTWARE.

 #pragma once

+// Helper macro for catching HIP errors
+#define HIP_CALL(cmd)                                                                   \
+    do {                                                                                \
+        hipError_t error = (cmd);                                                       \
+        if (error != hipSuccess)                                                        \
+        {                                                                               \
+            std::cerr << "Encountered HIP error (" << hipGetErrorString(error)          \
+                      << ") at line " << __LINE__ << " in file " << __FILE__ << "\n";   \
+            exit(-1);                                                                   \
+        }                                                                               \
+    } while (0)
+
 #if defined(__NVCC__)

 #include <cuda_runtime.h>
@@ -65,6 +77,7 @@ THE SOFTWARE.
 #define hipHostFree                                        cudaFreeHost
 #define hipHostMalloc                                      cudaMallocHost
 #define hipMalloc                                          cudaMalloc
+#define hipMallocManaged                                   cudaMallocManaged
 #define hipMemcpy                                          cudaMemcpy
 #define hipMemcpyAsync                                     cudaMemcpyAsync
 #define hipMemset                                          cudaMemset

--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
 /*
-Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"

-#define TB_VERSION "1.47"
+#define TB_VERSION "1.48"

 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -130,6 +130,7 @@ public:

  // Developer features
  int enableDebug;       // Enable debug output
+  int gpuMaxHwQueues;    // Tracks GPU_MAX_HW_QUEUES environment variable

  // Used to track current configuration mode
  ConfigModeEnum configMode;
@@ -202,6 +203,7 @@ public:
    useXccFilter      = GetEnvVar("USE_XCC_FILTER"      , 0);
    validateDirect    = GetEnvVar("VALIDATE_DIRECT"     , 0);
    enableDebug       = GetEnvVar("DEBUG"               , 0);
+    gpuMaxHwQueues    = GetEnvVar("GPU_MAX_HW_QUEUES"   , 4);

    // P2P Benchmark related
    useDmaCopy        = GetEnvVar("USE_GPU_DMA"         , 0); // Needed for numGpuSubExec
@@ -790,6 +792,9 @@ public:
      printf("[Remote-Write Related]\n");
    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
+    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
+             std::string("Performing remote ") + (useRemoteRead ? "reads" : "writes"));
+    printf("\n");
  }



--- a/src/include/GetClosestNumaNode.hpp
+++ b/src/include/GetClosestNumaNode.hpp
 /*
-Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

+#pragma once
+
 // Helper macro for checking HSA calls
 #define HSA_CHECK(cmd)                                                  \
  do {                                                                  \

--- a/src/include/Kernels.hpp
+++ b/src/include/Kernels.hpp
 /*
-Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -62,12 +62,15 @@ struct SubExecParam
 };

 // Macro for collecting HW_REG_HW_ID
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__NVCC__)
-#define __trace_hwreg() \
-  p.hwId = 0
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
+#define GetHwId(hwId) \
+  hwId = 0
+#elif defined(__NVCC__)
+#define GetHwId(hwId) \
+  asm("mov.u32 %0, %smid;" : "=r"(hwId) )
 #else
-#define __trace_hwreg() \
-  asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (p.hwId));
+#define GetHwId(hwId) \
+  asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwId));
 #endif

 // Macro for collecting HW_REG_XCC_ID
@@ -284,10 +287,8 @@ __global__ void __launch_bounds__(BLOCKSIZE)
    __threadfence_system();
    p.stopCycle  = GetTimestamp();
    p.startCycle = startCycle;
-#if !defined(__NVCC__)
-    p.xccId      = xccId;
-#endif
-    __trace_hwreg();
+    GetHwId(p.hwId);
+    GetXccId(p.xccId);
  }
 }


--- a/src/include/TransferBench.hpp
+++ b/src/include/TransferBench.hpp
 /*
-Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
+#pragma once

 #include <vector>
 #include <sstream>
@@ -31,21 +32,7 @@ THE SOFTWARE.
 #include <map>
 #include <iostream>
 #include <sstream>
-
 #include "Compatibility.hpp"
-
-// Helper macro for catching HIP errors
-#define HIP_CALL(cmd)                                                                   \
-    do {                                                                                \
-        hipError_t error = (cmd);                                                       \
-        if (error != hipSuccess)                                                        \
-        {                                                                               \
-            std::cerr << "Encountered HIP error (" << hipGetErrorString(error)          \
-                      << ") at line " << __LINE__ << " in file " << __FILE__ << "\n";   \
-            exit(-1);                                                                   \
-        }                                                                               \
-    } while (0)
-
 #include "EnvVars.hpp"

 // Simple configuration parameters
@@ -62,6 +49,7 @@ typedef enum
  MEM_GPU_FINE     = 3, // Fine-grained global GPU memory
  MEM_CPU_UNPINNED = 4, // Unpinned CPU memory
  MEM_NULL         = 5, // NULL memory - used for empty
+  MEM_MANAGED      = 6
 } MemType;

 typedef enum
@@ -71,12 +59,12 @@ typedef enum
  EXE_GPU_DMA      = 2, // GPU SDMA-based executor   (subExecutor = streams)
 } ExeType;

-bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE); }
+bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); }
 bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); };
 bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); };
 bool IsCpuType(ExeType e) { return (e == EXE_CPU); };

-char const MemTypeStr[7] = "CGBFUN";
+char const MemTypeStr[8] = "CGBFUNM";
 char const ExeTypeStr[4] = "CGD";
 char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};

@@ -129,6 +117,14 @@ struct Transfer
  SubExecParam*              subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
  std::vector<int>           subExecIdx;         // Indicies into subExecParamGpu

+#if !defined(__NVCC__)
+  // For targeted-SDMA
+  hsa_agent_t                dstAgent;           // DMA destination memory agent
+  hsa_agent_t                srcAgent;           // DMA source memory agent
+  hsa_signal_t               signal;             // HSA signal for completion
+  hsa_amd_sdma_engine_id_t   sdmaEngineId;       // DMA engine ID
+#endif
+
  // Prepares src/dst subarray pointers for each SubExecutor
  void PrepareSubExecParams(EnvVars const& ev);