Implementing sweep preset

93430da1 · Gilbert Lee · ddb6508f · 93430da1 · 93430da1 · 93430da1
Commit 93430da1 authored Aug 15, 2022 by Gilbert Lee
Showing with 725 additions and 358 deletions

CHANGELOG.md CHANGELOG.md +20 -0

EnvVars.hpp EnvVars.hpp +169 -17

README.md README.md +14 -0

TransferBench.cpp TransferBench.cpp +498 -340

TransferBench.hpp TransferBench.hpp +24 -1

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench
+## v1.03
+### Added
+- New preset modes stress-test benchmarks "sweep" and "randomsweep"
+  - sweep iterates over all possible sets of Transfers to test
+  - randomsweep iterates over random sets of Transfers
+  -  New sweep-only environment variables can modify sweep
+     - SWEEP_SRC - String containing only "B","C","F", or "G", defining possible source memory types
+     - SWEEP_EXE - String containing only "C", or "G", defining possible executors
+     - SWEEP_DST - String containing only "B","C","F", or "G", defining possible destination memory types
+     - SWEEP_SRC_IS_EXE - Restrict executor to be the same as the source if non-zero
+     - SWEEP_MIN - Minimum number of parallel transfers to test
+     - SWEEP_MAX - Maximum number of parallel transfers to test
+     - SWEEP_COUNT - Maximum number of tests to run
+     - SWEEP_TIME_LIMIT - Maximum number of seconds to run tests for
+- New environment variable to restrict number of available GPUs to test on (primarily for sweep runs)
+  - NUM_CPU_DEVICES - Number of CPU devices
+  - NUM_GPU_DEVICES - Number of GPU devices
+### Changed
+- Fixed timing display for CPU-executors when using single stream mode
 ## v1.02
 ### Added
 - Setting NUM_ITERATIONS to negative number indicates to run for -NUM_ITERATIONS seconds per Test

--- a/EnvVars.hpp
+++ b/EnvVars.hpp
@@ -25,7 +25,9 @@ THE SOFTWARE.
 #include <algorithm>
-#define TB_VERSION "1.02"
+#define TB_VERSION "1.03"
+extern char const MemTypeStr[];
 // This class manages environment variable that affect TransferBench
 class EnvVars
@@ -37,10 +39,21 @@ public:
  int const DEFAULT_SAMPLING_FACTOR      =  1;
  int const DEFAULT_NUM_CPU_PER_TRANSFER =  4;
+  int const DEFAULT_SWEEP_SRC_IS_EXE  = 0;
+  std::string const DEFAULT_SWEEP_SRC = "CG";
+  std::string const DEFAULT_SWEEP_EXE = "CG";
+  std::string const DEFAULT_SWEEP_DST = "CG";
+  int const DEFAULT_SWEEP_MIN         = 1;
+  int const DEFAULT_SWEEP_MAX         = 24;
+  int const DEFAULT_SWEEP_TEST_LIMIT  = 0;
+  int const DEFAULT_SWEEP_TIME_LIMIT  = 0;
  // Environment variables
  int blockBytes;        // Each CU, except the last, gets a multiple of this many bytes to copy
  int byteOffset;        // Byte-offset for memory allocations
+  int numCpuDevices;     // Number of CPU devices to use (defaults to # NUMA nodes detected)
  int numCpuPerTransfer; // Number of CPU child threads to use per CPU Transfer
+  int numGpuDevices;     // Number of GPU devices to use (defaults to # HIP devices detected)
  int numIterations;     // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
  int numWarmups;        // Number of un-timed warmup iterations to perform
  int outputToCsv;       // Output in CSV format
@@ -54,6 +67,16 @@ public:
  std::vector<float> fillPattern; // Pattern of floats used to fill source data
+  // Environment variables only for Sweep-preset
+  int sweepSrcIsExe;     // Non-zero if executor should always be the same as source
+  int sweepMin;          // Min number of simultaneous Transfers to be executed per test
+  int sweepMax;          // Max number of simulatneous Transfers to be executed per test
+  int sweepTestLimit;    // Max number of tests to run during sweep (0 = no limit)
+  int sweepTimeLimit;    // Max number of seconds to run sweep for  (0 = no limit)
+  std::string sweepSrc;  // Set of src memory types to be swept
+  std::string sweepExe;  // Set of executors to be swept
+  std::string sweepDst;  // Set of dst memory types to be swept
  // Constructor that collects values
  EnvVars()
  {
@@ -61,9 +84,15 @@ public:
    hipDeviceGetAttribute(&maxSharedMemBytes,
                          hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0);
+    int numDetectedCpus = numa_num_configured_nodes();
+    int numDetectedGpus;
+    hipGetDeviceCount(&numGpuDevices);
    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
+    numCpuDevices     = GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
    numCpuPerTransfer = GetEnvVar("NUM_CPU_PER_TRANSFER", DEFAULT_NUM_CPU_PER_TRANSFER);
+    numGpuDevices     = GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
    numIterations     = GetEnvVar("NUM_ITERATIONS"      , DEFAULT_NUM_ITERATIONS);
    numWarmups        = GetEnvVar("NUM_WARMUPS"         , DEFAULT_NUM_WARMUPS);
    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
@@ -75,6 +104,15 @@ public:
    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 0);
+    sweepSrcIsExe     = GetEnvVar("SWEEP_SRC_IS_EXE", DEFAULT_SWEEP_SRC_IS_EXE);
+    sweepMin          = GetEnvVar("SWEEP_MIN", DEFAULT_SWEEP_MIN);
+    sweepMax          = GetEnvVar("SWEEP_MAX", DEFAULT_SWEEP_MAX);
+    sweepSrc          = GetEnvVar("SWEEP_SRC", DEFAULT_SWEEP_SRC);
+    sweepExe          = GetEnvVar("SWEEP_EXE", DEFAULT_SWEEP_EXE);
+    sweepDst          = GetEnvVar("SWEEP_DST", DEFAULT_SWEEP_DST);
+    sweepTestLimit    = GetEnvVar("SWEEP_TEST_LIMIT", DEFAULT_SWEEP_TEST_LIMIT);
+    sweepTimeLimit    = GetEnvVar("SWEEP_TIME_LIMIT", DEFAULT_SWEEP_TIME_LIMIT);
    // Check for fill pattern
    char* pattern = getenv("FILL_PATTERN");
    if (pattern != NULL)
@@ -134,6 +172,16 @@ public:
    else fillPattern.clear();
    // Perform some basic validation
+    if (numCpuDevices > numDetectedCpus)
+    {
+      printf("[ERROR] Number of CPUs to use (%d) cannot exceed number of detected CPUs (%d)\n", numCpuDevices, numDetectedCpus);
+      exit(1);
+    }
+    if (numGpuDevices > numDetectedGpus)
+    {
+      printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
+      exit(1);
+    }
    if (byteOffset % sizeof(float))
    {
      printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
@@ -169,6 +217,49 @@ public:
      printf("[ERROR] Single stream mode cannot be used with HIP calls\n");
      exit(1);
    }
+    for (auto ch : sweepSrc)
+    {
+      if (!strchr(MemTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
+        exit(1);
+      }
+    }
+    for (auto ch : sweepDst)
+    {
+      if (!strchr(MemTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
+        exit(1);
+      }
+    }
+    char const* permittedExecutors = "CG";
+    for (auto ch : sweepExe)
+    {
+      if (!strchr(permittedExecutors, ch))
+      {
+        printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
+        exit(1);
+      }
+    }
  }
  // Display info on the env vars that can be used
@@ -176,20 +267,22 @@ public:
  {
    printf("Environment variables:\n");
    printf("======================\n");
-    printf(" BLOCK_BYTES=B      - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
+    printf(" BLOCK_BYTES=B          - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
-    printf(" BYTE_OFFSET        - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
+    printf(" BYTE_OFFSET            - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
-    printf(" FILL_PATTERN=STR   - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
+    printf(" FILL_PATTERN=STR       - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
+    printf(" NUM_CPU_DEVICES=X      - Restrict number of CPUs to X.  May not be greater than # detected NUMA nodes\n");
    printf(" NUM_CPU_PER_TRANSFER=C - Use C threads per Transfer for CPU-executed copies\n");
-    printf(" NUM_ITERATIONS=I   - Perform I timed iteration(s) per test\n");
+    printf(" NUM_GPU_DEVICES=X      - Restrict number of GCPUs to X.  May not be greater than # detected HIP devices\n");
-    printf(" NUM_WARMUPS=W      - Perform W untimed warmup iteration(s) per test\n");
+    printf(" NUM_ITERATIONS=I       - Perform I timed iteration(s) per test\n");
-    printf(" OUTPUT_TO_CSV      - Outputs to CSV format if set\n");
+    printf(" NUM_WARMUPS=W          - Perform W untimed warmup iteration(s) per test\n");
-    printf(" SAMPLING_FACTOR=F  - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
+    printf(" OUTPUT_TO_CSV          - Outputs to CSV format if set\n");
-    printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
+    printf(" SAMPLING_FACTOR=F      - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
-    printf(" USE_HIP_CALL       - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
+    printf(" SHARED_MEM_BYTES=X     - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
-    printf(" USE_INTERACTIVE    - Pause for user-input before starting transfer loop\n");
+    printf(" USE_HIP_CALL           - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
-    printf(" USE_MEMSET         - Perform a memset instead of a copy (ignores source memory)\n");
+    printf(" USE_INTERACTIVE        - Pause for user-input before starting transfer loop\n");
-    printf(" USE_PCIE_INDEX     - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
+    printf(" USE_MEMSET             - Perform a memset instead of a copy (ignores source memory)\n");
-    printf(" USE_SINGLE_STREAM  - Use single stream per device instead of per Transfer.  Cannot be used with USE_HIP_CALL\n");
+    printf(" USE_PCIE_INDEX         - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
+    printf(" USE_SINGLE_STREAM      - Use single stream per device instead of per Transfer.  Cannot be used with USE_HIP_CALL\n");
  }
  // Display env var settings
@@ -207,8 +300,10 @@ public:
      else
        printf("Pseudo-random: (Element i = i modulo 383 + 31)");
      printf("\n");
-      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
+      printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
-      printf("%-20s = %12d : Running %d %s per topology\n", "NUM_ITERATIONS", numIterations,
+      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
+      printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
+      printf("%-20s = %12d : Running %d %s per test\n", "NUM_ITERATIONS", numIterations,
             numIterations > 0 ? numIterations : -numIterations,
             numIterations > 0 ? "timed iteration(s)" : "second(s)");
      printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
@@ -236,13 +331,70 @@ public:
    }
  };
+  // Display env var settings
+  void DisplaySweepEnvVars() const
+  {
+    if (!outputToCsv)
+    {
+      printf("Sweep configuration (TransferBench v%s)\n", TB_VERSION);
+      printf("=====================================================\n");
+      printf("%-20s = %12s : Source Memory Types to sweep\n", "SWEEP_SRC", sweepSrc.c_str());
+      printf("%-20s = %12s : Executor Types to sweep\n", "SWEEP_EXE", sweepExe.c_str());
+      printf("%-20s = %12s : Destination Memory Types to sweep\n", "SWEEP_DST", sweepDst.c_str());
+      printf("%-20s = %12d : Transfer executor %s Transfer source\n", "SWEEP_SRC_IS_EXE", sweepSrcIsExe, sweepSrcIsExe ? "must match" : "may have any");
+      printf("%-20s = %12d : Min simultaneous Transfers\n", "SWEEP_MIN", sweepMin);
+      printf("%-20s = %12d : Max simultaneous Transfers              (0 = no limit)\n", "SWEEP_MAX", sweepMax);
+      printf("%-20s = %12d : Max number of tests to run during sweep (0 = no limit)\n", "SWEEP_TEST_LIMIT", sweepTestLimit);
+      printf("%-20s = %12d : Max number of seconds to run sweep for  (0 = no limit)\n", "SWEEP_TIME_LIMIT", sweepTimeLimit);
+      printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
+      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
+      printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
+      printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
+      printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
+      printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
+      if (fillPattern.size())
+        printf("Pattern: %s", getenv("FILL_PATTERN"));
+      else
+        printf("Pseudo-random: (Element i = i modulo 383 + 31)");
+      printf("\n");
+      printf("%-20s = %12d : Running %d %s per test\n", "NUM_ITERATIONS", numIterations,
+             numIterations > 0 ? numIterations : -numIterations,
+             numIterations > 0 ? "timed iteration(s)" : "second(s)");
+      printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
+      printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
+             outputToCsv ? "CSV" : "console");
+      printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
+             getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
+      printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
+             useHipCall ? "HIP functions" : "custom kernels");
+      if (useHipCall && !useMemset)
+      {
+        char* env = getenv("HSA_ENABLE_SDMA");
+        printf("%-20s = %12s : %s\n", "HSA_ENABLE_SDMA", env,
+               (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
+      }
+      printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX",
+             usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
+      printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM",
+             useSingleStream, (useSingleStream ? "device" : "Transfer"));
+      printf("\n");
+    }
+  };
  // Helper function that gets parses environment variable or sets to default value
-  static int GetEnvVar(std::string const varname, int defaultValue)
+  static int GetEnvVar(std::string const& varname, int defaultValue)
  {
    if (getenv(varname.c_str()))
      return atoi(getenv(varname.c_str()));
    return defaultValue;
  }
+  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return getenv(varname.c_str());
+    return defaultValue;
+  }
 };
 #endif
--- a/README.md
+++ b/README.md
@@ -12,3 +12,17 @@ TransferBench is a simple utility capable of benchmarking simultaneous copies be
 * `make`
  If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
+## Hints and suggestions
+- Running TransferBench with no arguments will display usage instructions and detected topology information
+- There are several preset configurations that can be used instead of a configuration file
+  including:
+  - p2p    - Peer to peer benchmark test
+  - sweep  - Sweep across possible sets of Transfers
+  - rsweep - Random sweep across possible sets of Transfers
+- When using the same GPU executor in multiple simultaneous Transfers, performance may be
+  serialized due to the maximum number of hardware queues available.
+  - The number of maximum hardware queues can be adjusted via GPU_MAX_HW_QUEUES
+  - Alternatively, running in single stream mode (USE_SINGLE_STREAM=1) may avoid this issue
+    by launching all Transfers on a single stream instead of individual streams
--- a/TransferBench.cpp
+++ b/TransferBench.cpp
@@ -24,6 +24,7 @@ THE SOFTWARE.
 // on the same node
 #include <numa.h>
 #include <numaif.h>
+#include <random>
 #include <stack>
 #include <thread>
@@ -33,6 +34,13 @@ THE SOFTWARE.
 int main(int argc, char **argv)
 {
+  // Check for NUMA library support
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
  // Display usage instructions and detected topology
  if (argc <= 1)
  {
@@ -63,14 +71,16 @@ int main(int argc, char **argv)
  }
  PopulateTestSizes(numBytesPerTransfer, ev.samplingFactor, valuesOfN);
-  // Find the largest N to be used - memory will only be allocated once per set of simulatenous Transfers
+  // Check for preset tests
-  size_t maxN = valuesOfN[0];
+  // - Tests that sweep across possible sets of Transfers
-  for (auto N : valuesOfN)
+  if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep"))
-    maxN = std::max(maxN, N);
+  {
+    RunSweepPreset(ev, numBytesPerTransfer, !strcmp(argv[1], "rsweep"));
-  // Execute only peer to peer benchmark mode, similar to rocm-bandwidth-test
+    exit(0);
-  if (!strcmp(argv[1], "p2p") || !strcmp(argv[1], "p2p_rr") ||
+  }
-      !strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr"))
+  // - Tests that benchmark peer-to-peer performance
+  else if (!strcmp(argv[1], "p2p") || !strcmp(argv[1], "p2p_rr") ||
+           !strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr"))
  {
    int numBlocksToUse = 0;
    if (argc > 3)
@@ -96,33 +106,14 @@ int main(int argc, char **argv)
    exit(1);
  }
-  // Check for NUMA library support
+  // Print environment variables and CSV header
-  if (numa_available() == -1)
-  {
-    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
-    exit(1);
-  }
  ev.DisplayEnvVars();
-  int const initOffset = ev.byteOffset / sizeof(float);
-  std::stack<std::thread> threads;
-  // Collect the number of available CPUs/GPUs on this machine
-  int numGpuDevices;
-  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-  int const numCpuDevices = numa_num_configured_nodes();
-  // Track unique pair of transfers that get used
-  std::set<std::pair<int, int>> peerAccessTracker;
-  // Print CSV header
  if (ev.outputToCsv)
  {
    printf("Test,NumBytes,SrcMem,Executor,DstMem,CUs,BW(GB/s),Time(ms),"
           "TransferDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters\n");
  }
-  // Loop over each line in the Transfer configuration file
  int testNum = 0;
  char line[2048];
  while(fgets(line, 2048, fp))
@@ -130,317 +121,361 @@ int main(int argc, char **argv)
    // Check if line is a comment to be echoed to output (starts with ##)
    if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line);
-    // Parse transfers from configuration file
+    // Parse set of parallel Transfers to execute
-    TransferMap transferMap;
+    std::vector<Transfer> transfers;
-    ParseTransfers(line, numCpuDevices, numGpuDevices, transferMap);
+    ParseTransfers(line, ev.numCpuDevices, ev.numGpuDevices, transfers);
-    if (transferMap.size() == 0) continue;
+    if (transfers.empty()) continue;
-    testNum++;
+    ExecuteTransfers(ev, ++testNum, valuesOfN, transfers);
+  }
+  fclose(fp);
-    // Prepare (maximum) memory for each transfer
+  return 0;
-    std::vector<Transfer*> transferList;
+}
-    for (auto& exeInfoPair : transferMap)
-    {
-      ExecutorInfo& exeInfo = exeInfoPair.second;
-      exeInfo.totalTime = 0.0;
-      exeInfo.totalBlocks = 0;
-      for (Transfer& transfer : exeInfo.transfers)
+void ExecuteTransfers(EnvVars const& ev,
-      {
+                      int testNum,
-        // Get some aliases to transfer variables
+                      std::vector<size_t> const& valuesOfN,
-        MemType const& exeMemType  = transfer.exeMemType;
+                      std::vector<Transfer>& transfers)
-        MemType const& srcMemType  = transfer.srcMemType;
+{
-        MemType const& dstMemType  = transfer.dstMemType;
+  int const initOffset = ev.byteOffset / sizeof(float);
-        int     const& blocksToUse = transfer.numBlocksToUse;
-        // Get potentially remapped device indices
-        int const srcIndex = RemappedIndex(transfer.srcIndex, srcMemType);
-        int const exeIndex = RemappedIndex(transfer.exeIndex, exeMemType);
-        int const dstIndex = RemappedIndex(transfer.dstIndex, dstMemType);
-        // Enable peer-to-peer access if necessary (can only be called once per unique pair)
-        if (exeMemType == MEM_GPU)
-        {
-          // Ensure executing GPU can access source memory
-          if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex)
-          {
-            auto exeSrcPair = std::make_pair(exeIndex, srcIndex);
-            if (!peerAccessTracker.count(exeSrcPair))
-            {
-              EnablePeerAccess(exeIndex, srcIndex);
-              peerAccessTracker.insert(exeSrcPair);
-            }
-          }
-          // Ensure executing GPU can access destination memory
+  // Find the largest N to be used - memory will only be allocated once per set of Transfers
-          if ((dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) && dstIndex != exeIndex)
+  size_t maxN = valuesOfN[0];
-          {
+  for (auto N : valuesOfN)
-            auto exeDstPair = std::make_pair(exeIndex, dstIndex);
+    maxN = std::max(maxN, N);
-            if (!peerAccessTracker.count(exeDstPair))
-            {
-              EnablePeerAccess(exeIndex, dstIndex);
-              peerAccessTracker.insert(exeDstPair);
-            }
-          }
-        }
-        // Allocate (maximum) source / destination memory based on type / device index
+  // Map transfers by executor
-        AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.srcMem);
+  TransferMap transferMap;
-        AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.dstMem);
+  for (Transfer const& transfer : transfers)
-        transfer.blockParam.resize(exeMemType == MEM_CPU ? ev.numCpuPerTransfer : blocksToUse);
+  {
-        exeInfo.totalBlocks += transfer.blockParam.size();
+    Executor executor(transfer.exeMemType, transfer.exeIndex);
-        transferList.push_back(&transfer);
+    ExecutorInfo& executorInfo = transferMap[executor];
-      }
+    executorInfo.transfers.push_back(transfer);
+  }
-      // Prepare GPU resources for GPU executors
+  // Loop over each executor and prepare GPU resources
-      MemType const exeMemType = exeInfoPair.first.first;
+  std::vector<Transfer*> transferList;
-      int     const exeIndex   = RemappedIndex(exeInfoPair.first.second, exeMemType);
+  for (auto& exeInfoPair : transferMap)
+  {
+    Executor const& executor = exeInfoPair.first;
+    ExecutorInfo& exeInfo = exeInfoPair.second;
+    exeInfo.totalTime = 0.0;
+    exeInfo.totalBlocks = 0;
+    // Loop over each transfer this executor is involved in
+    for (Transfer& transfer : exeInfo.transfers)
+    {
+      // Get some aliases to transfer variables
+      MemType const& exeMemType  = transfer.exeMemType;
+      MemType const& srcMemType  = transfer.srcMemType;
+      MemType const& dstMemType  = transfer.dstMemType;
+      int     const& blocksToUse = transfer.numBlocksToUse;
+      // Get potentially remapped device indices
+      int const srcIndex = RemappedIndex(transfer.srcIndex, srcMemType);
+      int const exeIndex = RemappedIndex(transfer.exeIndex, exeMemType);
+      int const dstIndex = RemappedIndex(transfer.dstIndex, dstMemType);
+      // Enable peer-to-peer access if necessary (can only be called once per unique pair)
      if (exeMemType == MEM_GPU)
      {
-        AllocateMemory(exeMemType, exeIndex, exeInfo.totalBlocks * sizeof(BlockParam),
+        // Ensure executing GPU can access source memory
-                       (void**)&exeInfo.blockParamGpu);
+        if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex)
+          EnablePeerAccess(exeIndex, srcIndex);
-        int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
+        // Ensure executing GPU can access destination memory
-        exeInfo.streams.resize(numTransfersToRun);
+        if ((dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) && dstIndex != exeIndex)
-        exeInfo.startEvents.resize(numTransfersToRun);
+          EnablePeerAccess(exeIndex, dstIndex);
-        exeInfo.stopEvents.resize(numTransfersToRun);
-        for (int i = 0; i < numTransfersToRun; ++i)
-        {
-          HIP_CALL(hipSetDevice(exeIndex));
-          HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
-          HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
-          HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
-        }
-        int transferOffset = 0;
-        for (int i = 0; i < exeInfo.transfers.size(); i++)
-        {
-          exeInfo.transfers[i].blockParamGpuPtr = exeInfo.blockParamGpu + transferOffset;
-          transferOffset += exeInfo.transfers[i].blockParam.size();
-        }
      }
+      // Allocate (maximum) source / destination memory based on type / device index
+      AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.srcMem);
+      AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.dstMem);
+      transfer.blockParam.resize(exeMemType == MEM_CPU ? ev.numCpuPerTransfer : blocksToUse);
+      exeInfo.totalBlocks += transfer.blockParam.size();
+      transferList.push_back(&transfer);
    }
-    // Loop over all the different number of bytes to use per Transfer
+    // Prepare per-threadblock parameters for GPU executors
-    for (auto N : valuesOfN)
+    MemType const exeMemType = executor.first;
+    int     const exeIndex   = RemappedIndex(executor.second, exeMemType);
+    if (exeMemType == MEM_GPU)
    {
-      if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));
+      // Allocate one contiguous chunk of GPU memory for threadblock parameters
+      // This allows support for executing one transfer per stream, or all transfers in a single stream
-      // Prepare input memory and block parameters for current N
+      AllocateMemory(exeMemType, exeIndex, exeInfo.totalBlocks * sizeof(BlockParam),
-      for (auto& exeInfoPair : transferMap)
+                     (void**)&exeInfo.blockParamGpu);
+      int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
+      exeInfo.streams.resize(numTransfersToRun);
+      exeInfo.startEvents.resize(numTransfersToRun);
+      exeInfo.stopEvents.resize(numTransfersToRun);
+      for (int i = 0; i < numTransfersToRun; ++i)
      {
-        ExecutorInfo& exeInfo = exeInfoPair.second;
+        HIP_CALL(hipSetDevice(exeIndex));
+        HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
+        HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
+        HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
+      }
-        int transferOffset = 0;
+      // Assign each transfer its portion of threadblock parameters
+      int transferOffset = 0;
+      for (int i = 0; i < exeInfo.transfers.size(); i++)
+      {
+        exeInfo.transfers[i].blockParamGpuPtr = exeInfo.blockParamGpu + transferOffset;
+        transferOffset += exeInfo.transfers[i].blockParam.size();
+      }
+    }
+  }
-        for (int i = 0; i < exeInfo.transfers.size(); ++i)
+  // Loop over all the different number of bytes to use per Transfer
-        {
+  for (auto N : valuesOfN)
-          Transfer& transfer = exeInfo.transfers[i];
+  {
-          transfer.PrepareBlockParams(ev, N);
+    if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));
-          // Copy block parameters to GPU for GPU executors
+    // Prepare input memory and block parameters for current N
-          if (transfer.exeMemType == MEM_GPU)
+    for (auto& exeInfoPair : transferMap)
-          {
+    {
-            HIP_CALL(hipMemcpy(&exeInfo.blockParamGpu[transferOffset],
+      ExecutorInfo& exeInfo = exeInfoPair.second;
-                               transfer.blockParam.data(),
-                               transfer.blockParam.size() * sizeof(BlockParam),
-                               hipMemcpyHostToDevice));
-            transferOffset += transfer.blockParam.size();
-          }
-        }
-      }
-      // Launch kernels (warmup iterations are not counted)
+      int transferOffset = 0;
-      double totalCpuTime = 0;
+      for (int i = 0; i < exeInfo.transfers.size(); ++i)
-      size_t numTimedIterations = 0;
-      for (int iteration = -ev.numWarmups; ; iteration++)
      {
-        if (ev.numIterations > 0 && iteration >= ev.numIterations) break;
+        // Prepare subarrays each threadblock works on and fill src memory with patterned data
-        if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
+        Transfer& transfer = exeInfo.transfers[i];
+        transfer.PrepareBlockParams(ev, N);
-        // Pause before starting first timed iteration in interactive mode
+        // Copy block parameters to GPU for GPU executors
-        if (ev.useInteractive && iteration == 0)
+        if (transfer.exeMemType == MEM_GPU)
        {
-          printf("Hit <Enter> to continue: ");
+          HIP_CALL(hipMemcpy(&exeInfo.blockParamGpu[transferOffset],
-          scanf("%*c");
+                             transfer.blockParam.data(),
-          printf("\n");
+                             transfer.blockParam.size() * sizeof(BlockParam),
+                             hipMemcpyHostToDevice));
+          transferOffset += transfer.blockParam.size();
        }
+      }
+    }
-        // Start CPU timing for this iteration
+    // Launch kernels (warmup iterations are not counted)
-        auto cpuStart = std::chrono::high_resolution_clock::now();
+    double totalCpuTime = 0;
+    size_t numTimedIterations = 0;
+    std::stack<std::thread> threads;
+    for (int iteration = -ev.numWarmups; ; iteration++)
+    {
+      if (ev.numIterations > 0 && iteration >= ev.numIterations) break;
+      if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
-        // Execute all Transfers in parallel
+      // Pause before starting first timed iteration in interactive mode
-        for (auto& exeInfoPair : transferMap)
+      if (ev.useInteractive && iteration == 0)
-        {
+      {
-          ExecutorInfo& exeInfo = exeInfoPair.second;
+        printf("Hit <Enter> to continue: ");
-          int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
+        scanf("%*c");
-          for (int i = 0; i < numTransfersToRun; ++i)
+        printf("\n");
-            threads.push(std::thread(RunTransfer, std::ref(ev), N, iteration, std::ref(exeInfo), i));
+      }
-        }
-        // Wait for all threads to finish
+      // Start CPU timing for this iteration
-        int const numTransfers = threads.size();
+      auto cpuStart = std::chrono::high_resolution_clock::now();
-        for (int i = 0; i < numTransfers; i++)
-        {
-          threads.top().join();
-          threads.pop();
-        }
-        // Stop CPU timing for this iteration
+      // Execute all Transfers in parallel
-        auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+      for (auto& exeInfoPair : transferMap)
-        double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+      {
+        ExecutorInfo& exeInfo = exeInfoPair.second;
+        int const numTransfersToRun = (IsGpuType(exeInfoPair.first.first) && ev.useSingleStream) ?
+          1 : exeInfo.transfers.size();
+        for (int i = 0; i < numTransfersToRun; ++i)
+          threads.push(std::thread(RunTransfer, std::ref(ev), N, iteration, std::ref(exeInfo), i));
+      }
-        if (iteration >= 0)
+      // Wait for all threads to finish
-        {
+      int const numTransfers = threads.size();
-          ++numTimedIterations;
+      for (int i = 0; i < numTransfers; i++)
-          totalCpuTime += deltaSec;
+      {
-        }
+        threads.top().join();
+        threads.pop();
      }
-      // Pause for interactive mode
+      // Stop CPU timing for this iteration
-      if (ev.useInteractive)
+      auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+      double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+      if (iteration >= 0)
      {
-        printf("Transfers complete. Hit <Enter> to continue: ");
+        ++numTimedIterations;
-        scanf("%*c");
+        totalCpuTime += deltaSec;
-        printf("\n");
      }
+    }
+    // Pause for interactive mode
+    if (ev.useInteractive)
+    {
+      printf("Transfers complete. Hit <Enter> to continue: ");
+      scanf("%*c");
+      printf("\n");
+    }
-      // Validate that each transfer has transferred correctly
+    // Validate that each transfer has transferred correctly
-      int const numTransfers = transferList.size();
+    int const numTransfers = transferList.size();
-      for (auto transfer : transferList)
+    for (auto transfer : transferList)
-        CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, transfer->dstMem + initOffset);
+      CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, transfer->dstMem + initOffset);
-      // Report timings
+    // Report timings
-      totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
+    totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
-      double totalBandwidthGbs = (numTransfers * N * sizeof(float) / 1.0E6) / totalCpuTime;
+    double totalBandwidthGbs = (numTransfers * N * sizeof(float) / 1.0E6) / totalCpuTime;
-      double maxGpuTime = 0;
+    double maxGpuTime = 0;
-      if (ev.useSingleStream)
+    if (ev.useSingleStream)
+    {
+      for (auto& exeInfoPair : transferMap)
      {
-        for (auto& exeInfoPair : transferMap)
+        ExecutorInfo  exeInfo    = exeInfoPair.second;
+        MemType const exeMemType = exeInfoPair.first.first;
+        int     const exeIndex   = exeInfoPair.first.second;
+        // Compute total time for CPU executors
+        if (!IsGpuType(exeMemType))
        {
-          ExecutorInfo const& exeInfo = exeInfoPair.second;
+          exeInfo.totalTime = 0;
-          MemType const exeMemType    = exeInfoPair.first.first;
+          for (auto const& transfer : exeInfo.transfers)
-          int     const exeIndex      = exeInfoPair.first.second;
+            exeInfo.totalTime = std::max(exeInfo.totalTime, transfer.transferTime);
+        }
-          double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
+        double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
-          double exeBandwidthGbs = (exeInfo.transfers.size() * N * sizeof(float) / 1.0E9) / exeDurationMsec * 1000.0f;
+        double exeBandwidthGbs = (exeInfo.transfers.size() * N * sizeof(float) / 1.0E9) /
-          maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
+          exeDurationMsec * 1000.0f;
+        maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
-          if (!ev.outputToCsv)
+        if (!ev.outputToCsv)
-          {
+        {
-            printf(" Executor: %cPU %02d        (# Transfers %02lu)| %9.3f GB/s | %8.3f ms |\n",
+          printf(" Executor: %cPU %02d        (# Transfers %02lu)| %9.3f GB/s | %8.3f ms |\n",
-                   MemTypeStr[exeMemType], exeIndex, exeInfo.transfers.size(), exeBandwidthGbs, exeDurationMsec);
+                 MemTypeStr[exeMemType], exeIndex, exeInfo.transfers.size(), exeBandwidthGbs, exeDurationMsec);
-            for (auto transfer : exeInfo.transfers)
-            {
-              double transferDurationMsec = transfer.transferTime / (1.0 * numTimedIterations);
-              double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
-              printf("                            Transfer  %02d | %9.3f GB/s | %8.3f ms | %c%02d -> %c%02d:(%03d) -> %c%02d\n",
-                     transfer.transferIndex,
-                     transferBandwidthGbs,
-                     transferDurationMsec,
-                     MemTypeStr[transfer.srcMemType], transfer.srcIndex,
-                     MemTypeStr[transfer.exeMemType], transfer.exeIndex,
-                     transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
-                     MemTypeStr[transfer.dstMemType], transfer.dstIndex);
-            }
-          }
-          else
-          {
-            printf("%d,%lu,ALL,%c%02d,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
-                   testNum, N * sizeof(float),
-                   MemTypeStr[exeMemType], exeIndex,
-                   exeBandwidthGbs, exeDurationMsec,
-                   ev.byteOffset,
-                   ev.numWarmups, numTimedIterations);
-          }
        }
-      }
-      else
+        for (auto const& transfer : exeInfo.transfers)
-      {
-        for (auto transfer : transferList)
        {
-          double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
+          double transferDurationMsec = transfer.transferTime / (1.0 * numTimedIterations);
          double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
-          maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
          if (!ev.outputToCsv)
          {
-            printf(" Transfer %02d: %c%02d -> [%cPU %02d:%03d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s\n",
+            printf("                            Transfer  %02d | %9.3f GB/s | %8.3f ms | %c%02d -> %c%02d:(%03d) -> %c%02d\n",
-                   transfer->transferIndex,
+                   transfer.transferIndex,
-                   MemTypeStr[transfer->srcMemType], transfer->srcIndex,
+                   transferBandwidthGbs,
-                   MemTypeStr[transfer->exeMemType], transfer->exeIndex,
+                   transferDurationMsec,
-                   transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
+                   MemTypeStr[transfer.srcMemType], transfer.srcIndex,
-                   MemTypeStr[transfer->dstMemType], transfer->dstIndex,
+                   MemTypeStr[transfer.exeMemType], transfer.exeIndex,
-                   transferBandwidthGbs, transferDurationMsec,
+                   transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
-                   GetTransferDesc(*transfer).c_str());
+                   MemTypeStr[transfer.dstMemType], transfer.dstIndex);
          }
          else
          {
            printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%lu\n",
                   testNum, N * sizeof(float),
-                   MemTypeStr[transfer->srcMemType], transfer->srcIndex,
+                   MemTypeStr[transfer.srcMemType], transfer.srcIndex,
-                   MemTypeStr[transfer->exeMemType], transfer->exeIndex,
+                   MemTypeStr[transfer.exeMemType], transfer.exeIndex,
-                   MemTypeStr[transfer->dstMemType], transfer->dstIndex,
+                   MemTypeStr[transfer.dstMemType], transfer.dstIndex,
-                   transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
+                   transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
                   transferBandwidthGbs, transferDurationMsec,
-                   GetTransferDesc(*transfer).c_str(),
+                   GetTransferDesc(transfer).c_str(),
-                   transfer->srcMem + initOffset, transfer->dstMem + initOffset,
+                   transfer.srcMem + initOffset, transfer.dstMem + initOffset,
                   ev.byteOffset,
                   ev.numWarmups, numTimedIterations);
          }
        }
-      }
-      // Display aggregate statistics
+        if (ev.outputToCsv)
-      if (!ev.outputToCsv)
+        {
-      {
+          printf("%d,%lu,ALL,%c%02d,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
-        printf(" Aggregate Bandwidth (CPU timed)         | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n", totalBandwidthGbs, totalCpuTime,
+                 testNum, N * sizeof(float),
-               totalCpuTime - maxGpuTime);
+                 MemTypeStr[exeMemType], exeIndex,
+                 exeBandwidthGbs, exeDurationMsec,
+                 ev.byteOffset,
+                 ev.numWarmups, numTimedIterations);
+        }
      }
-      else
+    }
+    else
+    {
+      for (auto const& transfer : transferList)
      {
-        printf("%d,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
+        double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
-               testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, ev.byteOffset,
+        double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
-               ev.numWarmups, numTimedIterations);
+        maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
+        if (!ev.outputToCsv)
+        {
+          printf(" Transfer %02d: %c%02d -> [%cPU %02d:%03d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s\n",
+                 transfer->transferIndex,
+                 MemTypeStr[transfer->srcMemType], transfer->srcIndex,
+                 MemTypeStr[transfer->exeMemType], transfer->exeIndex,
+                 transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
+                 MemTypeStr[transfer->dstMemType], transfer->dstIndex,
+                 transferBandwidthGbs, transferDurationMsec,
+                 GetTransferDesc(*transfer).c_str());
+        }
+        else
+        {
+          printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%lu\n",
+                 testNum, N * sizeof(float),
+                 MemTypeStr[transfer->srcMemType], transfer->srcIndex,
+                 MemTypeStr[transfer->exeMemType], transfer->exeIndex,
+                 MemTypeStr[transfer->dstMemType], transfer->dstIndex,
+                 transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
+                 transferBandwidthGbs, transferDurationMsec,
+                 GetTransferDesc(*transfer).c_str(),
+                 transfer->srcMem + initOffset, transfer->dstMem + initOffset,
+                 ev.byteOffset,
+                 ev.numWarmups, numTimedIterations);
+        }
      }
    }
-    // Release GPU memory
+    // Display aggregate statistics
-    for (auto exeInfoPair : transferMap)
+    if (!ev.outputToCsv)
    {
-      ExecutorInfo& exeInfo = exeInfoPair.second;
+      printf(" Aggregate Bandwidth (CPU timed)         | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n",
-      for (auto& transfer : exeInfo.transfers)
+             totalBandwidthGbs, totalCpuTime, totalCpuTime - maxGpuTime);
-      {
+    }
-        // Get some aliases to Transfer variables
+    else
-        MemType const& exeMemType = transfer.exeMemType;
+    {
-        MemType const& srcMemType = transfer.srcMemType;
+      printf("%d,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
-        MemType const& dstMemType = transfer.dstMemType;
+             testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, ev.byteOffset,
+             ev.numWarmups, numTimedIterations);
-        // Allocate (maximum) source / destination memory based on type / device index
+    }
-        DeallocateMemory(srcMemType, transfer.srcMem);
+  }
-        DeallocateMemory(dstMemType, transfer.dstMem);
-        transfer.blockParam.clear();
-      }
-      MemType const exeMemType = exeInfoPair.first.first;
+  // Release GPU memory
-      int     const exeIndex   = RemappedIndex(exeInfoPair.first.second, exeMemType);
+  for (auto exeInfoPair : transferMap)
-      if (exeMemType == MEM_GPU)
+  {
+    ExecutorInfo& exeInfo = exeInfoPair.second;
+    for (auto& transfer : exeInfo.transfers)
+    {
+      // Get some aliases to Transfer variables
+      MemType const& exeMemType = transfer.exeMemType;
+      MemType const& srcMemType = transfer.srcMemType;
+      MemType const& dstMemType = transfer.dstMemType;
+      // Allocate (maximum) source / destination memory based on type / device index
+      DeallocateMemory(srcMemType, transfer.srcMem);
+      DeallocateMemory(dstMemType, transfer.dstMem);
+      transfer.blockParam.clear();
+    }
+    MemType const exeMemType = exeInfoPair.first.first;
+    int     const exeIndex   = RemappedIndex(exeInfoPair.first.second, exeMemType);
+    if (exeMemType == MEM_GPU)
+    {
+      DeallocateMemory(exeMemType, exeInfo.blockParamGpu);
+      int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
+      for (int i = 0; i < numTransfersToRun; ++i)
      {
-        DeallocateMemory(exeMemType, exeInfo.blockParamGpu);
+        HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
-        int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
+        HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
-        for (int i = 0; i < numTransfersToRun; ++i)
+        HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
-        {
-          HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
-          HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
-          HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
-        }
      }
    }
  }
-  fclose(fp);
-  return 0;
 }
 void DisplayUsage(char const* cmdName)
@@ -461,10 +496,10 @@ void DisplayUsage(char const* cmdName)
  printf("  config: Either:\n");
  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
  printf("          - Name of preset benchmark:\n");
-  printf("              p2p    - All CPU/GPU pairs benchmark\n");
+  printf("              p2p{_rr} - All CPU/GPU pairs benchmark {with remote reads}\n");
-  printf("              p2p_rr - All CPU/GPU pairs benchmark with remote reads\n");
+  printf("              g2g{_rr} - All GPU/GPU pairs benchmark {with remote reads}\n");
-  printf("              g2g    - All GPU/GPU pairs benchmark\n");
+  printf("              sweep    - Sweep across possible sets of Transfers\n");
-  printf("              g2g_rr - All GPU/GPU pairs benchmark with remote reads\n");
+  printf("              rsweep   - Randomly sweep across possible sets of Transfers\n");
  printf("            - 3rd optional argument will be used as # of CUs to use (uses all by default)\n");
  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
@@ -649,15 +684,15 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
 }
 // Helper function to parse a list of Transfer definitions
-void ParseTransfers(char* line, int numCpus, int numGpus, TransferMap& transferMap)
+void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>& transfers)
 {
  // Replace any round brackets or '->' with spaces,
  for (int i = 1; line[i]; i++)
    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
-  transferMap.clear();
+  transfers.clear();
-  int numTransfers = 0;
+  int numTransfers = 0;
  std::istringstream iss(line);
  iss >> numTransfers;
  if (iss.fail()) return;
@@ -665,75 +700,43 @@ void ParseTransfers(char* line, int numCpus, int numGpus, TransferMap& transferM
  std::string exeMem;
  std::string srcMem;
  std::string dstMem;
-  if (numTransfers > 0)
+  // If numTransfers < 0, read quads (srcMem, exeMem, dstMem, #CUs)
+  // otherwise read triples (srcMem, exeMem, dstMem)
+  bool const perTransferCUs = (numTransfers < 0);
+  numTransfers = abs(numTransfers);
+  int numBlocksToUse;
+  if (!perTransferCUs)
  {
-    // Method 1: Take in triples (srcMem, exeMem, dstMem)
-    int numBlocksToUse;
    iss >> numBlocksToUse;
    if (numBlocksToUse <= 0 || iss.fail())
    {
      printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numBlocksToUse);
      exit(1);
    }
-    for (int i = 0; i < numTransfers; i++)
-    {
-      Transfer transfer;
-      transfer.transferIndex = i;
-      iss >> srcMem >> exeMem >> dstMem;
-      if (iss.fail())
-      {
-        printf("Parsing error: Unable to read valid Transfer triplet (possibly missing a SRC or EXE or DST)\n");
-        exit(1);
-      }
-      ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
-      ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
-      ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
-      transfer.numBlocksToUse = numBlocksToUse;
-      // Ensure executor is either CPU or GPU
-      if (transfer.exeMemType != MEM_CPU && transfer.exeMemType != MEM_GPU)
-      {
-        printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n",
-               srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), transfer.numBlocksToUse);
-        exit(1);
-      }
-      Executor executor(transfer.exeMemType, transfer.exeIndex);
-      ExecutorInfo& executorInfo = transferMap[executor];
-      executorInfo.totalBlocks += transfer.numBlocksToUse;
-      executorInfo.transfers.push_back(transfer);
-    }
  }
-  else
-  {
-    // Method 2: Read in quads (srcMem, exeMem, dstMem,  Read common # blocks to use, then read (src, dst) doubles
-    numTransfers *= -1;
-    for (int i = 0; i < numTransfers; i++)
+  for (int i = 0; i < numTransfers; i++)
+  {
+    Transfer transfer;
+    transfer.transferIndex = i;
+    iss >> srcMem >> exeMem >> dstMem;
+    if (perTransferCUs) iss >> numBlocksToUse;
+    if (iss.fail())
    {
-      Transfer transfer;
+      if (perTransferCUs)
-      transfer.transferIndex = i;
-      iss >> srcMem >> exeMem >> dstMem >> transfer.numBlocksToUse;
-      if (iss.fail())
-      {
        printf("Parsing error: Unable to read valid Transfer quadruple (possibly missing a SRC or EXE or DST or #CU)\n");
-        exit(1);
+      else
-      }
+        printf("Parsing error: Unable to read valid Transfer triplet (possibly missing a SRC or EXE or DST)\n");
-      ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
+      exit(1);
-      ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
-      ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
-      if (transfer.exeMemType != MEM_CPU && transfer.exeMemType != MEM_GPU)
-      {
-        printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n"
-,               srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), transfer.numBlocksToUse);
-        exit(1);
-      }
-      Executor executor(transfer.exeMemType, transfer.exeIndex);
-      ExecutorInfo& executorInfo = transferMap[executor];
-      executorInfo.totalBlocks += transfer.numBlocksToUse;
-      executorInfo.transfers.push_back(transfer);
    }
+    ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
+    ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
+    ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
+    transfer.numBlocksToUse = numBlocksToUse;
+    transfers.push_back(transfer);
  }
 }
@@ -747,7 +750,13 @@ void EnablePeerAccess(int const deviceId, int const peerDeviceId)
    exit(1);
  }
  HIP_CALL(hipSetDevice(deviceId));
-  HIP_CALL(hipDeviceEnablePeerAccess(peerDeviceId, 0));
+  hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
+  if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
+  {
+    printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
+           deviceId, peerDeviceId, hipGetErrorString(error));
+    exit(1);
+  }
 }
 void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
@@ -982,7 +991,8 @@ std::string GetTransferDesc(Transfer const& transfer)
    + GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.dstMemType, transfer.dstIndex);
 }
-void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx)
+void RunTransfer(EnvVars const& ev, size_t const N, int const iteration,
+                 ExecutorInfo& exeInfo, int const transferIdx)
 {
  Transfer& transfer = exeInfo.transfers[transferIdx];
@@ -1348,3 +1358,151 @@ int GetWallClockRate(int deviceId)
  }
  return wallClockPerDeviceMhz[deviceId];
 }
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom)
+{
+  ev.DisplaySweepEnvVars();
+  std::vector<size_t> valuesOfN(1, numBytesPerTransfer / sizeof(float));
+  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
+  bool hasCpuExecutor = false;
+  bool hasGpuExecutor = false;
+  std::vector<std::pair<MemType, int>> exeList;
+  for (auto exe : ev.sweepExe)
+  {
+    MemType const exeMemType = CharToMemType(exe);
+    int numDevices;
+    if (IsGpuType(exeMemType))
+    {
+      numDevices = ev.numGpuDevices;
+      hasGpuExecutor = true;
+    }
+    else
+    {
+      numDevices = ev.numCpuDevices;
+      hasCpuExecutor = true;
+    }
+    for (int exeIndex = 0; exeIndex < numDevices; ++exeIndex)
+      exeList.push_back(std::make_pair(exeMemType, exeIndex));
+  }
+  int numExes = ev.sweepSrcIsExe ? 1 : exeList.size();
+  std::vector<std::pair<MemType, int>> srcList;
+  for (auto src : ev.sweepSrc)
+  {
+    MemType const srcMemType = CharToMemType(src);
+    int const numDevices = IsGpuType(srcMemType) ? ev.numGpuDevices : ev.numCpuDevices;
+    // Skip source memory type if executor is supposed to be source but not specified
+    if ((IsGpuType(srcMemType) && !hasGpuExecutor) ||
+        (!IsGpuType(srcMemType) && !hasCpuExecutor)) continue;
+    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
+      srcList.push_back(std::make_pair(srcMemType, srcIndex));
+  }
+  int numSrcs = srcList.size();
+  std::vector<std::pair<MemType, int>> dstList;
+  for (auto dst : ev.sweepDst)
+  {
+    MemType const dstMemType = CharToMemType(dst);
+    int const numDevices = IsGpuType(dstMemType) ? ev.numGpuDevices : ev.numCpuDevices;
+    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
+      dstList.push_back(std::make_pair(dstMemType, dstIndex));
+  }
+  int numDsts = dstList.size();
+  int const numPossible = numSrcs * numExes * numDsts;
+  int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
+  if (ev.sweepSrcIsExe)
+  {
+    printf("Num possible (SRC/DST) triplets: (%d/%d) = %d\n", numSrcs, numDsts, numPossible);
+  }
+  else
+  {
+    printf("Num possible (SRC/EXE/DST) triplets: (%d/%d/%d) = %d\n", numSrcs, numExes, numDsts, numPossible);
+  }
+  if (ev.sweepMin > numPossible)
+  {
+    printf("No valid test configurations exist\n");
+    return;
+  }
+  int numTestsRun = 0;
+  int M = ev.sweepMin;
+  // Create bitmask of numPossible triplets, of which M will be chosen
+  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
+  auto rng = std::default_random_engine {};
+  auto cpuStart = std::chrono::high_resolution_clock::now();
+  while (1)
+  {
+    if (isRandom)
+    {
+      // Pick random number of simultaneous transfers to execute
+      // NOTE: This currently skews distribution due to some #s having more possibilities than others
+      M = ((maxParallelTransfers > ev.sweepMin) ? (rand() % (maxParallelTransfers - ev.sweepMin)) : 0)
+        + ev.sweepMin;
+      // Generate a random bitmask
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+      std::shuffle(bitmask.begin(), bitmask.end(), rng);
+    }
+    // Convert bitmask to list of Transfers
+    std::vector<Transfer> transfers;
+    for (int value = 0; value < numPossible; ++value)
+    {
+      if (bitmask[value])
+      {
+        // Convert integer value to (SRC->EXE->DST) triplet
+        Transfer transfer;
+        int srcValue = value / numDsts / numExes;
+        int exeValue = value / numDsts % numExes;
+        int dstValue = value % numDsts;
+        transfer.srcMemType = srcList[srcValue].first;
+        transfer.srcIndex   = srcList[srcValue].second;
+        transfer.exeMemType = ev.sweepSrcIsExe ? transfer.srcMemType : exeList[exeValue].first;
+        transfer.exeIndex   = ev.sweepSrcIsExe ? transfer.srcIndex   : exeList[exeValue].second;
+        transfer.dstMemType = dstList[dstValue].first;
+        transfer.dstIndex   = dstList[dstValue].second;
+        transfer.numBlocksToUse = IsGpuType(transfer.exeMemType) ? 4 : ev.numCpuPerTransfer;
+        transfer.transferIndex = transfers.size();
+        transfers.push_back(transfer);
+      }
+    }
+    ExecuteTransfers(ev, ++numTestsRun, valuesOfN, transfers);
+    // Check for test limit
+    if (numTestsRun == ev.sweepTestLimit)
+    {
+      printf("Test limit reached\n");
+      break;
+    }
+    // Check for time limit
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+    if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
+    {
+      printf("Time limit exceeded\n");
+      break;
+    }
+    // Increment bitmask if not random sweep
+    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
+    {
+      M++;
+      // Check for completion
+      if (M > maxParallelTransfers)
+      {
+        printf("Sweep complete\n");
+        break;
+      }
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+    }
+  }
+}
--- a/TransferBench.hpp
+++ b/TransferBench.hpp
@@ -61,8 +61,27 @@ typedef enum
  MEM_GPU_FINE = 3     // Fine-grained global GPU memory
 } MemType;
+bool IsGpuType(MemType m)
+{
+  return (m == MEM_GPU || m == MEM_GPU_FINE);
+}
 char const MemTypeStr[5] = "CGBF";
+MemType inline CharToMemType(char const c)
+{
+  switch (c)
+  {
+  case 'C': return MEM_CPU;
+  case 'G': return MEM_GPU;
+  case 'B': return MEM_CPU_FINE;
+  case 'F': return MEM_GPU_FINE;
+  default:
+    printf("[ERROR] Unexpected mem type (%c)\n", c);
+    exit(1);
+  }
+}
 typedef enum
 {
  MODE_FILL  = 0,         // Fill data with pattern
@@ -141,7 +160,10 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
                  MemType* memType, int* memIndex);
 void ParseTransfers(char* line, int numCpus, int numGpus,
-                TransferMap& transferMap);
+                    std::vector<Transfer>& transfers);
+void ExecuteTransfers(EnvVars const& ev, int testNum, std::vector<size_t> const& valuesOfN,
+                      std::vector<Transfer>& transfers);
 void EnablePeerAccess(int const deviceId, int const peerDeviceId);
 void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
@@ -150,6 +172,7 @@ void CheckPages(char* byteArray, size_t numBytes, int targetId);
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
 void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
 void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom);
 // Return the maximum bandwidth measured for given (src/dst) pair
 double GetPeakBandwidth(EnvVars const& ev,