Merge pull request #1 from gilbertlee-amd/SweepPreset

Implementing sweep preset

Merge pull request #1 from gilbertlee-amd/SweepPreset
Implementing sweep preset
9598bdb6 · gilbertlee-amd · GitHub · ddb6508f · 93430da1 · 9598bdb6
Unverified Commit 9598bdb6 authored Aug 15, 2022 by gilbertlee-amd Committed by GitHub Aug 15, 2022
Showing with 725 additions and 358 deletions

CHANGELOG.md CHANGELOG.md +20 -0

EnvVars.hpp EnvVars.hpp +169 -17

README.md README.md +14 -0

TransferBench.cpp TransferBench.cpp +498 -340

TransferBench.hpp TransferBench.hpp +24 -1

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench

+## v1.03
+### Added
+- New preset modes stress-test benchmarks "sweep" and "randomsweep"
+  - sweep iterates over all possible sets of Transfers to test
+  - randomsweep iterates over random sets of Transfers
+  -  New sweep-only environment variables can modify sweep
+     - SWEEP_SRC - String containing only "B","C","F", or "G", defining possible source memory types
+     - SWEEP_EXE - String containing only "C", or "G", defining possible executors
+     - SWEEP_DST - String containing only "B","C","F", or "G", defining possible destination memory types
+     - SWEEP_SRC_IS_EXE - Restrict executor to be the same as the source if non-zero
+     - SWEEP_MIN - Minimum number of parallel transfers to test
+     - SWEEP_MAX - Maximum number of parallel transfers to test
+     - SWEEP_COUNT - Maximum number of tests to run
+     - SWEEP_TIME_LIMIT - Maximum number of seconds to run tests for
+- New environment variable to restrict number of available GPUs to test on (primarily for sweep runs)
+  - NUM_CPU_DEVICES - Number of CPU devices
+  - NUM_GPU_DEVICES - Number of GPU devices
+### Changed
+- Fixed timing display for CPU-executors when using single stream mode
+
 ## v1.02
 ### Added
 - Setting NUM_ITERATIONS to negative number indicates to run for -NUM_ITERATIONS seconds per Test

--- a/EnvVars.hpp
+++ b/EnvVars.hpp
@@ -25,7 +25,9 @@ THE SOFTWARE.

 #include <algorithm>

-#define TB_VERSION "1.02"
+#define TB_VERSION "1.03"
+
+extern char const MemTypeStr[];

 // This class manages environment variable that affect TransferBench
 class EnvVars
@@ -37,10 +39,21 @@ public:
  int const DEFAULT_SAMPLING_FACTOR      =  1;
  int const DEFAULT_NUM_CPU_PER_TRANSFER =  4;

+  int const DEFAULT_SWEEP_SRC_IS_EXE  = 0;
+  std::string const DEFAULT_SWEEP_SRC = "CG";
+  std::string const DEFAULT_SWEEP_EXE = "CG";
+  std::string const DEFAULT_SWEEP_DST = "CG";
+  int const DEFAULT_SWEEP_MIN         = 1;
+  int const DEFAULT_SWEEP_MAX         = 24;
+  int const DEFAULT_SWEEP_TEST_LIMIT  = 0;
+  int const DEFAULT_SWEEP_TIME_LIMIT  = 0;
+
  // Environment variables
  int blockBytes;        // Each CU, except the last, gets a multiple of this many bytes to copy
  int byteOffset;        // Byte-offset for memory allocations
+  int numCpuDevices;     // Number of CPU devices to use (defaults to # NUMA nodes detected)
  int numCpuPerTransfer; // Number of CPU child threads to use per CPU Transfer
+  int numGpuDevices;     // Number of GPU devices to use (defaults to # HIP devices detected)
  int numIterations;     // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
  int numWarmups;        // Number of un-timed warmup iterations to perform
  int outputToCsv;       // Output in CSV format
@@ -54,6 +67,16 @@ public:

  std::vector<float> fillPattern; // Pattern of floats used to fill source data

+  // Environment variables only for Sweep-preset
+  int sweepSrcIsExe;     // Non-zero if executor should always be the same as source
+  int sweepMin;          // Min number of simultaneous Transfers to be executed per test
+  int sweepMax;          // Max number of simulatneous Transfers to be executed per test
+  int sweepTestLimit;    // Max number of tests to run during sweep (0 = no limit)
+  int sweepTimeLimit;    // Max number of seconds to run sweep for  (0 = no limit)
+  std::string sweepSrc;  // Set of src memory types to be swept
+  std::string sweepExe;  // Set of executors to be swept
+  std::string sweepDst;  // Set of dst memory types to be swept
+
  // Constructor that collects values
  EnvVars()
  {
@@ -61,9 +84,15 @@ public:
    hipDeviceGetAttribute(&maxSharedMemBytes,
                          hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0);

+    int numDetectedCpus = numa_num_configured_nodes();
+    int numDetectedGpus;
+    hipGetDeviceCount(&numGpuDevices);
+
    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
+    numCpuDevices     = GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
    numCpuPerTransfer = GetEnvVar("NUM_CPU_PER_TRANSFER", DEFAULT_NUM_CPU_PER_TRANSFER);
+    numGpuDevices     = GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
    numIterations     = GetEnvVar("NUM_ITERATIONS"      , DEFAULT_NUM_ITERATIONS);
    numWarmups        = GetEnvVar("NUM_WARMUPS"         , DEFAULT_NUM_WARMUPS);
    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
@@ -75,6 +104,15 @@ public:
    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 0);

+    sweepSrcIsExe     = GetEnvVar("SWEEP_SRC_IS_EXE", DEFAULT_SWEEP_SRC_IS_EXE);
+    sweepMin          = GetEnvVar("SWEEP_MIN", DEFAULT_SWEEP_MIN);
+    sweepMax          = GetEnvVar("SWEEP_MAX", DEFAULT_SWEEP_MAX);
+    sweepSrc          = GetEnvVar("SWEEP_SRC", DEFAULT_SWEEP_SRC);
+    sweepExe          = GetEnvVar("SWEEP_EXE", DEFAULT_SWEEP_EXE);
+    sweepDst          = GetEnvVar("SWEEP_DST", DEFAULT_SWEEP_DST);
+    sweepTestLimit    = GetEnvVar("SWEEP_TEST_LIMIT", DEFAULT_SWEEP_TEST_LIMIT);
+    sweepTimeLimit    = GetEnvVar("SWEEP_TIME_LIMIT", DEFAULT_SWEEP_TIME_LIMIT);
+
    // Check for fill pattern
    char* pattern = getenv("FILL_PATTERN");
    if (pattern != NULL)
@@ -134,6 +172,16 @@ public:
    else fillPattern.clear();

    // Perform some basic validation
+    if (numCpuDevices > numDetectedCpus)
+    {
+      printf("[ERROR] Number of CPUs to use (%d) cannot exceed number of detected CPUs (%d)\n", numCpuDevices, numDetectedCpus);
+      exit(1);
+    }
+    if (numGpuDevices > numDetectedGpus)
+    {
+      printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
+      exit(1);
+    }
    if (byteOffset % sizeof(float))
    {
      printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
@@ -169,6 +217,49 @@ public:
      printf("[ERROR] Single stream mode cannot be used with HIP calls\n");
      exit(1);
    }
+
+    for (auto ch : sweepSrc)
+    {
+      if (!strchr(MemTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
+        exit(1);
+      }
+    }
+
+    for (auto ch : sweepDst)
+    {
+      if (!strchr(MemTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
+        exit(1);
+      }
+    }
+
+    char const* permittedExecutors = "CG";
+    for (auto ch : sweepExe)
+    {
+      if (!strchr(permittedExecutors, ch))
+      {
+        printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
+        exit(1);
+      }
+    }
  }

  // Display info on the env vars that can be used
@@ -179,7 +270,9 @@ public:
    printf(" BLOCK_BYTES=B          - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
    printf(" BYTE_OFFSET            - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
    printf(" FILL_PATTERN=STR       - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
+    printf(" NUM_CPU_DEVICES=X      - Restrict number of CPUs to X.  May not be greater than # detected NUMA nodes\n");
    printf(" NUM_CPU_PER_TRANSFER=C - Use C threads per Transfer for CPU-executed copies\n");
+    printf(" NUM_GPU_DEVICES=X      - Restrict number of GCPUs to X.  May not be greater than # detected HIP devices\n");
    printf(" NUM_ITERATIONS=I       - Perform I timed iteration(s) per test\n");
    printf(" NUM_WARMUPS=W          - Perform W untimed warmup iteration(s) per test\n");
    printf(" OUTPUT_TO_CSV          - Outputs to CSV format if set\n");
@@ -207,8 +300,10 @@ public:
      else
        printf("Pseudo-random: (Element i = i modulo 383 + 31)");
      printf("\n");
-      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
-      printf("%-20s = %12d : Running %d %s per topology\n", "NUM_ITERATIONS", numIterations,
+      printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
+      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
+      printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
+      printf("%-20s = %12d : Running %d %s per test\n", "NUM_ITERATIONS", numIterations,
             numIterations > 0 ? numIterations : -numIterations,
             numIterations > 0 ? "timed iteration(s)" : "second(s)");
      printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
@@ -236,13 +331,70 @@ public:
    }
  };

+  // Display env var settings
+  void DisplaySweepEnvVars() const
+  {
+    if (!outputToCsv)
+    {
+      printf("Sweep configuration (TransferBench v%s)\n", TB_VERSION);
+      printf("=====================================================\n");
+      printf("%-20s = %12s : Source Memory Types to sweep\n", "SWEEP_SRC", sweepSrc.c_str());
+      printf("%-20s = %12s : Executor Types to sweep\n", "SWEEP_EXE", sweepExe.c_str());
+      printf("%-20s = %12s : Destination Memory Types to sweep\n", "SWEEP_DST", sweepDst.c_str());
+      printf("%-20s = %12d : Transfer executor %s Transfer source\n", "SWEEP_SRC_IS_EXE", sweepSrcIsExe, sweepSrcIsExe ? "must match" : "may have any");
+      printf("%-20s = %12d : Min simultaneous Transfers\n", "SWEEP_MIN", sweepMin);
+      printf("%-20s = %12d : Max simultaneous Transfers              (0 = no limit)\n", "SWEEP_MAX", sweepMax);
+      printf("%-20s = %12d : Max number of tests to run during sweep (0 = no limit)\n", "SWEEP_TEST_LIMIT", sweepTestLimit);
+      printf("%-20s = %12d : Max number of seconds to run sweep for  (0 = no limit)\n", "SWEEP_TIME_LIMIT", sweepTimeLimit);
+      printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
+      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
+      printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
+      printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
+      printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
+      printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
+      if (fillPattern.size())
+        printf("Pattern: %s", getenv("FILL_PATTERN"));
+      else
+        printf("Pseudo-random: (Element i = i modulo 383 + 31)");
+      printf("\n");
+      printf("%-20s = %12d : Running %d %s per test\n", "NUM_ITERATIONS", numIterations,
+             numIterations > 0 ? numIterations : -numIterations,
+             numIterations > 0 ? "timed iteration(s)" : "second(s)");
+      printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
+      printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
+             outputToCsv ? "CSV" : "console");
+      printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
+             getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
+      printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
+             useHipCall ? "HIP functions" : "custom kernels");
+      if (useHipCall && !useMemset)
+      {
+        char* env = getenv("HSA_ENABLE_SDMA");
+        printf("%-20s = %12s : %s\n", "HSA_ENABLE_SDMA", env,
+               (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
+      }
+      printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX",
+             usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
+      printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM",
+             useSingleStream, (useSingleStream ? "device" : "Transfer"));
+      printf("\n");
+    }
+  };
+
  // Helper function that gets parses environment variable or sets to default value
-  static int GetEnvVar(std::string const varname, int defaultValue)
+  static int GetEnvVar(std::string const& varname, int defaultValue)
  {
    if (getenv(varname.c_str()))
      return atoi(getenv(varname.c_str()));
    return defaultValue;
  }
+
+  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return getenv(varname.c_str());
+    return defaultValue;
+  }
 };

 #endif
--- a/README.md
+++ b/README.md
@@ -12,3 +12,17 @@ TransferBench is a simple utility capable of benchmarking simultaneous copies be
 * `make`

  If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
+
+
+## Hints and suggestions
+- Running TransferBench with no arguments will display usage instructions and detected topology information
+- There are several preset configurations that can be used instead of a configuration file
+  including:
+  - p2p    - Peer to peer benchmark test
+  - sweep  - Sweep across possible sets of Transfers
+  - rsweep - Random sweep across possible sets of Transfers
+- When using the same GPU executor in multiple simultaneous Transfers, performance may be
+  serialized due to the maximum number of hardware queues available.
+  - The number of maximum hardware queues can be adjusted via GPU_MAX_HW_QUEUES
+  - Alternatively, running in single stream mode (USE_SINGLE_STREAM=1) may avoid this issue
+    by launching all Transfers on a single stream instead of individual streams
--- a/TransferBench.cpp
+++ b/TransferBench.cpp
@@ -24,6 +24,7 @@ THE SOFTWARE.
 // on the same node
 #include <numa.h>
 #include <numaif.h>
+#include <random>
 #include <stack>
 #include <thread>

@@ -33,6 +34,13 @@ THE SOFTWARE.

 int main(int argc, char **argv)
 {
+  // Check for NUMA library support
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+
  // Display usage instructions and detected topology
  if (argc <= 1)
  {
@@ -63,13 +71,15 @@ int main(int argc, char **argv)
  }
  PopulateTestSizes(numBytesPerTransfer, ev.samplingFactor, valuesOfN);

-  // Find the largest N to be used - memory will only be allocated once per set of simulatenous Transfers
-  size_t maxN = valuesOfN[0];
-  for (auto N : valuesOfN)
-    maxN = std::max(maxN, N);
-
-  // Execute only peer to peer benchmark mode, similar to rocm-bandwidth-test
-  if (!strcmp(argv[1], "p2p") || !strcmp(argv[1], "p2p_rr") ||
+  // Check for preset tests
+  // - Tests that sweep across possible sets of Transfers
+  if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep"))
+  {
+    RunSweepPreset(ev, numBytesPerTransfer, !strcmp(argv[1], "rsweep"));
+    exit(0);
+  }
+  // - Tests that benchmark peer-to-peer performance
+  else if (!strcmp(argv[1], "p2p") || !strcmp(argv[1], "p2p_rr") ||
           !strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr"))
  {
    int numBlocksToUse = 0;
@@ -96,33 +106,14 @@ int main(int argc, char **argv)
    exit(1);
  }

-  // Check for NUMA library support
-  if (numa_available() == -1)
-  {
-    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
-    exit(1);
-  }
+  // Print environment variables and CSV header
  ev.DisplayEnvVars();
-
-  int const initOffset = ev.byteOffset / sizeof(float);
-  std::stack<std::thread> threads;
-
-  // Collect the number of available CPUs/GPUs on this machine
-  int numGpuDevices;
-  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-  int const numCpuDevices = numa_num_configured_nodes();
-
-  // Track unique pair of transfers that get used
-  std::set<std::pair<int, int>> peerAccessTracker;
-
-  // Print CSV header
  if (ev.outputToCsv)
  {
    printf("Test,NumBytes,SrcMem,Executor,DstMem,CUs,BW(GB/s),Time(ms),"
           "TransferDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters\n");
  }

-  // Loop over each line in the Transfer configuration file
  int testNum = 0;
  char line[2048];
  while(fgets(line, 2048, fp))
@@ -130,21 +121,49 @@ int main(int argc, char **argv)
    // Check if line is a comment to be echoed to output (starts with ##)
    if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line);

-    // Parse transfers from configuration file
-    TransferMap transferMap;
-    ParseTransfers(line, numCpuDevices, numGpuDevices, transferMap);
-    if (transferMap.size() == 0) continue;
+    // Parse set of parallel Transfers to execute
+    std::vector<Transfer> transfers;
+    ParseTransfers(line, ev.numCpuDevices, ev.numGpuDevices, transfers);
+    if (transfers.empty()) continue;
+
+    ExecuteTransfers(ev, ++testNum, valuesOfN, transfers);
+  }
+  fclose(fp);
+
+  return 0;
+}

-    testNum++;
+void ExecuteTransfers(EnvVars const& ev,
+                      int testNum,
+                      std::vector<size_t> const& valuesOfN,
+                      std::vector<Transfer>& transfers)
+{
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  // Find the largest N to be used - memory will only be allocated once per set of Transfers
+  size_t maxN = valuesOfN[0];
+  for (auto N : valuesOfN)
+    maxN = std::max(maxN, N);

-    // Prepare (maximum) memory for each transfer
+  // Map transfers by executor
+  TransferMap transferMap;
+  for (Transfer const& transfer : transfers)
+  {
+    Executor executor(transfer.exeMemType, transfer.exeIndex);
+    ExecutorInfo& executorInfo = transferMap[executor];
+    executorInfo.transfers.push_back(transfer);
+  }
+
+  // Loop over each executor and prepare GPU resources
  std::vector<Transfer*> transferList;
  for (auto& exeInfoPair : transferMap)
  {
+    Executor const& executor = exeInfoPair.first;
    ExecutorInfo& exeInfo = exeInfoPair.second;
    exeInfo.totalTime = 0.0;
    exeInfo.totalBlocks = 0;

+    // Loop over each transfer this executor is involved in
    for (Transfer& transfer : exeInfo.transfers)
    {
      // Get some aliases to transfer variables
@@ -163,40 +182,29 @@ int main(int argc, char **argv)
      {
        // Ensure executing GPU can access source memory
        if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex)
-          {
-            auto exeSrcPair = std::make_pair(exeIndex, srcIndex);
-            if (!peerAccessTracker.count(exeSrcPair))
-            {
          EnablePeerAccess(exeIndex, srcIndex);
-              peerAccessTracker.insert(exeSrcPair);
-            }
-          }

        // Ensure executing GPU can access destination memory
        if ((dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) && dstIndex != exeIndex)
-          {
-            auto exeDstPair = std::make_pair(exeIndex, dstIndex);
-            if (!peerAccessTracker.count(exeDstPair))
-            {
          EnablePeerAccess(exeIndex, dstIndex);
-              peerAccessTracker.insert(exeDstPair);
-            }
-          }
      }

      // Allocate (maximum) source / destination memory based on type / device index
      AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.srcMem);
      AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.dstMem);
+
      transfer.blockParam.resize(exeMemType == MEM_CPU ? ev.numCpuPerTransfer : blocksToUse);
      exeInfo.totalBlocks += transfer.blockParam.size();
      transferList.push_back(&transfer);
    }

-      // Prepare GPU resources for GPU executors
-      MemType const exeMemType = exeInfoPair.first.first;
-      int     const exeIndex   = RemappedIndex(exeInfoPair.first.second, exeMemType);
+    // Prepare per-threadblock parameters for GPU executors
+    MemType const exeMemType = executor.first;
+    int     const exeIndex   = RemappedIndex(executor.second, exeMemType);
    if (exeMemType == MEM_GPU)
    {
+      // Allocate one contiguous chunk of GPU memory for threadblock parameters
+      // This allows support for executing one transfer per stream, or all transfers in a single stream
      AllocateMemory(exeMemType, exeIndex, exeInfo.totalBlocks * sizeof(BlockParam),
                     (void**)&exeInfo.blockParamGpu);

@@ -212,6 +220,7 @@ int main(int argc, char **argv)
        HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
      }

+      // Assign each transfer its portion of threadblock parameters
      int transferOffset = 0;
      for (int i = 0; i < exeInfo.transfers.size(); i++)
      {
@@ -232,9 +241,9 @@ int main(int argc, char **argv)
      ExecutorInfo& exeInfo = exeInfoPair.second;

      int transferOffset = 0;
-
      for (int i = 0; i < exeInfo.transfers.size(); ++i)
      {
+        // Prepare subarrays each threadblock works on and fill src memory with patterned data
        Transfer& transfer = exeInfo.transfers[i];
        transfer.PrepareBlockParams(ev, N);

@@ -253,6 +262,7 @@ int main(int argc, char **argv)
    // Launch kernels (warmup iterations are not counted)
    double totalCpuTime = 0;
    size_t numTimedIterations = 0;
+    std::stack<std::thread> threads;
    for (int iteration = -ev.numWarmups; ; iteration++)
    {
      if (ev.numIterations > 0 && iteration >= ev.numIterations) break;
@@ -273,7 +283,8 @@ int main(int argc, char **argv)
      for (auto& exeInfoPair : transferMap)
      {
        ExecutorInfo& exeInfo = exeInfoPair.second;
-          int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
+        int const numTransfersToRun = (IsGpuType(exeInfoPair.first.first) && ev.useSingleStream) ?
+          1 : exeInfo.transfers.size();
        for (int i = 0; i < numTransfersToRun; ++i)
          threads.push(std::thread(RunTransfer, std::ref(ev), N, iteration, std::ref(exeInfo), i));
      }
@@ -319,23 +330,36 @@ int main(int argc, char **argv)
    {
      for (auto& exeInfoPair : transferMap)
      {
-          ExecutorInfo const& exeInfo = exeInfoPair.second;
+        ExecutorInfo  exeInfo    = exeInfoPair.second;
        MemType const exeMemType = exeInfoPair.first.first;
        int     const exeIndex   = exeInfoPair.first.second;

+        // Compute total time for CPU executors
+        if (!IsGpuType(exeMemType))
+        {
+          exeInfo.totalTime = 0;
+          for (auto const& transfer : exeInfo.transfers)
+            exeInfo.totalTime = std::max(exeInfo.totalTime, transfer.transferTime);
+        }
+
        double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
-          double exeBandwidthGbs = (exeInfo.transfers.size() * N * sizeof(float) / 1.0E9) / exeDurationMsec * 1000.0f;
+        double exeBandwidthGbs = (exeInfo.transfers.size() * N * sizeof(float) / 1.0E9) /
+          exeDurationMsec * 1000.0f;
        maxGpuTime = std::max(maxGpuTime, exeDurationMsec);

        if (!ev.outputToCsv)
        {
          printf(" Executor: %cPU %02d        (# Transfers %02lu)| %9.3f GB/s | %8.3f ms |\n",
                 MemTypeStr[exeMemType], exeIndex, exeInfo.transfers.size(), exeBandwidthGbs, exeDurationMsec);
-            for (auto transfer : exeInfo.transfers)
+        }
+
+        for (auto const& transfer : exeInfo.transfers)
        {
          double transferDurationMsec = transfer.transferTime / (1.0 * numTimedIterations);
          double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;

+          if (!ev.outputToCsv)
+          {
            printf("                            Transfer  %02d | %9.3f GB/s | %8.3f ms | %c%02d -> %c%02d:(%03d) -> %c%02d\n",
                   transfer.transferIndex,
                   transferBandwidthGbs,
@@ -345,8 +369,23 @@ int main(int argc, char **argv)
                   transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
                   MemTypeStr[transfer.dstMemType], transfer.dstIndex);
          }
-          }
          else
+          {
+            printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%lu\n",
+                   testNum, N * sizeof(float),
+                   MemTypeStr[transfer.srcMemType], transfer.srcIndex,
+                   MemTypeStr[transfer.exeMemType], transfer.exeIndex,
+                   MemTypeStr[transfer.dstMemType], transfer.dstIndex,
+                   transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
+                   transferBandwidthGbs, transferDurationMsec,
+                   GetTransferDesc(transfer).c_str(),
+                   transfer.srcMem + initOffset, transfer.dstMem + initOffset,
+                   ev.byteOffset,
+                   ev.numWarmups, numTimedIterations);
+          }
+        }
+
+        if (ev.outputToCsv)
        {
          printf("%d,%lu,ALL,%c%02d,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
                 testNum, N * sizeof(float),
@@ -359,7 +398,7 @@ int main(int argc, char **argv)
    }
    else
    {
-        for (auto transfer : transferList)
+      for (auto const& transfer : transferList)
      {
        double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
        double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
@@ -395,8 +434,8 @@ int main(int argc, char **argv)
    // Display aggregate statistics
    if (!ev.outputToCsv)
    {
-        printf(" Aggregate Bandwidth (CPU timed)         | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n", totalBandwidthGbs, totalCpuTime,
-               totalCpuTime - maxGpuTime);
+      printf(" Aggregate Bandwidth (CPU timed)         | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n",
+             totalBandwidthGbs, totalCpuTime, totalCpuTime - maxGpuTime);
    }
    else
    {
@@ -437,10 +476,6 @@ int main(int argc, char **argv)
      }
    }
  }
-  }
-  fclose(fp);
-
-  return 0;
 }

 void DisplayUsage(char const* cmdName)
@@ -461,10 +496,10 @@ void DisplayUsage(char const* cmdName)
  printf("  config: Either:\n");
  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
  printf("          - Name of preset benchmark:\n");
-  printf("              p2p    - All CPU/GPU pairs benchmark\n");
-  printf("              p2p_rr - All CPU/GPU pairs benchmark with remote reads\n");
-  printf("              g2g    - All GPU/GPU pairs benchmark\n");
-  printf("              g2g_rr - All GPU/GPU pairs benchmark with remote reads\n");
+  printf("              p2p{_rr} - All CPU/GPU pairs benchmark {with remote reads}\n");
+  printf("              g2g{_rr} - All GPU/GPU pairs benchmark {with remote reads}\n");
+  printf("              sweep    - Sweep across possible sets of Transfers\n");
+  printf("              rsweep   - Randomly sweep across possible sets of Transfers\n");
  printf("            - 3rd optional argument will be used as # of CUs to use (uses all by default)\n");
  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
@@ -649,15 +684,15 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
 }

 // Helper function to parse a list of Transfer definitions
-void ParseTransfers(char* line, int numCpus, int numGpus, TransferMap& transferMap)
+void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>& transfers)
 {
  // Replace any round brackets or '->' with spaces,
  for (int i = 1; line[i]; i++)
    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';

-  transferMap.clear();
-  int numTransfers = 0;
+  transfers.clear();

+  int numTransfers = 0;
  std::istringstream iss(line);
  iss >> numTransfers;
  if (iss.fail()) return;
@@ -665,75 +700,43 @@ void ParseTransfers(char* line, int numCpus, int numGpus, TransferMap& transferM
  std::string exeMem;
  std::string srcMem;
  std::string dstMem;
-  if (numTransfers > 0)
-  {
-    // Method 1: Take in triples (srcMem, exeMem, dstMem)
+
+  // If numTransfers < 0, read quads (srcMem, exeMem, dstMem, #CUs)
+  // otherwise read triples (srcMem, exeMem, dstMem)
+  bool const perTransferCUs = (numTransfers < 0);
+  numTransfers = abs(numTransfers);
+
  int numBlocksToUse;
+  if (!perTransferCUs)
+  {
    iss >> numBlocksToUse;
    if (numBlocksToUse <= 0 || iss.fail())
    {
      printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numBlocksToUse);
      exit(1);
    }
-    for (int i = 0; i < numTransfers; i++)
-    {
-      Transfer transfer;
-      transfer.transferIndex = i;
-      iss >> srcMem >> exeMem >> dstMem;
-      if (iss.fail())
-      {
-        printf("Parsing error: Unable to read valid Transfer triplet (possibly missing a SRC or EXE or DST)\n");
-        exit(1);
-      }
-      ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
-      ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
-      ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
-      transfer.numBlocksToUse = numBlocksToUse;
-
-      // Ensure executor is either CPU or GPU
-      if (transfer.exeMemType != MEM_CPU && transfer.exeMemType != MEM_GPU)
-      {
-        printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n",
-               srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), transfer.numBlocksToUse);
-        exit(1);
-      }
-
-      Executor executor(transfer.exeMemType, transfer.exeIndex);
-      ExecutorInfo& executorInfo = transferMap[executor];
-      executorInfo.totalBlocks += transfer.numBlocksToUse;
-      executorInfo.transfers.push_back(transfer);
-    }
  }
-  else
-  {
-    // Method 2: Read in quads (srcMem, exeMem, dstMem,  Read common # blocks to use, then read (src, dst) doubles
-    numTransfers *= -1;

  for (int i = 0; i < numTransfers; i++)
  {
    Transfer transfer;
    transfer.transferIndex = i;
-      iss >> srcMem >> exeMem >> dstMem >> transfer.numBlocksToUse;
+    iss >> srcMem >> exeMem >> dstMem;
+    if (perTransferCUs) iss >> numBlocksToUse;
    if (iss.fail())
    {
+      if (perTransferCUs)
        printf("Parsing error: Unable to read valid Transfer quadruple (possibly missing a SRC or EXE or DST or #CU)\n");
+      else
+        printf("Parsing error: Unable to read valid Transfer triplet (possibly missing a SRC or EXE or DST)\n");
      exit(1);
    }
+
    ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
    ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
    ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
-      if (transfer.exeMemType != MEM_CPU && transfer.exeMemType != MEM_GPU)
-      {
-        printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n"
-,               srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), transfer.numBlocksToUse);
-        exit(1);
-      }
-
-      Executor executor(transfer.exeMemType, transfer.exeIndex);
-      ExecutorInfo& executorInfo = transferMap[executor];
-      executorInfo.totalBlocks += transfer.numBlocksToUse;
-      executorInfo.transfers.push_back(transfer);
-    }
+    transfer.numBlocksToUse = numBlocksToUse;
+    transfers.push_back(transfer);
  }
 }

@@ -747,7 +750,13 @@ void EnablePeerAccess(int const deviceId, int const peerDeviceId)
    exit(1);
  }
  HIP_CALL(hipSetDevice(deviceId));
-  HIP_CALL(hipDeviceEnablePeerAccess(peerDeviceId, 0));
+  hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
+  if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
+  {
+    printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
+           deviceId, peerDeviceId, hipGetErrorString(error));
+    exit(1);
+  }
 }

 void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
@@ -982,7 +991,8 @@ std::string GetTransferDesc(Transfer const& transfer)
    + GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.dstMemType, transfer.dstIndex);
 }

-void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx)
+void RunTransfer(EnvVars const& ev, size_t const N, int const iteration,
+                 ExecutorInfo& exeInfo, int const transferIdx)
 {
  Transfer& transfer = exeInfo.transfers[transferIdx];

@@ -1348,3 +1358,151 @@ int GetWallClockRate(int deviceId)
  }
  return wallClockPerDeviceMhz[deviceId];
 }
+
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom)
+{
+  ev.DisplaySweepEnvVars();
+  std::vector<size_t> valuesOfN(1, numBytesPerTransfer / sizeof(float));
+
+  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
+  bool hasCpuExecutor = false;
+  bool hasGpuExecutor = false;
+  std::vector<std::pair<MemType, int>> exeList;
+  for (auto exe : ev.sweepExe)
+  {
+    MemType const exeMemType = CharToMemType(exe);
+    int numDevices;
+    if (IsGpuType(exeMemType))
+    {
+      numDevices = ev.numGpuDevices;
+      hasGpuExecutor = true;
+    }
+    else
+    {
+      numDevices = ev.numCpuDevices;
+      hasCpuExecutor = true;
+    }
+    for (int exeIndex = 0; exeIndex < numDevices; ++exeIndex)
+      exeList.push_back(std::make_pair(exeMemType, exeIndex));
+  }
+  int numExes = ev.sweepSrcIsExe ? 1 : exeList.size();
+
+  std::vector<std::pair<MemType, int>> srcList;
+  for (auto src : ev.sweepSrc)
+  {
+    MemType const srcMemType = CharToMemType(src);
+    int const numDevices = IsGpuType(srcMemType) ? ev.numGpuDevices : ev.numCpuDevices;
+    // Skip source memory type if executor is supposed to be source but not specified
+    if ((IsGpuType(srcMemType) && !hasGpuExecutor) ||
+        (!IsGpuType(srcMemType) && !hasCpuExecutor)) continue;
+    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
+      srcList.push_back(std::make_pair(srcMemType, srcIndex));
+  }
+  int numSrcs = srcList.size();
+
+
+  std::vector<std::pair<MemType, int>> dstList;
+  for (auto dst : ev.sweepDst)
+  {
+    MemType const dstMemType = CharToMemType(dst);
+    int const numDevices = IsGpuType(dstMemType) ? ev.numGpuDevices : ev.numCpuDevices;
+
+    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
+      dstList.push_back(std::make_pair(dstMemType, dstIndex));
+  }
+  int numDsts = dstList.size();
+
+  int const numPossible = numSrcs * numExes * numDsts;
+  int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
+  if (ev.sweepSrcIsExe)
+  {
+    printf("Num possible (SRC/DST) triplets: (%d/%d) = %d\n", numSrcs, numDsts, numPossible);
+  }
+  else
+  {
+    printf("Num possible (SRC/EXE/DST) triplets: (%d/%d/%d) = %d\n", numSrcs, numExes, numDsts, numPossible);
+  }
+
+  if (ev.sweepMin > numPossible)
+  {
+    printf("No valid test configurations exist\n");
+    return;
+  }
+
+  int numTestsRun = 0;
+  int M = ev.sweepMin;
+  // Create bitmask of numPossible triplets, of which M will be chosen
+  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
+  auto rng = std::default_random_engine {};
+  auto cpuStart = std::chrono::high_resolution_clock::now();
+  while (1)
+  {
+    if (isRandom)
+    {
+      // Pick random number of simultaneous transfers to execute
+      // NOTE: This currently skews distribution due to some #s having more possibilities than others
+      M = ((maxParallelTransfers > ev.sweepMin) ? (rand() % (maxParallelTransfers - ev.sweepMin)) : 0)
+        + ev.sweepMin;
+
+      // Generate a random bitmask
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+      std::shuffle(bitmask.begin(), bitmask.end(), rng);
+    }
+
+    // Convert bitmask to list of Transfers
+    std::vector<Transfer> transfers;
+    for (int value = 0; value < numPossible; ++value)
+    {
+      if (bitmask[value])
+      {
+        // Convert integer value to (SRC->EXE->DST) triplet
+        Transfer transfer;
+        int srcValue = value / numDsts / numExes;
+        int exeValue = value / numDsts % numExes;
+        int dstValue = value % numDsts;
+        transfer.srcMemType = srcList[srcValue].first;
+        transfer.srcIndex   = srcList[srcValue].second;
+        transfer.exeMemType = ev.sweepSrcIsExe ? transfer.srcMemType : exeList[exeValue].first;
+        transfer.exeIndex   = ev.sweepSrcIsExe ? transfer.srcIndex   : exeList[exeValue].second;
+        transfer.dstMemType = dstList[dstValue].first;
+        transfer.dstIndex   = dstList[dstValue].second;
+        transfer.numBlocksToUse = IsGpuType(transfer.exeMemType) ? 4 : ev.numCpuPerTransfer;
+        transfer.transferIndex = transfers.size();
+        transfers.push_back(transfer);
+      }
+    }
+
+    ExecuteTransfers(ev, ++numTestsRun, valuesOfN, transfers);
+
+    // Check for test limit
+    if (numTestsRun == ev.sweepTestLimit)
+    {
+      printf("Test limit reached\n");
+      break;
+    }
+
+    // Check for time limit
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+    if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
+    {
+      printf("Time limit exceeded\n");
+      break;
+    }
+
+    // Increment bitmask if not random sweep
+    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
+    {
+      M++;
+      // Check for completion
+      if (M > maxParallelTransfers)
+      {
+        printf("Sweep complete\n");
+        break;
+      }
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+    }
+  }
+}
--- a/TransferBench.hpp
+++ b/TransferBench.hpp
@@ -61,8 +61,27 @@ typedef enum
  MEM_GPU_FINE = 3     // Fine-grained global GPU memory
 } MemType;

+bool IsGpuType(MemType m)
+{
+  return (m == MEM_GPU || m == MEM_GPU_FINE);
+}
+
 char const MemTypeStr[5] = "CGBF";

+MemType inline CharToMemType(char const c)
+{
+  switch (c)
+  {
+  case 'C': return MEM_CPU;
+  case 'G': return MEM_GPU;
+  case 'B': return MEM_CPU_FINE;
+  case 'F': return MEM_GPU_FINE;
+  default:
+    printf("[ERROR] Unexpected mem type (%c)\n", c);
+    exit(1);
+  }
+}
+
 typedef enum
 {
  MODE_FILL  = 0,         // Fill data with pattern
@@ -141,7 +160,10 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
                  MemType* memType, int* memIndex);

 void ParseTransfers(char* line, int numCpus, int numGpus,
-                TransferMap& transferMap);
+                    std::vector<Transfer>& transfers);
+
+void ExecuteTransfers(EnvVars const& ev, int testNum, std::vector<size_t> const& valuesOfN,
+                      std::vector<Transfer>& transfers);

 void EnablePeerAccess(int const deviceId, int const peerDeviceId);
 void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
@@ -150,6 +172,7 @@ void CheckPages(char* byteArray, size_t numBytes, int targetId);
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
 void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
 void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom);

 // Return the maximum bandwidth measured for given (src/dst) pair
 double GetPeakBandwidth(EnvVars const& ev,