V1.06 (#4)

* Updating version to v1.06 * Fixing CPU NUMA allocation * Fix random sweep repeatability * Adding unpinned CPU memory as possible memory type * Adding ability to customize per-transfer byte sizes * Updating advanced configuration file mode to take in numBytes per Transfer * Adding logging of sweep tests configuration to lastSweep.cfg * Add ability to specify #CUs for sweep benchmark

V1.06 (#4)
* Updating version to v1.06 * Fixing CPU NUMA allocation * Fix random sweep repeatability * Adding unpinned CPU memory as possible memory type * Adding ability to customize per-transfer byte sizes * Updating advanced configuration file mode to take in numBytes per Transfer * Adding logging of sweep tests configuration to lastSweep.cfg * Add ability to specify #CUs for sweep benchmark
be0db711 · gilbertlee-amd · GitHub · 5331f980 · be0db711 · be0db711
Unverified Commit be0db711 authored Sep 15, 2022 by gilbertlee-amd Committed by GitHub Sep 15, 2022
Showing with 401 additions and 411 deletions

CHANGELOG.md CHANGELOG.md +10 -0

EnvVars.hpp EnvVars.hpp +1 -1

TransferBench.cpp TransferBench.cpp +356 -390

TransferBench.hpp TransferBench.hpp +18 -10

example.cfg example.cfg +16 -10

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench

+## v1.06
+### Added
+- Added unpinned CPU memory type ('U').  May require HSA_XNACK=1 in order to access via GPU executors
+- Adding logging of sweep configuration to lastSweep.cfg
+- Adding ability to specify number of CUs to use for sweep-based presets
+### Changed
+- Fixing random sweep repeatibility
+- Fixing bug with CPU NUMA node memory allocation
+- Modified advanced configuration file format to accept bytes per Transfer
+
 ## v1.05
 ### Added
 - Topology output now includes NUMA node information

--- a/EnvVars.hpp
+++ b/EnvVars.hpp
@@ -26,7 +26,7 @@ THE SOFTWARE.
 #include <algorithm>
 #include <random>
 #include <time.h>
-#define TB_VERSION "1.05"
+#define TB_VERSION "1.06"

 extern char const MemTypeStr[];


--- a/TransferBench.cpp
+++ b/TransferBench.cpp
@@ -54,9 +54,6 @@ int main(int argc, char **argv)
  EnvVars ev;

  // Determine number of bytes to run per Transfer
-  // If a non-zero number of bytes is specified, use it
-  // Otherwise generate array of bytes values to execute over
-  std::vector<size_t> valuesOfN;
  size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
  if (argc > 2)
  {
@@ -69,14 +66,20 @@ int main(int argc, char **argv)
    case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break;
    }
  }
-  PopulateTestSizes(numBytesPerTransfer, ev.samplingFactor, valuesOfN);
+  if (numBytesPerTransfer % 4)
+  {
+    printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
+    exit(1);
+  }

  // Check for preset tests
  // - Tests that sweep across possible sets of Transfers
  if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep"))
  {
+    int numBlocksToUse = (argc > 3 ? atoi(argv[3]) : 4);
+
    ev.configMode = CFG_SWEEP;
-    RunSweepPreset(ev, numBytesPerTransfer, !strcmp(argv[1], "rsweep"));
+    RunSweepPreset(ev, numBytesPerTransfer, numBlocksToUse, !strcmp(argv[1], "rsweep"));
    exit(0);
  }
  // - Tests that benchmark peer-to-peer performance
@@ -129,7 +132,26 @@ int main(int argc, char **argv)
    ParseTransfers(line, ev.numCpuDevices, ev.numGpuDevices, transfers);
    if (transfers.empty()) continue;

-    ExecuteTransfers(ev, ++testNum, valuesOfN, transfers);
+    // If the number of bytes is specified, use it
+    if (numBytesPerTransfer != 0)
+    {
+      size_t N = numBytesPerTransfer / sizeof(float);
+      ExecuteTransfers(ev, ++testNum, N, transfers);
+    }
+    else
+    {
+      // Otherwise generate a range of values
+      for (int N = 256; N <= (1<<27); N *= 2)
+      {
+        int delta = std::max(32, N / ev.samplingFactor);
+        int curr = N;
+        while (curr < N * 2)
+        {
+          ExecuteTransfers(ev, ++testNum, N, transfers);
+          curr += delta;
+        }
+      }
+    }
  }
  fclose(fp);

@@ -137,28 +159,24 @@ int main(int argc, char **argv)
 }

 void ExecuteTransfers(EnvVars const& ev,
-                      int testNum,
-                      std::vector<size_t> const& valuesOfN,
-                      std::vector<Transfer>& transfers)
+                      int const testNum,
+                      size_t const N,
+                      std::vector<Transfer>& transfers,
+                      bool verbose)
 {
  int const initOffset = ev.byteOffset / sizeof(float);

-  // Find the largest N to be used - memory will only be allocated once per set of Transfers
-  size_t maxN = valuesOfN[0];
-  for (auto N : valuesOfN)
-    maxN = std::max(maxN, N);
-
  // Map transfers by executor
  TransferMap transferMap;
-  for (Transfer const& transfer : transfers)
+  for (Transfer& transfer : transfers)
  {
    Executor executor(transfer.exeMemType, transfer.exeIndex);
    ExecutorInfo& executorInfo = transferMap[executor];
-    executorInfo.transfers.push_back(transfer);
+    executorInfo.transfers.push_back(&transfer);
  }

  // Loop over each executor and prepare GPU resources
-  std::vector<Transfer*> transferList;
+  std::map<int, Transfer*> transferList;
  for (auto& exeInfoPair : transferMap)
  {
    Executor const& executor = exeInfoPair.first;
@@ -167,18 +185,18 @@ void ExecuteTransfers(EnvVars const& ev,
    exeInfo.totalBlocks = 0;

    // Loop over each transfer this executor is involved in
-    for (Transfer& transfer : exeInfo.transfers)
+    for (Transfer* transfer : exeInfo.transfers)
    {
      // Get some aliases to transfer variables
-      MemType const& exeMemType  = transfer.exeMemType;
-      MemType const& srcMemType  = transfer.srcMemType;
-      MemType const& dstMemType  = transfer.dstMemType;
-      int     const& blocksToUse = transfer.numBlocksToUse;
+      MemType const& exeMemType  = transfer->exeMemType;
+      MemType const& srcMemType  = transfer->srcMemType;
+      MemType const& dstMemType  = transfer->dstMemType;
+      int     const& blocksToUse = transfer->numBlocksToUse;

      // Get potentially remapped device indices
-      int const srcIndex = RemappedIndex(transfer.srcIndex, srcMemType);
-      int const exeIndex = RemappedIndex(transfer.exeIndex, exeMemType);
-      int const dstIndex = RemappedIndex(transfer.dstIndex, dstMemType);
+      int const srcIndex = RemappedIndex(transfer->srcIndex, srcMemType);
+      int const exeIndex = RemappedIndex(transfer->exeIndex, exeMemType);
+      int const dstIndex = RemappedIndex(transfer->dstIndex, dstMemType);

      // Enable peer-to-peer access if necessary (can only be called once per unique pair)
      if (exeMemType == MEM_GPU)
@@ -193,12 +211,13 @@ void ExecuteTransfers(EnvVars const& ev,
      }

      // Allocate (maximum) source / destination memory based on type / device index
-      AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.srcMem);
-      AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.dstMem);
+      transfer->numBytesToCopy = (transfer->numBytes ? transfer->numBytes : N * sizeof(float));
+      AllocateMemory(srcMemType, srcIndex, transfer->numBytesToCopy + ev.byteOffset, (void**)&transfer->srcMem);
+      AllocateMemory(dstMemType, dstIndex, transfer->numBytesToCopy + ev.byteOffset, (void**)&transfer->dstMem);

-      transfer.blockParam.resize(exeMemType == MEM_CPU ? ev.numCpuPerTransfer : blocksToUse);
-      exeInfo.totalBlocks += transfer.blockParam.size();
-      transferList.push_back(&transfer);
+      transfer->blockParam.resize(exeMemType == MEM_CPU ? ev.numCpuPerTransfer : blocksToUse);
+      exeInfo.totalBlocks += transfer->blockParam.size();
+      transferList[transfer->transferIndex] = transfer;
    }

    // Prepare per-threadblock parameters for GPU executors
@@ -227,209 +246,163 @@ void ExecuteTransfers(EnvVars const& ev,
      int transferOffset = 0;
      for (int i = 0; i < exeInfo.transfers.size(); i++)
      {
-        exeInfo.transfers[i].blockParamGpuPtr = exeInfo.blockParamGpu + transferOffset;
-        transferOffset += exeInfo.transfers[i].blockParam.size();
+        exeInfo.transfers[i]->blockParamGpuPtr = exeInfo.blockParamGpu + transferOffset;
+        transferOffset += exeInfo.transfers[i]->blockParam.size();
      }
    }
  }

-  // Loop over all the different number of bytes to use per Transfer
-  for (auto N : valuesOfN)
+  if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
+
+  // Prepare input memory and block parameters for current N
+  for (auto& exeInfoPair : transferMap)
  {
-    std::uniform_int_distribution<int> distribution(1,N);
-    if (!ev.outputToCsv) printf("Test %d:\n", testNum);
+    ExecutorInfo& exeInfo = exeInfoPair.second;
+    exeInfo.totalBytes = 0;

-    // Prepare input memory and block parameters for current N
-    for (auto& exeInfoPair : transferMap)
+    int transferOffset = 0;
+    for (int i = 0; i < exeInfo.transfers.size(); ++i)
    {
-      ExecutorInfo& exeInfo = exeInfoPair.second;
-      exeInfo.totalBytes = 0;
+      // Prepare subarrays each threadblock works on and fill src memory with patterned data
+      Transfer* transfer = exeInfo.transfers[i];
+      transfer->PrepareBlockParams(ev, transfer->numBytesToCopy / sizeof(float));
+      exeInfo.totalBytes += transfer->numBytesToCopy;

-      int transferOffset = 0;
-      for (int i = 0; i < exeInfo.transfers.size(); ++i)
+      // Copy block parameters to GPU for GPU executors
+      if (transfer->exeMemType == MEM_GPU)
      {
-        // Prepare subarrays each threadblock works on and fill src memory with patterned data
-        Transfer& transfer = exeInfo.transfers[i];
-        transfer.numBytes = ((ev.configMode == CFG_SWEEP && ev.sweepRandBytes) ? distribution(*ev.generator) : N) * sizeof(float);
-        transfer.PrepareBlockParams(ev, transfer.numBytes / sizeof(float));
-        exeInfo.totalBytes += transfer.numBytes;
-
-        // Copy block parameters to GPU for GPU executors
-        if (transfer.exeMemType == MEM_GPU)
-        {
-          HIP_CALL(hipMemcpy(&exeInfo.blockParamGpu[transferOffset],
-                             transfer.blockParam.data(),
-                             transfer.blockParam.size() * sizeof(BlockParam),
-                             hipMemcpyHostToDevice));
-          transferOffset += transfer.blockParam.size();
-        }
+        HIP_CALL(hipMemcpy(&exeInfo.blockParamGpu[transferOffset],
+                           transfer->blockParam.data(),
+                           transfer->blockParam.size() * sizeof(BlockParam),
+                           hipMemcpyHostToDevice));
+        transferOffset += transfer->blockParam.size();
      }
    }
+  }

-    // Launch kernels (warmup iterations are not counted)
-    double totalCpuTime = 0;
-    size_t numTimedIterations = 0;
-    std::stack<std::thread> threads;
-    for (int iteration = -ev.numWarmups; ; iteration++)
-    {
-      if (ev.numIterations > 0 && iteration >= ev.numIterations) break;
-      if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
-
-      // Pause before starting first timed iteration in interactive mode
-      if (ev.useInteractive && iteration == 0)
-      {
-        printf("Hit <Enter> to continue: ");
-        scanf("%*c");
-        printf("\n");
-      }
-
-      // Start CPU timing for this iteration
-      auto cpuStart = std::chrono::high_resolution_clock::now();
-
-      // Execute all Transfers in parallel
-      for (auto& exeInfoPair : transferMap)
-      {
-        ExecutorInfo& exeInfo = exeInfoPair.second;
-        int const numTransfersToRun = (IsGpuType(exeInfoPair.first.first) && ev.useSingleStream) ?
-          1 : exeInfo.transfers.size();
-        for (int i = 0; i < numTransfersToRun; ++i)
-          threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
-      }
-
-      // Wait for all threads to finish
-      int const numTransfers = threads.size();
-      for (int i = 0; i < numTransfers; i++)
-      {
-        threads.top().join();
-        threads.pop();
-      }
-
-      // Stop CPU timing for this iteration
-      auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-      double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-
-      if (iteration >= 0)
-      {
-        ++numTimedIterations;
-        totalCpuTime += deltaSec;
-      }
-    }
+  // Launch kernels (warmup iterations are not counted)
+  double totalCpuTime = 0;
+  size_t numTimedIterations = 0;
+  std::stack<std::thread> threads;
+  for (int iteration = -ev.numWarmups; ; iteration++)
+  {
+    if (ev.numIterations > 0 && iteration >= ev.numIterations) break;
+    if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;

-    // Pause for interactive mode
-    if (ev.useInteractive)
+    // Pause before starting first timed iteration in interactive mode
+    if (verbose && ev.useInteractive && iteration == 0)
    {
-      printf("Transfers complete. Hit <Enter> to continue: ");
+      printf("Hit <Enter> to continue: ");
      scanf("%*c");
      printf("\n");
    }

-    // Validate that each transfer has transferred correctly
-    size_t totalBytesTransferred = 0;
-    int const numTransfers = transferList.size();
-    for (auto transfer : transferList)
+    // Start CPU timing for this iteration
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    // Execute all Transfers in parallel
+    for (auto& exeInfoPair : transferMap)
    {
-      CheckOrFill(MODE_CHECK, transfer->numBytes / sizeof(float), ev.useMemset, ev.useHipCall, ev.fillPattern, transfer->dstMem + initOffset);
-      totalBytesTransferred += transfer->numBytes;
+      ExecutorInfo& exeInfo = exeInfoPair.second;
+      int const numTransfersToRun = (IsGpuType(exeInfoPair.first.first) && ev.useSingleStream) ?
+        1 : exeInfo.transfers.size();
+      for (int i = 0; i < numTransfersToRun; ++i)
+        threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
    }

-    // Report timings
-    totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
-    double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
-    double maxGpuTime = 0;
+    // Wait for all threads to finish
+    int const numTransfers = threads.size();
+    for (int i = 0; i < numTransfers; i++)
+    {
+      threads.top().join();
+      threads.pop();
+    }

-    if (ev.useSingleStream)
+    // Stop CPU timing for this iteration
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+
+    if (iteration >= 0)
    {
-      for (auto& exeInfoPair : transferMap)
-      {
-        ExecutorInfo  exeInfo    = exeInfoPair.second;
-        MemType const exeMemType = exeInfoPair.first.first;
-        int     const exeIndex   = exeInfoPair.first.second;
+      ++numTimedIterations;
+      totalCpuTime += deltaSec;
+    }
+  }

-        // Compute total time for CPU executors
-        if (!IsGpuType(exeMemType))
-        {
-          exeInfo.totalTime = 0;
-          for (auto const& transfer : exeInfo.transfers)
-            exeInfo.totalTime = std::max(exeInfo.totalTime, transfer.transferTime);
-        }
+  // Pause for interactive mode
+  if (verbose && ev.useInteractive)
+  {
+    printf("Transfers complete. Hit <Enter> to continue: ");
+    scanf("%*c");
+    printf("\n");
+  }

-        double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
-        double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
-        maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
+  // Validate that each transfer has transferred correctly
+  size_t totalBytesTransferred = 0;
+  int const numTransfers = transferList.size();
+  for (auto transferPair : transferList)
+  {
+    Transfer* transfer = transferPair.second;
+    CheckOrFill(MODE_CHECK, transfer->numBytesToCopy / sizeof(float), ev.useMemset, ev.useHipCall, ev.fillPattern, transfer->dstMem + initOffset);
+    totalBytesTransferred += transfer->numBytesToCopy;
+  }

-        if (!ev.outputToCsv)
-        {
-          printf(" Executor: %cPU %02d        (# Transfers %02lu)| %9.3f GB/s | %8.3f ms | %12lu bytes\n",
-                 MemTypeStr[exeMemType], exeIndex, exeInfo.transfers.size(), exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes);
-        }
+  // Report timings
+  totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
+  double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
+  double maxGpuTime = 0;

-        int totalCUs = 0;
-        for (auto const& transfer : exeInfo.transfers)
-        {
-          double transferDurationMsec = transfer.transferTime / (1.0 * numTimedIterations);
-          double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
-          totalCUs += transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse;
+  if (ev.useSingleStream)
+  {
+    for (auto& exeInfoPair : transferMap)
+    {
+      ExecutorInfo  exeInfo    = exeInfoPair.second;
+      MemType const exeMemType = exeInfoPair.first.first;
+      int     const exeIndex   = exeInfoPair.first.second;

-          if (!ev.outputToCsv)
-          {
-            printf("                            Transfer  %02d | %9.3f GB/s | %8.3f ms | %12lu bytes | %c%02d -> %c%02d:(%03d) -> %c%02d\n",
-                   transfer.transferIndex,
-                   transferBandwidthGbs,
-                   transferDurationMsec,
-                   transfer.numBytes,
-                   MemTypeStr[transfer.srcMemType], transfer.srcIndex,
-                   MemTypeStr[transfer.exeMemType], transfer.exeIndex,
-                   transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
-                   MemTypeStr[transfer.dstMemType], transfer.dstIndex);
+      // Compute total time for CPU executors
+      if (!IsGpuType(exeMemType))
+      {
+        exeInfo.totalTime = 0;
+        for (auto const& transfer : exeInfo.transfers)
+          exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
+      }

-          }
-          else
-          {
-            printf("%d,%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%s,%p,%p\n",
-                   testNum, transfer.transferIndex, transfer.numBytes,
-                   MemTypeStr[transfer.srcMemType], transfer.srcIndex,
-                   MemTypeStr[transfer.exeMemType], transfer.exeIndex,
-                   MemTypeStr[transfer.dstMemType], transfer.dstIndex,
-                   transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
-                   transferBandwidthGbs, transferDurationMsec,
-                   GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.srcMemType, transfer.srcIndex).c_str(),
-                   GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.dstMemType, transfer.dstIndex).c_str(),
-                   transfer.srcMem + initOffset, transfer.dstMem + initOffset);
-          }
-        }
+      double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
+      double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
+      maxGpuTime = std::max(maxGpuTime, exeDurationMsec);

-        if (ev.outputToCsv)
-        {
-          printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL,ALL,ALL\n",
-                 testNum, totalBytesTransferred,
-                 MemTypeStr[exeMemType], exeIndex, totalCUs,
-                 exeBandwidthGbs, exeDurationMsec);
-        }
+      if (verbose && !ev.outputToCsv)
+      {
+        printf(" Executor: %cPU %02d        (# Transfers %02lu)| %9.3f GB/s | %8.3f ms | %12lu bytes\n",
+               MemTypeStr[exeMemType], exeIndex, exeInfo.transfers.size(), exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes);
      }
-    }
-    else
-    {
-      for (auto const& transfer : transferList)
+
+      int totalCUs = 0;
+      for (auto const& transfer : exeInfo.transfers)
      {
        double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
-        double transferBandwidthGbs = (transfer->numBytes / 1.0E9) / transferDurationMsec * 1000.0f;
-        maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
+        double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
+        totalCUs += transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse;
+
+        if (!verbose) continue;
        if (!ev.outputToCsv)
        {
-          printf(" Transfer %02d: %c%02d -> [%cPU %02d:%03d] -> %c%02d | %9.3f GB/s | %8.3f ms | %12lu bytes | %-16s\n",
+          printf("                            Transfer  %02d | %9.3f GB/s | %8.3f ms | %12lu bytes | %c%02d -> %c%02d:(%03d) -> %c%02d\n",
                 transfer->transferIndex,
+                 transferBandwidthGbs,
+                 transferDurationMsec,
+                 transfer->numBytesToCopy,
                 MemTypeStr[transfer->srcMemType], transfer->srcIndex,
                 MemTypeStr[transfer->exeMemType], transfer->exeIndex,
                 transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
-                 MemTypeStr[transfer->dstMemType], transfer->dstIndex,
-                 transferBandwidthGbs, transferDurationMsec,
-                 transfer->numBytes,
-                 GetTransferDesc(*transfer).c_str());
+                 MemTypeStr[transfer->dstMemType], transfer->dstIndex);
+
        }
        else
        {
          printf("%d,%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%s,%p,%p\n",
-                 testNum, transfer->transferIndex, transfer->numBytes,
+                 testNum, transfer->transferIndex, transfer->numBytesToCopy,
                 MemTypeStr[transfer->srcMemType], transfer->srcIndex,
                 MemTypeStr[transfer->exeMemType], transfer->exeIndex,
                 MemTypeStr[transfer->dstMemType], transfer->dstIndex,
@@ -440,9 +413,56 @@ void ExecuteTransfers(EnvVars const& ev,
                 transfer->srcMem + initOffset, transfer->dstMem + initOffset);
        }
      }
+
+      if (verbose && ev.outputToCsv)
+      {
+        printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL,ALL,ALL\n",
+               testNum, totalBytesTransferred,
+               MemTypeStr[exeMemType], exeIndex, totalCUs,
+               exeBandwidthGbs, exeDurationMsec);
+      }
    }
+  }
+  else
+  {
+    for (auto const& transferPair : transferList)
+    {
+      Transfer* transfer = transferPair.second;
+      double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
+      double transferBandwidthGbs = (transfer->numBytesToCopy / 1.0E9) / transferDurationMsec * 1000.0f;
+      maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
+      if (!verbose) continue;
+      if (!ev.outputToCsv)
+      {
+        printf(" Transfer %02d: %c%02d -> [%cPU %02d:%03d] -> %c%02d | %9.3f GB/s | %8.3f ms | %12lu bytes | %-16s\n",
+               transfer->transferIndex,
+               MemTypeStr[transfer->srcMemType], transfer->srcIndex,
+               MemTypeStr[transfer->exeMemType], transfer->exeIndex,
+               transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
+               MemTypeStr[transfer->dstMemType], transfer->dstIndex,
+               transferBandwidthGbs, transferDurationMsec,
+               transfer->numBytesToCopy,
+               GetTransferDesc(*transfer).c_str());
+      }
+      else
+      {
+        printf("%d,%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%s,%p,%p\n",
+               testNum, transfer->transferIndex, transfer->numBytesToCopy,
+               MemTypeStr[transfer->srcMemType], transfer->srcIndex,
+               MemTypeStr[transfer->exeMemType], transfer->exeIndex,
+               MemTypeStr[transfer->dstMemType], transfer->dstIndex,
+               transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
+               transferBandwidthGbs, transferDurationMsec,
+               GetDesc(transfer->exeMemType, transfer->exeIndex, transfer->srcMemType, transfer->srcIndex).c_str(),
+               GetDesc(transfer->exeMemType, transfer->exeIndex, transfer->dstMemType, transfer->dstIndex).c_str(),
+               transfer->srcMem + initOffset, transfer->dstMem + initOffset);
+      }
+    }
+  }

-    // Display aggregate statistics
+  // Display aggregate statistics
+  if (verbose)
+  {
    if (!ev.outputToCsv)
    {
      printf(" Aggregate Bandwidth (CPU timed)         | %9.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n",
@@ -462,14 +482,14 @@ void ExecuteTransfers(EnvVars const& ev,
    for (auto& transfer : exeInfo.transfers)
    {
      // Get some aliases to Transfer variables
-      MemType const& exeMemType = transfer.exeMemType;
-      MemType const& srcMemType = transfer.srcMemType;
-      MemType const& dstMemType = transfer.dstMemType;
+      MemType const& exeMemType = transfer->exeMemType;
+      MemType const& srcMemType = transfer->srcMemType;
+      MemType const& dstMemType = transfer->dstMemType;

      // Allocate (maximum) source / destination memory based on type / device index
-      DeallocateMemory(srcMemType, transfer.srcMem);
-      DeallocateMemory(dstMemType, transfer.dstMem);
-      transfer.blockParam.clear();
+      DeallocateMemory(srcMemType, transfer->srcMem,  N * sizeof(float) + ev.byteOffset);
+      DeallocateMemory(dstMemType, transfer->dstMem,  N * sizeof(float) + ev.byteOffset);
+      transfer->blockParam.clear();
    }

    MemType const exeMemType = exeInfoPair.first.first;
@@ -510,7 +530,7 @@ void DisplayUsage(char const* cmdName)
  printf("              g2g{_rr} - All GPU/GPU pairs benchmark {with remote reads}\n");
  printf("              sweep    - Sweep across possible sets of Transfers\n");
  printf("              rsweep   - Randomly sweep across possible sets of Transfers\n");
-  printf("            - 3rd optional argument will be used as # of CUs to use (uses all by default)\n");
+  printf("            - 3rd optional argument used as # of CUs to use (all by default for p2p / 4 for sweep)\n");
  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
         DEFAULT_BYTES_PER_TRANSFER);
@@ -603,9 +623,9 @@ void DisplayTopology(bool const outputToCsv)
    {
      int numaDist = numa_distance(i,j);
      if (outputToCsv)
-	printf("%d,", numaDist);
+        printf("%d,", numaDist);
      else
-	printf(" %6d |", numaDist);
+        printf(" %6d |", numaDist);
    }

    int numCpus = 0;
@@ -622,8 +642,8 @@ void DisplayTopology(bool const outputToCsv)
      if (GetClosestNumaNode(RemappedIndex(j, MEM_GPU)) == i)
      {
        if (isFirst) isFirst = false;
-	else printf(",");
-	printf("%d", j);
+        else printf(",");
+        printf("%d", j);
      }
    }
    printf("\n");
@@ -687,40 +707,6 @@ void DisplayTopology(bool const outputToCsv)
  }
 }

-void PopulateTestSizes(size_t const numBytesPerTransfer,
-                       int const samplingFactor,
-                       std::vector<size_t>& valuesOfN)
-{
-  valuesOfN.clear();
-
-  // If the number of bytes is specified, use it
-  if (numBytesPerTransfer != 0)
-  {
-    if (numBytesPerTransfer % 4)
-    {
-      printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
-      exit(1);
-    }
-    size_t N = numBytesPerTransfer / sizeof(float);
-    valuesOfN.push_back(N);
-  }
-  else
-  {
-    // Otherwise generate a range of values
-    // (Powers of 2, with samplingFactor samples between successive powers of 2)
-    for (int N = 256; N <= (1<<27); N *= 2)
-    {
-      int delta = std::max(32, N / samplingFactor);
-      int curr = N;
-      while (curr < N * 2)
-      {
-        valuesOfN.push_back(curr);
-        curr += delta;
-      }
-    }
-  }
-}
-
 void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex)
 {
  char typeChar;
@@ -733,8 +719,8 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus

  switch (typeChar)
  {
-  case 'C': case 'c': case 'B': case 'b':
-    *memType = (typeChar == 'C' || typeChar == 'c') ? MEM_CPU : MEM_CPU_FINE;
+  case 'C': case 'c': case 'B': case 'b': case 'U': case 'u':
+    *memType = (typeChar == 'C' || typeChar == 'c') ? MEM_CPU : ((typeChar == 'B' || typeChar == 'b') ? MEM_CPU_FINE : MEM_CPU_UNPINNED);
    if (*memIndex < 0 || *memIndex >= numCpus)
    {
      printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, *memIndex);
@@ -750,7 +736,7 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
    }
    break;
  default:
-    printf("[ERROR] Unrecognized memory type %s.  Expecting either 'B', 'C' or 'G' or 'F'\n", token.c_str());
+    printf("[ERROR] Unrecognized memory type %s.  Expecting either 'B','C','U','G' or 'F'\n", token.c_str());
    exit(1);
  }
 }
@@ -775,11 +761,11 @@ void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>&

  // If numTransfers < 0, read quads (srcMem, exeMem, dstMem, #CUs)
  // otherwise read triples (srcMem, exeMem, dstMem)
-  bool const perTransferCUs = (numTransfers < 0);
+  bool const advancedMode = (numTransfers < 0);
  numTransfers = abs(numTransfers);

  int numBlocksToUse;
-  if (!perTransferCUs)
+  if (!advancedMode)
  {
    iss >> numBlocksToUse;
    if (numBlocksToUse <= 0 || iss.fail())
@@ -789,25 +775,50 @@ void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>&
    }
  }

+  size_t numBytes = 0;
  for (int i = 0; i < numTransfers; i++)
  {
    Transfer transfer;
    transfer.transferIndex = i;
-    iss >> srcMem >> exeMem >> dstMem;
-    if (perTransferCUs) iss >> numBlocksToUse;
-    if (iss.fail())
+    transfer.numBytes = 0;
+    transfer.numBytesToCopy = 0;
+    if (!advancedMode)
    {
-      if (perTransferCUs)
-        printf("Parsing error: Unable to read valid Transfer quadruple (possibly missing a SRC or EXE or DST or #CU)\n");
-      else
-        printf("Parsing error: Unable to read valid Transfer triplet (possibly missing a SRC or EXE or DST)\n");
-      exit(1);
+      iss >> srcMem >> exeMem >> dstMem;
+      if (iss.fail())
+      {
+        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1);
+        exit(1);
+      }
+    }
+    else
+    {
+      std::string numBytesToken;
+      iss >> srcMem >> exeMem >> dstMem >> numBlocksToUse >> numBytesToken;
+      if (iss.fail())
+      {
+        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1);
+        exit(1);
+      }
+      if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1)
+      {
+        printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1);
+        exit(1);
+      }
+      char units = numBytesToken.back();
+      switch (units)
+      {
+      case 'K': case 'k': numBytes *= 1024; break;
+      case 'M': case 'm': numBytes *= 1024*1024; break;
+      case 'G': case 'g': numBytes *= 1024*1024*1024; break;
+      }
    }

    ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
    ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
    ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
    transfer.numBlocksToUse = numBlocksToUse;
+    transfer.numBytes = numBytes;
    transfers.push_back(transfer);
  }
 }
@@ -839,22 +850,14 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
    exit(1);
  }

-  if (memType == MEM_CPU || memType == MEM_CPU_FINE)
+  if (IsCpuType(memType))
  {
    // Set numa policy prior to call to hipHostMalloc
-    // NOTE: It may be possible that the actual configured numa nodes do not start at 0
-    //       so remapping may be necessary
-    // Find the 'deviceId'-th available NUMA node
-    int numaIdx = 0;
-    for (int i = 0; i <= devIndex; i++)
-      while (!numa_bitmask_isbitset(numa_get_mems_allowed(), numaIdx))
-        ++numaIdx;
-
-    unsigned long nodemask = (1ULL << numaIdx);
+    unsigned long nodemask = (1ULL << devIndex);
    long retCode = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8);
    if (retCode)
    {
-      printf("[ERROR] Unable to set NUMA memory policy to bind to NUMA node %d\n", numaIdx);
+      printf("[ERROR] Unable to set NUMA memory policy to bind to NUMA node %d\n", devIndex);
      exit(1);
    }

@@ -864,13 +867,18 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
    {
      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
    }
-    else
+    else if (memType == MEM_CPU)
    {
      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent));
    }
+    else if (memType == MEM_CPU_UNPINNED)
+    {
+      *memPtr = numa_alloc_onnode(numBytes, devIndex);
+    }

    // Check that the allocated pages are actually on the correct NUMA node
-    CheckPages((char*)*memPtr, numBytes, numaIdx);
+    memset(*memPtr, 0, numBytes);
+    CheckPages((char*)*memPtr, numBytes, devIndex);

    // Reset to default numa mem policy
    retCode = set_mempolicy(MPOL_DEFAULT, NULL, 8);
@@ -898,12 +906,16 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
  }
 }

-void DeallocateMemory(MemType memType, void* memPtr)
+void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
 {
  if (memType == MEM_CPU || memType == MEM_CPU_FINE)
  {
    HIP_CALL(hipHostFree(memPtr));
  }
+  else if (memType == MEM_CPU_UNPINNED)
+  {
+    numa_free(memPtr, bytes);
+  }
  else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
  {
    HIP_CALL(hipFree(memPtr));
@@ -1024,20 +1036,16 @@ std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount)
 std::string GetDesc(MemType srcMemType, int srcIndex,
                    MemType dstMemType, int dstIndex)
 {
-  if (srcMemType == MEM_CPU || srcMemType == MEM_CPU_FINE)
+  if (IsCpuType(srcMemType))
  {
-    if (dstMemType == MEM_CPU || dstMemType == MEM_CPU_FINE)
-      return (srcIndex == dstIndex) ? "LOCAL" : "NUMA";
-    else if (dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE)
-      return "PCIE";
-    else
-      goto error;
+    if (IsCpuType(dstMemType)) return (srcIndex == dstIndex) ? "LOCAL" : "NUMA";
+    if (IsGpuType(dstMemType)) return "PCIE";
+    goto error;
  }
-  else if (srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE)
+  if (IsGpuType(srcMemType))
  {
-    if (dstMemType == MEM_CPU || dstMemType == MEM_CPU_FINE)
-      return "PCIE";
-    else if (dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE)
+    if (IsCpuType(dstMemType)) return "PCIE";
+    if (IsGpuType(dstMemType))
    {
      if (srcIndex == dstIndex) return "LOCAL";
      else
@@ -1049,8 +1057,6 @@ std::string GetDesc(MemType srcMemType, int srcIndex,
        return GetLinkTypeDesc(linkType, hopCount);
      }
    }
-    else
-      goto error;
  }
 error:
  printf("[ERROR] Unrecognized memory type\n");
@@ -1066,13 +1072,13 @@ std::string GetTransferDesc(Transfer const& transfer)
 void RunTransfer(EnvVars const& ev, int const iteration,
                 ExecutorInfo& exeInfo, int const transferIdx)
 {
-  Transfer& transfer = exeInfo.transfers[transferIdx];
+  Transfer* transfer = exeInfo.transfers[transferIdx];

  // GPU execution agent
-  if (transfer.exeMemType == MEM_GPU)
+  if (transfer->exeMemType == MEM_GPU)
  {
    // Switch to executing GPU
-    int const exeIndex = RemappedIndex(transfer.exeIndex, MEM_GPU);
+    int const exeIndex = RemappedIndex(transfer->exeIndex, MEM_GPU);
    HIP_CALL(hipSetDevice(exeIndex));

    hipStream_t& stream     = exeInfo.streams[transferIdx];
@@ -1088,24 +1094,24 @@ void RunTransfer(EnvVars const& ev, int const iteration,

      // Execute hipMemset / hipMemcpy
      if (ev.useMemset)
-        HIP_CALL(hipMemsetAsync(transfer.dstMem + initOffset, 42, transfer.numBytes, stream));
+        HIP_CALL(hipMemsetAsync(transfer->dstMem + initOffset, 42, transfer->numBytesToCopy, stream));
      else
-        HIP_CALL(hipMemcpyAsync(transfer.dstMem + initOffset,
-                                transfer.srcMem + initOffset,
-                                transfer.numBytes, hipMemcpyDefault,
+        HIP_CALL(hipMemcpyAsync(transfer->dstMem + initOffset,
+                                transfer->srcMem + initOffset,
+                                transfer->numBytesToCopy, hipMemcpyDefault,
                                stream));
      // Record stop event
      HIP_CALL(hipEventRecord(stopEvent, stream));
    }
    else
    {
-      int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalBlocks : transfer.numBlocksToUse;
+      int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalBlocks : transfer->numBlocksToUse;
      hipExtLaunchKernelGGL(ev.useMemset ? GpuMemsetKernel : GpuCopyKernel,
                            dim3(numBlocksToRun, 1, 1),
                            dim3(BLOCKSIZE, 1, 1),
                            ev.sharedMemBytes, stream,
                            startEvent, stopEvent,
-                            0, transfer.blockParamGpuPtr);
+                            0, transfer->blockParamGpuPtr);
    }

    // Synchronize per iteration, unless in single sync mode, in which case
@@ -1120,33 +1126,33 @@ void RunTransfer(EnvVars const& ev, int const iteration,

      if (ev.useSingleStream)
      {
-        for (Transfer& currTransfer : exeInfo.transfers)
+        for (Transfer* currTransfer : exeInfo.transfers)
        {
-          long long minStartCycle = currTransfer.blockParamGpuPtr[0].startCycle;
-          long long maxStopCycle  = currTransfer.blockParamGpuPtr[0].stopCycle;
-          for (int i = 1; i < currTransfer.numBlocksToUse; i++)
+          long long minStartCycle = currTransfer->blockParamGpuPtr[0].startCycle;
+          long long maxStopCycle  = currTransfer->blockParamGpuPtr[0].stopCycle;
+          for (int i = 1; i < currTransfer->numBlocksToUse; i++)
          {
-            minStartCycle = std::min(minStartCycle, currTransfer.blockParamGpuPtr[i].startCycle);
-            maxStopCycle  = std::max(maxStopCycle,  currTransfer.blockParamGpuPtr[i].stopCycle);
+            minStartCycle = std::min(minStartCycle, currTransfer->blockParamGpuPtr[i].startCycle);
+            maxStopCycle  = std::max(maxStopCycle,  currTransfer->blockParamGpuPtr[i].stopCycle);
          }
          int const wallClockRate = GetWallClockRate(exeIndex);
          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
-          currTransfer.transferTime += iterationTimeMs;
+          currTransfer->transferTime += iterationTimeMs;
        }
        exeInfo.totalTime += gpuDeltaMsec;
      }
      else
      {
-        transfer.transferTime += gpuDeltaMsec;
+        transfer->transferTime += gpuDeltaMsec;
      }
    }
  }
-  else if (transfer.exeMemType == MEM_CPU) // CPU execution agent
+  else if (transfer->exeMemType == MEM_CPU) // CPU execution agent
  {
    // Force this thread and all child threads onto correct NUMA node
-    if (numa_run_on_node(transfer.exeIndex))
+    if (numa_run_on_node(transfer->exeIndex))
    {
-      printf("[ERROR] Unable to set CPU to NUMA node %d\n", transfer.exeIndex);
+      printf("[ERROR] Unable to set CPU to NUMA node %d\n", transfer->exeIndex);
      exit(1);
    }

@@ -1156,7 +1162,7 @@ void RunTransfer(EnvVars const& ev, int const iteration,

    // Launch child-threads to perform memcopies
    for (int i = 0; i < ev.numCpuPerTransfer; i++)
-      childThreads.push_back(std::thread(ev.useMemset ? CpuMemsetKernel : CpuCopyKernel, std::ref(transfer.blockParam[i])));
+      childThreads.push_back(std::thread(ev.useMemset ? CpuMemsetKernel : CpuCopyKernel, std::ref(transfer->blockParam[i])));

    // Wait for child-threads to finish
    for (int i = 0; i < ev.numCpuPerTransfer; i++)
@@ -1166,7 +1172,7 @@ void RunTransfer(EnvVars const& ev, int const iteration,

    // Record time if not a warmup iteration
    if (iteration >= 0)
-      transfer.transferTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
+      transfer->transferTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
  }
 }

@@ -1272,109 +1278,40 @@ double GetPeakBandwidth(EnvVars const& ev,
  int const initOffset = ev.byteOffset / sizeof(float);

  // Prepare Transfers
-  std::vector<Transfer*> transfers;
-  ExecutorInfo exeInfo[2];
-  for (int i = 0; i < 2; i++)
-  {
-    exeInfo[i].transfers.resize(1);
-    exeInfo[i].streams.resize(1);
-    exeInfo[i].startEvents.resize(1);
-    exeInfo[i].stopEvents.resize(1);
-    transfers.push_back(&exeInfo[i].transfers[0]);
-  }
-
-  transfers[0]->srcMemType = transfers[1]->dstMemType = srcMemType;
-  transfers[0]->dstMemType = transfers[1]->srcMemType = dstMemType;
-  transfers[0]->srcIndex   = transfers[1]->dstIndex   = RemappedIndex(srcIndex, srcMemType);
-  transfers[0]->dstIndex   = transfers[1]->srcIndex   = RemappedIndex(dstIndex, dstMemType);
+  std::vector<Transfer> transfers(2);
+  transfers[0].srcMemType     = transfers[1].dstMemType     = srcMemType;
+  transfers[0].dstMemType     = transfers[1].srcMemType     = dstMemType;
+  transfers[0].srcIndex       = transfers[1].dstIndex       = RemappedIndex(srcIndex, srcMemType);
+  transfers[0].dstIndex       = transfers[1].srcIndex       = RemappedIndex(dstIndex, dstMemType);
+  transfers[0].numBytes       = transfers[1].numBytes       = N * sizeof(float);
+  transfers[0].numBlocksToUse = transfers[1].numBlocksToUse = numBlocksToUse;

  // Either perform (local read + remote write), or (remote read + local write)
-  transfers[0]->exeMemType = (readMode == 0 ? srcMemType : dstMemType);
-  transfers[1]->exeMemType = (readMode == 0 ? dstMemType : srcMemType);
-  transfers[0]->exeIndex   = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), transfers[0]->exeMemType);
-  transfers[1]->exeIndex   = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), transfers[1]->exeMemType);
+  transfers[0].exeMemType = (readMode == 0 ? srcMemType : dstMemType);
+  transfers[1].exeMemType = (readMode == 0 ? dstMemType : srcMemType);
+  transfers[0].exeIndex   = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), transfers[0].exeMemType);
+  transfers[1].exeIndex   = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), transfers[1].exeMemType);
+
+  transfers.resize(isBidirectional + 1);

  // Abort if executing on NUMA node with no CPUs
  for (int i = 0; i <= isBidirectional; i++)
  {
-    if (transfers[i]->exeMemType == MEM_CPU && ev.numCpusPerNuma[transfers[i]->exeIndex] == 0)
+    if (transfers[i].exeMemType == MEM_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
      return 0;
  }

-  for (int i = 0; i <= isBidirectional; i++)
-  {
-    AllocateMemory(transfers[i]->srcMemType, transfers[i]->srcIndex,
-                   N * sizeof(float) + ev.byteOffset, (void**)&transfers[i]->srcMem);
-    AllocateMemory(transfers[i]->dstMemType, transfers[i]->dstIndex,
-                   N * sizeof(float) + ev.byteOffset, (void**)&transfers[i]->dstMem);
-
-    // Prepare block parameters on CPU
-    transfers[i]->numBlocksToUse = (transfers[i]->exeMemType == MEM_GPU) ? numBlocksToUse : ev.numCpuPerTransfer;
-    transfers[i]->blockParam.resize(transfers[i]->numBlocksToUse);
-    transfers[i]->PrepareBlockParams(ev, N);
-
-    if (transfers[i]->exeMemType == MEM_GPU)
-    {
-      // Copy block parameters onto GPU
-      AllocateMemory(MEM_GPU, transfers[i]->exeIndex, numBlocksToUse * sizeof(BlockParam),
-                     (void **)&transfers[i]->blockParamGpuPtr);
-      HIP_CALL(hipMemcpy(transfers[i]->blockParamGpuPtr,
-                         transfers[i]->blockParam.data(),
-                         numBlocksToUse * sizeof(BlockParam),
-                         hipMemcpyHostToDevice));
-
-      // Prepare GPU resources
-      HIP_CALL(hipSetDevice(transfers[i]->exeIndex));
-      HIP_CALL(hipStreamCreate(&exeInfo[i].streams[0]));
-      HIP_CALL(hipEventCreate(&exeInfo[i].startEvents[0]));
-      HIP_CALL(hipEventCreate(&exeInfo[i].stopEvents[0]));
-    }
-  }
-
-  std::stack<std::thread> threads;
-
-  // Perform iteration
-  for (int iteration = -ev.numWarmups; iteration < ev.numIterations; iteration++)
-  {
-    // Perform timed iterations
-    for (int i = 0; i <= isBidirectional; i++)
-      threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo[i]), 0));
-
-    // Wait for all threads to finish
-    for (int i = 0; i <= isBidirectional; i++)
-    {
-      threads.top().join();
-      threads.pop();
-    }
-  }
-
-  // Validate that each Transfer has transferred correctly
-  for (int i = 0; i <= isBidirectional; i++)
-    CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, transfers[i]->dstMem + initOffset);
+  ExecuteTransfers(ev, 0, N, transfers, false);

  // Collect aggregate bandwidth
  double totalBandwidth = 0;
  for (int i = 0; i <= isBidirectional; i++)
  {
-    double transferDurationMsec = transfers[i]->transferTime / (1.0 * ev.numIterations);
-    double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
+    double transferDurationMsec = transfers[i].transferTime / (1.0 * ev.numIterations);
+    double transferBandwidthGbs = (transfers[i].numBytesToCopy / 1.0E9) / transferDurationMsec * 1000.0f;
    totalBandwidth += transferBandwidthGbs;
  }

-  // Release GPU memory
-  for (int i = 0; i <= isBidirectional; i++)
-  {
-    DeallocateMemory(transfers[i]->srcMemType, transfers[i]->srcMem);
-    DeallocateMemory(transfers[i]->dstMemType, transfers[i]->dstMem);
-
-    if (transfers[i]->exeMemType == MEM_GPU)
-    {
-      DeallocateMemory(MEM_GPU, transfers[i]->blockParamGpuPtr);
-      HIP_CALL(hipStreamDestroy(exeInfo[i].streams[0]));
-      HIP_CALL(hipEventDestroy(exeInfo[i].startEvents[0]));
-      HIP_CALL(hipEventDestroy(exeInfo[i].stopEvents[0]));
-    }
-  }
  return totalBandwidth;
 }

@@ -1438,10 +1375,9 @@ int GetWallClockRate(int deviceId)
  return wallClockPerDeviceMhz[deviceId];
 }

-void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom)
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numBlocksToUse, bool const isRandom)
 {
  ev.DisplaySweepEnvVars();
-  std::vector<size_t> valuesOfN(1, numBytesPerTransfer / sizeof(float));

  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
  std::vector<std::pair<MemType, int>> exeList;
@@ -1598,6 +1534,17 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co

  int numTestsRun = 0;
  int M = ev.sweepMin;
+  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
+  std::uniform_int_distribution<int> distribution(ev.sweepMin, maxParallelTransfers);
+
+  // Log sweep to configuration file
+  FILE *fp = fopen("lastSweep.cfg", "w");
+  if (!fp)
+  {
+    printf("[ERROR] Unable to open lastSweep.cfg.  Check permissions\n");
+    exit(1);
+  }
+
  // Create bitmask of numPossible triplets, of which M will be chosen
  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
  auto cpuStart = std::chrono::high_resolution_clock::now();
@@ -1607,8 +1554,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
    {
      // Pick random number of simultaneous transfers to execute
      // NOTE: This currently skews distribution due to some #s having more possibilities than others
-      M = ((maxParallelTransfers > ev.sweepMin) ? (rand() % (maxParallelTransfers - ev.sweepMin)) : 0)
-        + ev.sweepMin;
+      M = distribution(*ev.generator);

      // Generate a random bitmask
      for (int i = 0; i < numPossible; i++)
@@ -1630,13 +1576,15 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
        transfer.exeIndex       = possibleTransfers[value].exeIndex;
        transfer.dstMemType     = possibleTransfers[value].dstMemType;
        transfer.dstIndex       = possibleTransfers[value].dstIndex;
-        transfer.numBlocksToUse = IsGpuType(transfer.exeMemType) ? 4 : ev.numCpuPerTransfer;
+        transfer.numBlocksToUse = IsGpuType(transfer.exeMemType) ? numBlocksToUse : ev.numCpuPerTransfer;
        transfer.transferIndex  = transfers.size();
+        transfer.numBytes       = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0;
        transfers.push_back(transfer);
      }
    }

-    ExecuteTransfers(ev, ++numTestsRun, valuesOfN, transfers);
+    LogTransfers(fp, ++numTestsRun, transfers);
+    ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers);

    // Check for test limit
    if (numTestsRun == ev.sweepTestLimit)
@@ -1668,4 +1616,22 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
        bitmask[i] = (i < M) ? 1 : 0;
    }
  }
+  fclose(fp);
+}
+
+void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
+{
+  fprintf(fp, "# Test %d\n", testNum);
+  fprintf(fp, "%d", -1 * (int)transfers.size());
+  for (auto const& transfer : transfers)
+  {
+    fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)",
+            MemTypeStr[transfer.srcMemType], transfer.srcIndex,
+            MemTypeStr[transfer.exeMemType], transfer.exeIndex,
+            MemTypeStr[transfer.dstMemType], transfer.dstIndex,
+            transfer.numBlocksToUse,
+            transfer.numBytes);
+  }
+  fprintf(fp, "\n");
+  fflush(fp);
 }
--- a/TransferBench.hpp
+++ b/TransferBench.hpp
@@ -55,18 +55,23 @@ size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);  // Amount of data transferre
 // Different src/dst memory types supported
 typedef enum
 {
-  MEM_CPU      = 0,    // Coarse-grained pinned CPU memory
-  MEM_GPU      = 1,    // Coarse-grained global GPU memory
-  MEM_CPU_FINE = 2,    // Fine-grained pinned CPU memory
-  MEM_GPU_FINE = 3     // Fine-grained global GPU memory
+  MEM_CPU          = 0, // Coarse-grained pinned CPU memory
+  MEM_GPU          = 1, // Coarse-grained global GPU memory
+  MEM_CPU_FINE     = 2, // Fine-grained pinned CPU memory
+  MEM_GPU_FINE     = 3, // Fine-grained global GPU memory
+  MEM_CPU_UNPINNED = 4 // Unpinned CPU memory
 } MemType;

 bool IsGpuType(MemType m)
 {
  return (m == MEM_GPU || m == MEM_GPU_FINE);
 }
+bool IsCpuType(MemType m)
+{
+  return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED);
+}

-char const MemTypeStr[5] = "CGBF";
+char const MemTypeStr[6] = "CGBFU";

 MemType inline CharToMemType(char const c)
 {
@@ -76,6 +81,7 @@ MemType inline CharToMemType(char const c)
  case 'G': return MEM_GPU;
  case 'B': return MEM_CPU_FINE;
  case 'F': return MEM_GPU_FINE;
+  case 'U': return MEM_CPU_UNPINNED;
  default:
    printf("[ERROR] Unexpected mem type (%c)\n", c);
    exit(1);
@@ -112,6 +118,7 @@ struct Transfer
  int     dstIndex;            // Destination device index
  int     numBlocksToUse;      // Number of threadblocks to use for this Transfer
  size_t  numBytes;            // Number of bytes to Transfer
+  size_t  numBytesToCopy;      // Number of bytes to copy

  // Memory
  float*  srcMem;              // Source memory
@@ -132,7 +139,7 @@ typedef std::pair<MemType, int> Executor;

 struct ExecutorInfo
 {
-  std::vector<Transfer>    transfers;     // Transfers to execute
+  std::vector<Transfer*>   transfers;     // Transfers to execute
  size_t                   totalBytes;    // Total bytes this executor transfers

  // For GPU-Executors
@@ -164,17 +171,17 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
 void ParseTransfers(char* line, int numCpus, int numGpus,
                    std::vector<Transfer>& transfers);

-void ExecuteTransfers(EnvVars const& ev, int testNum, std::vector<size_t> const& valuesOfN,
-                      std::vector<Transfer>& transfers);
+void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N,
+                      std::vector<Transfer>& transfers, bool verbose = true);

 void EnablePeerAccess(int const deviceId, int const peerDeviceId);
 void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
-void DeallocateMemory(MemType memType, void* memPtr);
+void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
 void CheckPages(char* byteArray, size_t numBytes, int targetId);
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
 void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
 void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
-void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom);
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numBlocksToUse, bool const isRandom);

 // Return the maximum bandwidth measured for given (src/dst) pair
 double GetPeakBandwidth(EnvVars const& ev,
@@ -193,3 +200,4 @@ std::string GetDesc(MemType srcMemType, int srcIndex,
 std::string GetTransferDesc(Transfer const& transfer);
 int RemappedIndex(int const origIdx, MemType const memType);
 int GetWallClockRate(int deviceId);
+void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
--- a/example.cfg
+++ b/example.cfg
 # ConfigFile Format:
 # ==================
-# A Transfer is defined as a uni-directional transfer from src memory location to dst memory location
+# A Transfer is defined as a uni-directional copy from src memory location to dst memory location
 # executed by either CPU or GPU
 # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel

-# There are two ways to specify the configuration file:
+# There are two ways to specify a Test:

 # 1) Basic
 #    The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Transfer
@@ -13,9 +13,9 @@
 #    #Transfers #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)

 # 2) Advanced
-#    The advanced specification allows different number of threadblocks/CUs used per GPU-executed Transfer
-#    A negative number of Transfers is specified, followed by quadruples describing each Transfer
-#    -#Transfers (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)
+#    A negative number of Transfers is specified, followed by quintuplets describing each Transfer
+#    A non-zero number of bytes specified will override any provided value
+#    -#Transfers (srcMem1->Executor1->dstMem1 #CUs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL #CUsL BytesL)

 # Argument Details:
 #   #Transfers:   Number of Transfers to be run in parallel
@@ -25,23 +25,29 @@
 #                 - C: CPU-executed  (Indexed from 0 to # NUMA nodes - 1)
 #                 - G: GPU-executed  (Indexed from 0 to # GPUs - 1)
 #   dstMemL   :   Destination memory location (Where the data is to be written to)
-
+#   bytesL    :   Number of bytes to copy (0 means use command-line specified size)
+#                 Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')
+#
 #                 Memory locations are specified by a character indicating memory type,
 #                 followed by device index (0-indexed)
 #                 Supported memory locations are:
 #                 - C:    Pinned host memory       (on NUMA node, indexed from 0 to [# NUMA nodes-1])
+#                 - U:    Unpinned host memory     (on NUMA node, indexed from 0 to [# NUMA nodes-1])
 #                 - B:    Fine-grain host memory   (on NUMA node, indexed from 0 to [# NUMA nodes-1])
 #                 - G:    Global device memory     (on GPU device indexed from 0 to [# GPUs - 1])
 #                 - F:    Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])

 # Examples:
-# 1 4 (G0->G0->G1)             Single Transfer using 4 CUs on GPU0 to copy from GPU0 to GPU1
-# 1 4 (C1->G2->G0)             Single Transfer using 4 CUs on GPU2 to copy from CPU1 to GPU0
-# 2 4 G0->G0->G1 G1->G1->G0    Runs 2 Transfers in parallel.  GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs
-# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Transfers in parallel.  GPU0 to GPU1 with 4 CUs, and GPU1 to GPU0 with 2 CUs
+# 1 4 (G0->G0->G1)                   Uses 4 CUs on GPU0 to copy from GPU0 to GPU1
+# 1 4 (C1->G2->G0)                   Uses 4 CUs on GPU2 to copy from CPU1 to GPU0
+# 2 4 G0->G0->G1 G1->G1->G0          Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs
+# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 2 CUs

 # Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
 # Lines starting with # will be ignored. Lines starting with ## will be echoed to output

 # Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs
 1 4 (G0->G0->G1)
+
+# Copies 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs
+-2 (G0->G0->G1 4 1M) (G1->G1->G0 8 2M)