TransferBench v1.02

ddb6508f · Gilbert Lee · 07ac2dce · ddb6508f · ddb6508f · ddb6508f
Commit ddb6508f authored Apr 27, 2022 by Gilbert Lee
6 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench

+## v1.02
+### Added
+- Setting NUM_ITERATIONS to negative number indicates to run for -NUM_ITERATIONS seconds per Test
+### Changed
+- Copies are now refered to as Transfers instead of Links
+- Re-ordering how env vars are displayed (alphabetically now)
+### Removed
+- Combined timing is now always on for kernel-based GPU copies. COMBINED_TIMING env var has been removed
+- Use single sync is no longer supported to facility variable iterations. USE_SINGLE_SYNC env var has been removed
+
 ## v1.01
 ### Added
 - Adding USE_SINGLE_STREAM feature

--- a/EnvVars.hpp
+++ b/EnvVars.hpp
@@ -25,32 +25,32 @@ THE SOFTWARE.

 #include <algorithm>

+#define TB_VERSION "1.02"
+
 // This class manages environment variable that affect TransferBench
 class EnvVars
 {
 public:
  // Default configuration values
-  int const DEFAULT_NUM_WARMUPS      =  3;
-  int const DEFAULT_NUM_ITERATIONS   = 10;
-  int const DEFAULT_SAMPLING_FACTOR  =  1;
-  int const DEFAULT_NUM_CPU_PER_LINK =  4;
+  int const DEFAULT_NUM_WARMUPS          =  3;
+  int const DEFAULT_NUM_ITERATIONS       = 10;
+  int const DEFAULT_SAMPLING_FACTOR      =  1;
+  int const DEFAULT_NUM_CPU_PER_TRANSFER =  4;

  // Environment variables
-  int useHipCall;      // Use hipMemcpy/hipMemset instead of custom shader kernels
-  int useMemset;       // Perform a memset instead of a copy (ignores source memory)
-  int useSingleSync;   // Perform synchronization only once after all iterations instead of per iteration
-  int useInteractive;  // Pause for user-input before starting transfer loop
-  int combineTiming;   // Combines the timing with kernel launch
-  int outputToCsv;     // Output in CSV format
-  int byteOffset;      // Byte-offset for memory allocations
-  int numWarmups;      // Number of un-timed warmup iterations to perform
-  int numIterations;   // Number of timed iterations to perform
-  int samplingFactor;  // Affects how many different values of N are generated (when N set to 0)
-  int numCpuPerLink;   // Number of CPU child threads to use per CPU link
-  int sharedMemBytes;  // Amount of shared memory to use per threadblock
-  int blockBytes;      // Each CU, except the last, gets a multiple of this many bytes to copy
-  int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
-  int useSingleStream; // Use a single stream per device instead of per Link. Can not be used with USE_HIP_CALL
+  int blockBytes;        // Each CU, except the last, gets a multiple of this many bytes to copy
+  int byteOffset;        // Byte-offset for memory allocations
+  int numCpuPerTransfer; // Number of CPU child threads to use per CPU Transfer
+  int numIterations;     // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
+  int numWarmups;        // Number of un-timed warmup iterations to perform
+  int outputToCsv;       // Output in CSV format
+  int samplingFactor;    // Affects how many different values of N are generated (when N set to 0)
+  int sharedMemBytes;    // Amount of shared memory to use per threadblock
+  int useHipCall;        // Use hipMemcpy/hipMemset instead of custom shader kernels
+  int useInteractive;    // Pause for user-input before starting transfer loop
+  int useMemset;         // Perform a memset instead of a copy (ignores source memory)
+  int usePcieIndexing;   // Base GPU indexing on PCIe address instead of HIP device
+  int useSingleStream;   // Use a single stream per device instead of per Tink. Can not be used with USE_HIP_CALL

  std::vector<float> fillPattern; // Pattern of floats used to fill source data

@@ -61,21 +61,19 @@ public:
    hipDeviceGetAttribute(&maxSharedMemBytes,
                          hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0);

-    useHipCall      = GetEnvVar("USE_HIP_CALL"     , 0);
-    useMemset       = GetEnvVar("USE_MEMSET"       , 0);
-    useSingleSync   = GetEnvVar("USE_SINGLE_SYNC"  , 1);
-    useInteractive  = GetEnvVar("USE_INTERACTIVE"  , 0);
-    combineTiming   = GetEnvVar("COMBINE_TIMING"   , 0);
-    outputToCsv     = GetEnvVar("OUTPUT_TO_CSV"    , 0);
-    byteOffset      = GetEnvVar("BYTE_OFFSET"      , 0);
-    numWarmups      = GetEnvVar("NUM_WARMUPS"      , DEFAULT_NUM_WARMUPS);
-    numIterations   = GetEnvVar("NUM_ITERATIONS"   , DEFAULT_NUM_ITERATIONS);
-    samplingFactor  = GetEnvVar("SAMPLING_FACTOR"  , DEFAULT_SAMPLING_FACTOR);
-    numCpuPerLink   = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK);
-    sharedMemBytes  = GetEnvVar("SHARED_MEM_BYTES" , maxSharedMemBytes / 2 + 1);
-    blockBytes      = GetEnvVar("BLOCK_BYTES"      , 256);
-    usePcieIndexing = GetEnvVar("USE_PCIE_INDEX"   , 0);
-    useSingleStream = GetEnvVar("USE_SINGLE_STREAM", 0);
+    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
+    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
+    numCpuPerTransfer = GetEnvVar("NUM_CPU_PER_TRANSFER", DEFAULT_NUM_CPU_PER_TRANSFER);
+    numIterations     = GetEnvVar("NUM_ITERATIONS"      , DEFAULT_NUM_ITERATIONS);
+    numWarmups        = GetEnvVar("NUM_WARMUPS"         , DEFAULT_NUM_WARMUPS);
+    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
+    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , DEFAULT_SAMPLING_FACTOR);
+    sharedMemBytes    = GetEnvVar("SHARED_MEM_BYTES"    , maxSharedMemBytes / 2 + 1);
+    useHipCall        = GetEnvVar("USE_HIP_CALL"        , 0);
+    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
+    useMemset         = GetEnvVar("USE_MEMSET"          , 0);
+    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
+    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 0);

    // Check for fill pattern
    char* pattern = getenv("FILL_PATTERN");
@@ -146,19 +144,14 @@ public:
      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
      exit(1);
    }
-    if (numIterations <= 0)
-    {
-      printf("[ERROR] NUM_ITERATIONS must be set to a positive number\n");
-      exit(1);
-    }
    if (samplingFactor < 1)
    {
      printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
      exit(1);
    }
-    if (numCpuPerLink < 1)
+    if (numCpuPerTransfer < 1)
    {
-      printf("[ERROR] NUM_CPU_PER_LINK must be greater or equal to 1\n");
+      printf("[ERROR] NUM_CPU_PER_TRANSFER must be greater or equal to 1\n");
      exit(1);
    }
    if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes)
@@ -183,22 +176,20 @@ public:
  {
    printf("Environment variables:\n");
    printf("======================\n");
-    printf(" USE_HIP_CALL       - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
-    printf(" USE_MEMSET         - Perform a memset instead of a copy (ignores source memory)\n");
-    printf(" USE_SINGLE_SYNC    - Perform synchronization only once after all iterations instead of per iteration\n");
-    printf(" USE_INTERACTIVE    - Pause for user-input before starting transfer loop\n");
-    printf(" COMBINE_TIMING     - Combines timing with launch (potentially lower timing overhead)\n");
-    printf(" OUTPUT_TO_CSV      - Outputs to CSV format if set\n");
+    printf(" BLOCK_BYTES=B      - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
    printf(" BYTE_OFFSET        - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
-    printf(" NUM_WARMUPS=W      - Perform W untimed warmup iteration(s) per test\n");
+    printf(" FILL_PATTERN=STR   - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
+    printf(" NUM_CPU_PER_TRANSFER=C - Use C threads per Transfer for CPU-executed copies\n");
    printf(" NUM_ITERATIONS=I   - Perform I timed iteration(s) per test\n");
+    printf(" NUM_WARMUPS=W      - Perform W untimed warmup iteration(s) per test\n");
+    printf(" OUTPUT_TO_CSV      - Outputs to CSV format if set\n");
    printf(" SAMPLING_FACTOR=F  - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
-    printf(" NUM_CPU_PER_LINK=C - Use C threads per Link for CPU-executed copies\n");
-    printf(" FILL_PATTERN=STR   - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
    printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
-    printf(" BLOCK_BYTES=B      - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
+    printf(" USE_HIP_CALL       - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
+    printf(" USE_INTERACTIVE    - Pause for user-input before starting transfer loop\n");
+    printf(" USE_MEMSET         - Perform a memset instead of a copy (ignores source memory)\n");
    printf(" USE_PCIE_INDEX     - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
-    printf(" USE_SINGLE_STREAM  - Use single stream per device instead of per link.  Cannot be used with USE_HIP_CALL\n");
+    printf(" USE_SINGLE_STREAM  - Use single stream per device instead of per Transfer.  Cannot be used with USE_HIP_CALL\n");
  }

  // Display env var settings
@@ -206,45 +197,41 @@ public:
  {
    if (!outputToCsv)
    {
-      printf("Run configuration\n");
+      printf("Run configuration (TransferBench v%s)\n", TB_VERSION);
      printf("=====================================================\n");
+      printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
+      printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
+      printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
+      if (fillPattern.size())
+        printf("Pattern: %s", getenv("FILL_PATTERN"));
+      else
+        printf("Pseudo-random: (Element i = i modulo 383 + 31)");
+      printf("\n");
+      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
+      printf("%-20s = %12d : Running %d %s per topology\n", "NUM_ITERATIONS", numIterations,
+             numIterations > 0 ? numIterations : -numIterations,
+             numIterations > 0 ? "timed iteration(s)" : "second(s)");
+      printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
+      printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
+             outputToCsv ? "CSV" : "console");
+      printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
+             getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
      printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
             useHipCall ? "HIP functions" : "custom kernels");
-      printf("%-20s = %12d : Performing %s\n", "USE_MEMSET", useMemset,
-             useMemset ? "memset" : "memcopy");
      if (useHipCall && !useMemset)
      {
        char* env = getenv("HSA_ENABLE_SDMA");
        printf("%-20s = %12s : %s\n", "HSA_ENABLE_SDMA", env,
               (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
      }
-      printf("%-20s = %12d : %s\n", "USE_SINGLE_SYNC", useSingleSync,
-             useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
      printf("%-20s = %12d : Running in %s mode\n", "USE_INTERACTIVE", useInteractive,
             useInteractive ? "interactive" : "non-interactive");
-      printf("%-20s = %12d : %s\n", "COMBINE_TIMING", combineTiming,
-             combineTiming ? "Using combined timing+launch" : "Using separate timing / launch");
-      printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
-             outputToCsv ? "CSV" : "console");
-      printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
-      printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
-      printf("%-20s = %12d : Running %d timed iteration(s) per topology\n", "NUM_ITERATIONS", numIterations, numIterations);
-      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Link\n", "NUM_CPU_PER_LINK", numCpuPerLink, numCpuPerLink);
-      printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
-      if (fillPattern.size())
-      {
-        printf("Pattern: %s", getenv("FILL_PATTERN"));
-      }
-      else
-      {
-        printf("Pseudo-random: (Element i = i modulo 383 + 31)");
-      }
-      printf("\n");
-      printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
-             getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
-      printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
-      printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
-      printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM", useSingleStream, (useSingleStream ? "device" : "Link"));
+      printf("%-20s = %12d : Performing %s\n", "USE_MEMSET", useMemset,
+             useMemset ? "memset" : "memcopy");
+      printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX",
+             usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
+      printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM",
+             useSingleStream, (useSingleStream ? "device" : "Transfer"));
      printf("\n");
    }
  };

--- a/LICENSE.md
+++ b/LICENSE.md
+Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/TransferBench.cpp
+++ b/TransferBench.cpp
@@ -45,25 +45,25 @@ int main(int argc, char **argv)
  // Collect environment variables / display current run configuration
  EnvVars ev;

-  // Determine number of bytes to run per Link
+  // Determine number of bytes to run per Transfer
  // If a non-zero number of bytes is specified, use it
  // Otherwise generate array of bytes values to execute over
  std::vector<size_t> valuesOfN;
-  size_t numBytesPerLink = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_LINK;
+  size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
  if (argc > 2)
  {
    // Adjust bytes if unit specified
    char units = argv[2][strlen(argv[2])-1];
    switch (units)
    {
-    case 'K': case 'k': numBytesPerLink *= 1024; break;
-    case 'M': case 'm': numBytesPerLink *= 1024*1024; break;
-    case 'G': case 'g': numBytesPerLink *= 1024*1024*1024; break;
+    case 'K': case 'k': numBytesPerTransfer *= 1024; break;
+    case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break;
+    case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break;
    }
  }
-  PopulateTestSizes(numBytesPerLink, ev.samplingFactor, valuesOfN);
+  PopulateTestSizes(numBytesPerTransfer, ev.samplingFactor, valuesOfN);

-  // Find the largest N to be used - memory will only be allocated once per link config
+  // Find the largest N to be used - memory will only be allocated once per set of simulatenous Transfers
  size_t maxN = valuesOfN[0];
  for (auto N : valuesOfN)
    maxN = std::max(maxN, N);
@@ -84,15 +84,15 @@ int main(int argc, char **argv)
    int skipCpu  = (!strcmp(argv[1], "g2g"   ) || !strcmp(argv[1], "g2g_rr") ? 1 : 0);

    // Execute peer to peer benchmark mode
-    RunPeerToPeerBenchmarks(ev, numBytesPerLink / sizeof(float), numBlocksToUse, readMode, skipCpu);
+    RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float), numBlocksToUse, readMode, skipCpu);
    exit(0);
  }

-  // Check that Link configuration file can be opened
+  // Check that Transfer configuration file can be opened
  FILE* fp = fopen(argv[1], "r");
  if (!fp)
  {
-    printf("[ERROR] Unable to open link configuration file: [%s]\n", argv[1]);
+    printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
    exit(1);
  }

@@ -112,17 +112,17 @@ int main(int argc, char **argv)
  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
  int const numCpuDevices = numa_num_configured_nodes();

-  // Track unique pair of links that get used
+  // Track unique pair of transfers that get used
  std::set<std::pair<int, int>> peerAccessTracker;

  // Print CSV header
  if (ev.outputToCsv)
  {
    printf("Test,NumBytes,SrcMem,Executor,DstMem,CUs,BW(GB/s),Time(ms),"
-           "LinkDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters\n");
+           "TransferDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters\n");
  }

-  // Loop over each line in the Link configuration file
+  // Loop over each line in the Transfer configuration file
  int testNum = 0;
  char line[2048];
  while(fgets(line, 2048, fp))
@@ -130,33 +130,33 @@ int main(int argc, char **argv)
    // Check if line is a comment to be echoed to output (starts with ##)
    if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line);

-    // Parse links from configuration file
-    LinkMap linkMap;
-    ParseLinks(line, numCpuDevices, numGpuDevices, linkMap);
-    if (linkMap.size() == 0) continue;
+    // Parse transfers from configuration file
+    TransferMap transferMap;
+    ParseTransfers(line, numCpuDevices, numGpuDevices, transferMap);
+    if (transferMap.size() == 0) continue;

    testNum++;

-    // Prepare (maximum) memory for each link
-    std::vector<Link*> linkList;
-    for (auto& exeInfoPair : linkMap)
+    // Prepare (maximum) memory for each transfer
+    std::vector<Transfer*> transferList;
+    for (auto& exeInfoPair : transferMap)
    {
      ExecutorInfo& exeInfo = exeInfoPair.second;
      exeInfo.totalTime = 0.0;
      exeInfo.totalBlocks = 0;

-      for (Link& link : exeInfo.links)
+      for (Transfer& transfer : exeInfo.transfers)
      {
-        // Get some aliases to link variables
-        MemType const& exeMemType  = link.exeMemType;
-        MemType const& srcMemType  = link.srcMemType;
-        MemType const& dstMemType  = link.dstMemType;
-        int     const& blocksToUse = link.numBlocksToUse;
+        // Get some aliases to transfer variables
+        MemType const& exeMemType  = transfer.exeMemType;
+        MemType const& srcMemType  = transfer.srcMemType;
+        MemType const& dstMemType  = transfer.dstMemType;
+        int     const& blocksToUse = transfer.numBlocksToUse;

        // Get potentially remapped device indices
-        int const srcIndex = RemappedIndex(link.srcIndex, srcMemType);
-        int const exeIndex = RemappedIndex(link.exeIndex, exeMemType);
-        int const dstIndex = RemappedIndex(link.dstIndex, dstMemType);
+        int const srcIndex = RemappedIndex(transfer.srcIndex, srcMemType);
+        int const exeIndex = RemappedIndex(transfer.exeIndex, exeMemType);
+        int const dstIndex = RemappedIndex(transfer.dstIndex, dstMemType);

        // Enable peer-to-peer access if necessary (can only be called once per unique pair)
        if (exeMemType == MEM_GPU)
@@ -185,11 +185,11 @@ int main(int argc, char **argv)
        }

        // Allocate (maximum) source / destination memory based on type / device index
-        AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&link.srcMem);
-        AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&link.dstMem);
-        link.blockParam.resize(exeMemType == MEM_CPU ? ev.numCpuPerLink : blocksToUse);
-        exeInfo.totalBlocks += link.blockParam.size();
-        linkList.push_back(&link);
+        AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.srcMem);
+        AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.dstMem);
+        transfer.blockParam.resize(exeMemType == MEM_CPU ? ev.numCpuPerTransfer : blocksToUse);
+        exeInfo.totalBlocks += transfer.blockParam.size();
+        transferList.push_back(&transfer);
      }

      // Prepare GPU resources for GPU executors
@@ -200,11 +200,11 @@ int main(int argc, char **argv)
        AllocateMemory(exeMemType, exeIndex, exeInfo.totalBlocks * sizeof(BlockParam),
                       (void**)&exeInfo.blockParamGpu);

-        int const numLinksToRun = ev.useSingleStream ? 1 : exeInfo.links.size();
-        exeInfo.streams.resize(numLinksToRun);
-        exeInfo.startEvents.resize(numLinksToRun);
-        exeInfo.stopEvents.resize(numLinksToRun);
-        for (int i = 0; i < numLinksToRun; ++i)
+        int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
+        exeInfo.streams.resize(numTransfersToRun);
+        exeInfo.startEvents.resize(numTransfersToRun);
+        exeInfo.stopEvents.resize(numTransfersToRun);
+        for (int i = 0; i < numTransfersToRun; ++i)
        {
          HIP_CALL(hipSetDevice(exeIndex));
          HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
@@ -212,48 +212,52 @@ int main(int argc, char **argv)
          HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
        }

-        int linkOffset = 0;
-        for (int i = 0; i < exeInfo.links.size(); i++)
+        int transferOffset = 0;
+        for (int i = 0; i < exeInfo.transfers.size(); i++)
        {
-          exeInfo.links[i].blockParamGpuPtr = exeInfo.blockParamGpu + linkOffset;
-          linkOffset += exeInfo.links[i].blockParam.size();
+          exeInfo.transfers[i].blockParamGpuPtr = exeInfo.blockParamGpu + transferOffset;
+          transferOffset += exeInfo.transfers[i].blockParam.size();
        }
      }
    }

-    // Loop over all the different number of bytes to use per Link
+    // Loop over all the different number of bytes to use per Transfer
    for (auto N : valuesOfN)
    {
      if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));

      // Prepare input memory and block parameters for current N
-      for (auto& exeInfoPair : linkMap)
+      for (auto& exeInfoPair : transferMap)
      {
        ExecutorInfo& exeInfo = exeInfoPair.second;

-        int linkOffset = 0;
+        int transferOffset = 0;

-        for (int i = 0; i < exeInfo.links.size(); ++i)
+        for (int i = 0; i < exeInfo.transfers.size(); ++i)
        {
-          Link& link = exeInfo.links[i];
-          link.PrepareBlockParams(ev, N);
+          Transfer& transfer = exeInfo.transfers[i];
+          transfer.PrepareBlockParams(ev, N);

          // Copy block parameters to GPU for GPU executors
-          if (link.exeMemType == MEM_GPU)
+          if (transfer.exeMemType == MEM_GPU)
          {
-            HIP_CALL(hipMemcpy(&exeInfo.blockParamGpu[linkOffset],
-                               link.blockParam.data(),
-                               link.blockParam.size() * sizeof(BlockParam),
+            HIP_CALL(hipMemcpy(&exeInfo.blockParamGpu[transferOffset],
+                               transfer.blockParam.data(),
+                               transfer.blockParam.size() * sizeof(BlockParam),
                               hipMemcpyHostToDevice));
-            linkOffset += link.blockParam.size();
+            transferOffset += transfer.blockParam.size();
          }
        }
      }

      // Launch kernels (warmup iterations are not counted)
      double totalCpuTime = 0;
-      for (int iteration = -ev.numWarmups; iteration < ev.numIterations; iteration++)
+      size_t numTimedIterations = 0;
+      for (int iteration = -ev.numWarmups; ; iteration++)
      {
+        if (ev.numIterations > 0 && iteration >= ev.numIterations) break;
+        if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
+
        // Pause before starting first timed iteration in interactive mode
        if (ev.useInteractive && iteration == 0)
        {
@@ -265,18 +269,18 @@ int main(int argc, char **argv)
        // Start CPU timing for this iteration
        auto cpuStart = std::chrono::high_resolution_clock::now();

-        // Execute all links in parallel
-        for (auto& exeInfoPair : linkMap)
+        // Execute all Transfers in parallel
+        for (auto& exeInfoPair : transferMap)
        {
          ExecutorInfo& exeInfo = exeInfoPair.second;
-          int const numLinksToRun = ev.useSingleStream ? 1 : exeInfo.links.size();
-          for (int i = 0; i < numLinksToRun; ++i)
-            threads.push(std::thread(RunLink, std::ref(ev), N, iteration, std::ref(exeInfo), i));
+          int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
+          for (int i = 0; i < numTransfersToRun; ++i)
+            threads.push(std::thread(RunTransfer, std::ref(ev), N, iteration, std::ref(exeInfo), i));
        }

        // Wait for all threads to finish
-        int const numLinks = threads.size();
-        for (int i = 0; i < numLinks; i++)
+        int const numTransfers = threads.size();
+        for (int i = 0; i < numTransfers; i++)
        {
          threads.top().join();
          threads.pop();
@@ -286,9 +290,11 @@ int main(int argc, char **argv)
        auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
        double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();

-
-
-        if (iteration >= 0) totalCpuTime += deltaSec;
+        if (iteration >= 0)
+        {
+          ++numTimedIterations;
+          totalCpuTime += deltaSec;
+        }
      }

      // Pause for interactive mode
@@ -299,89 +305,89 @@ int main(int argc, char **argv)
        printf("\n");
      }

-      // Validate that each link has transferred correctly
-      int const numLinks = linkList.size();
-      for (auto link : linkList)
-        CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, link->dstMem + initOffset);
+      // Validate that each transfer has transferred correctly
+      int const numTransfers = transferList.size();
+      for (auto transfer : transferList)
+        CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, transfer->dstMem + initOffset);

      // Report timings
-      totalCpuTime = totalCpuTime / (1.0 * ev.numIterations) * 1000;
-      double totalBandwidthGbs = (numLinks * N * sizeof(float) / 1.0E6) / totalCpuTime;
+      totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
+      double totalBandwidthGbs = (numTransfers * N * sizeof(float) / 1.0E6) / totalCpuTime;
      double maxGpuTime = 0;

      if (ev.useSingleStream)
      {
-        for (auto& exeInfoPair : linkMap)
+        for (auto& exeInfoPair : transferMap)
        {
          ExecutorInfo const& exeInfo = exeInfoPair.second;
          MemType const exeMemType    = exeInfoPair.first.first;
          int     const exeIndex      = exeInfoPair.first.second;

-          double exeDurationMsec = exeInfo.totalTime / (1.0 * ev.numIterations);
-          double exeBandwidthGbs = (exeInfo.links.size() * N * sizeof(float) / 1.0E9) / exeDurationMsec * 1000.0f;
+          double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
+          double exeBandwidthGbs = (exeInfo.transfers.size() * N * sizeof(float) / 1.0E9) / exeDurationMsec * 1000.0f;
          maxGpuTime = std::max(maxGpuTime, exeDurationMsec);

          if (!ev.outputToCsv)
          {
-            printf(" Executor: %cPU %02d       (# Links %02lu)| %9.3f GB/s | %8.3f ms |\n",
-                   MemTypeStr[exeMemType], exeIndex, exeInfo.links.size(), exeBandwidthGbs, exeDurationMsec);
-            for (auto link : exeInfo.links)
+            printf(" Executor: %cPU %02d        (# Transfers %02lu)| %9.3f GB/s | %8.3f ms |\n",
+                   MemTypeStr[exeMemType], exeIndex, exeInfo.transfers.size(), exeBandwidthGbs, exeDurationMsec);
+            for (auto transfer : exeInfo.transfers)
            {
-              double linkDurationMsec = link.linkTime / (1.0 * ev.numIterations);
-              double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f;
-
-              printf("                           Link  %02d | %9.3f GB/s | %8.3f ms | %c%02d -> %c%02d:(%02d) -> %c%02d\n",
-                     link.linkIndex,
-                     linkBandwidthGbs,
-                     linkDurationMsec,
-                     MemTypeStr[link.srcMemType], link.srcIndex,
-                     MemTypeStr[link.exeMemType], link.exeIndex,
-                     link.exeMemType == MEM_CPU ? ev.numCpuPerLink : link.numBlocksToUse,
-                     MemTypeStr[link.dstMemType], link.dstIndex);
+              double transferDurationMsec = transfer.transferTime / (1.0 * numTimedIterations);
+              double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
+
+              printf("                            Transfer  %02d | %9.3f GB/s | %8.3f ms | %c%02d -> %c%02d:(%03d) -> %c%02d\n",
+                     transfer.transferIndex,
+                     transferBandwidthGbs,
+                     transferDurationMsec,
+                     MemTypeStr[transfer.srcMemType], transfer.srcIndex,
+                     MemTypeStr[transfer.exeMemType], transfer.exeIndex,
+                     transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
+                     MemTypeStr[transfer.dstMemType], transfer.dstIndex);
            }
          }
          else
          {
-            printf("%d,%lu,ALL,%c%02d,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%d\n",
+            printf("%d,%lu,ALL,%c%02d,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
                   testNum, N * sizeof(float),
                   MemTypeStr[exeMemType], exeIndex,
                   exeBandwidthGbs, exeDurationMsec,
                   ev.byteOffset,
-                   ev.numWarmups, ev.numIterations);
+                   ev.numWarmups, numTimedIterations);
          }
        }
      }
      else
      {
-        for (auto link : linkList)
+        for (auto transfer : transferList)
        {
-          double linkDurationMsec = link->linkTime / (1.0 * ev.numIterations);
-          double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f;
-          maxGpuTime = std::max(maxGpuTime, linkDurationMsec);
+          double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
+          double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
+          maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
          if (!ev.outputToCsv)
          {
-            printf(" Link %02d: %c%02d -> [%cPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s\n",
-                   link->linkIndex,
-                   MemTypeStr[link->srcMemType], link->srcIndex,
-                   MemTypeStr[link->exeMemType], link->exeIndex,
-                   link->exeMemType == MEM_CPU ? ev.numCpuPerLink : link->numBlocksToUse,
-                   MemTypeStr[link->dstMemType], link->dstIndex,
-                   linkBandwidthGbs, linkDurationMsec,
-                   GetLinkDesc(*link).c_str());
+            printf(" Transfer %02d: %c%02d -> [%cPU %02d:%03d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s\n",
+                   transfer->transferIndex,
+                   MemTypeStr[transfer->srcMemType], transfer->srcIndex,
+                   MemTypeStr[transfer->exeMemType], transfer->exeIndex,
+                   transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
+                   MemTypeStr[transfer->dstMemType], transfer->dstIndex,
+                   transferBandwidthGbs, transferDurationMsec,
+                   GetTransferDesc(*transfer).c_str());
          }
          else
          {
-            printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%d\n",
+            printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%lu\n",
                   testNum, N * sizeof(float),
-                   MemTypeStr[link->srcMemType], link->srcIndex,
-                   MemTypeStr[link->exeMemType], link->exeIndex,
-                   MemTypeStr[link->dstMemType], link->dstIndex,
-                   link->exeMemType == MEM_CPU ? ev.numCpuPerLink : link->numBlocksToUse,
-                   linkBandwidthGbs, linkDurationMsec,
-                   GetLinkDesc(*link).c_str(),
-                   link->srcMem + initOffset, link->dstMem + initOffset,
+                   MemTypeStr[transfer->srcMemType], transfer->srcIndex,
+                   MemTypeStr[transfer->exeMemType], transfer->exeIndex,
+                   MemTypeStr[transfer->dstMemType], transfer->dstIndex,
+                   transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
+                   transferBandwidthGbs, transferDurationMsec,
+                   GetTransferDesc(*transfer).c_str(),
+                   transfer->srcMem + initOffset, transfer->dstMem + initOffset,
                   ev.byteOffset,
-                   ev.numWarmups, ev.numIterations);
+                   ev.numWarmups, numTimedIterations);
          }
        }
      }
@@ -389,32 +395,32 @@ int main(int argc, char **argv)
      // Display aggregate statistics
      if (!ev.outputToCsv)
      {
-        printf(" Aggregate Bandwidth (CPU timed)    | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n", totalBandwidthGbs, totalCpuTime,
+        printf(" Aggregate Bandwidth (CPU timed)         | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n", totalBandwidthGbs, totalCpuTime,
               totalCpuTime - maxGpuTime);
      }
      else
      {
-        printf("%d,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%d\n",
+        printf("%d,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
               testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, ev.byteOffset,
-               ev.numWarmups, ev.numIterations);
+               ev.numWarmups, numTimedIterations);
      }
    }

    // Release GPU memory
-    for (auto exeInfoPair : linkMap)
+    for (auto exeInfoPair : transferMap)
    {
      ExecutorInfo& exeInfo = exeInfoPair.second;
-      for (auto& link : exeInfo.links)
+      for (auto& transfer : exeInfo.transfers)
      {
-        // Get some aliases to link variables
-        MemType const& exeMemType  = link.exeMemType;
-        MemType const& srcMemType  = link.srcMemType;
-        MemType const& dstMemType  = link.dstMemType;
+        // Get some aliases to Transfer variables
+        MemType const& exeMemType = transfer.exeMemType;
+        MemType const& srcMemType = transfer.srcMemType;
+        MemType const& dstMemType = transfer.dstMemType;

        // Allocate (maximum) source / destination memory based on type / device index
-        DeallocateMemory(srcMemType, link.srcMem);
-        DeallocateMemory(dstMemType, link.dstMem);
-        link.blockParam.clear();
+        DeallocateMemory(srcMemType, transfer.srcMem);
+        DeallocateMemory(dstMemType, transfer.dstMem);
+        transfer.blockParam.clear();
      }

      MemType const exeMemType = exeInfoPair.first.first;
@@ -422,8 +428,8 @@ int main(int argc, char **argv)
      if (exeMemType == MEM_GPU)
      {
        DeallocateMemory(exeMemType, exeInfo.blockParamGpu);
-        int const numLinksToRun = ev.useSingleStream ? 1 : exeInfo.links.size();
-        for (int i = 0; i < numLinksToRun; ++i)
+        int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
+        for (int i = 0; i < numTransfersToRun; ++i)
        {
          HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
          HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
@@ -439,7 +445,7 @@ int main(int argc, char **argv)

 void DisplayUsage(char const* cmdName)
 {
-  printf("TransferBench v. %s\n", TB_VERSION);
+  printf("TransferBench v%s\n", TB_VERSION);
  printf("========================================\n");

  if (numa_available() == -1)
@@ -453,16 +459,16 @@ void DisplayUsage(char const* cmdName)

  printf("Usage: %s config <N>\n", cmdName);
  printf("  config: Either:\n");
-  printf("          - Filename of configFile containing Links to execute (see example.cfg for format)\n");
+  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
  printf("          - Name of preset benchmark:\n");
  printf("              p2p    - All CPU/GPU pairs benchmark\n");
  printf("              p2p_rr - All CPU/GPU pairs benchmark with remote reads\n");
  printf("              g2g    - All GPU/GPU pairs benchmark\n");
  printf("              g2g_rr - All GPU/GPU pairs benchmark with remote reads\n");
  printf("            - 3rd optional argument will be used as # of CUs to use (uses all by default)\n");
-  printf("  N     : (Optional) Number of bytes to transfer per link.\n");
+  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
-         DEFAULT_BYTES_PER_LINK);
+         DEFAULT_BYTES_PER_TRANSFER);
  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
  printf("\n");
@@ -574,21 +580,21 @@ void DisplayTopology(bool const outputToCsv)
  }
 }

-void PopulateTestSizes(size_t const numBytesPerLink,
+void PopulateTestSizes(size_t const numBytesPerTransfer,
                       int const samplingFactor,
                       std::vector<size_t>& valuesOfN)
 {
  valuesOfN.clear();

  // If the number of bytes is specified, use it
-  if (numBytesPerLink != 0)
+  if (numBytesPerTransfer != 0)
  {
-    if (numBytesPerLink % 4)
+    if (numBytesPerTransfer % 4)
    {
-      printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 4\n", numBytesPerLink);
+      printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
      exit(1);
    }
-    size_t N = numBytesPerLink / sizeof(float);
+    size_t N = numBytesPerTransfer / sizeof(float);
    valuesOfN.push_back(N);
  }
  else
@@ -642,24 +648,24 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
  }
 }

-// Helper function to parse a list of link definitions
-void ParseLinks(char* line, int numCpus, int numGpus, LinkMap& linkMap)
+// Helper function to parse a list of Transfer definitions
+void ParseTransfers(char* line, int numCpus, int numGpus, TransferMap& transferMap)
 {
  // Replace any round brackets or '->' with spaces,
  for (int i = 1; line[i]; i++)
    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';

-  linkMap.clear();
-  int numLinks = 0;
+  transferMap.clear();
+  int numTransfers = 0;

  std::istringstream iss(line);
-  iss >> numLinks;
+  iss >> numTransfers;
  if (iss.fail()) return;

  std::string exeMem;
  std::string srcMem;
  std::string dstMem;
-  if (numLinks > 0)
+  if (numTransfers > 0)
  {
    // Method 1: Take in triples (srcMem, exeMem, dstMem)
    int numBlocksToUse;
@@ -669,64 +675,64 @@ void ParseLinks(char* line, int numCpus, int numGpus, LinkMap& linkMap)
      printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numBlocksToUse);
      exit(1);
    }
-    for (int i = 0; i < numLinks; i++)
+    for (int i = 0; i < numTransfers; i++)
    {
-      Link link;
-      link.linkIndex = i;
+      Transfer transfer;
+      transfer.transferIndex = i;
      iss >> srcMem >> exeMem >> dstMem;
      if (iss.fail())
      {
-        printf("Parsing error: Unable to read valid Link triplet (possibly missing a SRC or EXE or DST)\n");
+        printf("Parsing error: Unable to read valid Transfer triplet (possibly missing a SRC or EXE or DST)\n");
        exit(1);
      }
-      ParseMemType(srcMem, numCpus, numGpus, &link.srcMemType, &link.srcIndex);
-      ParseMemType(exeMem, numCpus, numGpus, &link.exeMemType, &link.exeIndex);
-      ParseMemType(dstMem, numCpus, numGpus, &link.dstMemType, &link.dstIndex);
-      link.numBlocksToUse = numBlocksToUse;
+      ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
+      ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
+      ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
+      transfer.numBlocksToUse = numBlocksToUse;

      // Ensure executor is either CPU or GPU
-      if (link.exeMemType != MEM_CPU && link.exeMemType != MEM_GPU)
+      if (transfer.exeMemType != MEM_CPU && transfer.exeMemType != MEM_GPU)
      {
        printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n",
-               srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), link.numBlocksToUse);
+               srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), transfer.numBlocksToUse);
        exit(1);
      }

-      Executor executor(link.exeMemType, link.exeIndex);
-      ExecutorInfo& executorInfo = linkMap[executor];
-      executorInfo.totalBlocks += link.numBlocksToUse;
-      executorInfo.links.push_back(link);
+      Executor executor(transfer.exeMemType, transfer.exeIndex);
+      ExecutorInfo& executorInfo = transferMap[executor];
+      executorInfo.totalBlocks += transfer.numBlocksToUse;
+      executorInfo.transfers.push_back(transfer);
    }
  }
  else
  {
    // Method 2: Read in quads (srcMem, exeMem, dstMem,  Read common # blocks to use, then read (src, dst) doubles
-    numLinks *= -1;
+    numTransfers *= -1;

-    for (int i = 0; i < numLinks; i++)
+    for (int i = 0; i < numTransfers; i++)
    {
-      Link link;
-      link.linkIndex = i;
-      iss >> srcMem >> exeMem >> dstMem >> link.numBlocksToUse;
+      Transfer transfer;
+      transfer.transferIndex = i;
+      iss >> srcMem >> exeMem >> dstMem >> transfer.numBlocksToUse;
      if (iss.fail())
      {
-        printf("Parsing error: Unable to read valid Link quadruple (possibly missing a SRC or EXE or DST or #CU)\n");
+        printf("Parsing error: Unable to read valid Transfer quadruple (possibly missing a SRC or EXE or DST or #CU)\n");
        exit(1);
      }
-      ParseMemType(srcMem, numCpus, numGpus, &link.srcMemType, &link.srcIndex);
-      ParseMemType(exeMem, numCpus, numGpus, &link.exeMemType, &link.exeIndex);
-      ParseMemType(dstMem, numCpus, numGpus, &link.dstMemType, &link.dstIndex);
-      if (link.exeMemType != MEM_CPU && link.exeMemType != MEM_GPU)
+      ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
+      ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
+      ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
+      if (transfer.exeMemType != MEM_CPU && transfer.exeMemType != MEM_GPU)
      {
        printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n"
-,               srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), link.numBlocksToUse);
+,               srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), transfer.numBlocksToUse);
        exit(1);
      }

-      Executor executor(link.exeMemType, link.exeIndex);
-      ExecutorInfo& executorInfo = linkMap[executor];
-      executorInfo.totalBlocks += link.numBlocksToUse;
-      executorInfo.links.push_back(link);
+      Executor executor(transfer.exeMemType, transfer.exeIndex);
+      ExecutorInfo& executorInfo = transferMap[executor];
+      executorInfo.totalBlocks += transfer.numBlocksToUse;
+      executorInfo.transfers.push_back(transfer);
    }
  }
 }
@@ -772,6 +778,7 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
    }

    // Allocate host-pinned memory (should respect NUMA mem policy)
+
    if (memType == MEM_CPU_FINE)
    {
      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
@@ -969,108 +976,95 @@ error:
  exit(1);
 }

-std::string GetLinkDesc(Link const& link)
+std::string GetTransferDesc(Transfer const& transfer)
 {
-  return GetDesc(link.srcMemType, link.srcIndex, link.exeMemType, link.exeIndex) + "-"
-    + GetDesc(link.exeMemType, link.exeIndex, link.dstMemType, link.dstIndex);
+  return GetDesc(transfer.srcMemType, transfer.srcIndex, transfer.exeMemType, transfer.exeIndex) + "-"
+    + GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.dstMemType, transfer.dstIndex);
 }

-void RunLink(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const linkIdx)
+void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx)
 {
-  Link& link = exeInfo.links[linkIdx];
+  Transfer& transfer = exeInfo.transfers[transferIdx];

  // GPU execution agent
-  if (link.exeMemType == MEM_GPU)
+  if (transfer.exeMemType == MEM_GPU)
  {
    // Switch to executing GPU
-    int const exeIndex = RemappedIndex(link.exeIndex, MEM_GPU);
+    int const exeIndex = RemappedIndex(transfer.exeIndex, MEM_GPU);
    HIP_CALL(hipSetDevice(exeIndex));

-    hipStream_t& stream     = exeInfo.streams[linkIdx];
-    hipEvent_t&  startEvent = exeInfo.startEvents[linkIdx];
-    hipEvent_t&  stopEvent  = exeInfo.stopEvents[linkIdx];
-
-    bool recordStart = (!ev.useSingleSync || iteration == 0 || ev.useSingleStream);
-    bool recordStop  = (!ev.useSingleSync || iteration == ev.numIterations - 1 || ev.useSingleStream);
+    hipStream_t& stream     = exeInfo.streams[transferIdx];
+    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
+    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];

    int const initOffset = ev.byteOffset / sizeof(float);

    if (ev.useHipCall)
    {
      // Record start event
-      if (recordStart) HIP_CALL(hipEventRecord(startEvent, stream));
+      HIP_CALL(hipEventRecord(startEvent, stream));

      // Execute hipMemset / hipMemcpy
      if (ev.useMemset)
-        HIP_CALL(hipMemsetAsync(link.dstMem + initOffset, 42, N * sizeof(float), stream));
+        HIP_CALL(hipMemsetAsync(transfer.dstMem + initOffset, 42, N * sizeof(float), stream));
      else
-        HIP_CALL(hipMemcpyAsync(link.dstMem + initOffset,
-                                link.srcMem + initOffset,
+        HIP_CALL(hipMemcpyAsync(transfer.dstMem + initOffset,
+                                transfer.srcMem + initOffset,
                                N * sizeof(float), hipMemcpyDefault,
                                stream));
      // Record stop event
-      if (recordStop) HIP_CALL(hipEventRecord(stopEvent, stream));
+      HIP_CALL(hipEventRecord(stopEvent, stream));
    }
    else
    {
-      if (!ev.combineTiming && recordStart) HIP_CALL(hipEventRecord(startEvent, stream));
-      int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalBlocks : link.numBlocksToUse;
+      int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalBlocks : transfer.numBlocksToUse;
      hipExtLaunchKernelGGL(ev.useMemset ? GpuMemsetKernel : GpuCopyKernel,
                            dim3(numBlocksToRun, 1, 1),
                            dim3(BLOCKSIZE, 1, 1),
                            ev.sharedMemBytes, stream,
-                            (ev.combineTiming && recordStart) ? startEvent : NULL,
-                            (ev.combineTiming && recordStop)  ? stopEvent : NULL,
-                            0, link.blockParamGpuPtr);
-      if (!ev.combineTiming & recordStop) HIP_CALL(hipEventRecord(stopEvent, stream));
+                            startEvent, stopEvent,
+                            0, transfer.blockParamGpuPtr);
    }

    // Synchronize per iteration, unless in single sync mode, in which case
    // synchronize during last warmup / last actual iteration
-    if (!ev.useSingleSync || iteration == -1 || iteration == ev.numIterations - 1)
-    {
-      HIP_CALL(hipStreamSynchronize(stream));
-    }
+    HIP_CALL(hipStreamSynchronize(stream));

    if (iteration >= 0)
    {
      // Record GPU timing
-      if (!ev.useSingleSync || iteration == ev.numIterations - 1 || ev.useSingleStream)
-      {
-        HIP_CALL(hipEventSynchronize(stopEvent));
-        float gpuDeltaMsec;
-        HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+      float gpuDeltaMsec;
+      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));

-        if (ev.useSingleStream)
+      if (ev.useSingleStream)
+      {
+        for (Transfer& currTransfer : exeInfo.transfers)
        {
-          for (Link& currLink : exeInfo.links)
+          long long minStartCycle = currTransfer.blockParamGpuPtr[0].startCycle;
+          long long maxStopCycle  = currTransfer.blockParamGpuPtr[0].stopCycle;
+          for (int i = 1; i < currTransfer.numBlocksToUse; i++)
          {
-            long long minStartCycle = currLink.blockParamGpuPtr[0].startCycle;
-            long long maxStopCycle  = currLink.blockParamGpuPtr[0].stopCycle;
-            for (int i = 1; i < currLink.numBlocksToUse; i++)
-            {
-              minStartCycle = std::min(minStartCycle, currLink.blockParamGpuPtr[i].startCycle);
-              maxStopCycle  = std::max(maxStopCycle,  currLink.blockParamGpuPtr[i].stopCycle);
-            }
-            int const wallClockRate = GetWallClockRate(exeIndex);
-            double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
-            currLink.linkTime += iterationTimeMs;
+            minStartCycle = std::min(minStartCycle, currTransfer.blockParamGpuPtr[i].startCycle);
+            maxStopCycle  = std::max(maxStopCycle,  currTransfer.blockParamGpuPtr[i].stopCycle);
          }
-          exeInfo.totalTime += gpuDeltaMsec;
-        }
-        else
-        {
-          link.linkTime += gpuDeltaMsec;
+          int const wallClockRate = GetWallClockRate(exeIndex);
+          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
+          currTransfer.transferTime += iterationTimeMs;
        }
+        exeInfo.totalTime += gpuDeltaMsec;
+      }
+      else
+      {
+        transfer.transferTime += gpuDeltaMsec;
      }
    }
  }
-  else if (link.exeMemType == MEM_CPU) // CPU execution agent
+  else if (transfer.exeMemType == MEM_CPU) // CPU execution agent
  {
    // Force this thread and all child threads onto correct NUMA node
-    if (numa_run_on_node(link.exeIndex))
+    if (numa_run_on_node(transfer.exeIndex))
    {
-      printf("[ERROR] Unable to set CPU to NUMA node %d\n", link.exeIndex);
+      printf("[ERROR] Unable to set CPU to NUMA node %d\n", transfer.exeIndex);
      exit(1);
    }

@@ -1079,18 +1073,18 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, ExecutorInf
    auto cpuStart = std::chrono::high_resolution_clock::now();

    // Launch child-threads to perform memcopies
-    for (int i = 0; i < ev.numCpuPerLink; i++)
-      childThreads.push_back(std::thread(ev.useMemset ? CpuMemsetKernel : CpuCopyKernel, std::ref(link.blockParam[i])));
+    for (int i = 0; i < ev.numCpuPerTransfer; i++)
+      childThreads.push_back(std::thread(ev.useMemset ? CpuMemsetKernel : CpuCopyKernel, std::ref(transfer.blockParam[i])));

    // Wait for child-threads to finish
-    for (int i = 0; i < ev.numCpuPerLink; i++)
+    for (int i = 0; i < ev.numCpuPerTransfer; i++)
      childThreads[i].join();

    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;

    // Record time if not a warmup iteration
    if (iteration >= 0)
-      link.linkTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
+      transfer.transferTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
  }
 }

@@ -1110,7 +1104,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, in
  if (!ev.outputToCsv)
  {
    printf("Performing copies in each direction of %lu bytes\n", N * sizeof(float));
-    printf("Using %d threads per NUMA node for CPU copies\n", ev.numCpuPerLink);
+    printf("Using %d threads per NUMA node for CPU copies\n", ev.numCpuPerTransfer);
    printf("Using %d CUs per transfer\n", numBlocksToUse);
  }
  else
@@ -1195,53 +1189,53 @@ double GetPeakBandwidth(EnvVars const& ev,

  int const initOffset = ev.byteOffset / sizeof(float);

-  // Prepare Links
-  std::vector<Link*> links;
+  // Prepare Transfers
+  std::vector<Transfer*> transfers;
  ExecutorInfo exeInfo[2];
  for (int i = 0; i < 2; i++)
  {
-    exeInfo[i].links.resize(1);
+    exeInfo[i].transfers.resize(1);
    exeInfo[i].streams.resize(1);
    exeInfo[i].startEvents.resize(1);
    exeInfo[i].stopEvents.resize(1);
-    links.push_back(&exeInfo[i].links[0]);
+    transfers.push_back(&exeInfo[i].transfers[0]);
  }

-  links[0]->srcMemType = links[1]->dstMemType = srcMemType;
-  links[0]->dstMemType = links[1]->srcMemType = dstMemType;
-  links[0]->srcIndex   = links[1]->dstIndex   = RemappedIndex(srcIndex, srcMemType);
-  links[0]->dstIndex   = links[1]->srcIndex   = RemappedIndex(dstIndex, dstMemType);
+  transfers[0]->srcMemType = transfers[1]->dstMemType = srcMemType;
+  transfers[0]->dstMemType = transfers[1]->srcMemType = dstMemType;
+  transfers[0]->srcIndex   = transfers[1]->dstIndex   = RemappedIndex(srcIndex, srcMemType);
+  transfers[0]->dstIndex   = transfers[1]->srcIndex   = RemappedIndex(dstIndex, dstMemType);

  // Either perform (local read + remote write), or (remote read + local write)
-  links[0]->exeMemType = (readMode == 0 ? srcMemType : dstMemType);
-  links[1]->exeMemType = (readMode == 0 ? dstMemType : srcMemType);
-  links[0]->exeIndex   = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), links[0]->exeMemType);
-  links[1]->exeIndex   = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), links[1]->exeMemType);
+  transfers[0]->exeMemType = (readMode == 0 ? srcMemType : dstMemType);
+  transfers[1]->exeMemType = (readMode == 0 ? dstMemType : srcMemType);
+  transfers[0]->exeIndex   = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), transfers[0]->exeMemType);
+  transfers[1]->exeIndex   = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), transfers[1]->exeMemType);

  for (int i = 0; i <= isBidirectional; i++)
  {
-    AllocateMemory(links[i]->srcMemType, links[i]->srcIndex,
-                   N * sizeof(float) + ev.byteOffset, (void**)&links[i]->srcMem);
-    AllocateMemory(links[i]->dstMemType, links[i]->dstIndex,
-                   N * sizeof(float) + ev.byteOffset, (void**)&links[i]->dstMem);
+    AllocateMemory(transfers[i]->srcMemType, transfers[i]->srcIndex,
+                   N * sizeof(float) + ev.byteOffset, (void**)&transfers[i]->srcMem);
+    AllocateMemory(transfers[i]->dstMemType, transfers[i]->dstIndex,
+                   N * sizeof(float) + ev.byteOffset, (void**)&transfers[i]->dstMem);

    // Prepare block parameters on CPU
-    links[i]->numBlocksToUse = (links[i]->exeMemType == MEM_GPU) ? numBlocksToUse : ev.numCpuPerLink;
-    links[i]->blockParam.resize(links[i]->numBlocksToUse);
-    links[i]->PrepareBlockParams(ev, N);
+    transfers[i]->numBlocksToUse = (transfers[i]->exeMemType == MEM_GPU) ? numBlocksToUse : ev.numCpuPerTransfer;
+    transfers[i]->blockParam.resize(transfers[i]->numBlocksToUse);
+    transfers[i]->PrepareBlockParams(ev, N);

-    if (links[i]->exeMemType == MEM_GPU)
+    if (transfers[i]->exeMemType == MEM_GPU)
    {
      // Copy block parameters onto GPU
-      AllocateMemory(MEM_GPU, links[i]->exeIndex, numBlocksToUse * sizeof(BlockParam),
-                     (void **)&links[i]->blockParamGpuPtr);
-      HIP_CALL(hipMemcpy(links[i]->blockParamGpuPtr,
-                         links[i]->blockParam.data(),
+      AllocateMemory(MEM_GPU, transfers[i]->exeIndex, numBlocksToUse * sizeof(BlockParam),
+                     (void **)&transfers[i]->blockParamGpuPtr);
+      HIP_CALL(hipMemcpy(transfers[i]->blockParamGpuPtr,
+                         transfers[i]->blockParam.data(),
                         numBlocksToUse * sizeof(BlockParam),
                         hipMemcpyHostToDevice));

      // Prepare GPU resources
-      HIP_CALL(hipSetDevice(links[i]->exeIndex));
+      HIP_CALL(hipSetDevice(transfers[i]->exeIndex));
      HIP_CALL(hipStreamCreate(&exeInfo[i].streams[0]));
      HIP_CALL(hipEventCreate(&exeInfo[i].startEvents[0]));
      HIP_CALL(hipEventCreate(&exeInfo[i].stopEvents[0]));
@@ -1255,7 +1249,7 @@ double GetPeakBandwidth(EnvVars const& ev,
  {
    // Perform timed iterations
    for (int i = 0; i <= isBidirectional; i++)
-      threads.push(std::thread(RunLink, std::ref(ev), N, iteration, std::ref(exeInfo[i]), 0));
+      threads.push(std::thread(RunTransfer, std::ref(ev), N, iteration, std::ref(exeInfo[i]), 0));

    // Wait for all threads to finish
    for (int i = 0; i <= isBidirectional; i++)
@@ -1265,28 +1259,28 @@ double GetPeakBandwidth(EnvVars const& ev,
    }
  }

-  // Validate that each link has transferred correctly
+  // Validate that each Transfer has transferred correctly
  for (int i = 0; i <= isBidirectional; i++)
-    CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, links[i]->dstMem + initOffset);
+    CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, transfers[i]->dstMem + initOffset);

  // Collect aggregate bandwidth
  double totalBandwidth = 0;
  for (int i = 0; i <= isBidirectional; i++)
  {
-    double linkDurationMsec = links[i]->linkTime / (1.0 * ev.numIterations);
-    double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f;
-    totalBandwidth += linkBandwidthGbs;
+    double transferDurationMsec = transfers[i]->transferTime / (1.0 * ev.numIterations);
+    double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
+    totalBandwidth += transferBandwidthGbs;
  }

  // Release GPU memory
  for (int i = 0; i <= isBidirectional; i++)
  {
-    DeallocateMemory(links[i]->srcMemType, links[i]->srcMem);
-    DeallocateMemory(links[i]->dstMemType, links[i]->dstMem);
+    DeallocateMemory(transfers[i]->srcMemType, transfers[i]->srcMem);
+    DeallocateMemory(transfers[i]->dstMemType, transfers[i]->dstMem);

-    if (links[i]->exeMemType == MEM_GPU)
+    if (transfers[i]->exeMemType == MEM_GPU)
    {
-      DeallocateMemory(MEM_GPU, links[i]->blockParamGpuPtr);
+      DeallocateMemory(MEM_GPU, transfers[i]->blockParamGpuPtr);
      HIP_CALL(hipStreamDestroy(exeInfo[i].streams[0]));
      HIP_CALL(hipEventDestroy(exeInfo[i].startEvents[0]));
      HIP_CALL(hipEventDestroy(exeInfo[i].stopEvents[0]));
@@ -1295,7 +1289,7 @@ double GetPeakBandwidth(EnvVars const& ev,
  return totalBandwidth;
 }

-void Link::PrepareBlockParams(EnvVars const& ev, size_t const N)
+void Transfer::PrepareBlockParams(EnvVars const& ev, size_t const N)
 {
  int const initOffset = ev.byteOffset / sizeof(float);

@@ -1303,7 +1297,7 @@ void Link::PrepareBlockParams(EnvVars const& ev, size_t const N)
  CheckOrFill(MODE_FILL, N, ev.useMemset, ev.useHipCall, ev.fillPattern, this->srcMem + initOffset);

  // Each block needs to know src/dst pointers and how many elements to transfer
-  // Figure out the sub-array each block does for this Link
+  // Figure out the sub-array each block does for this Transfer
  // - Partition N as evenly as possible, but try to keep blocks as multiples of BLOCK_BYTES bytes,
  //   except the very last one, for alignment reasons
  int const targetMultiple = ev.blockBytes / sizeof(float);
@@ -1324,7 +1318,7 @@ void Link::PrepareBlockParams(EnvVars const& ev, size_t const N)
    assigned += param.N;
  }

-  this->linkTime = 0.0;
+  this->transferTime = 0.0;
 }

 // NOTE: This is a stop-gap solution until HIP provides wallclock values

--- a/TransferBench.hpp
+++ b/TransferBench.hpp
@@ -37,8 +37,6 @@ THE SOFTWARE.

 #include "EnvVars.hpp"

-#define TB_VERSION "1.01"
-
 // Helper macro for catching HIP errors
 #define HIP_CALL(cmd)                                                   \
    do {                                                                \
@@ -52,7 +50,7 @@ THE SOFTWARE.
    } while (0)

 // Simple configuration parameters
-size_t const DEFAULT_BYTES_PER_LINK = (1<<26);  // Amount of data transferred per Link
+size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);  // Amount of data transferred per Transfer

 // Different src/dst memory types supported
 typedef enum
@@ -81,19 +79,19 @@ struct BlockParam
  long long stopCycle;
 };

-// Each Link is a uni-direction operation from a src memory to dst memory
-struct Link
+// Each Transfer is a uni-direction operation from a src memory to dst memory
+struct Transfer
 {
-  int     linkIndex;           // Link identifier
+  int     transferIndex;       // Transfer identifier

-  // Link config
-  MemType exeMemType;          // Link executor type (CPU or GPU)
+  // Transfer config
+  MemType exeMemType;          // Transfer executor type (CPU or GPU)
  int     exeIndex;            // Executor index (NUMA node for CPU / device ID for GPU)
  MemType srcMemType;          // Source memory type
  int     srcIndex;            // Source device index
  MemType dstMemType;          // Destination memory type
  int     dstIndex;            // Destination device index
-  int     numBlocksToUse;      // Number of threadblocks to use for this Link
+  int     numBlocksToUse;      // Number of threadblocks to use for this Transfer

  // Memory
  float*  srcMem;              // Source memory
@@ -104,7 +102,7 @@ struct Link
  BlockParam* blockParamGpuPtr;

  // Results
-  double  linkTime;
+  double  transferTime;

  // Prepares src memory and how to divide N elements across threadblocks/threads
  void PrepareBlockParams(EnvVars const& ev, size_t const N);
@@ -114,7 +112,7 @@ typedef std::pair<MemType, int> Executor;

 struct ExecutorInfo
 {
-  std::vector<Link> links;       // Links to execute
+  std::vector<Transfer>    transfers;     // Transfers to execute

  // For GPU-Executors
  int                      totalBlocks;   // Total number of CUs/CPU threads to use
@@ -127,7 +125,7 @@ struct ExecutorInfo
  double totalTime;
 };

-typedef std::map<Executor, ExecutorInfo> LinkMap;
+typedef std::map<Executor, ExecutorInfo> TransferMap;

 // Display usage instructions
 void DisplayUsage(char const* cmdName);
@@ -136,21 +134,21 @@ void DisplayUsage(char const* cmdName);
 void DisplayTopology(bool const outputToCsv);

 // Build array of test sizes based on sampling factor
-void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor,
+void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor,
                       std::vector<size_t>& valuesofN);

 void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
                  MemType* memType, int* memIndex);

-void ParseLinks(char* line, int numCpus, int numGpus,
-                LinkMap& linkMap);
+void ParseTransfers(char* line, int numCpus, int numGpus,
+                TransferMap& transferMap);

 void EnablePeerAccess(int const deviceId, int const peerDeviceId);
 void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
 void DeallocateMemory(MemType memType, void* memPtr);
 void CheckPages(char* byteArray, size_t numBytes, int targetId);
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
-void RunLink(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const linkIdx);
+void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
 void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);

 // Return the maximum bandwidth measured for given (src/dst) pair
@@ -167,6 +165,6 @@ double GetPeakBandwidth(EnvVars const& ev,
 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
 std::string GetDesc(MemType srcMemType, int srcIndex,
                    MemType dstMemType, int dstIndex);
-std::string GetLinkDesc(Link const& link);
+std::string GetTransferDesc(Transfer const& transfer);
 int RemappedIndex(int const origIdx, MemType const memType);
 int GetWallClockRate(int deviceId);
--- a/example.cfg
+++ b/example.cfg
 # ConfigFile Format:
 # ==================
-# A Link is defined as a uni-directional transfer from src memory location to dst memory location
+# A Transfer is defined as a uni-directional transfer from src memory location to dst memory location
 # executed by either CPU or GPU
-# Each single line in the configuration file defines a set of Links to run in parallel
+# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel

 # There are two ways to specify the configuration file:

 # 1) Basic
-#    The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Link
-#    A positive number of Links is specified followed by that number of triplets describing each Link
+#    The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Transfer
+#    A positive number of Transfers is specified followed by that number of triplets describing each Transfer

-#    #Links #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
+#    #Transfers #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)

 # 2) Advanced
-#    The advanced specification allows different number of threadblocks/CUs used per GPU-executed Link
-#    A negative number of links is specified, followed by quadruples describing each Link
-#    -#Links (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)
+#    The advanced specification allows different number of threadblocks/CUs used per GPU-executed Transfer
+#    A negative number of Transfers is specified, followed by quadruples describing each Transfer
+#    -#Transfers (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)

 # Argument Details:
-#   #Links  :   Number of Links to be run in parallel
-#   #CUs    :   Number of threadblocks/CUs to use for a GPU-executed Link
-#   srcMemL :   Source memory location (Where the data is to be read from). Ignored in memset mode
-#   Executor:   Executor are specified by a character indicating executor type, followed by device index (0-indexed)
-#               - C: CPU-executed  (Indexed from 0 to 1)
-#               - G: GPU-executed  (Indexed from 0 to 3)
-#   dstMemL :   Destination memory location (Where the data is to be written to)
-
-#               Memory locations are specified by a character indicating memory type,
-#               followed by device index (0-indexed)
-#               Supported memory locations are:
-#               - C:    Pinned host memory       (on NUMA node, indexed from 0 to [# NUMA nodes-1])
-#               - B:    Fine-grain host memory   (on NUMA node, indexed from 0 to [# NUMA nodes-1])
-#               - G:    Global device memory     (on GPU device indexed from 0 to [# GPUs - 1])
-#               - F:    Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
+#   #Transfers:   Number of Transfers to be run in parallel
+#   #CUs      :   Number of threadblocks/CUs to use for a GPU-executed Transfer
+#   srcMemL   :   Source memory location (Where the data is to be read from). Ignored in memset mode
+#   Executor  :   Executor is specified by a character indicating type, followed by device index (0-indexed)
+#                 - C: CPU-executed  (Indexed from 0 to # NUMA nodes - 1)
+#                 - G: GPU-executed  (Indexed from 0 to # GPUs - 1)
+#   dstMemL   :   Destination memory location (Where the data is to be written to)
+
+#                 Memory locations are specified by a character indicating memory type,
+#                 followed by device index (0-indexed)
+#                 Supported memory locations are:
+#                 - C:    Pinned host memory       (on NUMA node, indexed from 0 to [# NUMA nodes-1])
+#                 - B:    Fine-grain host memory   (on NUMA node, indexed from 0 to [# NUMA nodes-1])
+#                 - G:    Global device memory     (on GPU device indexed from 0 to [# GPUs - 1])
+#                 - F:    Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])

 # Examples:
-# 1 4 (G0->G0->G1)             Single link using 4 CUs on GPU0 to copy from GPU0 to GPU1
-# 1 4 (C1->G2->G0)             Single link using 4 CUs on GPU2 to copy from CPU1 to GPU0
-# 2 4 G0->G0->G1 G1->G1->G0    Runs 2 Links in parallel.  GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs
-# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel.  GPU0 to GPU1 with 4 CUs, and GPU1 to GPU0 with 2 CUs
+# 1 4 (G0->G0->G1)             Single Transfer using 4 CUs on GPU0 to copy from GPU0 to GPU1
+# 1 4 (C1->G2->G0)             Single Transfer using 4 CUs on GPU2 to copy from CPU1 to GPU0
+# 2 4 G0->G0->G1 G1->G1->G0    Runs 2 Transfers in parallel.  GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs
+# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Transfers in parallel.  GPU0 to GPU1 with 4 CUs, and GPU1 to GPU0 with 2 CUs

 # Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
 # Lines starting with # will be ignored. Lines starting with ## will be echoed to output

-# Single GPU-executed link between GPUs 0 and 1 using 4 CUs
+# Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs
 1 4 (G0->G0->G1)