TransferBench v1.02

ddb6508f · Gilbert Lee · 07ac2dce · ddb6508f · ddb6508f · ddb6508f
Commit ddb6508f authored Apr 27, 2022 by Gilbert Lee
6 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench
+## v1.02
+### Added
+- Setting NUM_ITERATIONS to negative number indicates to run for -NUM_ITERATIONS seconds per Test
+### Changed
+- Copies are now refered to as Transfers instead of Links
+- Re-ordering how env vars are displayed (alphabetically now)
+### Removed
+- Combined timing is now always on for kernel-based GPU copies. COMBINED_TIMING env var has been removed
+- Use single sync is no longer supported to facility variable iterations. USE_SINGLE_SYNC env var has been removed
 ## v1.01
 ### Added
 - Adding USE_SINGLE_STREAM feature

--- a/EnvVars.hpp
+++ b/EnvVars.hpp
@@ -25,32 +25,32 @@ THE SOFTWARE.
 #include <algorithm>
+#define TB_VERSION "1.02"
 // This class manages environment variable that affect TransferBench
 class EnvVars
 {
 public:
  // Default configuration values
-  int const DEFAULT_NUM_WARMUPS      =  3;
+  int const DEFAULT_NUM_WARMUPS          =  3;
-  int const DEFAULT_NUM_ITERATIONS   = 10;
+  int const DEFAULT_NUM_ITERATIONS       = 10;
-  int const DEFAULT_SAMPLING_FACTOR  =  1;
+  int const DEFAULT_SAMPLING_FACTOR      =  1;
-  int const DEFAULT_NUM_CPU_PER_LINK =  4;
+  int const DEFAULT_NUM_CPU_PER_TRANSFER =  4;
  // Environment variables
-  int useHipCall;      // Use hipMemcpy/hipMemset instead of custom shader kernels
+  int blockBytes;        // Each CU, except the last, gets a multiple of this many bytes to copy
-  int useMemset;       // Perform a memset instead of a copy (ignores source memory)
+  int byteOffset;        // Byte-offset for memory allocations
-  int useSingleSync;   // Perform synchronization only once after all iterations instead of per iteration
+  int numCpuPerTransfer; // Number of CPU child threads to use per CPU Transfer
-  int useInteractive;  // Pause for user-input before starting transfer loop
+  int numIterations;     // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
-  int combineTiming;   // Combines the timing with kernel launch
+  int numWarmups;        // Number of un-timed warmup iterations to perform
-  int outputToCsv;     // Output in CSV format
+  int outputToCsv;       // Output in CSV format
-  int byteOffset;      // Byte-offset for memory allocations
+  int samplingFactor;    // Affects how many different values of N are generated (when N set to 0)
-  int numWarmups;      // Number of un-timed warmup iterations to perform
+  int sharedMemBytes;    // Amount of shared memory to use per threadblock
-  int numIterations;   // Number of timed iterations to perform
+  int useHipCall;        // Use hipMemcpy/hipMemset instead of custom shader kernels
-  int samplingFactor;  // Affects how many different values of N are generated (when N set to 0)
+  int useInteractive;    // Pause for user-input before starting transfer loop
-  int numCpuPerLink;   // Number of CPU child threads to use per CPU link
+  int useMemset;         // Perform a memset instead of a copy (ignores source memory)
-  int sharedMemBytes;  // Amount of shared memory to use per threadblock
+  int usePcieIndexing;   // Base GPU indexing on PCIe address instead of HIP device
-  int blockBytes;      // Each CU, except the last, gets a multiple of this many bytes to copy
+  int useSingleStream;   // Use a single stream per device instead of per Tink. Can not be used with USE_HIP_CALL
-  int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
-  int useSingleStream; // Use a single stream per device instead of per Link. Can not be used with USE_HIP_CALL
  std::vector<float> fillPattern; // Pattern of floats used to fill source data
@@ -61,21 +61,19 @@ public:
    hipDeviceGetAttribute(&maxSharedMemBytes,
                          hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0);
-    useHipCall      = GetEnvVar("USE_HIP_CALL"     , 0);
+    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
-    useMemset       = GetEnvVar("USE_MEMSET"       , 0);
+    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
-    useSingleSync   = GetEnvVar("USE_SINGLE_SYNC"  , 1);
+    numCpuPerTransfer = GetEnvVar("NUM_CPU_PER_TRANSFER", DEFAULT_NUM_CPU_PER_TRANSFER);
-    useInteractive  = GetEnvVar("USE_INTERACTIVE"  , 0);
+    numIterations     = GetEnvVar("NUM_ITERATIONS"      , DEFAULT_NUM_ITERATIONS);
-    combineTiming   = GetEnvVar("COMBINE_TIMING"   , 0);
+    numWarmups        = GetEnvVar("NUM_WARMUPS"         , DEFAULT_NUM_WARMUPS);
-    outputToCsv     = GetEnvVar("OUTPUT_TO_CSV"    , 0);
+    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
-    byteOffset      = GetEnvVar("BYTE_OFFSET"      , 0);
+    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , DEFAULT_SAMPLING_FACTOR);
-    numWarmups      = GetEnvVar("NUM_WARMUPS"      , DEFAULT_NUM_WARMUPS);
+    sharedMemBytes    = GetEnvVar("SHARED_MEM_BYTES"    , maxSharedMemBytes / 2 + 1);
-    numIterations   = GetEnvVar("NUM_ITERATIONS"   , DEFAULT_NUM_ITERATIONS);
+    useHipCall        = GetEnvVar("USE_HIP_CALL"        , 0);
-    samplingFactor  = GetEnvVar("SAMPLING_FACTOR"  , DEFAULT_SAMPLING_FACTOR);
+    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
-    numCpuPerLink   = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK);
+    useMemset         = GetEnvVar("USE_MEMSET"          , 0);
-    sharedMemBytes  = GetEnvVar("SHARED_MEM_BYTES" , maxSharedMemBytes / 2 + 1);
+    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
-    blockBytes      = GetEnvVar("BLOCK_BYTES"      , 256);
+    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 0);
-    usePcieIndexing = GetEnvVar("USE_PCIE_INDEX"   , 0);
-    useSingleStream = GetEnvVar("USE_SINGLE_STREAM", 0);
    // Check for fill pattern
    char* pattern = getenv("FILL_PATTERN");
@@ -146,19 +144,14 @@ public:
      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
      exit(1);
    }
-    if (numIterations <= 0)
-    {
-      printf("[ERROR] NUM_ITERATIONS must be set to a positive number\n");
-      exit(1);
-    }
    if (samplingFactor < 1)
    {
      printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
      exit(1);
    }
-    if (numCpuPerLink < 1)
+    if (numCpuPerTransfer < 1)
    {
-      printf("[ERROR] NUM_CPU_PER_LINK must be greater or equal to 1\n");
+      printf("[ERROR] NUM_CPU_PER_TRANSFER must be greater or equal to 1\n");
      exit(1);
    }
    if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes)
@@ -183,22 +176,20 @@ public:
  {
    printf("Environment variables:\n");
    printf("======================\n");
-    printf(" USE_HIP_CALL       - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
+    printf(" BLOCK_BYTES=B      - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
-    printf(" USE_MEMSET         - Perform a memset instead of a copy (ignores source memory)\n");
-    printf(" USE_SINGLE_SYNC    - Perform synchronization only once after all iterations instead of per iteration\n");
-    printf(" USE_INTERACTIVE    - Pause for user-input before starting transfer loop\n");
-    printf(" COMBINE_TIMING     - Combines timing with launch (potentially lower timing overhead)\n");
-    printf(" OUTPUT_TO_CSV      - Outputs to CSV format if set\n");
    printf(" BYTE_OFFSET        - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
-    printf(" NUM_WARMUPS=W      - Perform W untimed warmup iteration(s) per test\n");
+    printf(" FILL_PATTERN=STR   - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
+    printf(" NUM_CPU_PER_TRANSFER=C - Use C threads per Transfer for CPU-executed copies\n");
    printf(" NUM_ITERATIONS=I   - Perform I timed iteration(s) per test\n");
+    printf(" NUM_WARMUPS=W      - Perform W untimed warmup iteration(s) per test\n");
+    printf(" OUTPUT_TO_CSV      - Outputs to CSV format if set\n");
    printf(" SAMPLING_FACTOR=F  - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
-    printf(" NUM_CPU_PER_LINK=C - Use C threads per Link for CPU-executed copies\n");
-    printf(" FILL_PATTERN=STR   - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
    printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
-    printf(" BLOCK_BYTES=B      - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
+    printf(" USE_HIP_CALL       - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
+    printf(" USE_INTERACTIVE    - Pause for user-input before starting transfer loop\n");
+    printf(" USE_MEMSET         - Perform a memset instead of a copy (ignores source memory)\n");
    printf(" USE_PCIE_INDEX     - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
-    printf(" USE_SINGLE_STREAM  - Use single stream per device instead of per link.  Cannot be used with USE_HIP_CALL\n");
+    printf(" USE_SINGLE_STREAM  - Use single stream per device instead of per Transfer.  Cannot be used with USE_HIP_CALL\n");
  }
  // Display env var settings
@@ -206,45 +197,41 @@ public:
  {
    if (!outputToCsv)
    {
-      printf("Run configuration\n");
+      printf("Run configuration (TransferBench v%s)\n", TB_VERSION);
      printf("=====================================================\n");
+      printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
+      printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
+      printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
+      if (fillPattern.size())
+        printf("Pattern: %s", getenv("FILL_PATTERN"));
+      else
+        printf("Pseudo-random: (Element i = i modulo 383 + 31)");
+      printf("\n");
+      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
+      printf("%-20s = %12d : Running %d %s per topology\n", "NUM_ITERATIONS", numIterations,
+             numIterations > 0 ? numIterations : -numIterations,
+             numIterations > 0 ? "timed iteration(s)" : "second(s)");
+      printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
+      printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
+             outputToCsv ? "CSV" : "console");
+      printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
+             getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
      printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
             useHipCall ? "HIP functions" : "custom kernels");
-      printf("%-20s = %12d : Performing %s\n", "USE_MEMSET", useMemset,
-             useMemset ? "memset" : "memcopy");
      if (useHipCall && !useMemset)
      {
        char* env = getenv("HSA_ENABLE_SDMA");
        printf("%-20s = %12s : %s\n", "HSA_ENABLE_SDMA", env,
               (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
      }
-      printf("%-20s = %12d : %s\n", "USE_SINGLE_SYNC", useSingleSync,
-             useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
      printf("%-20s = %12d : Running in %s mode\n", "USE_INTERACTIVE", useInteractive,
             useInteractive ? "interactive" : "non-interactive");
-      printf("%-20s = %12d : %s\n", "COMBINE_TIMING", combineTiming,
+      printf("%-20s = %12d : Performing %s\n", "USE_MEMSET", useMemset,
-             combineTiming ? "Using combined timing+launch" : "Using separate timing / launch");
+             useMemset ? "memset" : "memcopy");
-      printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
+      printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX",
-             outputToCsv ? "CSV" : "console");
+             usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
-      printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
+      printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM",
-      printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
+             useSingleStream, (useSingleStream ? "device" : "Transfer"));
-      printf("%-20s = %12d : Running %d timed iteration(s) per topology\n", "NUM_ITERATIONS", numIterations, numIterations);
-      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Link\n", "NUM_CPU_PER_LINK", numCpuPerLink, numCpuPerLink);
-      printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
-      if (fillPattern.size())
-      {
-        printf("Pattern: %s", getenv("FILL_PATTERN"));
-      }
-      else
-      {
-        printf("Pseudo-random: (Element i = i modulo 383 + 31)");
-      }
-      printf("\n");
-      printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
-             getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
-      printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
-      printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
-      printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM", useSingleStream, (useSingleStream ? "device" : "Link"));
      printf("\n");
    }
  };

--- a/LICENSE.md
+++ b/LICENSE.md
+Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/TransferBench.cpp
+++ b/TransferBench.cpp
--- a/TransferBench.hpp
+++ b/TransferBench.hpp
@@ -37,8 +37,6 @@ THE SOFTWARE.
 #include "EnvVars.hpp"
-#define TB_VERSION "1.01"
 // Helper macro for catching HIP errors
 #define HIP_CALL(cmd)                                                   \
    do {                                                                \
@@ -52,7 +50,7 @@ THE SOFTWARE.
    } while (0)
 // Simple configuration parameters
-size_t const DEFAULT_BYTES_PER_LINK = (1<<26);  // Amount of data transferred per Link
+size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);  // Amount of data transferred per Transfer
 // Different src/dst memory types supported
 typedef enum
@@ -81,19 +79,19 @@ struct BlockParam
  long long stopCycle;
 };
-// Each Link is a uni-direction operation from a src memory to dst memory
+// Each Transfer is a uni-direction operation from a src memory to dst memory
-struct Link
+struct Transfer
 {
-  int     linkIndex;           // Link identifier
+  int     transferIndex;       // Transfer identifier
-  // Link config
+  // Transfer config
-  MemType exeMemType;          // Link executor type (CPU or GPU)
+  MemType exeMemType;          // Transfer executor type (CPU or GPU)
  int     exeIndex;            // Executor index (NUMA node for CPU / device ID for GPU)
  MemType srcMemType;          // Source memory type
  int     srcIndex;            // Source device index
  MemType dstMemType;          // Destination memory type
  int     dstIndex;            // Destination device index
-  int     numBlocksToUse;      // Number of threadblocks to use for this Link
+  int     numBlocksToUse;      // Number of threadblocks to use for this Transfer
  // Memory
  float*  srcMem;              // Source memory
@@ -104,7 +102,7 @@ struct Link
  BlockParam* blockParamGpuPtr;
  // Results
-  double  linkTime;
+  double  transferTime;
  // Prepares src memory and how to divide N elements across threadblocks/threads
  void PrepareBlockParams(EnvVars const& ev, size_t const N);
@@ -114,7 +112,7 @@ typedef std::pair<MemType, int> Executor;
 struct ExecutorInfo
 {
-  std::vector<Link> links;       // Links to execute
+  std::vector<Transfer>    transfers;     // Transfers to execute
  // For GPU-Executors
  int                      totalBlocks;   // Total number of CUs/CPU threads to use
@@ -127,7 +125,7 @@ struct ExecutorInfo
  double totalTime;
 };
-typedef std::map<Executor, ExecutorInfo> LinkMap;
+typedef std::map<Executor, ExecutorInfo> TransferMap;
 // Display usage instructions
 void DisplayUsage(char const* cmdName);
@@ -136,21 +134,21 @@ void DisplayUsage(char const* cmdName);
 void DisplayTopology(bool const outputToCsv);
 // Build array of test sizes based on sampling factor
-void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor,
+void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor,
                       std::vector<size_t>& valuesofN);
 void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
                  MemType* memType, int* memIndex);
-void ParseLinks(char* line, int numCpus, int numGpus,
+void ParseTransfers(char* line, int numCpus, int numGpus,
-                LinkMap& linkMap);
+                TransferMap& transferMap);
 void EnablePeerAccess(int const deviceId, int const peerDeviceId);
 void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
 void DeallocateMemory(MemType memType, void* memPtr);
 void CheckPages(char* byteArray, size_t numBytes, int targetId);
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
-void RunLink(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const linkIdx);
+void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
 void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
 // Return the maximum bandwidth measured for given (src/dst) pair
@@ -167,6 +165,6 @@ double GetPeakBandwidth(EnvVars const& ev,
 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
 std::string GetDesc(MemType srcMemType, int srcIndex,
                    MemType dstMemType, int dstIndex);
-std::string GetLinkDesc(Link const& link);
+std::string GetTransferDesc(Transfer const& transfer);
 int RemappedIndex(int const origIdx, MemType const memType);
 int GetWallClockRate(int deviceId);
--- a/example.cfg
+++ b/example.cfg
 # ConfigFile Format:
 # ==================
-# A Link is defined as a uni-directional transfer from src memory location to dst memory location
+# A Transfer is defined as a uni-directional transfer from src memory location to dst memory location
 # executed by either CPU or GPU
-# Each single line in the configuration file defines a set of Links to run in parallel
+# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
 # There are two ways to specify the configuration file:
 # 1) Basic
-#    The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Link
+#    The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Transfer
-#    A positive number of Links is specified followed by that number of triplets describing each Link
+#    A positive number of Transfers is specified followed by that number of triplets describing each Transfer
-#    #Links #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
+#    #Transfers #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
 # 2) Advanced
-#    The advanced specification allows different number of threadblocks/CUs used per GPU-executed Link
+#    The advanced specification allows different number of threadblocks/CUs used per GPU-executed Transfer
-#    A negative number of links is specified, followed by quadruples describing each Link
+#    A negative number of Transfers is specified, followed by quadruples describing each Transfer
-#    -#Links (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)
+#    -#Transfers (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)
 # Argument Details:
-#   #Links  :   Number of Links to be run in parallel
+#   #Transfers:   Number of Transfers to be run in parallel
-#   #CUs    :   Number of threadblocks/CUs to use for a GPU-executed Link
+#   #CUs      :   Number of threadblocks/CUs to use for a GPU-executed Transfer
-#   srcMemL :   Source memory location (Where the data is to be read from). Ignored in memset mode
+#   srcMemL   :   Source memory location (Where the data is to be read from). Ignored in memset mode
-#   Executor:   Executor are specified by a character indicating executor type, followed by device index (0-indexed)
+#   Executor  :   Executor is specified by a character indicating type, followed by device index (0-indexed)
-#               - C: CPU-executed  (Indexed from 0 to 1)
+#                 - C: CPU-executed  (Indexed from 0 to # NUMA nodes - 1)
-#               - G: GPU-executed  (Indexed from 0 to 3)
+#                 - G: GPU-executed  (Indexed from 0 to # GPUs - 1)
-#   dstMemL :   Destination memory location (Where the data is to be written to)
+#   dstMemL   :   Destination memory location (Where the data is to be written to)
-#               Memory locations are specified by a character indicating memory type,
+#                 Memory locations are specified by a character indicating memory type,
-#               followed by device index (0-indexed)
+#                 followed by device index (0-indexed)
-#               Supported memory locations are:
+#                 Supported memory locations are:
-#               - C:    Pinned host memory       (on NUMA node, indexed from 0 to [# NUMA nodes-1])
+#                 - C:    Pinned host memory       (on NUMA node, indexed from 0 to [# NUMA nodes-1])
-#               - B:    Fine-grain host memory   (on NUMA node, indexed from 0 to [# NUMA nodes-1])
+#                 - B:    Fine-grain host memory   (on NUMA node, indexed from 0 to [# NUMA nodes-1])
-#               - G:    Global device memory     (on GPU device indexed from 0 to [# GPUs - 1])
+#                 - G:    Global device memory     (on GPU device indexed from 0 to [# GPUs - 1])
-#               - F:    Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
+#                 - F:    Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
 # Examples:
-# 1 4 (G0->G0->G1)             Single link using 4 CUs on GPU0 to copy from GPU0 to GPU1
+# 1 4 (G0->G0->G1)             Single Transfer using 4 CUs on GPU0 to copy from GPU0 to GPU1
-# 1 4 (C1->G2->G0)             Single link using 4 CUs on GPU2 to copy from CPU1 to GPU0
+# 1 4 (C1->G2->G0)             Single Transfer using 4 CUs on GPU2 to copy from CPU1 to GPU0
-# 2 4 G0->G0->G1 G1->G1->G0    Runs 2 Links in parallel.  GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs
+# 2 4 G0->G0->G1 G1->G1->G0    Runs 2 Transfers in parallel.  GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs
-# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel.  GPU0 to GPU1 with 4 CUs, and GPU1 to GPU0 with 2 CUs
+# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Transfers in parallel.  GPU0 to GPU1 with 4 CUs, and GPU1 to GPU0 with 2 CUs
 # Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
 # Lines starting with # will be ignored. Lines starting with ## will be echoed to output
-# Single GPU-executed link between GPUs 0 and 1 using 4 CUs
+# Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs
 1 4 (G0->G0->G1)