v1.27 CU Masking, cmdline preset, CUDA fixes (#53)

3d284dd7 · gilbertlee-amd · GitHub · 7608befb · 3d284dd7 · 3d284dd7
Unverified Commit 3d284dd7 authored Sep 19, 2023 by gilbertlee-amd Committed by GitHub Sep 19, 2023
6 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench

+## v1.27
+### Added
+- Adding cmdline preset to allow specify simple tests on command line
+- E.g. ./TransferBench cmdline 64M "1 4 G0->G0->G1"
+- Adding environment variable HIDE_ENV, which skips printing of environment variable values
+- Adding environment variable CU_MASK, which allows selection of which CUs to execute on
+- CU_MASK is specified in CU indices (0-#CUs-1), and '-' can be used to denote ranges of values
+  - E.g.: CU_MASK=3-8,16 would request Transfer be executed only CUs 3,4,5,6,7,8,16
+  - NOTE: This is somewhat experimental and may not work on all hardware
+- SHOW_ITERATIONS now shows CU usage for that iteration (experimental)
+### Modified
+- Adding extra comments on commonly missing includes with details on how to install them
+### Fixed
+- CUDA compilation should work again (wall_clock64 CUDA alias was not defined)
+
 ## v1.26
 ### Added
 - Setting SHOW_ITERATIONS=1 provides additional information about per-iteration timing for file and p2p configs

--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
@@ -22,7 +22,8 @@ THE SOFTWARE.

 // This program measures simultaneous copy performance across multiple GPUs
 // on the same node
-#include <numa.h>
+#include <numa.h>     // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
+#include <cmath>      // If not found, try installing g++-12      (e.g apt-get install g++-12)
 #include <numaif.h>
 #include <random>
 #include <stack>
@@ -115,6 +116,48 @@ int main(int argc, char **argv)
    RunAllToAllBenchmark(ev, numBytesPerTransfer, numSubExecs);
    exit(0);
  }
+  else if (!strcmp(argv[1], "cmdline"))
+  {
+    // Print environment variables and CSV header
+    ev.DisplayEnvVars();
+    if (ev.outputToCsv)
+    {
+      printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
+    }
+
+    // Read Transfer from command line
+    std::string cmdlineTransfer;
+    for (int i = 3; i < argc; i++)
+      cmdlineTransfer += std::string(argv[i]) + " ";
+
+    char line[2048];
+    sprintf(line, "%s", cmdlineTransfer.c_str());
+    std::vector<Transfer> transfers;
+    ParseTransfers(line, ev.numCpuDevices, ev.numGpuDevices, transfers);
+    if (transfers.empty()) exit(0);
+
+    // If the number of bytes is specified, use it
+    if (numBytesPerTransfer != 0)
+    {
+      size_t N = numBytesPerTransfer / sizeof(float);
+      ExecuteTransfers(ev, 1, N, transfers);
+    }
+    else
+    {
+      // Otherwise generate a range of values
+      for (int N = 256; N <= (1<<27); N *= 2)
+      {
+        int delta = std::max(1, N / ev.samplingFactor);
+        int curr = N;
+        while (curr < N * 2)
+        {
+          ExecuteTransfers(ev, 1, curr, transfers);
+          curr += delta;
+        }
+      }
+    }
+    exit(0);
+  }

  // Check that Transfer configuration file can be opened
  ev.configMode = CFG_FILE;
@@ -252,7 +295,16 @@ void ExecuteTransfers(EnvVars const& ev,
      exeInfo.stopEvents.resize(numStreamsToUse);
      for (int i = 0; i < numStreamsToUse; ++i)
      {
-        HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
+        if (ev.cuMask.size())
+        {
+#if !defined(__NVCC__)
+          HIP_CALL(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], ev.cuMask.size(), ev.cuMask.data()));
+#endif
+        }
+        else
+        {
+          HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
+        }
        HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
        HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
      }
@@ -261,8 +313,13 @@ void ExecuteTransfers(EnvVars const& ev,
      {
        // Allocate one contiguous chunk of GPU memory for threadblock parameters
        // This allows support for executing one transfer per stream, or all transfers in a single stream
+#if !defined(__NVCC__)
        AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
                       (void**)&exeInfo.subExecParamGpu);
+#else
+        AllocateMemory(MEM_CPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
+                       (void**)&exeInfo.subExecParamGpu);
+#endif
      }
    }
  }
@@ -538,7 +595,10 @@ void ExecuteTransfers(EnvVars const& ev,
            {
              double iterDurationMsec = t.first;
              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
-              printf("      Iter %03d    | %7.3f GB/s | %8.3f ms |\n", t.second, iterBandwidthGbs, iterDurationMsec);
+              printf("      Iter %03d    | %7.3f GB/s | %8.3f ms | CUs:", t.second, iterBandwidthGbs, iterDurationMsec);
+              for (auto x : transfer->perIterationCUs[t.second - 1])
+                printf(" %2d", x);
+              printf("\n");
            }
            printf("      StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
        }
@@ -608,7 +668,11 @@ cleanup:

      if (exeType == EXE_GPU_GFX)
      {
+#if !defined(__NVCC__)
        DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
+#else
+        DeallocateMemory(MEM_CPU, exeInfo.subExecParamGpu);
+#endif
      }
    }
  }
@@ -641,6 +705,7 @@ void DisplayUsage(char const* cmdName)
  printf("                             - 4rd optional arg: GPU index to use as executor\n");
  printf("              a2a          - GPU All-To-All benchmark\n");
  printf("                             - 3rd optional arg: # of SubExecs to use\n");
+  printf("              cmdline      - Read Transfers from command line arguments (after N)\n");
  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
         DEFAULT_BYTES_PER_TRANSFER);
@@ -1182,6 +1247,15 @@ void CheckPages(char* array, size_t numBytes, int targetId)
  }
 }

+uint32_t GetId(uint32_t hwId)
+{
+  // Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
+  int const shId = (hwId >> 12) & 1;
+  int const cuId = (hwId >>  8) & 7;
+  int const seId = (hwId >> 13) & 3;
+  return (shId << 5) + (cuId << 2) + seId;
+}
+
 void RunTransfer(EnvVars const& ev, int const iteration,
                 ExecutorInfo& exeInfo, int const transferIdx)
 {
@@ -1239,7 +1313,13 @@ void RunTransfer(EnvVars const& ev, int const iteration,
          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
          currTransfer->transferTime += iterationTimeMs;
          if (ev.showIterations)
+          {
            currTransfer->perIterationTime.push_back(iterationTimeMs);
+            std::set<int> CUs;
+            for (int i = 0; i < currTransfer->numSubExecs; i++)
+              CUs.insert(GetId(currTransfer->subExecParamGpuPtr[i].hwId));
+            currTransfer->perIterationCUs.push_back(CUs);
+          }
        }
        exeInfo.totalTime += gpuDeltaMsec;
      }
@@ -1247,7 +1327,13 @@ void RunTransfer(EnvVars const& ev, int const iteration,
      {
        transfer->transferTime += gpuDeltaMsec;
        if (ev.showIterations)
+        {
          transfer->perIterationTime.push_back(gpuDeltaMsec);
+          std::set<int> CUs;
+          for (int i = 0; i < transfer->numSubExecs; i++)
+            CUs.insert(GetId(transfer->subExecParamGpuPtr[i].hwId));
+          transfer->perIterationCUs.push_back(CUs);
+        }
      }
    }
  }
@@ -1341,6 +1427,9 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
  // Perform unidirectional / bidirectional
  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
  {
+    if (ev.p2pMode == 1 && isBidirectional == 1 ||
+        ev.p2pMode == 2 && isBidirectional == 0) continue;
+
    printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
           ev.useRemoteRead ? "Remote" : "Local",
           ev.useRemoteRead ? "Local" : "Remote",
@@ -1372,7 +1461,6 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
    printf("\n");

    ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
-
    // Loop over all possible src/dst pairs
    for (int src = 0; src < numDevices; src++)
    {
@@ -1506,7 +1594,6 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
          // minBw
          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
          if (ev.outputToCsv) printf(",");
-
          for (int i = 0; i < numDevices; i++)
          {
            double const minBw = minBandwidth[dir][i];

--- a/src/include/Compatibility.hpp
+++ b/src/include/Compatibility.hpp
@@ -27,7 +27,7 @@ THE SOFTWARE.
 #include <cuda_runtime.h>

 // ROCm specific
-#define __builtin_amdgcn_s_memrealtime                     clock64
+#define wall_clock64                                       clock64
 #define gcnArchName                                        name

 // Datatypes

--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"

-#define TB_VERSION "1.26"
+#define TB_VERSION "1.27"

 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -68,6 +68,7 @@ public:
  int blockBytes;        // Each CU, except the last, gets a multiple of this many bytes to copy
  int byteOffset;        // Byte-offset for memory allocations
  int continueOnError;   // Continue tests even after mismatch detected
+  int hideEnv;           // Skip printing environment variable
  int numCpuDevices;     // Number of CPU devices to use (defaults to # NUMA nodes detected)
  int numGpuDevices;     // Number of GPU devices to use (defaults to # HIP devices detected)
  int numIterations;     // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
@@ -83,12 +84,14 @@ public:
  int validateDirect;    // Validate GPU destination memory directly instead of staging GPU memory on host

  std::vector<float> fillPattern; // Pattern of floats used to fill source data
+  std::vector<uint32_t> cuMask;   // Bit-vector representing the CU mask

-  // Environment variables only for Benchmark-preset
-  int useRemoteRead;     // Use destination memory type as executor instead of source memory type
-  int useDmaCopy;        // Use DMA copy instead of GPU copy
-  int numGpuSubExecs;    // Number of GPU subexecutors to use
+  // Environment variables only for P2P preset
  int numCpuSubExecs;    // Number of CPU subexecttors to use
+  int numGpuSubExecs;    // Number of GPU subexecutors to use
+  int p2pMode;           // Both = 0, Unidirectional = 1, Bidirectional = 2
+  int useDmaCopy;        // Use DMA copy instead of GPU copy
+  int useRemoteRead;     // Use destination memory type as executor instead of source memory type

  // Environment variables only for Sweep-preset
  int sweepMin;          // Min number of simultaneous Transfers to be executed per test
@@ -149,6 +152,7 @@ public:
    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
    continueOnError   = GetEnvVar("CONTINUE_ON_ERROR"   , 0);
+    hideEnv           = GetEnvVar("HIDE_ENV"            , 0);
    numCpuDevices     = GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
    numGpuDevices     = GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
    numIterations     = GetEnvVar("NUM_ITERATIONS"      , DEFAULT_NUM_ITERATIONS);
@@ -170,6 +174,7 @@ public:
    useDmaCopy        = GetEnvVar("USE_GPU_DMA"         , 0);
    numGpuSubExecs    = GetEnvVar("NUM_GPU_SE"          , useDmaCopy ? 1 : numDeviceCUs);
    numCpuSubExecs    = GetEnvVar("NUM_CPU_SE"          , DEFAULT_P2P_NUM_CPU_SE);
+    p2pMode           = GetEnvVar("P2P_MODE"            , 0);

    // Sweep related
    sweepMin          = GetEnvVar("SWEEP_MIN"           , DEFAULT_SWEEP_MIN);
@@ -252,6 +257,49 @@ public:
    }
    else fillPattern.clear();

+    // Check for CU mask
+    cuMask.clear();
+    char* cuMaskStr = getenv("CU_MASK");
+    if (cuMaskStr != NULL)
+    {
+#if defined(__NVCC__)
+      printf("[WARN] CU_MASK is not supported in CUDA\n");
+#else
+      std::vector<std::pair<int, int>> ranges;
+      int maxCU = 0;
+      char* token = strtok(cuMaskStr, ",");
+      while (token)
+      {
+        int start, end;
+        if (sscanf(token, "%d-%d", &start, &end) == 2)
+        {
+          ranges.push_back(std::make_pair(std::min(start, end), std::max(start, end)));
+          maxCU = std::max(maxCU, std::max(start, end));
+        }
+        else if (sscanf(token, "%d", &start) == 1)
+        {
+          ranges.push_back(std::make_pair(start, start));
+          maxCU = std::max(maxCU, start);
+        }
+        else
+        {
+          printf("[ERROR] Unrecognized token [%s]\n", token);
+          exit(1);
+        }
+        token = strtok(NULL, ",");
+      }
+      cuMask.resize(maxCU / 32 + 1, 0);
+
+      for (auto range : ranges)
+      {
+        for (int i = range.first; i <= range.second; i++)
+        {
+          cuMask[i / 32] |= (1 << (i % 32));
+        }
+      }
+#endif
+    }
+
    // Perform some basic validation
    if (numCpuDevices > numDetectedCpus)
    {
@@ -376,7 +424,9 @@ public:
    printf(" BLOCK_BYTES=B          - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
    printf(" BYTE_OFFSET            - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
    printf(" CONTINUE_ON_ERROR      - Continue tests even after mismatch detected\n");
+    printf(" CU_MASK                - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n");
    printf(" FILL_PATTERN=STR       - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
+    printf(" HIDE_ENV               - Hide environment variable value listing\n");
    printf(" NUM_CPU_DEVICES=X      - Restrict number of CPUs to X.  May not be greater than # detected NUMA nodes\n");
    printf(" NUM_GPU_DEVICES=X      - Restrict number of GPUs to X.  May not be greater than # detected HIP devices\n");
    printf(" NUM_ITERATIONS=I       - Perform I timed iteration(s) per test\n");
@@ -406,10 +456,11 @@ public:
    {
      printf("TransferBench v%s\n", TB_VERSION);
      printf("=====================================================\n");
-      printf("[Common]\n");
+      if (!hideEnv) printf("[Common]\n");
    }
-    else
+    else if (!hideEnv)
      printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
+    if (hideEnv) return;

    PRINT_EV("BLOCK_BYTES", blockBytes,
             std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
@@ -417,6 +468,8 @@ public:
             std::string("Using byte offset of " + std::to_string(byteOffset)));
    PRINT_EV("CONTINUE_ON_ERROR", continueOnError,
             std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error"));
+    PRINT_EV("CU_MASK", getenv("CU_MASK") ? 1 : 0,
+             (cuMask.size() ? GetCuMaskDesc() : "All"));
    PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
             (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
    PRINT_EV("GPU_KERNEL", gpuKernel,
@@ -451,6 +504,7 @@ public:
  void DisplayP2PBenchmarkEnvVars() const
  {
    DisplayEnvVars();
+    if (hideEnv) return;

    if (!outputToCsv)
      printf("[P2P Related]\n");
@@ -459,6 +513,10 @@ public:
             std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors");
    PRINT_EV("NUM_GPU_SE", numGpuSubExecs,
             std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors");
+    PRINT_EV("P2P_MODE", p2pMode,
+             std::string("Running ") + (p2pMode == 1 ? "Unidirectional" :
+                                        p2pMode == 2 ? "Bidirectional"  :
+                                                       "Unidirectional + Bidirectional"));
    PRINT_EV("USE_GPU_DMA", useDmaCopy,
             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
@@ -470,6 +528,7 @@ public:
  void DisplaySweepEnvVars() const
  {
    DisplayEnvVars();
+    if (hideEnv) return;

    if (!outputToCsv)
      printf("[Sweep Related]\n");
@@ -512,6 +571,50 @@ public:
      return getenv(varname.c_str());
    return defaultValue;
  }
+
+  std::string GetCuMaskDesc() const
+  {
+    std::vector<std::pair<int, int>> runs;
+
+    bool inRun = false;
+    std::pair<int, int> curr;
+    int used = 0;
+    for (int i = 0; i < cuMask.size(); i++)
+    {
+      for (int j = 0; j < 32; j++)
+      {
+        if (cuMask[i] & (1 << j))
+        {
+          used++;
+          if (!inRun)
+          {
+            inRun = true;
+            curr.first = i * 32 + j;
+          }
+        }
+        else
+        {
+          if (inRun)
+          {
+            inRun = false;
+            curr.second = i * 32 + j - 1;
+            runs.push_back(curr);
+          }
+        }
+      }
+    }
+    if (inRun)
+      curr.second = cuMask.size() * 32 - 1;
+
+    std::string result = "CUs used: (" + std::to_string(used) + ") ";
+    for (int i = 0; i < runs.size(); i++)
+    {
+      if (i) result += ",";
+      if (runs[i].first == runs[i].second) result += std::to_string(runs[i].first);
+      else result += std::to_string(runs[i].first) + "-" + std::to_string(runs[i].second);
+    }
+    return result;
+  }
 };

 #endif
--- a/src/include/Kernels.hpp
+++ b/src/include/Kernels.hpp
@@ -41,8 +41,18 @@ struct SubExecParam
  float*    dst[MAX_DSTS];                      // Destination array pointers
  long long startCycle;                         // Start timestamp for in-kernel timing (GPU-GFX executor)
  long long stopCycle;                          // Stop  timestamp for in-kernel timing (GPU-GFX executor)
+  uint32_t  hwId;                               // Hardware ID
 };

+// Macro for collecting HW_REG_HW_ID
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__NVCC__)
+#define __trace_hwreg() \
+  p.hwId = 0
+#else
+#define __trace_hwreg() \
+  asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (p.hwId));
+#endif
+
 void CpuReduceKernel(SubExecParam const& p)
 {
  int const& numSrcs = p.numSrcs;
@@ -211,6 +221,7 @@ GpuReduceKernel(SubExecParam* params)
  {
    p.startCycle = startCycle;
    p.stopCycle  = wall_clock64();
+    __trace_hwreg();
  }
 }


--- a/src/include/TransferBench.hpp
+++ b/src/include/TransferBench.hpp
@@ -81,7 +81,7 @@ char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};
 MemType inline CharToMemType(char const c)
 {
  char const* val = strchr(MemTypeStr, toupper(c));
-  if (*val) return (MemType)(val - MemTypeStr);
+  if (val) return (MemType)(val - MemTypeStr);
  printf("[ERROR] Unexpected memory type (%c)\n", c);
  exit(1);
 }
@@ -89,7 +89,7 @@ MemType inline CharToMemType(char const c)
 ExeType inline CharToExeType(char const c)
 {
  char const* val = strchr(ExeTypeStr, toupper(c));
-  if (*val) return (ExeType)(val - ExeTypeStr);
+  if (val) return (ExeType)(val - ExeTypeStr);
  printf("[ERROR] Unexpected executor type (%c)\n", c);
  exit(1);
 }
@@ -98,28 +98,29 @@ ExeType inline CharToExeType(char const c)
 // then writes the summation to each of the specified destination memory location(s)
 struct Transfer
 {
-  int                       transferIndex;      // Transfer identifier (within a Test)
-  ExeType                   exeType;            // Transfer executor type
-  int                       exeIndex;           // Executor index (NUMA node for CPU / device ID for GPU)
-  int                       numSubExecs;        // Number of subExecutors to use for this Transfer
-  size_t                    numBytes;           // # of bytes requested to Transfer (may be 0 to fallback to default)
-  size_t                    numBytesActual;     // Actual number of bytes to copy
-  double                    transferTime;       // Time taken in milliseconds
-
-  int                       numSrcs;            // Number of sources
-  std::vector<MemType>      srcType;            // Source memory types
-  std::vector<int>          srcIndex;           // Source device indice
-  std::vector<float*>       srcMem;             // Source memory
-
-  int                       numDsts;            // Number of destinations
-  std::vector<MemType>      dstType;            // Destination memory type
-  std::vector<int>          dstIndex;           // Destination device index
-  std::vector<float*>       dstMem;             // Destination memory
-
-  std::vector<SubExecParam> subExecParam;       // Defines subarrays assigned to each threadblock
-  SubExecParam*             subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
-
-  std::vector<double>       perIterationTime;   // Per-iteration timing
+  int                        transferIndex;      // Transfer identifier (within a Test)
+  ExeType                    exeType;            // Transfer executor type
+  int                        exeIndex;           // Executor index (NUMA node for CPU / device ID for GPU)
+  int                        numSubExecs;        // Number of subExecutors to use for this Transfer
+  size_t                     numBytes;           // # of bytes requested to Transfer (may be 0 to fallback to default)
+  size_t                     numBytesActual;     // Actual number of bytes to copy
+  double                     transferTime;       // Time taken in milliseconds
+
+  int                        numSrcs;            // Number of sources
+  std::vector<MemType>       srcType;            // Source memory types
+  std::vector<int>           srcIndex;           // Source device indice
+  std::vector<float*>        srcMem;             // Source memory
+
+  int                        numDsts;            // Number of destinations
+  std::vector<MemType>       dstType;            // Destination memory type
+  std::vector<int>           dstIndex;           // Destination device index
+  std::vector<float*>        dstMem;             // Destination memory
+
+  std::vector<SubExecParam>  subExecParam;       // Defines subarrays assigned to each threadblock
+  SubExecParam*              subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
+
+  std::vector<double>        perIterationTime;   // Per-iteration timing
+  std::vector<std::set<int>> perIterationCUs;    // Per-iteration CU usage

  // Prepares src/dst subarray pointers for each SubExecutor
  void PrepareSubExecParams(EnvVars const& ev);