impr: Library/Client build organization

Change code organization and build options Code changes related to the following: * Build files * Options to build client, shared, and static libraries * Source code directories * Modern C++20 changes * Based on TB 1.6.4 * Formatting Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>

impr: Library/Client build organization
Change code organization and build options Code changes related to the following: * Build files * Options to build client, shared, and static libraries * Source code directories * Modern C++20 changes * Based on TB 1.6.4 * Formatting Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
09f4f11b · Oliveira, Daniel · 2d0ecaae · 2d0ecaae · 2d0ecaae · 2d0ecaae
Commit 09f4f11b authored Jul 15, 2025 by Oliveira, Daniel
7 changed files
--- a/src/client/Presets/OneToAll.hpp
+++ b/src/client/Presets/OneToAll.hpp
-/*
-Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-void OneToAllPreset(EnvVars&           ev,
-                    size_t      const  numBytesPerTransfer,
-                    std::string const  presetName)
-{
-  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-  if (numDetectedGpus < 2) {
-    printf("[ERROR] One-to-all benchmark requires machine with at least 2 GPUs\n");
-    exit(1);
-  }
-  // Collect env vars for this preset
-  int         numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
-  int         numSubExecs   = EnvVars::GetEnvVar("NUM_GPU_SE", 4);
-  int         exeIndex      = EnvVars::GetEnvVar("EXE_INDEX",   0);
-  int         sweepDir      = EnvVars::GetEnvVar("SWEEP_DIR",  0);
-  std::string sweepDst      = EnvVars::GetEnvVar("SWEEP_DST",  "G");
-  std::string sweepExe      = EnvVars::GetEnvVar("SWEEP_EXE",  "G");
-  std::string sweepSrc      = EnvVars::GetEnvVar("SWEEP_SRC",  "G");
-  int         sweepMin      = EnvVars::GetEnvVar("SWEEP_MIN",  1);
-  int         sweepMax      = EnvVars::GetEnvVar("SWEEP_MAX",  numGpuDevices);
-  // Display environment variables
-  ev.DisplayEnvVars();
-  if (!ev.hideEnv) {
-    if (!ev.outputToCsv) printf("[One-To-All Related]\n");
-    ev.Print("NUM_GPU_DEVICES", numGpuDevices,    "Using %d GPUs", numGpuDevices);
-    ev.Print("NUM_GPU_SE",      numSubExecs,      "Using %d subExecutors/CUs per Transfer", numSubExecs);
-    ev.Print("EXE_INDEX",       exeIndex,         "Executing on GPU %d", exeIndex);
-    ev.Print("SWEEP_DIR",       sweepDir,         "Direction of transfer");
-    ev.Print("SWEEP_DST",       sweepDst.c_str(), "DST memory types to sweep");
-    ev.Print("SWEEP_EXE",       sweepExe.c_str(), "Executor type to use");
-    ev.Print("SWEEP_MAX",       sweepMax,         "Maximum number of peers");
-    ev.Print("SWEEP_MIN",       sweepMin,         "Minimum number of peers");
-    ev.Print("SWEEP_SRC",       sweepSrc.c_str(), "SRC memory types to sweep");
-    printf("\n");
-  }
-  // Perform validation
-  for (auto ch : sweepExe) {
-    if (ch != 'G' && ch != 'D') {
-      printf("[ERROR] Unrecognized executor type '%c' specified\n", ch);
-      exit(1);
-    }
-  }
-  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
-  TransferBench::TestResults results;
-  char const sep = (ev.outputToCsv ? ',' : ' ');
-  for (char src : sweepSrc) for (char exe : sweepExe) for (char dst : sweepDst) {
-    // Skip invalid configurations
-    if ((exe == 'D' && (src == 'N' || dst == 'N')) || (src == 'N' && dst == 'N')) continue;
-    printf("Executing (%c%s -> %c%d -> %c%s)\n",
-           src, src == 'N' ? "" : (sweepDir == 0 ? std::to_string(exeIndex).c_str() : std::string("*").c_str()),
-           exe, exeIndex,
-           dst, dst == 'N' ? "" : sweepDir == 0 ? std::string("*").c_str() : std::to_string(exeIndex).c_str());
-    for (int i = 0; i < numGpuDevices; i++) {
-      if (i == exeIndex) continue;
-      printf("   GPU %-3d  %c", i, sep);
-    }
-    printf("\n");
-    if (!ev.outputToCsv) {
-      for (int i = 0; i < numGpuDevices-1; i++)
-        printf("-------------");
-      printf("\n");
-    }
-    for (int p = sweepMin; p <= sweepMax; p++) {
-      for (int bitmask = 0; bitmask < (1<<numGpuDevices); bitmask++) {
-        if (bitmask & (1<<exeIndex) || __builtin_popcount(bitmask) != p) continue;
-        std::vector<Transfer> transfers;
-        for (int i = 0; i < numGpuDevices; i++) {
-          if (bitmask & (1<<i)) {
-            Transfer t;
-            CheckForError(TransferBench::CharToExeType(exe, t.exeDevice.exeType));
-            t.exeDevice.exeIndex = exeIndex;
-            t.exeSubIndex = -1;
-            t.numSubExecs = numSubExecs;
-            t.numBytes    = numBytesPerTransfer;
-            if (src == 'N') {
-              t.srcs.clear();
-            } else {
-              t.srcs.resize(1);
-              CheckForError(TransferBench::CharToMemType(src, t.srcs[0].memType));
-              t.srcs[0].memIndex = sweepDir == 0 ? exeIndex : i;
-            }
-            if (dst == 'N') {
-              t.dsts.clear();
-            } else {
-              t.dsts.resize(1);
-              CheckForError(TransferBench::CharToMemType(dst, t.dsts[0].memType));
-              t.dsts[0].memIndex = sweepDir == 0 ? i : exeIndex;
-            }
-            transfers.push_back(t);
-          }
-        }
-        if (!TransferBench::RunTransfers(cfg, transfers, results)) {
-          PrintErrors(results.errResults);
-          exit(1);
-        }
-        int counter = 0;
-        for (int i = 0; i < numGpuDevices; i++) {
-          if (bitmask & (1<<i))
-            printf("  %8.3f  %c", results.tfrResults[counter++].avgBandwidthGbPerSec, sep);
-          else if (i != exeIndex)
-            printf("            %c", sep);
-        }
-        printf(" %d %d", p, numSubExecs);
-        for (auto i = 0; i < transfers.size(); i++) {
-          printf(" (%s %c%d %s)",
-                 MemDevicesToStr(transfers[i].srcs).c_str(),
-                 ExeTypeStr[transfers[i].exeDevice.exeType], transfers[i].exeDevice.exeIndex,
-                 MemDevicesToStr(transfers[i].dsts).c_str());
-        }
-        printf("\n");
-      }
-    }
-  }
-}
--- a/src/client/Presets/PeerToPeer.hpp
+++ b/src/client/Presets/PeerToPeer.hpp
-/*
-Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-void PeerToPeerPreset(EnvVars&           ev,
-                      size_t      const  numBytesPerTransfer,
-                      std::string const  presetName)
-{
-  int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
-  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-  // Collect env vars for this preset
-  int useDmaCopy     = EnvVars::GetEnvVar("USE_GPU_DMA",     0);
-  int numCpuDevices  = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
-  int numCpuSubExecs = EnvVars::GetEnvVar("NUM_CPU_SE",      4);
-  int numGpuDevices  = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
-  int numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE",      useDmaCopy ? 1 : TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}));
-  int p2pMode        = EnvVars::GetEnvVar("P2P_MODE",        0);
-  int useFineGrain   = EnvVars::GetEnvVar("USE_FINE_GRAIN",  0);
-  int useRemoteRead  = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
-  // Display environment variables
-  ev.DisplayEnvVars();
-  if (!ev.hideEnv) {
-    int outputToCsv = ev.outputToCsv;
-    if (!outputToCsv) printf("[P2P Related]\n");
-    ev.Print("NUM_CPU_DEVICES", numCpuDevices,  "Using %d CPUs", numCpuDevices);
-    ev.Print("NUM_CPU_SE",      numCpuSubExecs, "Using %d CPU threads per Transfer", numCpuSubExecs);
-    ev.Print("NUM_GPU_DEVICES", numGpuDevices,  "Using %d GPUs", numGpuDevices);
-    ev.Print("NUM_GPU_SE",      numGpuSubExecs, "Using %d GPU subexecutors/CUs per Transfer", numGpuSubExecs);
-    ev.Print("P2P_MODE",        p2pMode,        "Running %s transfers", p2pMode == 0 ? "Uni + Bi" :
-                                                                        p2pMode == 1 ? "Unidirectional"
-                                                                                     : "Bidirectional");
-    ev.Print("USE_FINE_GRAIN",  useFineGrain,   "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
-    ev.Print("USE_GPU_DMA",     useDmaCopy,     "Using GPU-%s as GPU executor", useDmaCopy ? "DMA" : "GFX");
-    ev.Print("USE_REMOTE_READ", useRemoteRead,  "Using %s as executor", useRemoteRead ? "DST" : "SRC");
-    printf("\n");
-  }
-  char const separator = ev.outputToCsv ? ',' : ' ';
-  printf("Bytes Per Direction%c%lu\n", separator, numBytesPerTransfer);
-  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
-  TransferBench::TestResults results;
-  // Collect the number of available CPUs/GPUs on this machine
-  int const numDevices = numCpuDevices + numGpuDevices;
-  // Perform unidirectional / bidirectional
-  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) {
-    if (p2pMode == 1 && isBidirectional == 1 ||
-        p2pMode == 2 && isBidirectional == 0) continue;
-    printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
-           useRemoteRead ? "Remote" : "Local",
-           useRemoteRead ? "Local" : "Remote",
-           useDmaCopy    ? "DMA"   : "GFX");
-    // Print header
-    if (isBidirectional) {
-      printf("%12s", "SRC\\DST");
-    } else {
-      if (useRemoteRead)
-        printf("%12s", "SRC\\EXE+DST");
-      else
-        printf("%12s", "SRC+EXE\\DST");
-    }
-    if (ev.outputToCsv) printf(",");
-    for (int i = 0; i < numCpuDevices; i++) {
-      printf("%7s %02d", "CPU", i);
-      if (ev.outputToCsv) printf(",");
-    }
-    if (numCpuDevices > 0) printf("   ");
-    for (int i = 0; i < numGpuDevices; i++) {
-      printf("%7s %02d", "GPU", i);
-      if (ev.outputToCsv) printf(",");
-    }
-    printf("\n");
-    double avgBwSum[2][2] = {};
-    int    avgCount[2][2] = {};
-    ExeType const gpuExeType = useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
-    // Loop over all possible src/dst pairs
-    for (int src = 0; src < numDevices; src++) {
-      MemType const srcType  = (src < numCpuDevices ? MEM_CPU : MEM_GPU);
-      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpuDevices);
-      MemType const srcTypeActual = ((useFineGrain && srcType == MEM_CPU) ? MEM_CPU_FINE :
-                                     (useFineGrain && srcType == MEM_GPU) ? MEM_GPU_FINE :
-                                                                               srcType);
-      std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
-      std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
-      std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
-      std::vector<std::vector<double>> stdDev(isBidirectional + 1);
-      if (src == numCpuDevices && src != 0) printf("\n");
-      for (int dst = 0; dst < numDevices; dst++) {
-        MemType const dstType  = (dst < numCpuDevices ? MEM_CPU : MEM_GPU);
-        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpuDevices);
-        MemType const dstTypeActual = ((useFineGrain && dstType == MEM_CPU) ? MEM_CPU_FINE :
-                                       (useFineGrain && dstType == MEM_GPU) ? MEM_GPU_FINE :
-                                                                                 dstType);
-        // Prepare Transfers
-        std::vector<Transfer> transfers(isBidirectional + 1);
-        // SRC -> DST
-        transfers[0].numBytes = numBytesPerTransfer;
-        transfers[0].srcs.push_back({srcTypeActual, srcIndex});
-        transfers[0].dsts.push_back({dstTypeActual, dstIndex});
-        transfers[0].exeDevice = {IsGpuMemType(useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU,
-                                  (useRemoteRead ? dstIndex : srcIndex)};
-        transfers[0].exeSubIndex = -1;
-        transfers[0].numSubExecs = (transfers[0].exeDevice.exeType == gpuExeType) ? numGpuSubExecs : numCpuSubExecs;
-        // DST -> SRC
-        if (isBidirectional) {
-          transfers[1].numBytes = numBytesPerTransfer;
-          transfers[1].srcs.push_back({dstTypeActual, dstIndex});
-          transfers[1].dsts.push_back({srcTypeActual, srcIndex});
-          transfers[1].exeDevice = {IsGpuMemType(useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU,
-                                    (useRemoteRead ? srcIndex : dstIndex)};
-          transfers[1].exeSubIndex = -1;
-          transfers[1].numSubExecs = (transfers[1].exeDevice.exeType == gpuExeType) ? numGpuSubExecs : numCpuSubExecs;
-        }
-        bool skipTest = false;
-        // Abort if executing on NUMA node with no CPUs
-        for (int i = 0; i <= isBidirectional; i++) {
-          if (transfers[i].exeDevice.exeType == EXE_CPU &&
-              TransferBench::GetNumSubExecutors(transfers[i].exeDevice) == 0) {
-            skipTest = true;
-            break;
-          }
-#if defined(__NVCC__)
-          // NVIDIA platform cannot access GPU memory directly from CPU executors
-          if (transfers[i].exeDevice.exeType == EXE_CPU && (IsGpuMemType(srcType) || IsGpuMemType(dstType))) {
-            skipTest = true;
-            break;
-          }
-#endif
-        }
-        if (isBidirectional && srcType == dstType && srcIndex == dstIndex) skipTest = true;
-        if (!skipTest) {
-          if (!TransferBench::RunTransfers(cfg, transfers, results)) {
-            for (auto const& err : results.errResults)
-              printf("%s\n", err.errMsg.c_str());
-            exit(1);
-          }
-          for (int dir = 0; dir <= isBidirectional; dir++) {
-            double const avgBw   = results.tfrResults[dir].avgBandwidthGbPerSec;
-            avgBandwidth[dir].push_back(avgBw);
-            if (!(srcType == dstType && srcIndex == dstIndex)) {
-              avgBwSum[srcType][dstType] += avgBw;
-              avgCount[srcType][dstType]++;
-            }
-            if (ev.showIterations) {
-              double minTime = results.tfrResults[dir].perIterMsec[0];
-              double maxTime = minTime;
-              double varSum  = 0;
-              for (int i = 0; i < results.tfrResults[dir].perIterMsec.size(); i++) {
-                minTime = std::min(minTime, results.tfrResults[dir].perIterMsec[i]);
-                maxTime = std::max(maxTime, results.tfrResults[dir].perIterMsec[i]);
-                double const bw  = (transfers[dir].numBytes / 1.0E9) / results.tfrResults[dir].perIterMsec[i] * 1000.0f;
-                double const delta = (avgBw - bw);
-                varSum += delta * delta;
-              }
-              double const minBw = (transfers[dir].numBytes / 1.0E9) / maxTime * 1000.0f;
-              double const maxBw = (transfers[dir].numBytes / 1.0E9) / minTime * 1000.0f;
-              double const stdev = sqrt(varSum / results.tfrResults[dir].perIterMsec.size());
-              minBandwidth[dir].push_back(minBw);
-              maxBandwidth[dir].push_back(maxBw);
-              stdDev[dir].push_back(stdev);
-            }
-          }
-        } else {
-          for (int dir = 0; dir <= isBidirectional; dir++) {
-            avgBandwidth[dir].push_back(0);
-            minBandwidth[dir].push_back(0);
-            maxBandwidth[dir].push_back(0);
-            stdDev[dir].push_back(-1.0);
-          }
-        }
-      }
-      for (int dir = 0; dir <= isBidirectional; dir++) {
-        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, dir ? "<- " : " ->");
-        if (ev.outputToCsv) printf(",");
-        for (int dst = 0; dst < numDevices; dst++) {
-          if (dst == numCpuDevices && dst != 0) printf("   ");
-          double const avgBw = avgBandwidth[dir][dst];
-          if (avgBw == 0.0)
-            printf("%10s", "N/A");
-          else
-            printf("%10.2f", avgBw);
-          if (ev.outputToCsv) printf(",");
-        }
-        printf("\n");
-        if (ev.showIterations) {
-          // minBw
-          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
-          if (ev.outputToCsv) printf(",");
-          for (int i = 0; i < numDevices; i++) {
-            double const minBw = minBandwidth[dir][i];
-            if (i == numCpuDevices && i != 0) printf("   ");
-            if (minBw == 0.0)
-              printf("%10s", "N/A");
-            else
-              printf("%10.2f", minBw);
-            if (ev.outputToCsv) printf(",");
-          }
-          printf("\n");
-          // maxBw
-          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "max");
-          if (ev.outputToCsv) printf(",");
-          for (int i = 0; i < numDevices; i++) {
-            double const maxBw = maxBandwidth[dir][i];
-            if (i == numCpuDevices && i != 0) printf("   ");
-            if (maxBw == 0.0)
-              printf("%10s", "N/A");
-            else
-              printf("%10.2f", maxBw);
-            if (ev.outputToCsv) printf(",");
-          }
-          printf("\n");
-          // stddev
-          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, " sd");
-          if (ev.outputToCsv) printf(",");
-          for (int i = 0; i < numDevices; i++) {
-            double const sd = stdDev[dir][i];
-            if (i == numCpuDevices && i != 0) printf("   ");
-            if (sd == -1.0)
-              printf("%10s", "N/A");
-            else
-              printf("%10.2f", sd);
-            if (ev.outputToCsv) printf(",");
-          }
-          printf("\n");
-        }
-        fflush(stdout);
-      }
-      if (isBidirectional) {
-        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "<->");
-        if (ev.outputToCsv) printf(",");
-        for (int dst = 0; dst < numDevices; dst++) {
-          double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
-          if (dst == numCpuDevices && dst != 0) printf("   ");
-          if (sumBw == 0.0)
-            printf("%10s", "N/A");
-          else
-            printf("%10.2f", sumBw);
-          if (ev.outputToCsv) printf(",");
-        }
-        printf("\n");
-        if (src < numDevices - 1) printf("\n");
-      }
-    }
-    if (!ev.outputToCsv) {
-      printf("                         ");
-      for (int srcType : {MEM_CPU, MEM_GPU})
-        for (int dstType : {MEM_CPU, MEM_GPU})
-          printf("  %cPU->%cPU", srcType == MEM_CPU ? 'C' : 'G', dstType == MEM_CPU ? 'C' : 'G');
-      printf("\n");
-      printf("Averages (During %s):",  isBidirectional ? " BiDir" : "UniDir");
-      for (int srcType : {MEM_CPU, MEM_GPU})
-        for (int dstType : {MEM_CPU, MEM_GPU}) {
-          if (avgCount[srcType][dstType])
-            printf("%10.2f", avgBwSum[srcType][dstType] / avgCount[srcType][dstType]);
-          else
-            printf("%10s", "N/A");
-        }
-      printf("\n\n");
-    }
-  }
-}
--- a/src/client/Presets/Scaling.hpp
+++ b/src/client/Presets/Scaling.hpp
-/*
-Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-void ScalingPreset(EnvVars&           ev,
-                   size_t      const  numBytesPerTransfer,
-                   std::string const  presetName)
-{
-  int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
-  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-  // Collect env vars for this preset
-  int localIdx      = EnvVars::GetEnvVar("LOCAL_IDX",       0);
-  int numCpuDevices = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
-  int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
-  int sweepMax      = EnvVars::GetEnvVar("SWEEP_MAX",       32);
-  int sweepMin      = EnvVars::GetEnvVar("SWEEP_MIN",       1);
-  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN",  0);
-  // Display environment variables
-  ev.DisplayEnvVars();
-  if (!ev.hideEnv) {
-    int outputToCsv = ev.outputToCsv;
-    if (!outputToCsv) printf("[Schmoo Related]\n");
-    ev.Print("LOCAL_IDX",      localIdx,     "Local GPU index");
-    ev.Print("SWEEP_MAX",      sweepMax,     "Max number of subExecutors to use");
-    ev.Print("SWEEP_MIN",      sweepMin,     "Min number of subExecutors to use");
-    printf("\n");
-  }
-  // Validate env vars
-  if (localIdx >= numDetectedGpus) {
-    printf("[ERROR] Cannot execute scaling test with local GPU device %d\n", localIdx);
-    exit(1);
-  }
-  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
-  TransferBench::TestResults results;
-  char separator = (ev.outputToCsv ? ',' : ' ');
-  int numDevices = numCpuDevices + numGpuDevices;
-  printf("GPU-GFX Scaling benchmark:\n");
-  printf("==========================\n");
-  printf("- Copying %lu bytes from GPU %d to other devices\n", numBytesPerTransfer, localIdx);
-  printf("- All numbers reported as GB/sec\n\n");
-  printf("NumCUs");
-  for (int i = 0; i < numDevices; i++)
-    printf("%c  %s%02d     ", separator, i < numCpuDevices ? "CPU" : "GPU", i < numCpuDevices ? i : i - numCpuDevices);
-  printf("\n");
-  std::vector<std::pair<double, int>> bestResult(numDevices);
-  std::vector<Transfer> transfers(1);
-  Transfer& t   = transfers[0];
-  t.exeDevice   = {EXE_GPU_GFX, localIdx};
-  t.exeSubIndex = -1;
-  t.numBytes    = numBytesPerTransfer;
-  t.srcs        = {{MEM_GPU, localIdx}};
-  for (int numSubExec = sweepMin; numSubExec <= sweepMax; numSubExec++) {
-    t.numSubExecs = numSubExec;
-    printf("%4d  ", numSubExec);
-    for (int i = 0; i < numDevices; i++) {
-      t.dsts = {{i < numCpuDevices ? MEM_CPU : MEM_GPU,
-                 i < numCpuDevices ? i : i - numCpuDevices}};
-      if (!RunTransfers(cfg, transfers, results)) {
-        PrintErrors(results.errResults);
-        exit(1);
-      }
-      double bw = results.tfrResults[0].avgBandwidthGbPerSec;
-      printf("%c%7.2f     ", separator, bw);
-      if (bw > bestResult[i].first) {
-        bestResult[i].first  = bw;
-        bestResult[i].second = numSubExec;
-      }
-    }
-    printf("\n");
-  }
-  printf(" Best ");
-  for (int i = 0; i < numDevices; i++)
-    printf("%c%7.2f(%3d)", separator, bestResult[i].first, bestResult[i].second);
-  printf("\n");
-}
--- a/src/client/Presets/Schmoo.hpp
+++ b/src/client/Presets/Schmoo.hpp
-/*
-Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-void SchmooPreset(EnvVars&           ev,
-                  size_t      const  numBytesPerTransfer,
-                  std::string const  presetName)
-{
-  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-  if (numDetectedGpus < 2) {
-    printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
-    exit(1);
-  }
-  // Collect env vars for this preset
-  int localIdx     = EnvVars::GetEnvVar("LOCAL_IDX",      0);
-  int remoteIdx    = EnvVars::GetEnvVar("REMOTE_IDX",     1);
-  int sweepMax     = EnvVars::GetEnvVar("SWEEP_MAX",      32);
-  int sweepMin     = EnvVars::GetEnvVar("SWEEP_MIN",      1);
-  int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
-  // Display environment variables
-  ev.DisplayEnvVars();
-  if (!ev.hideEnv) {
-    int outputToCsv = ev.outputToCsv;
-    if (!outputToCsv) printf("[Schmoo Related]\n");
-    ev.Print("LOCAL_IDX",      localIdx,     "Local GPU index");
-    ev.Print("REMOTE_IDX",     remoteIdx,    "Remote GPU index");
-    ev.Print("SWEEP_MAX",      sweepMax,     "Max number of subExecutors to use");
-    ev.Print("SWEEP_MIN",      sweepMin,     "Min number of subExecutors to use");
-    ev.Print("USE_FINE_GRAIN", useFineGrain, "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
-    printf("\n");
-  }
-  // Validate env vars
-  if (localIdx >= numDetectedGpus || remoteIdx >= numDetectedGpus) {
-    printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx);
-    exit(1);
-  }
-  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
-  TransferBench::TestResults results;
-  char memChar = useFineGrain ? 'F' : 'G';
-  printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n", numBytesPerTransfer, localIdx, remoteIdx);
-  printf("       | Local Read  | Local Write | Local Copy  | Remote Read | Remote Write| Remote Copy |\n");
-  printf("  #CUs |%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|\n",
-         memChar, localIdx, localIdx,
-         localIdx, memChar, localIdx,
-         memChar, localIdx, localIdx, memChar, localIdx,
-         memChar, remoteIdx, localIdx,
-         localIdx, memChar, remoteIdx,
-         memChar, localIdx, localIdx, memChar, remoteIdx);
-  printf("|------|-------------|-------------|-------------|-------------|-------------|-------------|\n");
-  std::vector<Transfer> transfers(1);
-  Transfer& t   = transfers[0];
-  t.exeDevice   = {EXE_GPU_GFX, localIdx};
-  t.exeSubIndex = -1;
-  t.numBytes    = numBytesPerTransfer;
-  MemType memType = (useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-  for (int numCUs = sweepMin; numCUs <= sweepMax; numCUs++) {
-    t.numSubExecs = numCUs;
-    // Local Read
-    t.srcs = {{memType, localIdx}};
-    t.dsts = {};
-    if (!RunTransfers(cfg, transfers, results)) {
-      PrintErrors(results.errResults);
-      exit(1);
-    }
-    double const localRead = results.tfrResults[0].avgBandwidthGbPerSec;
-    // Local Write
-    t.srcs = {};
-    t.dsts = {{memType, localIdx}};
-    if (!RunTransfers(cfg, transfers, results)) {
-      PrintErrors(results.errResults);
-      exit(1);
-    }
-    double const localWrite = results.tfrResults[0].avgBandwidthGbPerSec;
-    // Local Copy
-    t.srcs = {{memType, localIdx}};
-    t.dsts = {{memType, localIdx}};
-    t.srcs = {};
-    t.dsts = {{memType, localIdx}};
-    if (!RunTransfers(cfg, transfers, results)) {
-      PrintErrors(results.errResults);
-      exit(1);
-    }
-    double const localCopy = results.tfrResults[0].avgBandwidthGbPerSec;
-    // Remote Read
-    t.srcs = {{memType, remoteIdx}};
-    t.dsts = {};
-    if (!RunTransfers(cfg, transfers, results)) {
-      PrintErrors(results.errResults);
-      exit(1);
-    }
-    double const remoteRead = results.tfrResults[0].avgBandwidthGbPerSec;
-    // Remote Write
-    t.srcs = {};
-    t.dsts = {{memType, remoteIdx}};
-    if (!RunTransfers(cfg, transfers, results)) {
-      PrintErrors(results.errResults);
-      exit(1);
-    }
-    double const remoteWrite = results.tfrResults[0].avgBandwidthGbPerSec;
-    // Remote Copy
-    t.srcs = {{memType, localIdx}};
-    t.dsts = {{memType, remoteIdx}};
-    if (!RunTransfers(cfg, transfers, results)) {
-      PrintErrors(results.errResults);
-      exit(1);
-    }
-    double const remoteCopy = results.tfrResults[0].avgBandwidthGbPerSec;
-    printf("   %3d   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f  \n",
-           numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy);
-  }
-}
--- a/src/client/Presets/Sweep.hpp
+++ b/src/client/Presets/Sweep.hpp
-/*
-Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
-{
-  if (fp) {
-    fprintf(fp, "# Test %d\n", testNum);
-    fprintf(fp, "%d", -1 * (int)transfers.size());
-    for (auto const& transfer : transfers)
-    {
-      fprintf(fp, " (%s->%c%d->%s %d %lu)",
-              MemDevicesToStr(transfer.srcs).c_str(),
-              ExeTypeStr[transfer.exeDevice.exeType], transfer.exeDevice.exeIndex,
-              MemDevicesToStr(transfer.dsts).c_str(),
-              transfer.numSubExecs,
-              transfer.numBytes);
-    }
-    fprintf(fp, "\n");
-    fflush(fp);
-  }
-}
-void SweepPreset(EnvVars&           ev,
-                 size_t      const  numBytesPerTransfer,
-                 std::string const  presetName)
-{
-  bool const isRandom = (presetName == "rsweep");
-  int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
-  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-  // Collect env vars and set defaults
-  int         continueOnErr  = EnvVars::GetEnvVar("CONTINUE_ON_ERROR"   , 0);
-  int         numCpuDevices  = EnvVars::GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
-  int         numCpuSubExecs = EnvVars::GetEnvVar("NUM_CPU_SE"          , 4);
-  int         numGpuDevices  = EnvVars::GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
-  int         numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE"          , 4);
-  std::string sweepDst       = EnvVars::GetEnvVar("SWEEP_DST"           , "CG");
-  std::string sweepExe       = EnvVars::GetEnvVar("SWEEP_EXE"           , "CDG");
-  std::string sweepFile      = EnvVars::GetEnvVar("SWEEP_FILE"          , "/tmp/lastSweep.cfg");
-  int         sweepMax       = EnvVars::GetEnvVar("SWEEP_MAX"           , 24);
-  int         sweepMin       = EnvVars::GetEnvVar("SWEEP_MIN"           , 1);
-  int         sweepRandBytes = EnvVars::GetEnvVar("SWEEP_RAND_BYTES"    , 0);
-  int         sweepSeed      = EnvVars::GetEnvVar("SWEEP_SEED"          , time(NULL));
-  std::string sweepSrc       = EnvVars::GetEnvVar("SWEEP_SRC"           , "CG");
-  int         sweepTestLimit = EnvVars::GetEnvVar("SWEEP_TEST_LIMIT"    , 0);
-  int         sweepTimeLimit = EnvVars::GetEnvVar("SWEEP_TIME_LIMIT"    , 0);
-  int         sweepXgmiMin   = EnvVars::GetEnvVar("SWEEP_XGMI_MIN"      , 0);
-  int         sweepXgmiMax   = EnvVars::GetEnvVar("SWEEP_XGMI_MAX"      , -1);
-  auto generator = new std::default_random_engine(sweepSeed);
-  // Display env var settings
-  ev.DisplayEnvVars();
-  if (!ev.hideEnv) {
-    int outputToCsv = ev.outputToCsv;
-    if (!outputToCsv) printf("[Sweep Related]\n");
-    ev.Print("CONTINUE_ON_ERROR", continueOnErr,    continueOnErr ? "Continue on mismatch error" : "Stop after first error");
-    ev.Print("NUM_CPU_DEVICES",   numCpuDevices,    "Using %d CPUs", numCpuDevices);
-    ev.Print("NUM_CPU_SE",        numCpuSubExecs,   "Using %d CPU threads per CPU executed Transfer", numCpuSubExecs);
-    ev.Print("NUM_GPU_DEVICES",   numGpuDevices,    "Using %d GPUs", numGpuDevices);
-    ev.Print("NUM_GPU_SE",        numGpuSubExecs,   "Using %d subExecutors/CUs per GPU executed Transfer", numGpuSubExecs);
-    ev.Print("SWEEP_DST",         sweepDst.c_str(), "Destination Memory Types to sweep");
-    ev.Print("SWEEP_EXE",         sweepExe.c_str(), "Executor Types to sweep");
-    ev.Print("SWEEP_FILE",        sweepFile.c_str(),"File to store the executing sweep configuration");
-    ev.Print("SWEEP_MAX",         sweepMax,         "Max simultaneous transfers (0 = no limit)");
-    ev.Print("SWEEP_MIN",         sweepMin,         "Min simultaenous transfers");
-    ev.Print("SWEEP_RAND_BYTES",  sweepRandBytes,   "Using %s number of bytes per Transfer", (sweepRandBytes ? "random" : "constant"));
-    ev.Print("SWEEP_SEED",        sweepSeed,        "Random seed set to %d", sweepSeed);
-    ev.Print("SWEEP_SRC",         sweepSrc.c_str(), "Source Memory Types to sweep");
-    ev.Print("SWEEP_TEST_LIMIT",  sweepTestLimit,   "Max number of tests to run during sweep (0 = no limit)");
-    ev.Print("SWEEP_TIME_LIMIT",  sweepTimeLimit,   "Max number of seconds to run sweep for  (0 = no limit)");
-    ev.Print("SWEEP_XGMI_MAX",    sweepXgmiMax,     "Max number of XGMI hops for Transfers  (-1 = no limit)");
-    ev.Print("SWEEP_XGMI_MIN",    sweepXgmiMin,     "Min number of XGMI hops for Transfers");
-    printf("\n");
-  }
-  // Validate env vars
-  for (auto ch : sweepSrc) {
-    if (!strchr(MemTypeStr, ch)) {
-      printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
-      exit(1);
-    }
-    if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch)) {
-      printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
-      exit(1);
-    }
-  }
-  for (auto ch : sweepDst) {
-    if (!strchr(MemTypeStr, ch)) {
-      printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
-      exit(1);
-    }
-    if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch)) {
-      printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
-      exit(1);
-    }
-  }
-  for (auto ch : sweepExe) {
-    if (!strchr(ExeTypeStr, ch)) {
-      printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
-      exit(1);
-    }
-    if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch)) {
-      printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
-      exit(1);
-    }
-  }
-  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
-  TransferBench::TestResults results;
-  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
-  std::vector<ExeDevice> exeList;
-  for (auto exe : sweepExe) {
-    ExeType exeType;
-    CharToExeType(exe, exeType);
-    if (IsGpuExeType(exeType)) {
-      for (int exeIndex = 0; exeIndex < numGpuDevices; ++exeIndex)
-        exeList.push_back({exeType, exeIndex});
-    }
-    else if (IsCpuExeType(exeType)) {
-      for (int exeIndex = 0; exeIndex < numCpuDevices; ++exeIndex) {
-        // Skip NUMA nodes that have no CPUs (e.g. CXL)
-        if (TransferBench::GetNumSubExecutors({EXE_CPU, exeIndex}) == 0) continue;
-        exeList.push_back({exeType, exeIndex});
-      }
-    }
-  }
-  int numExes = exeList.size();
-  std::vector<MemDevice> srcList;
-  for (auto src : sweepSrc) {
-    MemType srcType;
-    CharToMemType(src, srcType);
-    int const numDevices = (srcType == MEM_NULL) ? 1 : IsGpuMemType(srcType) ? numGpuDevices : numCpuDevices;
-    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
-      srcList.push_back({srcType, srcIndex});
-  }
-  int numSrcs = srcList.size();
-  std::vector<MemDevice> dstList;
-  for (auto dst : sweepDst) {
-    MemType dstType;
-    CharToMemType(dst, dstType);
-    int const numDevices = (dstType == MEM_NULL) ? 1 : IsGpuMemType(dstType) ? numGpuDevices : numCpuDevices;
-    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
-      dstList.push_back({dstType, dstIndex});
-  }
-  int numDsts = dstList.size();
-  // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
-  struct TransferInfo
-  {
-    MemDevice srcMem;
-    ExeDevice exeDevice;
-    MemDevice dstMem;
-  };
-  // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
-  bool const useXgmiOnly = (sweepXgmiMin > 0 || sweepXgmiMax > 0);
-  std::vector<TransferInfo> possibleTransfers;
-  TransferInfo tinfo;
-  for (int i = 0; i < numExes; ++i) {
-    // Skip CPU executors if XGMI link must be used
-    if (useXgmiOnly && !IsGpuExeType(exeList[i].exeType)) continue;
-    tinfo.exeDevice = exeList[i];
-    bool isXgmiSrc  = false;
-    int  numHopsSrc = 0;
-    for (int j = 0; j < numSrcs; ++j) {
-      if (IsGpuExeType(exeList[i].exeType) && IsGpuMemType(srcList[j].memType)) {
-        if (exeList[i].exeIndex != srcList[j].memIndex) {
-#if defined(__NVCC__)
-          isXgmiSrc = false;
-#else
-          uint32_t exeToSrcLinkType, exeToSrcHopCount;
-          HIP_CALL(hipExtGetLinkTypeAndHopCount(exeList[i].exeIndex,
-                                                srcList[j].memIndex,
-                                                &exeToSrcLinkType,
-                                                &exeToSrcHopCount));
-          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
-          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
-#endif
-        } else {
-          isXgmiSrc = true;
-          numHopsSrc = 0;
-        }
-        // Skip this SRC if it is not XGMI but only XGMI links may be used
-        if (useXgmiOnly && !isXgmiSrc) continue;
-        // Skip this SRC if XGMI distance is already past limit
-        if (sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > sweepXgmiMax) continue;
-      } else if (srcList[j].memType != MEM_NULL && useXgmiOnly) continue;
-      tinfo.srcMem = srcList[j];
-      bool isXgmiDst = false;
-      int  numHopsDst = 0;
-      for (int k = 0; k < numDsts; ++k) {
-        if (IsGpuExeType(exeList[i].exeType) && IsGpuMemType(dstList[k].memType)) {
-          if (exeList[i].exeIndex != dstList[k].memIndex) {
-#if defined(__NVCC__)
-            isXgmiSrc = false;
-#else
-            uint32_t exeToDstLinkType, exeToDstHopCount;
-            HIP_CALL(hipExtGetLinkTypeAndHopCount(exeList[i].exeIndex,
-                                                  dstList[k].memIndex,
-                                                  &exeToDstLinkType,
-                                                  &exeToDstHopCount));
-            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
-            if (isXgmiDst) numHopsDst = exeToDstHopCount;
-#endif
-          } else {
-            isXgmiDst = true;
-            numHopsDst = 0;
-          }
-        }
-        // Skip this DST if it is not XGMI but only XGMI links may be used
-        if (dstList[k].memType != MEM_NULL && useXgmiOnly && !isXgmiDst) continue;
-        // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
-        if (sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < sweepXgmiMin)) continue;
-        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
-        if (sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > sweepXgmiMax) continue;
-#if defined(__NVCC__)
-        // Skip CPU executors on GPU memory on NVIDIA platform
-        if (IsCpuExeType(exeList[i].exeType) && (IsGpuMemType(dstList[j].memType) || IsGpuMemType(dstList[k].memType)))
-          continue;
-#endif
-        tinfo.dstMem = dstList[k];
-        // Skip if there is no src and dst
-        if (tinfo.srcMem.memType == MEM_NULL && tinfo.dstMem.memType == MEM_NULL) continue;
-        possibleTransfers.push_back(tinfo);
-      }
-    }
-  }
-  int const numPossible = (int)possibleTransfers.size();
-  int maxParallelTransfers = (sweepMax == 0 ? numPossible : sweepMax);
-  if (sweepMin > numPossible) {
-    printf("No valid test configurations exist\n");
-    return;
-  }
-  if (ev.outputToCsv) {
-    printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
-           "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
-  }
-  int numTestsRun = 0;
-  int M = sweepMin;
-  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
-  std::uniform_int_distribution<int> distribution(sweepMin, maxParallelTransfers);
-  // Log sweep to configuration file
-  char absPath[1024];
-  auto const res = realpath(sweepFile.c_str(), absPath);
-  FILE *fp = fopen(sweepFile.c_str(), "w");
-  if (!fp) {
-    printf("[WARN] Unable to open %s.  Skipping output of sweep configuration file\n", res ? absPath : sweepFile.c_str());
-  } else {
-    printf("Sweep configuration saved to: %s\n", res ? absPath : sweepFile.c_str());
-  }
-  // Create bitmask of numPossible triplets, of which M will be chosen
-  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
-  auto cpuStart = std::chrono::high_resolution_clock::now();
-  while (1)  {
-    if (isRandom) {
-      // Pick random number of simultaneous transfers to execute
-      // NOTE: This currently skews distribution due to some #s having more possibilities than others
-      M = distribution(*generator);
-      // Generate a random bitmask
-      for (int i = 0; i < numPossible; i++)
-        bitmask[i] = (i < M) ? 1 : 0;
-      std::shuffle(bitmask.begin(), bitmask.end(), *generator);
-    }
-    // Convert bitmask to list of Transfers
-    std::vector<Transfer> transfers;
-    for (int value = 0; value < numPossible; ++value) {
-      if (bitmask[value]) {
-        // Convert integer value to (SRC->EXE->DST) triplet
-        Transfer transfer;
-        if (possibleTransfers[value].srcMem.memType != MEM_NULL)
-          transfer.srcs.push_back(possibleTransfers[value].srcMem);
-        transfer.exeDevice      = possibleTransfers[value].exeDevice;
-        if (possibleTransfers[value].dstMem.memType != MEM_NULL)
-          transfer.dsts.push_back(possibleTransfers[value].dstMem);
-        transfer.exeSubIndex    = -1;
-        transfer.numSubExecs    = IsGpuExeType(transfer.exeDevice.exeType) ? numGpuSubExecs : numCpuSubExecs;
-        transfer.numBytes       = sweepRandBytes ? randSize(*generator) * sizeof(float) : numBytesPerTransfer;
-        transfers.push_back(transfer);
-      }
-    }
-    LogTransfers(fp, ++numTestsRun, transfers);
-    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
-      PrintErrors(results.errResults);
-      if (!continueOnErr) exit(1);
-    } else {
-      PrintResults(ev, numTestsRun, transfers, results);
-    }
-    // Check for test limit
-    if (numTestsRun == sweepTestLimit) {
-      printf("Sweep Test limit reached\n");
-      break;
-    }
-    // Check for time limit
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-    if (sweepTimeLimit && totalCpuTime > sweepTimeLimit) {
-      printf("Sweep Time limit exceeded\n");
-      break;
-    }
-    // Increment bitmask if not random sweep
-    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end())) {
-      M++;
-      // Check for completion
-      if (M > maxParallelTransfers) {
-        printf("Sweep complete\n");
-        break;
-      }
-      for (int i = 0; i < numPossible; i++)
-        bitmask[i] = (i < M) ? 1 : 0;
-    }
-  }
-  if (fp) fclose(fp);
-}
--- a/src/client/Topology.hpp
+++ b/src/client/Topology.hpp
-/*
-Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-#pragma once
-#include "TransferBench.hpp"
-static int RemappedCpuIndex(int origIdx)
-{
-  static std::vector<int> remappingCpu;
-  // Build CPU remapping on first use
-  // Skip numa nodes that are not configured
-  if (remappingCpu.empty()) {
-    for (int node = 0; node <= numa_max_node(); node++)
-      if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
-        remappingCpu.push_back(node);
-  }
-  return remappingCpu[origIdx];
-}
-static void PrintNicToGPUTopo(bool outputToCsv)
-{
-#ifdef NIC_EXEC_ENABLED
-  printf(" NIC | Device Name | Active | PCIe Bus ID  | NUMA | Closest GPU(s) | GID Index | GID Descriptor\n");
-  if(!outputToCsv)
-    printf("-----+-------------+--------+--------------+------+----------------+-----------+-------------------\n");
-  int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-  auto const& ibvDeviceList = GetIbvDeviceList();
-  for (int i = 0; i < ibvDeviceList.size(); i++) {
-    std::string closestGpusStr = "";
-    for (int j = 0; j < numGpus; j++) {
-      if (TransferBench::GetClosestNicToGpu(j) == i) {
-        if (closestGpusStr != "") closestGpusStr += ",";
-        closestGpusStr += std::to_string(j);
-      }
-    }
-    printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-14s | %-9s | %-20s\n",
-           i, ibvDeviceList[i].name.c_str(),
-           ibvDeviceList[i].hasActivePort ? "Yes" : "No",
-           ibvDeviceList[i].busId.c_str(),
-           ibvDeviceList[i].numaNode,
-           closestGpusStr.c_str(),
-           ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort?  std::to_string(ibvDeviceList[i].gidIndex).c_str() : "N/A",
-           ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort?  ibvDeviceList[i].gidDescriptor.c_str() : "N/A"
-          );
-  }
-  printf("\n");
-#endif
-}
-void DisplayTopology(bool outputToCsv)
-{
-  int numCpus = TransferBench::GetNumExecutors(EXE_CPU);
-  int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-  int numNics = TransferBench::GetNumExecutors(EXE_NIC);
-  char sep = (outputToCsv ? ',' : '|');
-  if (outputToCsv) {
-    printf("NumCpus,%d\n", numCpus);
-    printf("NumGpus,%d\n", numGpus);
-    printf("NumNics,%d\n", numNics);
-  } else {
-    printf("\nDetected Topology:\n");
-    printf("==================\n");
-    printf("  %d configured CPU NUMA node(s) [%d total]\n", numCpus, numa_max_node() + 1);
-    printf("  %d GPU device(s)\n", numGpus);
-    printf("  %d Supported NIC device(s)\n", numNics);
-  }
-  // Print out detected CPU topology
-  printf("\n            %c", sep);
-  for (int j = 0; j < numCpus; j++)
-    printf("NUMA %02d%c", j, sep);
-  printf(" #Cpus %c Closest GPU(s)\n", sep);
-  if (!outputToCsv) {
-    printf("------------+");
-    for (int j = 0; j <= numCpus; j++)
-      printf("-------+");
-    printf("---------------\n");
-  }
-  for (int i = 0; i < numCpus; i++) {
-    int nodeI = RemappedCpuIndex(i);
-    printf("NUMA %02d (%02d)%c", i, nodeI, sep);
-    for (int j = 0; j < numCpus; j++) {
-      int nodeJ = RemappedCpuIndex(j);
-      int numaDist = numa_distance(nodeI, nodeJ);
-      printf(" %5d %c", numaDist, sep);
-    }
-    int numCpuCores = 0;
-    for (int j = 0; j < numa_num_configured_cpus(); j++)
-      if (numa_node_of_cpu(j) == nodeI) numCpuCores++;
-    printf(" %5d %c", numCpuCores, sep);
-    for (int j = 0; j < numGpus; j++) {
-      if (TransferBench::GetClosestCpuNumaToGpu(j) == nodeI) {
-        printf(" %d", j);
-      }
-    }
-    printf("\n");
-  }
-  printf("\n");
-  // Print out detected NIC topology
-  PrintNicToGPUTopo(outputToCsv);
-  // Print out detected GPU topology
-#if defined(__NVCC__)
-  for (int i = 0; i < numGpus; i++) {
-    hipDeviceProp_t prop;
-    HIP_CALL(hipGetDeviceProperties(&prop, i));
-    printf(" GPU %02d | %s\n", i, prop.name);
-  }
-  // No further topology detection done for NVIDIA platforms
-  return;
-#else
-  // Print headers
-  if (!outputToCsv) {
-    printf("        |");
-    for (int j = 0; j < numGpus; j++) {
-      hipDeviceProp_t prop;
-      HIP_CALL(hipGetDeviceProperties(&prop, j));
-      std::string fullName = prop.gcnArchName;
-      std::string archName = fullName.substr(0, fullName.find(':'));
-      printf(" %6s |", archName.c_str());
-    }
-    printf("\n");
-  }
-  printf("        %c", sep);
-  for (int j = 0; j < numGpus; j++)
-    printf(" GPU %02d %c", j, sep);
-  printf(" PCIe Bus ID  %c #CUs %c NUMA %c #DMA %c #XCC %c NIC\n", sep, sep, sep, sep, sep);
-  if (!outputToCsv) {
-    for (int j = 0; j <= numGpus; j++)
-      printf("--------+");
-    printf("--------------+------+------+------+------+------\n");
-  }
-  // Loop over each GPU device
-  for (int i = 0; i < numGpus; i++) {
-    printf(" GPU %02d %c", i, sep);
-    // Print off link information
-    for (int j = 0; j < numGpus; j++) {
-      if (i == j) {
-        printf("    N/A %c", sep);
-      } else {
-        uint32_t linkType, hopCount;
-        HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
-        printf(" %s-%d %c",
-               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
-               hopCount, sep);
-      }
-    }
-    char pciBusId[20];
-    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
-    printf(" %-11s %c %-4d %c %-4d %c %-4d %c %-4d %c %-4d\n",
-           pciBusId, sep,
-           TransferBench::GetNumSubExecutors({EXE_GPU_GFX, i}), sep,
-           TransferBench::GetClosestCpuNumaToGpu(i), sep,
-           TransferBench::GetNumExecutorSubIndices({EXE_GPU_DMA, i}), sep,
-           TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}), sep,
-           TransferBench::GetClosestNicToGpu(i));
-  }
-#endif
-}
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
-/*
-Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-/// @cond
-#pragma once
-#include <algorithm>
-#include <cstring>
-#include <future>
-#include <map>
-#include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
-#include <numaif.h>
-#include <random>
-#include <set>
-#include <sstream>
-#include <stdarg.h>
-#include <thread>
-#include <unistd.h>
-#include <vector>
-#ifdef NIC_EXEC_ENABLED
-#include <infiniband/verbs.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <arpa/inet.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <filesystem>
-#include <fstream>
-#endif
-#if defined(__NVCC__)
-#include <cuda_runtime.h>
-#else
-#include <hip/hip_ext.h>
-#include <hip/hip_runtime.h>
-#include <hsa/hsa.h>
-#include <hsa/hsa_ext_amd.h>
-#endif
-/// @endcond
-namespace TransferBench
-{
-  using std::map;
-  using std::pair;
-  using std::set;
-  using std::vector;
-  constexpr char VERSION[] = "1.62";
-  /**
-   * Enumeration of supported Executor types
-   *
-   * @note The Executor is the device used to perform a Transfer
-   */
-  enum ExeType
-  {
-    EXE_CPU          = 0,                       ///<  CPU executor              (subExecutor = CPU thread)
-    EXE_GPU_GFX      = 1,                       ///<  GPU kernel-based executor (subExecutor = threadblock/CU)
-    EXE_GPU_DMA      = 2,                       ///<  GPU SDMA executor         (subExecutor = not supported)
-    EXE_NIC          = 3,                       ///<  NIC RDMA executor         (subExecutor = queue pair)
-    EXE_NIC_NEAREST  = 4                        ///<  NIC RDMA nearest executor (subExecutor = queue pair)
-  };
-  char const ExeTypeStr[6] = "CGDIN";
-  inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; }
-  inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA; }
-  inline bool IsNicExeType(ExeType e){ return e == EXE_NIC || e == EXE_NIC_NEAREST; }
-  /**
-   * A ExeDevice defines a specific Executor
-   */
-  struct ExeDevice
-  {
-    ExeType exeType;                            ///< Executor type
-    int32_t exeIndex;                           ///< Executor index
-    bool operator<(ExeDevice const& other) const {
-      return (exeType < other.exeType) || (exeType == other.exeType && exeIndex < other.exeIndex);
-    }
-  };
-  /**
-   * Enumeration of supported memory types
-   *
-   * @note These are possible types of memory to be used as sources/destinations
-   */
-  enum MemType
-  {
-    MEM_CPU          = 0,                       ///< Coarse-grained pinned CPU memory
-    MEM_GPU          = 1,                       ///< Coarse-grained global GPU memory
-    MEM_CPU_FINE     = 2,                       ///< Fine-grained pinned CPU memory
-    MEM_GPU_FINE     = 3,                       ///< Fine-grained global GPU memory
-    MEM_CPU_UNPINNED = 4,                       ///< Unpinned CPU memory
-    MEM_NULL         = 5,                       ///< NULL memory - used for empty
-    MEM_MANAGED      = 6,                       ///< Managed memory
-    MEM_CPU_CLOSEST  = 7,                       ///< Coarse-grained pinned CPU memory indexed by closest GPU
-  };
-  char const MemTypeStr[9] = "CGBFUNMP";
-  inline bool IsCpuMemType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED || m == MEM_CPU_CLOSEST); }
-  inline bool IsGpuMemType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); }
-  /**
-   * A MemDevice indicates a memory type on a specific device
-   */
-  struct MemDevice
-  {
-    MemType memType;                            ///< Memory type
-    int32_t memIndex;                           ///< Device index
-    bool operator<(MemDevice const& other) const {
-      return (memType < other.memType) || (memType == other.memType && memIndex < other.memIndex);
-    }
-  };
-  /**
-   * A Transfer adds together data from zero or more sources then writes the sum to zero or more desintations
-   */
-  struct Transfer
-  {
-    size_t            numBytes    = 0;          ///< Number of bytes to Transfer
-    vector<MemDevice> srcs        = {};         ///< List of source memory devices
-    vector<MemDevice> dsts        = {};         ///< List of destination memory devices
-    ExeDevice         exeDevice   = {};         ///< Executor to use
-    int32_t           exeSubIndex = -1;         ///< Executor subindex
-    int               numSubExecs = 0;          ///< Number of subExecutors to use for this Transfer
-  };
-  /**
-   * General options
-   */
-  struct GeneralOptions
-  {
-    int numIterations      = 10;                ///< Number of timed iterations to perform. If negative, run for -numIterations seconds instead
-    int numSubIterations   = 1;                 ///< Number of sub-iterations per iteration
-    int numWarmups         = 3;                 ///< Number of un-timed warmup iterations to perform
-    int recordPerIteration = 0;                 ///< Record per-iteration timing information
-    int useInteractive     = 0;                 ///< Pause for user-input before starting transfer loop
-  };
-  /**
-   * Data options
-   */
-  struct DataOptions
-  {
-    int           alwaysValidate   = 0;         ///< Validate after each iteration instead of once at end
-    int           blockBytes       = 256;       ///< Each subexecutor works on a multiple of this many bytes
-    int           byteOffset       = 0;         ///< Byte-offset for memory allocations
-    vector<float> fillPattern      = {};        ///< Pattern of floats used to fill source data
-    int           validateDirect   = 0;         ///< Validate GPU results directly instead of copying to host
-    int           validateSource   = 0;         ///< Validate src GPU memory immediately after preparation
-  };
-  /**
-   * GFX Executor options
-   */
-  struct GfxOptions
-  {
-    int                 blockOrder     = 0;     ///< Determines how threadblocks are ordered (0=sequential, 1=interleaved, 2=random)
-    int                 blockSize      = 256;   ///< Size of each threadblock (must be multiple of 64)
-    vector<uint32_t>    cuMask         = {};    ///< Bit-vector representing the CU mask
-    vector<vector<int>> prefXccTable   = {};    ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
-    int                 temporalMode   = 0;     ///< Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both
-    int                 unrollFactor   = 4;     ///< GFX-kernel unroll factor
-    int                 useHipEvents   = 1;     ///< Use HIP events for timing GFX Executor
-    int                 useMultiStream = 0;     ///< Use multiple streams for GFX
-    int                 useSingleTeam  = 0;     ///< Team all subExecutors across the data array
-    int                 waveOrder      = 0;     ///< GFX-kernel wavefront ordering
-    int                 wordSize       = 4;     ///< GFX-kernel packed data size (4=dwordx4, 2=dwordx2, 1=dwordx1)
-  };
-  /**
-   * DMA Executor options
-   */
-  struct DmaOptions
-  {
-    int useHipEvents = 1;                       ///< Use HIP events for timing DMA Executor
-    int useHsaCopy   = 0;                       ///< Use HSA copy instead of HIP copy to perform DMA
-  };
-  /**
-   * NIC Executor options
-   */
-  struct NicOptions
-  {
-    vector<int> closestNics     = {};           ///< Overrides the auto-detected closest NIC per GPU
-    int         ibGidIndex      = -1;           ///< GID Index for RoCE NICs (-1 is auto)
-    uint8_t     ibPort          = 1;            ///< NIC port number to be used
-    int         ipAddressFamily = 4;            ///< 4=IPv4, 6=IPv6 (used for auto GID detection)
-    int         maxRecvWorkReq  = 16;           ///< Maximum number of recv work requests per queue pair
-    int         maxSendWorkReq  = 16;           ///< Maximum number of send work requests per queue pair
-    int         queueSize       = 100;          ///< Completion queue size
-    int         roceVersion     = 2;            ///< RoCE version (used for auto GID detection)
-    int         useRelaxedOrder = 1;            ///< Use relaxed ordering
-    int         useNuma         = 0;            ///< Switch to closest numa thread for execution
-  };
-  /**
-   * Configuration options for performing Transfers
-   */
-  struct ConfigOptions
-  {
-    GeneralOptions general;                     ///< General options
-    DataOptions    data;                        ///< Data options
-    GfxOptions     gfx;                         ///< GFX executor options
-    DmaOptions     dma;                         ///< DMA executor options
-    NicOptions     nic;                         ///< NIC executor options
-  };
-  /**
-   * Enumeration of possible error types
-   */
-  enum ErrType
-  {
-    ERR_NONE  = 0,                              ///< No errors
-    ERR_WARN  = 1,                              ///< Warning - results may not be accurate
-    ERR_FATAL = 2,                              ///< Fatal error - results are invalid
-  };
-  /**
-   * Enumeration of GID priority
-   *
-   * @note These are the GID types ordered in priority from lowest (0) to highest
-   */
-  enum GidPriority
-  {
-    UNKNOWN           = -1,                      ///< Default
-    ROCEV1_LINK_LOCAL = 0,                       ///< RoCEv1 Link-local
-    ROCEV2_LINK_LOCAL = 1,                       ///< RoCEv2 Link-local fe80::/10
-    ROCEV1_IPV6       = 2,                       ///< RoCEv1 IPv6
-    ROCEV2_IPV6       = 3,                       ///< RoCEv2 IPv6
-    ROCEV1_IPV4       = 4,                       ///< RoCEv1 IPv4-mapped IPv6
-    ROCEV2_IPV4       = 5,                       ///< RoCEv2 IPv4-mapped IPv6 ::ffff:192.168.x.x
-  };
-  const char* GidPriorityStr[] = {
-    "RoCEv1 Link-local",
-    "RoCEv2 Link-local",
-    "RoCEv1 IPv6",
-    "RoCEv2 IPv6",
-    "RoCEv1 IPv4-mapped IPv6",
-    "RoCEv2 IPv4-mapped IPv6"
-  };
-  /**
-   * ErrResult consists of error type and error message
-   */
-  struct ErrResult
-  {
-    ErrType     errType;                        ///< Error type
-    std::string errMsg;                         ///< Error details
-    ErrResult() = default;
-#if defined(__NVCC__)
-    ErrResult(cudaError_t  err);
-#else
-    ErrResult(hipError_t   err);
-    ErrResult(hsa_status_t err);
-#endif
-    ErrResult(ErrType      err);
-    ErrResult(ErrType      errType, const char* format, ...);
-  };
-  /**
-   * Results for a single Executor
-   */
-  struct ExeResult
-  {
-    size_t      numBytes;                       ///< Total bytes transferred by this Executor
-    double      avgDurationMsec;                ///< Averaged duration for all the Transfers for this Executor
-    double      avgBandwidthGbPerSec;           ///< Average bandwidth for this Executor
-    double      sumBandwidthGbPerSec;           ///< Naive sum of individual Transfer average bandwidths
-    vector<int> transferIdx;                    ///< Indicies of Transfers this Executor executed
-  };
-  /**
-   * Results for a single Transfer
-   */
-  struct TransferResult
-  {
-    size_t numBytes;                            ///< Number of bytes transferred by this Transfer
-    double avgDurationMsec;                     ///< Duration for this Transfer, averaged over all timed iterations
-    double avgBandwidthGbPerSec;                ///< Bandwidth for this Transfer based on averaged duration
-    // Only filled in if recordPerIteration = 1
-    vector<double> perIterMsec;                 ///< Duration for each individual iteration
-    vector<set<pair<int,int>>> perIterCUs;      ///< GFX-Executor only. XCC:CU used per iteration
-    ExeDevice exeDevice;                        ///< Tracks which executor performed this Transfer (e.g. for EXE_NIC_NEAREST)
-    ExeDevice exeDstDevice;                     ///< Tracks actual destination executor (only valid for EXE_NIC/EXE_NIC_NEAREST)
-  };
-  /**
-   * TestResults contain timing results for a set of Transfers as a group as well as per Executor and per Transfer
-   * timing information
-   */
-  struct TestResults
-  {
-    int    numTimedIterations;                  ///< Number of iterations executed
-    size_t totalBytesTransferred;               ///< Total bytes transferred per iteration
-    double avgTotalDurationMsec;                ///< Wall-time (msec) to finish all Transfers (averaged across all timed iterations)
-    double avgTotalBandwidthGbPerSec;           ///< Bandwidth based on all Transfers and average wall time
-    double overheadMsec;                        ///< Difference between total wall time and slowest executor
-    map<ExeDevice, ExeResult> exeResults;       ///< Per Executor results
-    vector<TransferResult>    tfrResults;       ///< Per Transfer results
-    vector<ErrResult>         errResults;       ///< List of any errors/warnings that occurred
-  };
-  /**
-   * Run a set of Transfers
-   *
-   * @param[in]  config     Configuration options
-   * @param[in]  transfers  Set of Transfers to execute
-   * @param[out] results    Timing results
-   * @returns true if and only if Transfers were run successfully without any fatal errors
-   */
-  bool RunTransfers(ConfigOptions    const& config,
-                    vector<Transfer> const& transfers,
-                    TestResults&            results);
-  /**
-   * Enumeration of implementation attributes
-   */
-  enum IntAttribute
-  {
-    ATR_GFX_MAX_BLOCKSIZE,                      ///< Maximum blocksize for GFX executor
-    ATR_GFX_MAX_UNROLL,                         ///< Maximum unroll factor for GFX executor
-  };
-  enum StrAttribute
-  {
-    ATR_SRC_PREP_DESCRIPTION                    ///< Description of how source memory is prepared
-  };
-  /**
-   * Query attributes (integer)
-   *
-   * @note This allows querying of implementation information such as limits
-   *
-   * @param[in] attribute   Attribute to query
-   * @returns Value of the attribute
-   */
-  int GetIntAttribute(IntAttribute attribute);
-  /**
-   * Query attributes (string)
-   *
-   * @note This allows query of implementation details such as limits
-   *
-   * @param[in] attrtibute Attribute to query
-   * @returns Value of the attribute
-   */
-  std::string GetStrAttribute(StrAttribute attribute);
-  /**
-   * Returns information about number of available available Executors
-   *
-   * @param[in] exeType    Executor type to query
-   * @returns Number of detected Executors of exeType
-   */
-  int GetNumExecutors(ExeType exeType);
-  /**
-   * Returns the number of possible Executor subindices
-   *
-   * @note For CPU, this is 0
-   * @note For GFX, this refers to the number of XCDs
-   * @note For DMA, this refers to the number of DMA engines
-   *
-   * @param[in] exeDevice The specific Executor to query
-   * @returns Number of detected executor subindices
-   */
-  int GetNumExecutorSubIndices(ExeDevice exeDevice);
-  /**
-   * Returns number of subExecutors for a given ExeDevice
-   *
-   * @param[in] exeDevice   The specific Executor to query
-   * @returns Number of detected subExecutors for the given ExePair
-   */
-  int GetNumSubExecutors(ExeDevice exeDevice);
-  /**
-   * Returns the index of the NUMA node closest to the given GPU
-   *
-   * @param[in] gpuIndex Index of the GPU to query
-   * @returns NUMA node index closest to GPU gpuIndex, or -1 if unable to detect
-   */
-  int GetClosestCpuNumaToGpu(int gpuIndex);
-  /**
-   * Returns the index of the NUMA node closest to the given NIC
-   *
-   * @param[in] nicIndex Index of the NIC to query
-   * @returns NUMA node index closest to the NIC nicIndex, or -1 if unable to detect
-   */
-  int GetClosestCpuNumaToNic(int nicIndex);
-  /**
-   * Returns the index of the NIC closest to the given GPU
-   *
-   * @param[in] gpuIndex Index of the GPU to query
-   * @note This function is applicable when the IBV/RDMA executor is available
-   * @returns IB Verbs capable NIC index closest to GPU gpuIndex, or -1 if unable to detect
-   */
-  int GetClosestNicToGpu(int gpuIndex);
-  /**
-   * Helper function to parse a line containing Transfers into a vector of Transfers
-   *
-   * @param[in]  str       String containing description of Transfers
-   * @param[out] transfers List of Transfers described by 'str'
-   * @returns Information about any error that may have occured
-   */
-  ErrResult ParseTransfers(std::string str,
-                           std::vector<Transfer>& transfers);
-};
-//==========================================================================================
-// End of TransferBench API
-//==========================================================================================
-// Redefinitions for CUDA compatibility
-//==========================================================================================
-#if defined(__NVCC__)
-  // ROCm specific
-  #define wall_clock64                                       clock64
-  #define gcnArchName                                        name
-  // Datatypes
-  #define hipDeviceProp_t                                    cudaDeviceProp
-  #define hipError_t                                         cudaError_t
-  #define hipEvent_t                                         cudaEvent_t
-  #define hipStream_t                                        cudaStream_t
-  // Enumerations
-  #define hipDeviceAttributeClockRate                        cudaDevAttrClockRate
-  #define hipDeviceAttributeMultiprocessorCount              cudaDevAttrMultiProcessorCount
-  #define hipErrorPeerAccessAlreadyEnabled                   cudaErrorPeerAccessAlreadyEnabled
-  #define hipFuncCachePreferShared                           cudaFuncCachePreferShared
-  #define hipMemcpyDefault                                   cudaMemcpyDefault
-  #define hipMemcpyDeviceToHost                              cudaMemcpyDeviceToHost
-  #define hipMemcpyHostToDevice                              cudaMemcpyHostToDevice
-  #define hipSuccess                                         cudaSuccess
-  // Functions
-  #define hipDeviceCanAccessPeer                             cudaDeviceCanAccessPeer
-  #define hipDeviceEnablePeerAccess                          cudaDeviceEnablePeerAccess
-  #define hipDeviceGetAttribute                              cudaDeviceGetAttribute
-  #define hipDeviceGetPCIBusId                               cudaDeviceGetPCIBusId
-  #define hipDeviceSetCacheConfig                            cudaDeviceSetCacheConfig
-  #define hipDeviceSynchronize                               cudaDeviceSynchronize
-  #define hipEventCreate                                     cudaEventCreate
-  #define hipEventDestroy                                    cudaEventDestroy
-  #define hipEventElapsedTime                                cudaEventElapsedTime
-  #define hipEventRecord                                     cudaEventRecord
-  #define hipFree                                            cudaFree
-  #define hipGetDeviceCount                                  cudaGetDeviceCount
-  #define hipGetDeviceProperties                             cudaGetDeviceProperties
-  #define hipGetErrorString                                  cudaGetErrorString
-  #define hipHostFree                                        cudaFreeHost
-  #define hipHostMalloc                                      cudaMallocHost
-  #define hipMalloc                                          cudaMalloc
-  #define hipMallocManaged                                   cudaMallocManaged
-  #define hipMemcpy                                          cudaMemcpy
-  #define hipMemcpyAsync                                     cudaMemcpyAsync
-  #define hipMemset                                          cudaMemset
-  #define hipMemsetAsync                                     cudaMemsetAsync
-  #define hipSetDevice                                       cudaSetDevice
-  #define hipStreamCreate                                    cudaStreamCreate
-  #define hipStreamDestroy                                   cudaStreamDestroy
-  #define hipStreamSynchronize                               cudaStreamSynchronize
-  // Define float2 addition operator for NVIDIA platform
-  __device__ inline float2& operator +=(float2& a, const float2& b)
-  {
-    a.x += b.x;
-    a.y += b.y;
-    return a;
-  }
-  // Define float4 addition operator for NVIDIA platform
-  __device__ inline float4& operator +=(float4& a, const float4& b)
-  {
-    a.x += b.x;
-    a.y += b.y;
-    a.z += b.z;
-    a.w += b.w;
-    return a;
-  }
-#endif
-// Helper macro functions
-//==========================================================================================
-// Macro for collecting CU/SM GFX kernel is running on
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__)
-#define GetHwId(hwId) hwId = 0
-#elif defined(__NVCC__)
-#define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId))
-#else
-#define GetHwId(hwId) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwId));
-#endif
-// Macro for collecting XCC GFX kernel is running on
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
-#define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
-#else
-#define GetXccId(val) val = 0
-#endif
-// Error check macro (NOTE: This will return even for ERR_WARN)
-#define ERR_CHECK(cmd)            \
-  do {                            \
-    ErrResult err = (cmd);        \
-    if (err.errType != ERR_NONE)  \
-      return err;                 \
-  } while (0)
-// Appends warn/fatal errors to a list, return false if fatal
-#define ERR_APPEND(cmd, list)     \
-  do {                            \
-    ErrResult err = (cmd);        \
-    if (err.errType != ERR_NONE)  \
-      list.push_back(err);        \
-    if (err.errType == ERR_FATAL) \
-      return false;               \
-  } while (0)
-// Helper macros for calling RDMA functions and reporting errors
-#ifdef VERBS_DEBUG
-#define IBV_CALL(__func__, ...)                                         \
-  do {                                                                  \
-    int error = __func__(__VA_ARGS__);                                  \
-    if (error != 0) {                                                   \
-      return {ERR_FATAL, "Encountered IbVerbs error (%d) at line (%d) " \
-              "and function (%s)", (error), __LINE__, #__func__};       \
-    }                                                                   \
-  } while (0)
-#define IBV_PTR_CALL(__ptr__, __func__, ...)                               \
-  do {                                                                     \
-    __ptr__ = __func__(__VA_ARGS__);                                       \
-    if (__ptr__ == nullptr) {                                              \
-      return {ERR_FATAL, "Encountered IbVerbs nullptr error at line (%d) " \
-              "and function (%s)", __LINE__, #__func__};                   \
-    }                                                                      \
-  } while (0)
-#else
-#define IBV_CALL(__func__, ...)                                         \
-  do {                                                                  \
-    int error = __func__(__VA_ARGS__);                                  \
-    if (error != 0) {                                                   \
-      return {ERR_FATAL, "Encountered IbVerbs error (%d) in func (%s) " \
-              , error, #__func__};                                      \
-    }                                                                   \
-  } while (0)
-#define IBV_PTR_CALL(__ptr__, __func__, ...)                               \
-  do {                                                                     \
-    __ptr__ = __func__(__VA_ARGS__);                                       \
-    if (__ptr__ == nullptr) {                                              \
-      return {ERR_FATAL, "Encountered IbVerbs nullptr error in func (%s) " \
-              , #__func__};                                                \
-    }                                                                      \
-  } while (0)
-#endif
-namespace TransferBench
-{
-/// @cond
-// Helper functions ('hidden' in anonymous namespace)
-//========================================================================================
-namespace {
-// Constants
-//========================================================================================
-  int   constexpr MAX_BLOCKSIZE  = 512;                // Max threadblock size
-  int   constexpr MAX_WAVEGROUPS = MAX_BLOCKSIZE / 64; // Max wavegroups/warps
-  int   constexpr MAX_UNROLL     = 8;                  // Max unroll factor
-  int   constexpr MAX_SRCS       = 8;                  // Max srcs per Transfer
-  int   constexpr MAX_DSTS       = 8;                  // Max dsts per Transfer
-  int   constexpr MEMSET_CHAR    = 75;                 // Value to memset (char)
-  float constexpr MEMSET_VAL     = 13323083.0f;        // Value to memset (double)
-// Parsing-related functions
-//========================================================================================
-  static ErrResult CharToMemType(char const c, MemType& memType)
-  {
-    char const* val = strchr(MemTypeStr, toupper(c));
-    if (val) {
-      memType = (MemType)(val - MemTypeStr);
-      return ERR_NONE;
-    }
-    return {ERR_FATAL, "Unexpected memory type (%c)", c};
-  }
-  static ErrResult CharToExeType(char const c, ExeType& exeType)
-  {
-    char const* val = strchr(ExeTypeStr, toupper(c));
-    if (val) {
-      exeType = (ExeType)(val - ExeTypeStr);
-      return ERR_NONE;
-    }
-    return {ERR_FATAL, "Unexpected executor type (%c)", c};
-  }
-  static ErrResult ParseMemType(std::string const& token,
-                                std::vector<MemDevice>& memDevices)
-  {
-    char memTypeChar;
-    int offset = 0, memIndex, inc;
-    MemType memType;
-    bool found = false;
-    memDevices.clear();
-    while (sscanf(token.c_str() + offset, " %c %d%n", &memTypeChar, &memIndex, &inc) == 2) {
-      offset += inc;
-      ErrResult err = CharToMemType(memTypeChar, memType);
-      if (err.errType != ERR_NONE) return err;
-      if (memType != MEM_NULL)
-        memDevices.push_back({memType, memIndex});
-      found = true;
-    }
-    if (found) return ERR_NONE;
-    return {ERR_FATAL,
-            "Unable to parse memory type token %s.  Expected one of %s followed by an index",
-            token.c_str(), MemTypeStr};
-  }
-  static ErrResult ParseExeType(std::string const& token,
-                                ExeDevice& exeDevice,
-                                int& exeSubIndex)
-  {
-    char exeTypeChar;
-    exeSubIndex = -1;
-    int numTokensParsed = sscanf(token.c_str(),
-                                 " %c%d.%d", &exeTypeChar, &exeDevice.exeIndex, &exeSubIndex);
-    if (numTokensParsed < 2) {
-      return {ERR_FATAL,
-              "Unable to parse valid executor token (%s)."
-              "Expected one of %s followed by an index",
-              token.c_str(), ExeTypeStr};
-    }
-    return CharToExeType(exeTypeChar, exeDevice.exeType);
-  }
-// Memory-related functions
-//========================================================================================
-  // Enable peer access between two GPUs
-  static ErrResult EnablePeerAccess(int const deviceId, int const peerDeviceId)
-  {
-    int canAccess;
-    ERR_CHECK(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
-    if (!canAccess)
-      return {ERR_FATAL,
-              "Unable to enable peer access from GPU devices %d to %d", peerDeviceId, deviceId};
-    ERR_CHECK(hipSetDevice(deviceId));
-    hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
-    if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled) {
-      return {ERR_FATAL,
-              "Unable to enable peer to peer access from %d to %d (%s)",
-              deviceId, peerDeviceId, hipGetErrorString(error)};
-    }
-    return ERR_NONE;
-  }
-  // Check that CPU memory array of numBytes has been allocated on targetId NUMA node
-  static ErrResult CheckPages(char* array, size_t numBytes, int targetId)
-  {
-    size_t const pageSize = getpagesize();
-    size_t const numPages = (numBytes + pageSize - 1) / pageSize;
-    std::vector<void *> pages(numPages);
-    std::vector<int> status(numPages);
-    pages[0] = array;
-    for (int i = 1; i < numPages; i++) {
-      pages[i] = (char*)pages[i-1] + pageSize;
-    }
-    long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
-    if (retCode)
-      return {ERR_FATAL,
-              "Unable to collect page table information for allocated memory. "
-              "Ensure NUMA library is installed properly"};
-    size_t mistakeCount = 0;
-    for (size_t i = 0; i < numPages; i++) {
-      if (status[i] < 0)
-        return {ERR_FATAL,
-                "Unexpected page status (%d) for page %llu", status[i], i};
-      if (status[i] != targetId) mistakeCount++;
-    }
-    if (mistakeCount > 0) {
-      return {ERR_FATAL,
-              "%lu out of %lu pages for memory allocation were not on NUMA node %d."
-              " This could be due to hardware memory issues, or the use of numa-rebalancing daemons such as numad",
-              mistakeCount, numPages, targetId};
-    }
-    return ERR_NONE;
-  }
-  // Allocate memory
-  static ErrResult AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr)
-  {
-    if (numBytes == 0) {
-      return {ERR_FATAL, "Unable to allocate 0 bytes"};
-    }
-    *memPtr = nullptr;
-    MemType const& memType = memDevice.memType;
-    if (IsCpuMemType(memType)) {
-      // Determine which NUMA device to use
-      int numaIdx = memDevice.memIndex;
-      if (memType == MEM_CPU_CLOSEST) {
-        numaIdx = GetClosestCpuNumaToGpu(memDevice.memIndex);
-      }
-      // Set NUMA policy prior to call to hipHostMalloc
-      numa_set_preferred(numaIdx);
-      // Allocate host-pinned memory (should respect NUMA mem policy)
-      if (memType == MEM_CPU_FINE) {
-#if defined (__NVCC__)
-        return {ERR_FATAL, "Fine-grained CPU memory not supported on NVIDIA platform"};
-#else
-        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
-#endif
-      } else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) {
-#if defined (__NVCC__)
-        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, 0));
-#else
-        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent));
-#endif
-      } else if (memType == MEM_CPU_UNPINNED) {
-        *memPtr = numa_alloc_onnode(numBytes, numaIdx);
-      }
-      // Check that the allocated pages are actually on the correct NUMA node
-      memset(*memPtr, 0, numBytes);
-      ERR_CHECK(CheckPages((char*)*memPtr, numBytes, numaIdx));
-      // Reset to default numa mem policy
-      numa_set_preferred(-1);
-    } else if (IsGpuMemType(memType)) {
-      // Switch to the appropriate GPU
-      ERR_CHECK(hipSetDevice(memDevice.memIndex));
-      if (memType == MEM_GPU) {
-        // Allocate GPU memory on appropriate device
-        ERR_CHECK(hipMalloc((void**)memPtr, numBytes));
-      } else if (memType == MEM_GPU_FINE) {
-#if defined (__NVCC__)
-        return {ERR_FATAL, "Fine-grained GPU memory not supported on NVIDIA platform"};
-#else
-        int flag = hipDeviceMallocUncached;
-        ERR_CHECK(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
-#endif
-      } else if (memType == MEM_MANAGED) {
-        ERR_CHECK(hipMallocManaged((void**)memPtr, numBytes));
-      }
-      // Clear the memory
-      ERR_CHECK(hipMemset(*memPtr, 0, numBytes));
-      ERR_CHECK(hipDeviceSynchronize());
-    } else {
-      return {ERR_FATAL, "Unsupported memory type (%d)", memType};
-    }
-    return ERR_NONE;
-  }
-  // Deallocate memory
-  static ErrResult DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)
-  {
-    // Avoid deallocating nullptr
-    if (memPtr == nullptr)
-      return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes};
-    switch (memType) {
-    case MEM_CPU: case MEM_CPU_FINE: case MEM_CPU_CLOSEST:
-    {
-      ERR_CHECK(hipHostFree(memPtr));
-      break;
-    }
-    case MEM_CPU_UNPINNED:
-    {
-      numa_free(memPtr, bytes);
-      break;
-    }
-    case MEM_GPU : case MEM_GPU_FINE: case MEM_MANAGED:
-    {
-      ERR_CHECK(hipFree(memPtr));
-      break;
-    }
-    default:
-      return {ERR_FATAL, "Attempting to deallocate unrecognized memory type (%d)", memType};
-    }
-    return ERR_NONE;
-  }
-// HSA-related functions
-//========================================================================================
-#if !defined(__NVCC__)
-  // Get the hsa_agent_t associated with a ExeDevice
-  static ErrResult GetHsaAgent(ExeDevice const& exeDevice, hsa_agent_t& agent)
-  {
-    static bool isInitialized = false;
-    static std::vector<hsa_agent_t> cpuAgents;
-    static std::vector<hsa_agent_t> gpuAgents;
-    int const& exeIndex = exeDevice.exeIndex;
-    int const numCpus   = GetNumExecutors(EXE_CPU);
-    int const numGpus   = GetNumExecutors(EXE_GPU_GFX);
-    // Initialize results on first use
-    if (!isInitialized) {
-      hsa_amd_pointer_info_t info;
-      info.size = sizeof(info);
-      ErrResult err;
-      int32_t* tempBuffer;
-      // Index CPU agents
-      cpuAgents.clear();
-      for (int i = 0; i < numCpus; i++) {
-        ERR_CHECK(AllocateMemory({MEM_CPU, i}, 1024, (void**)&tempBuffer));
-        ERR_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
-        cpuAgents.push_back(info.agentOwner);
-        ERR_CHECK(DeallocateMemory(MEM_CPU, tempBuffer, 1024));
-      }
-      // Index GPU agents
-      gpuAgents.clear();
-      for (int i = 0; i < numGpus; i++) {
-        ERR_CHECK(AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer));
-        ERR_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
-        gpuAgents.push_back(info.agentOwner);
-        ERR_CHECK(DeallocateMemory(MEM_GPU, tempBuffer, 1024));
-      }
-      isInitialized = true;
-    }
-    switch (exeDevice.exeType) {
-    case EXE_CPU:
-      if (exeIndex < 0 || exeIndex >= numCpus)
-        return {ERR_FATAL, "CPU index must be between 0 and %d inclusively", numCpus - 1};
-      agent = cpuAgents[exeDevice.exeIndex];
-      break;
-    case EXE_GPU_GFX: case EXE_GPU_DMA:
-      if (exeIndex < 0 || exeIndex >= numGpus)
-        return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1};
-      agent = gpuAgents[exeIndex];
-      break;
-    default:
-      return {ERR_FATAL,
-              "Attempting to get HSA agent of unknown or unsupported executor type (%d)",
-              exeDevice.exeType};
-    }
-    return ERR_NONE;
-  }
-  // Get the hsa_agent_t associated with a MemDevice
-  static ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent)
-  {
-    if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent);
-    if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent);
-    return {ERR_FATAL,
-            "Unable to get HSA agent for memDevice (%d,%d)",
-            memDevice.memType, memDevice.memIndex};
-  }
-#endif
-// Setup validation-related functions
-//========================================================================================
-  static ErrResult GetActualExecutor(ConfigOptions const& cfg,
-                                     ExeDevice     const& origExeDevice,
-                                     ExeDevice&           actualExeDevice)
-  {
-    // By default, nothing needs to change
-    actualExeDevice = origExeDevice;
-    // When using NIC_NEAREST, remap to the closest NIC to the GPU
-    if (origExeDevice.exeType == EXE_NIC_NEAREST) {
-      actualExeDevice.exeType  = EXE_NIC;
-      if (cfg.nic.closestNics.size() > 0) {
-        if (origExeDevice.exeIndex < 0 || origExeDevice.exeIndex >= cfg.nic.closestNics.size())
-          return {ERR_FATAL, "NIC index is out of range (%d)", origExeDevice.exeIndex};
-        actualExeDevice.exeIndex = cfg.nic.closestNics[origExeDevice.exeIndex];
-      } else {
-        actualExeDevice.exeIndex = GetClosestNicToGpu(origExeDevice.exeIndex);
-      }
-    }
-    return ERR_NONE;
-  }
-  // Validate that MemDevice exists
-  static ErrResult CheckMemDevice(MemDevice const& memDevice)
-  {
-    if (memDevice.memType == MEM_NULL)
-      return ERR_NONE;
-    if (IsCpuMemType(memDevice.memType) && memDevice.memType != MEM_CPU_CLOSEST) {
-      int numCpus = GetNumExecutors(EXE_CPU);
-      if (memDevice.memIndex < 0 || memDevice.memIndex >= numCpus)
-        return {ERR_FATAL,
-                "CPU index must be between 0 and %d (instead of %d)", numCpus - 1, memDevice.memIndex};
-      return ERR_NONE;
-    }
-    if (IsGpuMemType(memDevice.memType) || memDevice.memType == MEM_CPU_CLOSEST) {
-    int numGpus = GetNumExecutors(EXE_GPU_GFX);
-      if (memDevice.memIndex < 0 || memDevice.memIndex >= numGpus)
-        return {ERR_FATAL,
-                "GPU index must be between 0 and %d (instead of %d)", numGpus - 1, memDevice.memIndex};
-      if (memDevice.memType == MEM_CPU_CLOSEST) {
-        if (GetClosestCpuNumaToGpu(memDevice.memIndex) == -1) {
-          return {ERR_FATAL, "Unable to determine closest NUMA node for GPU %d", memDevice.memIndex};
-        }
-      }
-      return ERR_NONE;
-    }
-    return {ERR_FATAL, "Unsupported memory type (%d)", memDevice.memType};
-  }
-  // Validate configuration options - return trues if and only if an fatal error is detected
-  static bool ConfigOptionsHaveErrors(ConfigOptions const&    cfg,
-                                      std::vector<ErrResult>& errors)
-  {
-    // Check general options
-    if (cfg.general.numWarmups < 0)
-      errors.push_back({ERR_FATAL, "[general.numWarmups] must be a non-negative number"});
-    // Check data options
-    if (cfg.data.blockBytes == 0 || cfg.data.blockBytes % 4)
-      errors.push_back({ERR_FATAL, "[data.blockBytes] must be positive multiple of %lu", sizeof(float)});
-    if (cfg.data.byteOffset < 0 || cfg.data.byteOffset % sizeof(float))
-      errors.push_back({ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)});
-    // Check GFX options
-    if (cfg.gfx.blockOrder < 0 || cfg.gfx.blockOrder > 2)
-      errors.push_back({ERR_FATAL,
-          "[gfx.blockOrder] must be 0 for sequential, 1 for interleaved, or 2 for random"});
-    if (cfg.gfx.useMultiStream && cfg.gfx.blockOrder > 0)
-      errors.push_back({ERR_WARN, "[gfx.blockOrder] will be ignored when running in multi-stream mode"});
-    int gfxMaxBlockSize = GetIntAttribute(ATR_GFX_MAX_BLOCKSIZE);
-    if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > gfxMaxBlockSize)
-      errors.push_back({ERR_FATAL,
-                        "[gfx.blockSize] must be positive multiple of 64 less than or equal to %d",
-                        gfxMaxBlockSize});
-    if (cfg.gfx.temporalMode < 0 || cfg.gfx.temporalMode > 3)
-      errors.push_back({ERR_FATAL,
-                        "[gfx.temporalMode] must be non-negative and less than or equal to 3"});
-#if defined(__NVCC__)
-    if (cfg.gfx.temporalMode > 0)
-      errors.push_back({ERR_FATAL,
-          "[gfx.temporalMode] is not supported on NVIDIA hardware"});
-#endif
-    int gfxMaxUnroll = GetIntAttribute(ATR_GFX_MAX_UNROLL);
-    if (cfg.gfx.unrollFactor < 0 || cfg.gfx.unrollFactor > gfxMaxUnroll)
-      errors.push_back({ERR_FATAL,
-                        "[gfx.unrollFactor] must be non-negative and less than or equal to %d",
-                        gfxMaxUnroll});
-    if (cfg.gfx.waveOrder < 0 || cfg.gfx.waveOrder >= 6)
-      errors.push_back({ERR_FATAL,
-                        "[gfx.waveOrder] must be non-negative and less than 6"});
-    if (!(cfg.gfx.wordSize == 1 || cfg.gfx.wordSize == 2 || cfg.gfx.wordSize == 4))
-      errors.push_back({ERR_FATAL, "[gfx.wordSize] must be either 1, 2 or 4"});
-    int numGpus = GetNumExecutors(EXE_GPU_GFX);
-    int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
-    vector<vector<int>> const& table = cfg.gfx.prefXccTable;
-    if (!table.empty()) {
-      if (table.size() != numGpus) {
-        errors.push_back({ERR_FATAL, "[gfx.prefXccTable] must be have size %dx%d", numGpus, numGpus});
-      } else {
-        for (int i = 0; i < table.size(); i++) {
-          if (table[i].size() != numGpus) {
-            errors.push_back({ERR_FATAL, "[gfx.prefXccTable] must be have size %dx%d", numGpus, numGpus});
-            break;
-          } else {
-            for (auto x : table[i]) {
-              if (x < 0 || x >= numXccs) {
-                errors.push_back({ERR_FATAL, "[gfx.prefXccTable] must contain values between 0 and %d",
-                    numXccs - 1});
-                break;
-              }
-            }
-          }
-        }
-      }
-    }
-    // Check NIC options
-#ifdef NIC_EXEC_ENABLED
-    int numNics = GetNumExecutors(EXE_NIC);
-    for (auto const& nic : cfg.nic.closestNics)
-      if (nic < 0 || nic >= numNics)
-        errors.push_back({ERR_FATAL, "NIC index (%d) in user-specified closest NIC list must be between 0 and %d",
-            nic, numNics - 1});
-    size_t closetNicsSize = cfg.nic.closestNics.size();
-    if (closetNicsSize > 0 && closetNicsSize < numGpus)
-      errors.push_back({ERR_FATAL, "User-specified closest NIC list must match GPU count of %d",
-          numGpus});
-#endif
-    // NVIDIA specific
-#if defined(__NVCC__)
-    if (cfg.data.validateDirect)
-      errors.push_back({ERR_FATAL, "[data.validateDirect] is not supported on NVIDIA hardware"});
-#else
-    // AMD specific
-    // Check for largeBar enablement on GPUs
-    for (int i = 0; i < numGpus; i++) {
-      int isLargeBar = 0;
-      hipError_t err = hipDeviceGetAttribute(&isLargeBar, hipDeviceAttributeIsLargeBar, i);
-      if (err != hipSuccess) {
-        errors.push_back({ERR_FATAL, "Unable to query if GPU %d has largeBAR enabled", i});
-      } else if (!isLargeBar) {
-        errors.push_back({ERR_WARN,
-                          "Large BAR is not enabled for GPU %d in BIOS. "
-                          "Large BAR is required to enable multi-gpu data access", i});
-      }
-    }
-#endif
-    // Check for fatal errors
-    for (auto const& err : errors)
-      if (err.errType == ERR_FATAL) return true;
-    return false;
-  }
-  // Validate Transfers to execute - returns true if and only if fatal error detected
-  static bool TransfersHaveErrors(ConfigOptions         const& cfg,
-                                  std::vector<Transfer> const& transfers,
-                                  std::vector<ErrResult>&      errors)
-  {
-    int numCpus = GetNumExecutors(EXE_CPU);
-    int numGpus = GetNumExecutors(EXE_GPU_GFX);
-    int numNics = GetNumExecutors(EXE_NIC);
-    std::set<ExeDevice>      executors;
-    std::map<ExeDevice, int> transferCount;
-    std::map<ExeDevice, int> useSubIndexCount;
-    std::map<ExeDevice, int> totalSubExecs;
-    // Per-Transfer checks
-    for (size_t i = 0; i < transfers.size(); i++) {
-      Transfer const& t = transfers[i];
-      if (t.numBytes == 0)
-        errors.push_back({ERR_FATAL, "Transfer %d: Cannot perform 0-byte transfers", i});
-      if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU) {
-        size_t const N               = t.numBytes / sizeof(float);
-        int    const targetMultiple  = cfg.data.blockBytes / sizeof(float);
-        int    const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple,
-                                                (size_t)t.numSubExecs);
-        if (maxSubExecToUse < t.numSubExecs)
-          errors.push_back({ERR_WARN,
-                            "Transfer %d data size is too small - will only use %d of %d subexecutors",
-                            i, maxSubExecToUse, t.numSubExecs});
-      }
-      // Check sources and destinations
-      if (t.srcs.empty() && t.dsts.empty())
-        errors.push_back({ERR_FATAL, "Transfer %d: Must have at least one source or destination", i});
-      for (int j = 0; j < t.srcs.size(); j++) {
-        ErrResult err = CheckMemDevice(t.srcs[j]);
-        if (err.errType != ERR_NONE)
-          errors.push_back({ERR_FATAL, "Transfer %d: SRC %d: %s", i, j, err.errMsg.c_str()});
-      }
-      for (int j = 0; j < t.dsts.size(); j++) {
-        ErrResult err = CheckMemDevice(t.dsts[j]);
-        if (err.errType != ERR_NONE)
-          errors.push_back({ERR_FATAL, "Transfer %d: DST %d: %s", i, j, err.errMsg.c_str()});
-      }
-      // Check executor
-      executors.insert(t.exeDevice);
-      transferCount[t.exeDevice]++;
-      switch (t.exeDevice.exeType) {
-      case EXE_CPU:
-        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numCpus)
-          errors.push_back({ERR_FATAL,
-                            "Transfer %d: CPU index must be between 0 and %d (instead of %d)",
-                            i, numCpus - 1, t.exeDevice.exeIndex});
-        break;
-      case EXE_GPU_GFX:
-        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numGpus) {
-          errors.push_back({ERR_FATAL,
-                            "Transfer %d: GFX index must be between 0 and %d (instead of %d)",
-                            i, numGpus - 1, t.exeDevice.exeIndex});
-        } else {
-          if (t.exeSubIndex != -1) {
-#if defined(__NVCC__)
-            errors.push_back({ERR_FATAL,
-                              "Transfer %d: GFX executor subindex not supported on NVIDIA hardware", i});
-#else
-            useSubIndexCount[t.exeDevice]++;
-            int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
-            if (t.exeSubIndex >= numSubIndices)
-              errors.push_back({ERR_FATAL,
-                                "Transfer %d: GFX subIndex (XCC) must be between 0 and %d", i, numSubIndices - 1});
-#endif
-          }
-        }
-        break;
-      case EXE_GPU_DMA:
-        if (t.srcs.size() != 1 || t.dsts.size() != 1) {
-          errors.push_back({ERR_FATAL,
-                            "Transfer %d: DMA executor must have exactly 1 source and 1 destination", i});
-        }
-        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numGpus) {
-          errors.push_back({ERR_FATAL,
-                            "Transfer %d: DMA index must be between 0 and %d (instead of %d)",
-                            i, numGpus - 1, t.exeDevice.exeIndex});
-          // Cannot proceed with any further checks
-          continue;
-        }
-        if (t.exeSubIndex != -1) {
-#if defined(__NVCC__)
-          errors.push_back({ERR_FATAL,
-                            "Transfer %d: DMA executor subindex not supported on NVIDIA hardware", i});
-#else
-          useSubIndexCount[t.exeDevice]++;
-          int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
-          if (t.exeSubIndex >= numSubIndices)
-            errors.push_back({ERR_FATAL,
-                              "Transfer %d: DMA subIndex (engine) must be between 0 and %d",
-                              i, numSubIndices - 1});
-          // Check that engine Id exists between agents
-          hsa_agent_t srcAgent, dstAgent;
-          ErrResult err;
-          err = GetHsaAgent(t.srcs[0], srcAgent);
-          if (err.errType != ERR_NONE) {
-            errors.push_back(err);
-            if (err.errType == ERR_FATAL) break;
-          }
-          err = GetHsaAgent(t.dsts[0], dstAgent);
-          if (err.errType != ERR_NONE) {
-            errors.push_back(err);
-            if (err.errType == ERR_FATAL) break;
-          }
-          uint32_t engineIdMask = 0;
-          err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
-          if (err.errType != ERR_NONE) {
-            errors.push_back(err);
-            if (err.errType == ERR_FATAL) break;
-          }
-          hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
-          if (!(sdmaEngineId & engineIdMask)) {
-            errors.push_back({ERR_FATAL,
-                "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
-                i, t.exeDevice.exeIndex, t.exeSubIndex});
-          }
-#endif
-        }
-        if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) {
-          errors.push_back({ERR_WARN,
-              "Transfer %d: No GPU memory for source or destination.  Copy might not execute on DMA %d",
-              i, t.exeDevice.exeIndex});
-        } else {
-          // Currently HIP will use src agent if source memory is GPU, otherwise dst agent
-          if (IsGpuMemType(t.srcs[0].memType)) {
-            if (t.srcs[0].memIndex != t.exeDevice.exeIndex) {
-              errors.push_back({ERR_WARN,
-                  "Transfer %d: DMA executor will automatically switch to using the source memory device (%d) not (%d)",
-                  i, t.srcs[0].memIndex, t.exeDevice.exeIndex});
-            }
-          } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) {
-            errors.push_back({ERR_WARN,
-                "Transfer %d: DMA executor will automatically switch to using the destination memory device (%d) not (%d)",
-                i, t.dsts[0].memIndex, t.exeDevice.exeIndex});
-          }
-        }
-        break;
-      case EXE_NIC:
-#ifdef NIC_EXEC_ENABLED
-      {
-        int srcIndex = t.exeDevice.exeIndex;
-        int dstIndex = t.exeSubIndex;
-        if (srcIndex < 0 || srcIndex >= numNics)
-          errors.push_back({ERR_FATAL, "Transfer %d: src NIC executor indexes an out-of-range NIC (%d)", i, srcIndex});
-        if (dstIndex < 0 || dstIndex >= numNics)
-          errors.push_back({ERR_FATAL, "Transfer %d: dst NIC executor indexes an out-of-range NIC (%d)", i, dstIndex});
-      }
-#else
-        errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available", i});
-#endif
-        break;
-      case EXE_NIC_NEAREST:
-#ifdef NIC_EXEC_ENABLED
-      {
-        ExeDevice srcExeDevice;
-        ErrResult errSrc = GetActualExecutor(cfg, t.exeDevice, srcExeDevice);
-        if (errSrc.errType != ERR_NONE) errors.push_back(errSrc);
-        ExeDevice dstExeDevice;
-        ErrResult errDst = GetActualExecutor(cfg, {t.exeDevice.exeType, t.exeSubIndex}, dstExeDevice);
-        if (errDst.errType != ERR_NONE) errors.push_back(errDst);
-      }
-#else
-        errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available", i});
-#endif
-        break;
-      }
-      // Check subexecutors
-      if (t.numSubExecs <= 0)
-        errors.push_back({ERR_FATAL, "Transfer %d: # of subexecutors must be positive", i});
-      else
-        totalSubExecs[t.exeDevice] += t.numSubExecs;
-    }
-    int gpuMaxHwQueues = 4;
-    if (getenv("GPU_MAX_HW_QUEUES"))
-      gpuMaxHwQueues = atoi(getenv("GPU_MAX_HW_QUEUES"));
-    // Aggregate checks
-    for (auto const& exeDevice : executors) {
-      switch (exeDevice.exeType) {
-      case EXE_CPU:
-      {
-        // Check total number of subexecutors requested
-        int numCpuSubExec = GetNumSubExecutors(exeDevice);
-        if (totalSubExecs[exeDevice] > numCpuSubExec)
-          errors.push_back({ERR_WARN,
-                            "CPU %d requests %d total cores however only %d available. "
-                            "Serialization will occur",
-                            exeDevice.exeIndex, totalSubExecs[exeDevice], numCpuSubExec});
-        break;
-      }
-      case EXE_GPU_GFX:
-      {
-        // Check total number of subexecutors requested
-        int numGpuSubExec = GetNumSubExecutors(exeDevice);
-        if (totalSubExecs[exeDevice] > numGpuSubExec)
-          errors.push_back({ERR_WARN,
-                            "GPU %d requests %d total CUs however only %d available. "
-                            "Serialization will occur",
-                            exeDevice.exeIndex, totalSubExecs[exeDevice], numGpuSubExec});
-        // Check that if executor subindices are used, all Transfers specify executor subindices
-        if (useSubIndexCount[exeDevice] > 0 && useSubIndexCount[exeDevice] != transferCount[exeDevice]) {
-          errors.push_back({ERR_FATAL,
-                            "GPU %d specifies XCC on only %d of %d Transfers. "
-                            "Must either specific none or all",
-                            exeDevice.exeIndex, useSubIndexCount[exeDevice], transferCount[exeDevice]});
-        }
-        if (cfg.gfx.useMultiStream && transferCount[exeDevice] > gpuMaxHwQueues) {
-          errors.push_back({ERR_WARN,
-                            "GPU %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d",
-                            exeDevice.exeIndex, transferCount[exeDevice], gpuMaxHwQueues});
-        }
-        break;
-      }
-      case EXE_GPU_DMA:
-      {
-        // Check that if executor subindices are used, all Transfers specify executor subindices
-        if (useSubIndexCount[exeDevice] > 0 && useSubIndexCount[exeDevice] != transferCount[exeDevice]) {
-          errors.push_back({ERR_FATAL,
-                            "DMA %d specifies engine on only %d of %d Transfers. "
-                            "Must either specific none or all",
-                            exeDevice.exeIndex, useSubIndexCount[exeDevice], transferCount[exeDevice]});
-        }
-        if (transferCount[exeDevice] > gpuMaxHwQueues) {
-          errors.push_back({ERR_WARN,
-                           "DMA %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d",
-                           exeDevice.exeIndex, transferCount[exeDevice], gpuMaxHwQueues});
-        }
-        char* enableSdma = getenv("HSA_ENABLE_SDMA");
-        if (enableSdma && !strcmp(enableSdma, "0"))
-          errors.push_back({ERR_WARN,
-                            "DMA functionality disabled due to environment variable HSA_ENABLE_SDMA=0. "
-                            "DMA %d copies will fallback to blit (GFX) kernels", exeDevice.exeIndex});
-        break;
-      }
-      default:
-        break;
-      }
-    }
-    // Check for fatal errors
-    for (auto const& err : errors)
-      if (err.errType == ERR_FATAL) return true;
-    return false;
-  }
-// Internal data structures
-//========================================================================================
-  // Parameters for each SubExecutor
-  struct SubExecParam
-  {
-    // Inputs
-    size_t                     N;                 ///< Number of floats this subExecutor works on
-    int                        numSrcs;           ///< Number of source arrays
-    int                        numDsts;           ///< Number of destination arrays
-    float*                     src[MAX_SRCS];     ///< Source array pointers
-    float*                     dst[MAX_DSTS];     ///< Destination array pointers
-    int32_t                    preferredXccId;    ///< XCC ID to execute on (GFX only)
-    // Prepared
-    int                        teamSize;          ///< Index of this sub executor amongst team
-    int                        teamIdx;           ///< Size of team this sub executor is part of
-    // Outputs
-    long long                  startCycle;        ///< Start timestamp for in-kernel timing (GPU-GFX executor)
-    long long                  stopCycle;         ///< Stop  timestamp for in-kernel timing (GPU-GFX executor)
-    uint32_t                   hwId;              ///< Hardware ID
-    uint32_t                   xccId;             ///< XCC ID
-  };
-  // Internal resources allocated per Transfer
-  struct TransferResources
-  {
-    int                        transferIdx;       ///< The associated Transfer
-    size_t                     numBytes;          ///< Number of bytes to Transfer
-    vector<float*>             srcMem;            ///< Source memory
-    vector<float*>             dstMem;            ///< Destination memory
-    vector<SubExecParam>       subExecParamCpu;   ///< Defines subarrays for each subexecutor
-    vector<int>                subExecIdx;        ///< Indices into subExecParamGpu
-    int                        numaNode;          ///< NUMA node to use for this Transfer
-    // For GFX executor
-    SubExecParam*              subExecParamGpuPtr;
-    // For targeted-SDMA
-#if !defined(__NVCC__)
-    hsa_agent_t                dstAgent;          ///< DMA destination memory agent
-    hsa_agent_t                srcAgent;          ///< DMA source memory agent
-    hsa_signal_t               signal;            ///< HSA signal for completion
-    hsa_amd_sdma_engine_id_t   sdmaEngineId;      ///< DMA engine ID
-#endif
-// For IBV executor
-#ifdef NIC_EXEC_ENABLED
-    int                        srcNicIndex;       ///< SRC NIC index
-    int                        dstNicIndex;       ///< DST NIC index
-    ibv_context*               srcContext;        ///< Device context for SRC NIC
-    ibv_context*               dstContext;        ///< Device context for DST NIC
-    ibv_pd*                    srcProtect;        ///< Protection domain for SRC NIC
-    ibv_pd*                    dstProtect;        ///< Protection domain for DST NIC
-    ibv_cq*                    srcCompQueue;      ///< Completion queue for SRC NIC
-    ibv_cq*                    dstCompQueue;      ///< Completion queue for DST NIC
-    ibv_port_attr              srcPortAttr;       ///< Port attributes for SRC NIC
-    ibv_port_attr              dstPortAttr;       ///< Port attributes for DST NIC
-    ibv_gid                    srcGid;            ///< GID handle for SRC NIC
-    ibv_gid                    dstGid;            ///< GID handle for DST NIC
-    vector<ibv_qp*>            srcQueuePairs;     ///< Queue pairs for SRC NIC
-    vector<ibv_qp*>            dstQueuePairs;     ///< Queue pairs for DST NIC
-    ibv_mr*                    srcMemRegion;      ///< Memory region for SRC
-    ibv_mr*                    dstMemRegion;      ///< Memory region for DST
-    uint8_t                    qpCount;           ///< Number of QPs to be used for transferring data
-    vector<ibv_sge>            sgePerQueuePair;   ///< Scatter-gather elements per queue pair
-    vector<ibv_send_wr>        sendWorkRequests;  ///< Send work requests per queue pair
-#endif
-    // Counters
-    double                     totalDurationMsec; ///< Total duration for all iterations for this Transfer
-    vector<double>             perIterMsec;       ///< Duration for each individual iteration
-    vector<set<pair<int,int>>> perIterCUs;        ///< GFX-Executor only. XCC:CU used per iteration
-  };
-  // Internal resources allocated per Executor
-  struct ExeInfo
-  {
-    size_t                     totalBytes;        ///< Total bytes this executor transfers
-    double                     totalDurationMsec; ///< Total duration for all iterations for this Executor
-    int                        totalSubExecs;     ///< Total number of subExecutors to use
-    bool                       useSubIndices;     ///< Use subexecutor indicies
-    int                        numSubIndices;     ///< Number of subindices this ExeDevice has
-    vector<SubExecParam>       subExecParamCpu;   ///< Subexecutor parameters for this executor
-    vector<TransferResources>  resources;         ///< Per-Transfer resources
-    // For GPU-Executors
-    SubExecParam*              subExecParamGpu;   ///< GPU copy of subExecutor parameters
-    vector<hipStream_t>        streams;           ///< HIP streams to launch on
-    vector<hipEvent_t>         startEvents;       ///< HIP start timing event
-    vector<hipEvent_t>         stopEvents;        ///< HIP stop timing event
-    int                        wallClockRate;     ///< (GFX-only) Device wall clock rate
-  };
-  // Structure to track PCIe topology
-  struct PCIeNode
-  {
-    std::string        address;                   ///< PCIe address for this PCIe node
-    std::string        description;               ///< Description for this PCIe node
-    std::set<PCIeNode> children;                  ///< Children PCIe nodes
-    // Default constructor
-    PCIeNode() : address(""), description("") {}
-    // Constructor
-    PCIeNode(std::string const& addr) : address(addr) {}
-    // Constructor
-    PCIeNode(std::string const& addr, std::string const& desc)
-      :address(addr), description(desc) {}
-    // Comparison operator for std::set
-    bool operator<(PCIeNode const& other) const {
-      return address < other.address;
-    }
-  };
-#ifdef NIC_EXEC_ENABLED
-  // Structure to track information about IBV devices
-  struct IbvDevice
-  {
-    ibv_device* devicePtr;
-    std::string name;
-    std::string busId;
-    bool        hasActivePort;
-    int         numaNode;
-    int         gidIndex;
-    std::string gidDescriptor;
-    bool        isRoce;
-  };
-#endif
-#ifdef NIC_EXEC_ENABLED
-// Function to collect information about IBV devices
-//========================================================================================
-static bool IsConfiguredGid(union ibv_gid const& gid)
-  {
-    const struct in6_addr *a = (struct in6_addr *) gid.raw;
-    int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
-    if (((a->s6_addr32[0] | trailer) == 0UL) ||
-        ((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
-      return false;
-    }
-    return true;
-  }
-  static bool LinkLocalGid(union ibv_gid const& gid)
-  {
-    const struct in6_addr *a = (struct in6_addr *) gid.raw;
-    if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) {
-      return true;
-    }
-    return false;
-  }
-  static ErrResult GetRoceVersionNumber(struct ibv_context* const& context,
-                                        int const&  portNum,
-                                        int const&  gidIndex,
-                                        int&        version)
-  {
-    char const* deviceName = ibv_get_device_name(context->device);
-    char gidRoceVerStr[16]      = {};
-    char roceTypePath[PATH_MAX] = {};
-    sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d",
-            deviceName, portNum, gidIndex);
-    int fd = open(roceTypePath, O_RDONLY);
-    if (fd == -1)
-      return {ERR_FATAL, "Failed while opening RoCE file path (%s)", roceTypePath};
-    int ret = read(fd, gidRoceVerStr, 15);
-    close(fd);
-    if (ret == -1)
-      return {ERR_FATAL, "Failed while reading RoCE version"};
-    if (strlen(gidRoceVerStr)) {
-      if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0
-          || strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
-        version = 1;
-      }
-      else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
-        version = 2;
-      }
-    }
-    return ERR_NONE;
-  }
-  static bool IsIPv4MappedIPv6(const union ibv_gid &gid)
-  {
-    // look for ::ffff:x.x.x.x format
-    // From Broadcom documentation
-    // https://techdocs.broadcom.com/us/en/storage-and-ethernet-connectivity/ethernet-nic-controllers/bcm957xxx/adapters/frequently-asked-questions1.html
-    // "The IPv4 address is really an IPv4 address mapped into the IPv6 address space.
-    // This can be identified by 80 “0” bits, followed by 16 “1” bits (“FFFF” in hexadecimal)
-    // followed by the original 32-bit IPv4 address."
-    return (gid.global.subnet_prefix == 0    &&
-            gid.raw[8]               == 0    &&
-            gid.raw[9]               == 0    &&
-            gid.raw[10]              == 0xff &&
-            gid.raw[11]              == 0xff);
-  }
-  static ErrResult GetGidIndex(struct ibv_context*          context,
-                               int const&                   gidTblLen,
-                               int const&                   portNum,
-                               std::pair<int, std::string>& gidInfo)
-  {
-    if(gidInfo.first >= 0) return ERR_NONE; // honor user choice
-    union ibv_gid gid;
-    GidPriority highestPriority = GidPriority::UNKNOWN;
-    int gidIndex = -1;
-    for (int i = 0; i < gidTblLen; ++i) {
-      IBV_CALL(ibv_query_gid, context, portNum, i, &gid);
-      if (!IsConfiguredGid(gid)) continue;
-      int gidCurrRoceVersion;
-      if(GetRoceVersionNumber(context, portNum, i, gidCurrRoceVersion).errType != ERR_NONE) continue;
-      GidPriority currPriority;
-      if (IsIPv4MappedIPv6(gid)) {
-        currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV4 : GidPriority::ROCEV1_IPV4;
-      } else if (!LinkLocalGid(gid)) {
-        currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV6 : GidPriority::ROCEV1_IPV6;
-      } else {
-        currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_LINK_LOCAL : GidPriority::ROCEV1_LINK_LOCAL;
-      }
-      if(currPriority > highestPriority) {
-        highestPriority = currPriority;
-        gidIndex = i;
-      }
-    }
-    if (highestPriority == GidPriority::UNKNOWN) {
-      gidInfo.first = -1;
-      return {ERR_FATAL, "Failed to auto-detect a valid GID index. Try setting it manually through IB_GID_INDEX"};
-    }
-    gidInfo.first = gidIndex;
-    gidInfo.second = GidPriorityStr[highestPriority];
-    return ERR_NONE;
-  }
-  static vector<IbvDevice>& GetIbvDeviceList()
-  {
-    static bool isInitialized = false;
-    static vector<IbvDevice> ibvDeviceList = {};
-    // Build list on first use
-    if (!isInitialized) {
-      // Query the number of IBV devices
-      int numIbvDevices = 0;
-      ibv_device** deviceList = ibv_get_device_list(&numIbvDevices);
-      if (deviceList && numIbvDevices > 0) {
-        // Loop over each device to collect information
-        for (int i = 0; i < numIbvDevices; i++) {
-          IbvDevice ibvDevice;
-          ibvDevice.devicePtr = deviceList[i];
-          ibvDevice.name = deviceList[i]->name;
-          ibvDevice.hasActivePort = false;
-          {
-            struct ibv_context *context = ibv_open_device(ibvDevice.devicePtr);
-            if (context) {
-              struct ibv_device_attr deviceAttr;
-              if (!ibv_query_device(context, &deviceAttr)) {
-                int activePort;
-                ibvDevice.gidIndex = -1;
-                for (int port = 1; port <= deviceAttr.phys_port_cnt; ++port) {
-                  struct ibv_port_attr portAttr;
-                  if (ibv_query_port(context, port, &portAttr)) continue;
-                  if (portAttr.state == IBV_PORT_ACTIVE) {
-                    activePort = port;
-                    ibvDevice.hasActivePort = true;
-                    if(portAttr.link_layer == IBV_LINK_LAYER_ETHERNET) {
-                      ibvDevice.isRoce = true;
-                      std::pair<int, std::string> gidInfo (-1, "");
-                      auto res = GetGidIndex(context, portAttr.gid_tbl_len, activePort, gidInfo);
-                      if (res.errType == ERR_NONE) {
-                        ibvDevice.gidIndex = gidInfo.first;
-                        ibvDevice.gidDescriptor = gidInfo.second;
-                      }
-                    }
-                    break;
-                  }
-                }
-              }
-              ibv_close_device(context);
-            }
-          }
-          ibvDevice.busId = "";
-          {
-            std::string device_path(ibvDevice.devicePtr->dev_path);
-            if (std::filesystem::exists(device_path)) {
-              std::string pciPath = std::filesystem::canonical(device_path + "/device").string();
-              std::size_t pos = pciPath.find_last_of('/');
-              if (pos != std::string::npos) {
-                ibvDevice.busId = pciPath.substr(pos + 1);
-              }
-            }
-          }
-          // Get nearest numa node for this device
-          ibvDevice.numaNode = -1;
-          std::filesystem::path devicePath = "/sys/bus/pci/devices/" + ibvDevice.busId + "/numa_node";
-          std::string canonicalPath = std::filesystem::canonical(devicePath).string();
-          if (std::filesystem::exists(canonicalPath)) {
-            std::ifstream file(canonicalPath);
-            if (file.is_open()) {
-              std::string numaNodeStr;
-              std::getline(file, numaNodeStr);
-              int numaNodeVal;
-              if (sscanf(numaNodeStr.c_str(), "%d", &numaNodeVal) == 1)
-                ibvDevice.numaNode = numaNodeVal;
-              file.close();
-            }
-          }
-          ibvDeviceList.push_back(ibvDevice);
-        }
-      }
-      ibv_free_device_list(deviceList);
-      isInitialized = true;
-    }
-    return ibvDeviceList;
-  }
-#endif // NIC_EXEC_ENABLED
-#ifdef NIC_EXEC_ENABLED
-// PCIe-related functions
-//========================================================================================
-  // Prints off PCIe tree
-  static void PrintPCIeTree(PCIeNode    const& node,
-                            std::string const& prefix = "",
-                            bool               isLast = true)
-  {
-    if (!node.address.empty()) {
-      printf("%s%s%s", prefix.c_str(), (isLast ? "└── " : "├── "), node.address.c_str());
-      if (!node.description.empty()) {
-        printf("(%s)", node.description.c_str());
-      }
-      printf("\n");
-    }
-    auto const& children = node.children;
-    for (auto it = children.begin(); it != children.end(); ++it) {
-      PrintPCIeTree(*it, prefix + (isLast ? "    " : "│   "), std::next(it) == children.end());
-    }
-  }
-  // Inserts nodes along pcieAddress down a tree starting from root
-  static ErrResult InsertPCIePathToTree(std::string const& pcieAddress,
-                                        std::string const& description,
-                                        PCIeNode&          root)
-  {
-    std::filesystem::path devicePath = "/sys/bus/pci/devices/" + pcieAddress;
-    std::string canonicalPath = std::filesystem::canonical(devicePath).string();
-    if (!std::filesystem::exists(devicePath)) {
-      return {ERR_FATAL, "Device path %s does not exist", devicePath.c_str()};
-    }
-    std::istringstream iss(canonicalPath);
-    std::string token;
-    PCIeNode* currNode = &root;
-    while (std::getline(iss, token, '/')) {
-      auto it = (currNode->children.insert(PCIeNode(token))).first;
-      currNode = const_cast<PCIeNode*>(&(*it));
-    }
-    currNode->description = description;
-    return ERR_NONE;
-  }
-  // Returns root node for PCIe tree.  Constructed on first use
-  static PCIeNode* GetPCIeTreeRoot()
-  {
-    static bool isInitialized = false;
-    static PCIeNode pcieRoot;
-    // Build PCIe tree on first use
-    if (!isInitialized) {
-      // Add NICs to the tree
-      int numNics = GetNumExecutors(EXE_NIC);
-      auto const& ibvDeviceList = GetIbvDeviceList();
-      for (IbvDevice const& ibvDevice : ibvDeviceList) {
-        if (!ibvDevice.hasActivePort || ibvDevice.busId == "") continue;
-        InsertPCIePathToTree(ibvDevice.busId, ibvDevice.name, pcieRoot);
-      }
-      // Add GPUs to the tree
-      int numGpus = GetNumExecutors(EXE_GPU_GFX);
-      for (int i = 0; i < numGpus; ++i) {
-        char hipPciBusId[64];
-        if (hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i) == hipSuccess) {
-          InsertPCIePathToTree(hipPciBusId, "GPU " + std::to_string(i), pcieRoot);
-        }
-      }
-#ifdef VERBS_DEBUG
-      PrintPCIeTree(pcieRoot);
-#endif
-      isInitialized = true;
-    }
-    return &pcieRoot;
-  }
-  // Finds the lowest common ancestor in PCIe tree between two nodes
-  static PCIeNode const* GetLcaBetweenNodes(PCIeNode    const* root,
-                                            std::string const& node1Address,
-                                            std::string const& node2Address)
-  {
-    if (!root || root->address == node1Address || root->address == node2Address)
-      return root;
-    PCIeNode const* lcaFound1 = nullptr;
-    PCIeNode const* lcaFound2 = nullptr;
-    // Recursively iterate over children
-    for (auto const& child : root->children) {
-      PCIeNode const* lca = GetLcaBetweenNodes(&child, node1Address, node2Address);
-      if (!lca) continue;
-      if (!lcaFound1) {
-        // First time found
-        lcaFound1 = lca;
-      } else {
-        // Second time found
-        lcaFound2 = lca;
-        break;
-      }
-    }
-    // If two children were found, then current node is the lowest common ancestor
-    return (lcaFound1 && lcaFound2) ? root : lcaFound1;
-  }
-  // Gets the depth of an node in the PCIe tree
-  static int GetLcaDepth(std::string const&     targetBusID,
-                         PCIeNode const* const& node,
-                         int                    depth = 0)
-  {
-    if (!node) return -1;
-    if (targetBusID == node->address) return depth;
-    for (auto const& child : node->children) {
-      int distance = GetLcaDepth(targetBusID, &child, depth + 1);
-      if (distance != -1)
-        return distance;
-    }
-    return -1;
-  }
-  // Function to extract the bus number from a PCIe address (domain:bus:device.function)
-  static int ExtractBusNumber(std::string const& pcieAddress)
-  {
-    int domain, bus, device, function;
-    char delimiter;
-    std::istringstream iss(pcieAddress);
-    iss >> std::hex >> domain >> delimiter >> bus >> delimiter >> device >> delimiter >> function;
-    if (iss.fail()) {
-#ifdef VERBS_DEBUG
-      printf("Invalid PCIe address format: %s\n", pcieAddress.c_str());
-#endif
-      return -1;
-    }
-    return bus;
-  }
-  // Function to compute the distance between two bus IDs
-  static int GetBusIdDistance(std::string const& pcieAddress1,
-                              std::string const& pcieAddress2)
-  {
-    int bus1 = ExtractBusNumber(pcieAddress1);
-    int bus2 = ExtractBusNumber(pcieAddress2);
-    return (bus1 < 0 || bus2 < 0) ? -1 : std::abs(bus1 - bus2);
-  }
-  // Given a target busID and a set of candidate devices, returns a set of indices
-  // that is "closest" to the target
-  static std::set<int> GetNearestDevicesInTree(std::string              const& targetBusId,
-                                               std::vector<std::string> const& candidateBusIdList)
-  {
-    int maxDepth = -1;
-    int minDistance = std::numeric_limits<int>::max();
-    std::set<int> matches = {};
-    // Loop over the candidates to find the ones with the lowest common ancestor (LCA)
-    for (int i = 0; i < candidateBusIdList.size(); i++) {
-      std::string const& candidateBusId = candidateBusIdList[i];
-      if (candidateBusId == "") continue;
-      PCIeNode const* lca = GetLcaBetweenNodes(GetPCIeTreeRoot(), targetBusId, candidateBusId);
-      if (!lca) continue;
-      int depth = GetLcaDepth(lca->address, GetPCIeTreeRoot());
-      int currDistance = GetBusIdDistance(targetBusId, candidateBusId);
-      // When more than one LCA match is found, choose the one with smallest busId difference
-      // NOTE: currDistance could be -1, which signals problem with parsing, however still
-      //       remains a valid "closest" candidate, so is included
-      if (depth > maxDepth || (depth == maxDepth && depth >= 0 && currDistance < minDistance)) {
-        maxDepth = depth;
-        matches.clear();
-        matches.insert(i);
-        minDistance = currDistance;
-      } else if (depth == maxDepth && depth >= 0 && currDistance == minDistance) {
-        matches.insert(i);
-      }
-    }
-    return matches;
-  }
-#endif // NIC_EXEC_ENABLED
-#ifdef NIC_EXEC_ENABLED
-// IB Verbs-related functions
-//========================================================================================
-  // Create a queue pair
-  static ErrResult CreateQueuePair(ConfigOptions const& cfg,
-                                   struct ibv_pd*       pd,
-                                   struct ibv_cq*       cq,
-                                   struct ibv_qp*&      qp)
-  {
-    // Set queue pair attributes
-    struct ibv_qp_init_attr attr = {};
-    attr.qp_type          = IBV_QPT_RC;                  // Set type to reliable connection
-    attr.send_cq          = cq;                          // Send completion queue
-    attr.recv_cq          = cq;                          // Recv completion queue
-    attr.cap.max_send_wr  = cfg.nic.maxSendWorkReq;      // Max send work requests
-    attr.cap.max_recv_wr  = cfg.nic.maxRecvWorkReq;      // Max recv work requests
-    attr.cap.max_send_sge = 1;                           // Max send scatter-gather entries
-    attr.cap.max_recv_sge = 1;                           // Max recv scatter-gather entries
-    qp = ibv_create_qp(pd, &attr);
-    if (qp == NULL)
-      return {ERR_FATAL, "Error while creating QP"};
-    return ERR_NONE;
-  }
-  // Initialize a queue pair
-  static ErrResult InitQueuePair(struct ibv_qp* qp,
-                                 uint8_t        port,
-                                 unsigned       flags)
-  {
-    struct ibv_qp_attr attr = {};                        // Clear all attributes
-    attr.qp_state        = IBV_QPS_INIT;                 // Set the QP state to INIT
-    attr.pkey_index      = 0;                            // Set the partition key index to 0
-    attr.port_num        = port;                         // Set the port number to the defined IB_PORT
-    attr.qp_access_flags = flags;                        // Set the QP access flags to the provided flags
-    int ret = ibv_modify_qp(qp, &attr,
-                            IBV_QP_STATE      |          // Modify the QP state
-                            IBV_QP_PKEY_INDEX |          // Modify the partition key index
-                            IBV_QP_PORT       |          // Modify the port number
-                            IBV_QP_ACCESS_FLAGS);        // Modify the access flags
-    if (ret != 0)
-      return {ERR_FATAL, "Error during QP Init. IB Verbs Error code: %d", ret};
-    return ERR_NONE;
-  }
-  // Transition QueuePair to Ready to Receive State
-  static ErrResult TransitionQpToRtr(ibv_qp*         qp,
-                                     uint16_t const& dlid,
-                                     uint32_t const& dqpn,
-                                     ibv_gid  const& gid,
-                                     uint8_t  const& gidIndex,
-                                     uint8_t  const& port,
-                                     bool     const& isRoCE,
-                                     ibv_mtu  const& mtu)
-  {
-    // Prepare QP attributes
-    struct ibv_qp_attr attr = {};
-    attr.qp_state           = IBV_QPS_RTR;
-    attr.path_mtu           = mtu;
-    attr.rq_psn             = 0;
-    attr.max_dest_rd_atomic = 1;
-    attr.min_rnr_timer      = 12;
-    if (isRoCE) {
-      attr.ah_attr.is_global                     = 1;
-      attr.ah_attr.grh.dgid.global.subnet_prefix = gid.global.subnet_prefix;
-      attr.ah_attr.grh.dgid.global.interface_id  = gid.global.interface_id;
-      attr.ah_attr.grh.flow_label                = 0;
-      attr.ah_attr.grh.sgid_index                = gidIndex;
-      attr.ah_attr.grh.hop_limit                 = 255;
-    } else {
-      attr.ah_attr.is_global = 0;
-      attr.ah_attr.dlid      = dlid;
-    }
-    attr.ah_attr.sl            = 0;
-    attr.ah_attr.src_path_bits = 0;
-    attr.ah_attr.port_num      = port;
-    attr.dest_qp_num           = dqpn;
-    // Modify the QP
-    int ret = ibv_modify_qp(qp, &attr,
-                            IBV_QP_STATE              |
-                            IBV_QP_AV                 |
-                            IBV_QP_PATH_MTU           |
-                            IBV_QP_DEST_QPN           |
-                            IBV_QP_RQ_PSN             |
-                            IBV_QP_MAX_DEST_RD_ATOMIC |
-                            IBV_QP_MIN_RNR_TIMER);
-    if (ret != 0)
-      return {ERR_FATAL, "Error during QP RTR. IB Verbs Error code: %d", ret};
-    return ERR_NONE;
-  }
-  // Transition QueuePair to Ready to Send state
-  static ErrResult TransitionQpToRts(struct ibv_qp *qp)
-  {
-    struct ibv_qp_attr attr = {};
-    attr.qp_state           = IBV_QPS_RTS;
-    attr.sq_psn             = 0;
-    attr.timeout            = 14;
-    attr.retry_cnt          = 7;
-    attr.rnr_retry          = 7;
-    attr.max_rd_atomic      = 1;
-    int ret = ibv_modify_qp(qp, &attr,
-                            IBV_QP_STATE     |
-                            IBV_QP_TIMEOUT   |
-                            IBV_QP_RETRY_CNT |
-                            IBV_QP_RNR_RETRY |
-                            IBV_QP_SQ_PSN    |
-                            IBV_QP_MAX_QP_RD_ATOMIC);
-    if (ret != 0)
-      return {ERR_FATAL, "Error during QP RTS. IB Verbs Error code: %d", ret};
-    return ERR_NONE;
-  }
-  static ErrResult PrepareNicTransferResources(ConfigOptions const& cfg,
-                                               ExeDevice     const& srcExeDevice,
-                                               Transfer      const& t,
-                                               TransferResources&   rss)
-  {
-    // Switch to the closest NUMA node to this NIC
-    int numaNode = GetIbvDeviceList()[srcExeDevice.exeIndex].numaNode;
-    if (numaNode != -1)
-      numa_run_on_node(numaNode);
-    int const port = cfg.nic.ibPort;
-    // Figure out destination NIC (Accounts for possible remap due to use of EXE_NIC_NEAREST)
-    ExeDevice dstExeDevice;
-    ERR_CHECK(GetActualExecutor(cfg, {t.exeDevice.exeType, t.exeSubIndex}, dstExeDevice));
-    rss.srcNicIndex = srcExeDevice.exeIndex;
-    rss.dstNicIndex = dstExeDevice.exeIndex;
-    rss.qpCount     = t.numSubExecs;
-    // Check for valid NICs and active ports
-    int numNics = GetNumExecutors(EXE_NIC);
-    if (rss.srcNicIndex < 0 || rss.srcNicIndex >= numNics)
-      return {ERR_FATAL, "SRC NIC index is out of range (%d)", rss.srcNicIndex};
-    if (rss.dstNicIndex < 0 || rss.dstNicIndex >= numNics)
-      return {ERR_FATAL, "DST NIC index is out of range (%d)", rss.dstNicIndex};
-    if (!GetIbvDeviceList()[rss.srcNicIndex].hasActivePort)
-      return {ERR_FATAL, "SRC NIC %d is not active\n", rss.srcNicIndex};
-    if (!GetIbvDeviceList()[rss.dstNicIndex].hasActivePort)
-      return {ERR_FATAL, "DST NIC %d is not active\n", rss.dstNicIndex};
-    // Queue pair flags
-    unsigned int rdmaAccessFlags = (IBV_ACCESS_LOCAL_WRITE    |
-                                    IBV_ACCESS_REMOTE_READ    |
-                                    IBV_ACCESS_REMOTE_WRITE   |
-                                    IBV_ACCESS_REMOTE_ATOMIC);
-    unsigned int rdmaMemRegFlags = rdmaAccessFlags;
-    if (cfg.nic.useRelaxedOrder) rdmaMemRegFlags |= IBV_ACCESS_RELAXED_ORDERING;
-    // Open NIC contexts
-    IBV_PTR_CALL(rss.srcContext, ibv_open_device, GetIbvDeviceList()[rss.srcNicIndex].devicePtr);
-    IBV_PTR_CALL(rss.dstContext, ibv_open_device, GetIbvDeviceList()[rss.dstNicIndex].devicePtr);
-    // Open protection domains
-    IBV_PTR_CALL(rss.srcProtect, ibv_alloc_pd, rss.srcContext);
-    IBV_PTR_CALL(rss.dstProtect, ibv_alloc_pd, rss.dstContext);
-    // Register memory region
-    IBV_PTR_CALL(rss.srcMemRegion, ibv_reg_mr, rss.srcProtect, rss.srcMem[0], rss.numBytes, rdmaMemRegFlags);
-    IBV_PTR_CALL(rss.dstMemRegion, ibv_reg_mr, rss.dstProtect, rss.dstMem[0], rss.numBytes, rdmaMemRegFlags);
-    // Create completion queues
-    IBV_PTR_CALL(rss.srcCompQueue, ibv_create_cq, rss.srcContext, cfg.nic.queueSize, NULL, NULL, 0);
-    IBV_PTR_CALL(rss.dstCompQueue, ibv_create_cq, rss.dstContext, cfg.nic.queueSize, NULL, NULL, 0);
-    // Get port attributes
-    IBV_CALL(ibv_query_port, rss.srcContext, port, &rss.srcPortAttr);
-    IBV_CALL(ibv_query_port, rss.dstContext, port, &rss.dstPortAttr);
-    if (rss.srcPortAttr.link_layer != rss.dstPortAttr.link_layer)
-      return {ERR_FATAL, "SRC NIC (%d) and DST NIC (%d) do not have the same link layer", rss.srcNicIndex, rss.dstNicIndex};
-    // Prepare GID index
-    int srcGidIndex = cfg.nic.ibGidIndex;
-    int dstGidIndex = cfg.nic.ibGidIndex;
-    // Check for RDMA over Converged Ethernet (RoCE) and update GID index appropriately
-    bool isRoCE = (rss.srcPortAttr.link_layer == IBV_LINK_LAYER_ETHERNET);
-    if (isRoCE) {
-      // Try to auto-detect the GID index
-      std::pair<int, std::string> srcGidInfo (srcGidIndex, "");
-      std::pair<int, std::string> dstGidInfo (dstGidIndex, "");
-      ERR_CHECK(GetGidIndex(rss.srcContext, rss.srcPortAttr.gid_tbl_len, cfg.nic.ibPort, srcGidInfo));
-      ERR_CHECK(GetGidIndex(rss.dstContext, rss.dstPortAttr.gid_tbl_len, cfg.nic.ibPort, dstGidInfo));
-      srcGidIndex = srcGidInfo.first;
-      dstGidIndex = dstGidInfo.first;
-      IBV_CALL(ibv_query_gid, rss.srcContext, port, srcGidIndex, &rss.srcGid);
-      IBV_CALL(ibv_query_gid, rss.dstContext, port, dstGidIndex, &rss.dstGid);
-    }
-    // Prepare queue pairs and send elements
-    rss.srcQueuePairs.resize(rss.qpCount);
-    rss.dstQueuePairs.resize(rss.qpCount);
-    rss.sgePerQueuePair.resize(rss.qpCount);
-    rss.sendWorkRequests.resize(rss.qpCount);
-    for (int i = 0; i < rss.qpCount; ++i) {
-      // Create scatter-gather element for the portion of memory assigned to this queue pair
-      ibv_sge sg = {};
-      sg.addr   = (uint64_t)rss.subExecParamCpu[i].src[0];
-      sg.length = rss.subExecParamCpu[i].N * sizeof(float);
-      sg.lkey   = rss.srcMemRegion->lkey;
-      rss.sgePerQueuePair[i] = sg;
-      // Create send work request
-      ibv_send_wr wr = {};
-      wr.wr_id                = i;
-      wr.sg_list              = &rss.sgePerQueuePair[i];
-      wr.num_sge              = 1;
-      wr.opcode               = IBV_WR_RDMA_WRITE;
-      wr.send_flags           = IBV_SEND_SIGNALED;
-      wr.wr.rdma.remote_addr  = (uint64_t)rss.subExecParamCpu[i].dst[0];
-      wr.wr.rdma.rkey         = rss.dstMemRegion->rkey;
-      rss.sendWorkRequests[i] = wr;
-      // Create SRC/DST queue pairs
-      ERR_CHECK(CreateQueuePair(cfg, rss.srcProtect, rss.srcCompQueue, rss.srcQueuePairs[i]));
-      ERR_CHECK(CreateQueuePair(cfg, rss.dstProtect, rss.dstCompQueue, rss.dstQueuePairs[i]));
-      // Initialize SRC/DST queue pairs
-      ERR_CHECK(InitQueuePair(rss.srcQueuePairs[i], port, rdmaAccessFlags));
-      ERR_CHECK(InitQueuePair(rss.dstQueuePairs[i], port, rdmaAccessFlags));
-      // Transition the SRC queue pair to ready to receive
-      ERR_CHECK(TransitionQpToRtr(rss.srcQueuePairs[i], rss.dstPortAttr.lid,
-                                  rss.dstQueuePairs[i]->qp_num, rss.dstGid,
-                                  dstGidIndex, port, isRoCE,
-                                  rss.srcPortAttr.active_mtu));
-      // Transition the SRC queue pair to ready to send
-      ERR_CHECK(TransitionQpToRts(rss.srcQueuePairs[i]));
-      // Transition the DST queue pair to ready to receive
-      ERR_CHECK(TransitionQpToRtr(rss.dstQueuePairs[i], rss.srcPortAttr.lid,
-                                  rss.srcQueuePairs[i]->qp_num, rss.srcGid,
-                                  srcGidIndex, port, isRoCE,
-                                  rss.dstPortAttr.active_mtu));
-      // Transition the DST queue pair to ready to send
-      ERR_CHECK(TransitionQpToRts(rss.dstQueuePairs[i]));
-    }
-    return ERR_NONE;
-  }
-  static ErrResult TeardownNicTransferResources(TransferResources& rss)
-  {
-    // Deregister memory regions
-    IBV_CALL(ibv_dereg_mr, rss.srcMemRegion);
-    IBV_CALL(ibv_dereg_mr, rss.dstMemRegion);
-    // Destroy queue pairs
-    for (auto srcQueuePair : rss.srcQueuePairs)
-      IBV_CALL(ibv_destroy_qp, srcQueuePair);
-    rss.srcQueuePairs.clear();
-    for (auto dstQueuePair : rss.dstQueuePairs)
-      IBV_CALL(ibv_destroy_qp, dstQueuePair);
-    rss.dstQueuePairs.clear();
-    // Destroy completion queues
-    IBV_CALL(ibv_destroy_cq, rss.srcCompQueue);
-    IBV_CALL(ibv_destroy_cq, rss.dstCompQueue);
-    // Deallocate protection domains
-    IBV_CALL(ibv_dealloc_pd, rss.srcProtect);
-    IBV_CALL(ibv_dealloc_pd, rss.dstProtect);
-    // Destroy context
-    IBV_CALL(ibv_close_device, rss.srcContext);
-    IBV_CALL(ibv_close_device, rss.dstContext);
-    return ERR_NONE;
-  }
-#endif // NIC_EXEC_ENABLED
-// Data validation-related functions
-//========================================================================================
-  // Pseudo-random formula for each element in array
-  static __host__ float PrepSrcValue(int srcBufferIdx, size_t idx)
-  {
-    return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
-  }
-  // Fills a pre-sized buffer with the pattern, based on which src index buffer
-  // Note: Can also generate expected dst buffer
-  static void PrepareReference(ConfigOptions const& cfg, std::vector<float>& cpuBuffer, int bufferIdx)
-  {
-    size_t N = cpuBuffer.size();
-    // Source buffer
-    if (bufferIdx >= 0) {
-      // Use fill pattern if specified
-      size_t patternLen = cfg.data.fillPattern.size();
-      if (patternLen > 0) {
-        size_t copies   = N / patternLen;
-        size_t leftOver = N % patternLen;
-        float* cpuBufferPtr = cpuBuffer.data();
-        for (int i = 0; i < copies; i++) {
-          memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), patternLen * sizeof(float));
-          cpuBufferPtr += patternLen;
-        }
-        if (leftOver)
-          memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), leftOver * sizeof(float));
-      } else {
-        for (size_t i = 0; i < N; ++i)
-          cpuBuffer[i] = PrepSrcValue(bufferIdx, i);
-      }
-    } else { // Destination buffer
-      int numSrcs = -bufferIdx - 1;
-      if (numSrcs == 0) {
-        // Note: 0x75757575 = 13323083.0
-        memset(cpuBuffer.data(), MEMSET_CHAR, N * sizeof(float));
-      } else {
-        PrepareReference(cfg, cpuBuffer, 0);
-        if (numSrcs > 1) {
-          std::vector<float> temp(N);
-          for (int i = 1; i < numSrcs; i++) {
-            PrepareReference(cfg, temp, i);
-            for (int j = 0; j < N; j++) {
-              cpuBuffer[i] += temp[i];
-            }
-          }
-        }
-      }
-    }
-  }
-  // Checks that destination buffers match expected values
-  static ErrResult ValidateAllTransfers(ConfigOptions              const& cfg,
-                                        vector<Transfer>           const& transfers,
-                                        vector<TransferResources*> const& transferResources,
-                                        vector<vector<float>>      const& dstReference,
-                                        vector<float>&                    outputBuffer)
-  {
-    float* output;
-    size_t initOffset = cfg.data.byteOffset / sizeof(float);
-    for (auto rss : transferResources) {
-      int transferIdx = rss->transferIdx;
-      Transfer const& t = transfers[transferIdx];
-      size_t N = t.numBytes / sizeof(float);
-      float const* expected = dstReference[t.srcs.size()].data();
-      for (int dstIdx = 0; dstIdx < rss->dstMem.size(); dstIdx++) {
-        if (IsCpuMemType(t.dsts[dstIdx].memType) || cfg.data.validateDirect) {
-          output = (rss->dstMem[dstIdx]) + initOffset;
-        } else {
-          ERR_CHECK(hipMemcpy(outputBuffer.data(), (rss->dstMem[dstIdx]) + initOffset, t.numBytes, hipMemcpyDefault));
-          ERR_CHECK(hipDeviceSynchronize());
-          output = outputBuffer.data();
-        }
-        if (memcmp(output, expected, t.numBytes)) {
-          // Difference found - find first error
-          for (size_t i = 0; i < N; i++) {
-            if (output[i] != expected[i]) {
-              return {ERR_FATAL, "Transfer %d: Unexpected mismatch at index %lu of destination %d: Expected %10.5f Actual: %10.5f",
-                transferIdx, i, dstIdx, expected[i], output[i]};
-            }
-          }
-          return {ERR_FATAL, "Transfer %d: Unexpected output mismatch for destination %d", transferIdx, dstIdx};
-        }
-      }
-    }
-    return ERR_NONE;
-  }
-// Preparation-related functions
-//========================================================================================
-  // Prepares input parameters for each subexecutor
-  // Determines how sub-executors will split up the work
-  // Initializes counters
-  static ErrResult PrepareSubExecParams(ConfigOptions const& cfg,
-                                        Transfer      const& transfer,
-                                        TransferResources&   rss)
-  {
-    // Each subExecutor needs to know src/dst pointers and how many elements to transfer
-    // Figure out the sub-array each subExecutor works on for this Transfer
-    // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of data.blockBytes
-    //   except the very last one, for alignment reasons
-    size_t const N              = transfer.numBytes / sizeof(float);
-    int    const initOffset     = cfg.data.byteOffset / sizeof(float);
-    int    const targetMultiple = cfg.data.blockBytes / sizeof(float);
-    // In some cases, there may not be enough data for all subExectors
-    int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple,
-                                         (size_t)transfer.numSubExecs);
-    vector<SubExecParam>& subExecParam = rss.subExecParamCpu;
-    subExecParam.clear();
-    subExecParam.resize(transfer.numSubExecs);
-    size_t assigned = 0;
-    for (int i = 0; i < transfer.numSubExecs; ++i) {
-      SubExecParam& p  = subExecParam[i];
-      p.numSrcs        = rss.srcMem.size();
-      p.numDsts        = rss.dstMem.size();
-      p.startCycle     = 0;
-      p.stopCycle      = 0;
-      p.hwId           = 0;
-      p.xccId          = 0;
-      // In single team mode, subexecutors stripe across the entire array
-      if (cfg.gfx.useSingleTeam && transfer.exeDevice.exeType == EXE_GPU_GFX) {
-        p.N        = N;
-        p.teamSize = transfer.numSubExecs;
-        p.teamIdx  = i;
-        for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) p.src[iSrc] = rss.srcMem[iSrc] + initOffset;
-        for (int iDst = 0; iDst < p.numDsts; ++iDst) p.dst[iDst] = rss.dstMem[iDst] + initOffset;
-      } else {
-        // Otherwise, each subexecutor works on separate subarrays
-        int    const subExecLeft = std::max(0, maxSubExecToUse - i);
-        size_t const leftover    = N - assigned;
-        size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;
-        p.N        = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
-        p.teamSize = 1;
-        p.teamIdx  = 0;
-        for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) p.src[iSrc] = rss.srcMem[iSrc] + initOffset + assigned;
-        for (int iDst = 0; iDst < p.numDsts; ++iDst) p.dst[iDst] = rss.dstMem[iDst] + initOffset + assigned;
-        assigned += p.N;
-      }
-      p.preferredXccId = transfer.exeSubIndex;
-      // Override if XCC table has been specified
-      vector<vector<int>> const& table = cfg.gfx.prefXccTable;
-      if (transfer.exeDevice.exeType == EXE_GPU_GFX && transfer.exeSubIndex == -1 && !table.empty() &&
-          transfer.dsts.size() == 1 && IsGpuMemType(transfer.dsts[0].memType)) {
-        if (table.size() <= transfer.exeDevice.exeIndex ||
-            table[transfer.exeDevice.exeIndex].size() <= transfer.dsts[0].memIndex) {
-          return {ERR_FATAL, "[gfx.xccPrefTable] is too small"};
-        }
-        p.preferredXccId = table[transfer.exeDevice.exeIndex][transfer.dsts[0].memIndex];
-        if (p.preferredXccId < 0 || p.preferredXccId >= GetNumExecutorSubIndices(transfer.exeDevice)) {
-          return {ERR_FATAL, "[gfx.xccPrefTable] defines out-of-bound XCC index %d", p.preferredXccId};
-        }
-      }
-    }
-    // Clear counters
-    rss.totalDurationMsec = 0.0;
-    return ERR_NONE;
-  }
-  // Prepare each executor
-  // Allocates memory for src/dst, prepares subexecutors, executor-specific data structures
-  static ErrResult PrepareExecutor(ConfigOptions    const& cfg,
-                                   vector<Transfer> const& transfers,
-                                   ExeDevice        const& exeDevice,
-                                   ExeInfo&                exeInfo)
-  {
-    exeInfo.totalDurationMsec = 0.0;
-    // Loop over each transfer this executor is involved in
-    for (auto& rss : exeInfo.resources) {
-      Transfer const& t = transfers[rss.transferIdx];
-      rss.numBytes = t.numBytes;
-      // Allocate source memory
-      rss.srcMem.resize(t.srcs.size());
-      for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) {
-        MemDevice const& srcMemDevice = t.srcs[iSrc];
-        // Ensure executing GPU can access source memory
-        if (IsGpuExeType(exeDevice.exeType) && IsGpuMemType(srcMemDevice.memType) &&
-            srcMemDevice.memIndex != exeDevice.exeIndex) {
-          ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, srcMemDevice.memIndex));
-        }
-        ERR_CHECK(AllocateMemory(srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.srcMem[iSrc]));
-      }
-      // Allocate destination memory
-      rss.dstMem.resize(t.dsts.size());
-      for (int iDst = 0; iDst < t.dsts.size(); ++iDst) {
-        MemDevice const& dstMemDevice = t.dsts[iDst];
-        // Ensure executing GPU can access destination memory
-        if (IsGpuExeType(exeDevice.exeType) && IsGpuMemType(dstMemDevice.memType) &&
-            dstMemDevice.memIndex != exeDevice.exeIndex) {
-          ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, dstMemDevice.memIndex));
-        }
-        ERR_CHECK(AllocateMemory(dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.dstMem[iDst]));
-      }
-      if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) {
-#if !defined(__NVCC__)
-        // Collect HSA agent information
-        hsa_amd_pointer_info_t info;
-        info.size = sizeof(info);
-        ERR_CHECK(hsa_amd_pointer_info(rss.dstMem[0], &info, NULL, NULL, NULL));
-        rss.dstAgent = info.agentOwner;
-        ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL));
-        rss.srcAgent = info.agentOwner;
-        // Create HSA completion signal
-        ERR_CHECK(hsa_signal_create(1, 0, NULL, &rss.signal));
-        if (t.exeSubIndex != -1)
-          rss.sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
-#endif
-      }
-      // Prepare subexecutor parameters
-      ERR_CHECK(PrepareSubExecParams(cfg, t, rss));
-    }
-    // Prepare additional requirements for GPU-based executors
-    if (exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) {
-      ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
-      // Determine how many streams to use
-      int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA ||
-                                   (exeDevice.exeType == EXE_GPU_GFX && cfg.gfx.useMultiStream))
-        ? exeInfo.resources.size() : 1;
-      exeInfo.streams.resize(numStreamsToUse);
-      // Create streams
-      for (int i = 0; i < numStreamsToUse; ++i) {
-        if (cfg.gfx.cuMask.size()) {
-#if !defined(__NVCC__)
-          ERR_CHECK(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], cfg.gfx.cuMask.size(),
-                                                 cfg.gfx.cuMask.data()));
-#else
-          return {ERR_FATAL, "CU Masking in not supported on NVIDIA hardware"};
-#endif
-        } else {
-          ERR_CHECK(hipStreamCreate(&exeInfo.streams[i]));
-        }
-      }
-      if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
-        exeInfo.startEvents.resize(numStreamsToUse);
-        exeInfo.stopEvents.resize(numStreamsToUse);
-        for (int i = 0; i < numStreamsToUse; ++i) {
-          ERR_CHECK(hipEventCreate(&exeInfo.startEvents[i]));
-          ERR_CHECK(hipEventCreate(&exeInfo.stopEvents[i]));
-        }
-      }
-    }
-    // Prepare for GPU GFX executor
-    if (exeDevice.exeType == EXE_GPU_GFX) {
-      // Allocate one contiguous chunk of GPU memory for threadblock parameters
-      // This allows support for executing one transfer per stream, or all transfers in a single stream
-#if !defined(__NVCC__)
-      MemType memType = MEM_GPU;      // AMD hardware can directly access GPU memory from host
-#else
-      MemType memType = MEM_MANAGED;  // NVIDIA hardware requires managed memory to access from host
-#endif
-      ERR_CHECK(AllocateMemory({memType, exeDevice.exeIndex}, exeInfo.totalSubExecs * sizeof(SubExecParam),
-                               (void**)&exeInfo.subExecParamGpu));
-      // Create subexecutor parameter array for entire executor
-      exeInfo.subExecParamCpu.clear();
-      exeInfo.numSubIndices = GetNumExecutorSubIndices(exeDevice);
-#if defined(__NVCC__)
-      exeInfo.wallClockRate = 1000000;
-#else
-      ERR_CHECK(hipDeviceGetAttribute(&exeInfo.wallClockRate, hipDeviceAttributeWallClockRate,
-                                      exeDevice.exeIndex));
-#endif
-      int transferOffset = 0;
-      if (cfg.gfx.useMultiStream || cfg.gfx.blockOrder == 0) {
-        // Threadblocks are ordered sequentially one transfer at a time
-        for (auto& rss : exeInfo.resources) {
-          Transfer const& t = transfers[rss.transferIdx];
-          rss.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
-          for (auto p : rss.subExecParamCpu) {
-            rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
-            exeInfo.subExecParamCpu.push_back(p);
-            transferOffset++;
-          }
-        }
-      } else if (cfg.gfx.blockOrder == 1) {
-        // Interleave threadblocks of different Transfers
-        for (int subExecIdx = 0; exeInfo.subExecParamCpu.size() < exeInfo.totalSubExecs; ++subExecIdx) {
-          for (auto& rss : exeInfo.resources) {
-            Transfer const& t = transfers[rss.transferIdx];
-            if (subExecIdx < t.numSubExecs) {
-              rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
-              exeInfo.subExecParamCpu.push_back(rss.subExecParamCpu[subExecIdx]);
-            }
-          }
-        }
-      } else if (cfg.gfx.blockOrder == 2) {
-        // Build randomized threadblock list
-        std::vector<std::pair<int,int>> indices;
-        for (int i = 0; i < exeInfo.resources.size(); i++) {
-          auto const& rss = exeInfo.resources[i];
-          Transfer const& t = transfers[rss.transferIdx];
-          for (int j = 0; j < t.numSubExecs; j++)
-            indices.push_back(std::make_pair(i,j));
-        }
-        std::random_device rd;
-        std::default_random_engine gen(rd());
-        std::shuffle(indices.begin(), indices.end(), gen);
-        // Build randomized threadblock list
-        for (auto p : indices) {
-          auto& rss = exeInfo.resources[p.first];
-          rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
-          exeInfo.subExecParamCpu.push_back(rss.subExecParamCpu[p.second]);
-        }
-      }
-      // Copy sub executor parameters to GPU
-      ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
-      ERR_CHECK(hipMemcpy(exeInfo.subExecParamGpu,
-                          exeInfo.subExecParamCpu.data(),
-                          exeInfo.totalSubExecs * sizeof(SubExecParam),
-                          hipMemcpyHostToDevice));
-      ERR_CHECK(hipDeviceSynchronize());
-    }
-    // Prepare for NIC-based executors
-    if (IsNicExeType(exeDevice.exeType)) {
-#ifdef NIC_EXEC_ENABLED
-      for (auto& rss : exeInfo.resources) {
-        Transfer const& t = transfers[rss.transferIdx];
-        ERR_CHECK(PrepareNicTransferResources(cfg, exeDevice, t, rss));
-      }
-#else
-      return {ERR_FATAL, "RDMA executor is not supported"};
-#endif
-    }
-    return ERR_NONE;
-  }
-// Teardown-related functions
-//========================================================================================
-  // Clean up all resources
-  static ErrResult TeardownExecutor(ConfigOptions    const& cfg,
-                                    ExeDevice        const& exeDevice,
-                                    vector<Transfer> const& transfers,
-                                    ExeInfo&                exeInfo)
-  {
-    // Loop over each transfer this executor is involved in
-    for (auto& rss : exeInfo.resources) {
-      Transfer const& t = transfers[rss.transferIdx];
-      // Deallocate source memory
-      for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) {
-        ERR_CHECK(DeallocateMemory(t.srcs[iSrc].memType, rss.srcMem[iSrc], t.numBytes + cfg.data.byteOffset));
-      }
-      // Deallocate destination memory
-      for (int iDst = 0; iDst < t.dsts.size(); ++iDst) {
-        ERR_CHECK(DeallocateMemory(t.dsts[iDst].memType, rss.dstMem[iDst], t.numBytes + cfg.data.byteOffset));
-      }
-      // Destroy HSA signal for DMA executor
-#if !defined(__NVCC__)
-      if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) {
-        ERR_CHECK(hsa_signal_destroy(rss.signal));
-      }
-#endif
-      // Destroy NIC related resources
-#ifdef NIC_EXEC_ENABLED
-      if (IsNicExeType(exeDevice.exeType)) {
-        ERR_CHECK(TeardownNicTransferResources(rss));
-      }
-#endif
-    }
-    // Teardown additional requirements for GPU-based executors
-    if (exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) {
-      for (auto stream : exeInfo.streams)
-        ERR_CHECK(hipStreamDestroy(stream));
-      if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
-        for (auto event : exeInfo.startEvents)
-          ERR_CHECK(hipEventDestroy(event));
-        for (auto event : exeInfo.stopEvents)
-          ERR_CHECK(hipEventDestroy(event));
-      }
-    }
-    if (exeDevice.exeType == EXE_GPU_GFX) {
-#if !defined(__NVCC__)
-      MemType memType = MEM_GPU;
-#else
-      MemType memType = MEM_MANAGED;
-#endif
-      ERR_CHECK(DeallocateMemory(memType, exeInfo.subExecParamGpu, exeInfo.totalSubExecs * sizeof(SubExecParam)));
-    }
-    return ERR_NONE;
-  }
-// CPU Executor-related functions
-//========================================================================================
-  // Kernel for CPU execution (run by a single subexecutor)
-  static void CpuReduceKernel(SubExecParam const& p, int numSubIterations)
-  {
-    if (p.N == 0) return;
-    int subIteration = 0;
-    do {
-      int const& numSrcs = p.numSrcs;
-      int const& numDsts = p.numDsts;
-      if (numSrcs == 0) {
-        for (int i = 0; i < numDsts; ++i) {
-          memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
-          //for (int j = 0; j < p.N; j++) p.dst[i][j] = MEMSET_VAL;
-        }
-      } else if (numSrcs == 1) {
-        float const* __restrict__ src = p.src[0];
-        if (numDsts == 0) {
-          float sum = 0.0;
-          for (int j = 0; j < p.N; j++)
-            sum += p.src[0][j];
-          // Add a dummy check to ensure the read is not optimized out
-          if (sum != sum) {
-            printf("[ERROR] Nan detected\n");
-          }
-        } else {
-          for (int i = 0; i < numDsts; ++i)
-            memcpy(p.dst[i], src, p.N * sizeof(float));
-        }
-      } else {
-        float sum = 0.0f;
-        for (int j = 0; j < p.N; j++) {
-          sum = p.src[0][j];
-          for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
-          for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
-        }
-      }
-    } while (++subIteration != numSubIterations);
-  }
-  // Execution of a single CPU Transfers
-  static void ExecuteCpuTransfer(int           const  iteration,
-                                 ConfigOptions const& cfg,
-                                 int           const  exeIndex,
-                                 TransferResources&   rss)
-  {
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-    vector<std::thread> childThreads;
-    for (auto const& subExecParam : rss.subExecParamCpu)
-      childThreads.emplace_back(std::thread(CpuReduceKernel, std::cref(subExecParam), cfg.general.numSubIterations));
-    for (auto& subExecThread : childThreads)
-      subExecThread.join();
-    childThreads.clear();
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-    if (iteration >= 0) {
-      rss.totalDurationMsec += deltaMsec;
-      if (cfg.general.recordPerIteration)
-        rss.perIterMsec.push_back(deltaMsec);
-    }
-  }
-  // Execution of a single CPU executor
-  static ErrResult RunCpuExecutor(int           const  iteration,
-                                  ConfigOptions const& cfg,
-                                  int           const  exeIndex,
-                                  ExeInfo&             exeInfo)
-  {
-    numa_run_on_node(exeIndex);
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-    vector<std::thread> asyncTransfers;
-    for (auto& rss : exeInfo.resources) {
-      asyncTransfers.emplace_back(std::thread(ExecuteCpuTransfer,
-                                              iteration,
-                                              std::cref(cfg),
-                                              exeIndex,
-                                              std::ref(rss)));
-    }
-    for (auto& asyncTransfer : asyncTransfers)
-      asyncTransfer.join();
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-    if (iteration >= 0)
-      exeInfo.totalDurationMsec += deltaMsec;
-    return ERR_NONE;
-  }
-#ifdef NIC_EXEC_ENABLED
-  // Execution of a single NIC Transfer
-  static ErrResult ExecuteNicTransfer(int           const  iteration,
-                                      ConfigOptions const& cfg,
-                                      int           const  exeIndex,
-                                      TransferResources&   rss)
-  {
-    // Loop over each of the queue pairs and post the send
-    ibv_send_wr* badWorkReq;
-    for (int qpIndex = 0; qpIndex < rss.qpCount; qpIndex++) {
-      int error = ibv_post_send(rss.srcQueuePairs[qpIndex], &rss.sendWorkRequests[qpIndex], &badWorkReq);
-      if (error)
-        return {ERR_FATAL, "Transfer %d: Error when calling ibv_post_send for QP %d Error code %d\n",
-          rss.transferIdx, qpIndex, error};
-    }
-    return ERR_NONE;
-  }
-  // Execution of a single NIC executor
-  static ErrResult RunNicExecutor(int           const  iteration,
-                                  ConfigOptions const& cfg,
-                                  int           const  exeIndex,
-                                  ExeInfo&             exeInfo)
-  {
-    // Switch to the closest NUMA node to this NIC
-    if (cfg.nic.useNuma) {
-      int numaNode = GetIbvDeviceList()[exeIndex].numaNode;
-      if (numaNode != -1)
-        numa_run_on_node(numaNode);
-    }
-    int subIterations = 0;
-    do {
-      auto cpuStart = std::chrono::high_resolution_clock::now();
-      size_t completedTransfers = 0;
-      auto transferCount = exeInfo.resources.size();
-      std::vector<uint8_t> receivedQPs(transferCount);
-      std::vector<std::chrono::high_resolution_clock::time_point> transferTimers(transferCount);
-      // post the sends
-      for (auto i = 0; i < transferCount; i++) {
-        transferTimers[i] = std::chrono::high_resolution_clock::now();
-        ERR_CHECK(ExecuteNicTransfer(iteration, cfg, exeIndex, exeInfo.resources[i]));
-      }
-      // poll for completions
-      do {
-        for (auto i = 0; i < transferCount; i++) {
-          if(receivedQPs[i] < exeInfo.resources[i].qpCount) {
-            auto& rss = exeInfo.resources[i];
-            // Poll the completion queue until all queue pairs are complete
-            // The order of completion doesn't matter because this completion queue is dedicated to this Transfer
-            ibv_wc wc;
-            int nc = ibv_poll_cq(rss.srcCompQueue, 1, &wc);
-            if (nc > 0) {
-              receivedQPs[i]++;
-              if (wc.status != IBV_WC_SUCCESS) {
-                return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion", rss.transferIdx};
-              }
-            } else if (nc < 0) {
-              return {ERR_FATAL, "Transfer %d: Received negative work completion", rss.transferIdx};
-            }
-            if(receivedQPs[i] == rss.qpCount) {
-              auto cpuDelta = std::chrono::high_resolution_clock::now() - transferTimers[i];
-              double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-              if (iteration >= 0) {
-                rss.totalDurationMsec += deltaMsec;
-                if (cfg.general.recordPerIteration)
-                  rss.perIterMsec.push_back(deltaMsec);
-              }
-              completedTransfers++;
-            }
-          }
-        }
-      } while(completedTransfers < transferCount);
-      auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-      double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-      if (iteration >= 0)
-        exeInfo.totalDurationMsec += deltaMsec;
-    } while(++subIterations < cfg.general.numSubIterations);
-    return ERR_NONE;
-  }
-#endif
-// GFX Executor-related functions
-//========================================================================================
-  // Converts register value to a CU/SM index
-  static uint32_t GetId(uint32_t hwId)
-  {
-#if defined(__NVCC_)
-    return hwId;
-#else
-    // Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
-    int const shId = (hwId >> 12) &  1;
-    int const cuId = (hwId >>  8) & 15;
-    int const seId = (hwId >> 13) &  3;
-    return (shId << 5) + (cuId << 2) + seId;
-#endif
-  }
-  // Device level timestamp function
-  __device__ int64_t GetTimestamp()
-  {
-#if defined(__NVCC__)
-    int64_t result;
-    asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(result));
-    return result;
-#else
-    return wall_clock64();
-#endif
-  }
-  // Helper function for memset
-  template <typename T> __device__ __forceinline__ T      MemsetVal();
-  template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
-  template <>           __device__ __forceinline__ float2 MemsetVal(){ return make_float2(MEMSET_VAL,
-                                                                                          MEMSET_VAL); };
-  template <>           __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL,
-                                                                                          MEMSET_VAL,
-                                                                                          MEMSET_VAL,
-                                                                                          MEMSET_VAL); }
-  // Helper function for temporal/non-temporal reads / writes
-  #define TEMPORAL_NONE  0
-  #define TEMPORAL_LOAD  1
-  #define TEMPORAL_STORE 2
-  #define TEMPORAL_BOTH  3
-  template <int TEMPORAL_MODE>
-  __device__ __forceinline__ void Load(float const* src, float& dst) {
-    if (TEMPORAL_MODE & TEMPORAL_LOAD) {
-#if !defined(__NVCC__)
-      dst = __builtin_nontemporal_load(src);
-#endif
-    } else {
-      dst = *src;
-    }
-  }
-  template <int TEMPORAL_MODE>
-  __device__ __forceinline__ void Load(float2 const* src, float2& dst) {
-    if (TEMPORAL_MODE & TEMPORAL_LOAD) {
-#if !defined(__NVCC__)
-      dst.x = __builtin_nontemporal_load(&(src->x));
-      dst.y = __builtin_nontemporal_load(&(src->y));
-#endif
-    } else {
-      dst = *src;
-    }
-  }
-  template <int TEMPORAL_MODE>
-  __device__ __forceinline__ void Load(float4 const* src, float4& dst) {
-    if (TEMPORAL_MODE & TEMPORAL_LOAD) {
-#if !defined(__NVCC__)
-      dst.x = __builtin_nontemporal_load(&(src->x));
-      dst.y = __builtin_nontemporal_load(&(src->y));
-      dst.z = __builtin_nontemporal_load(&(src->z));
-      dst.w = __builtin_nontemporal_load(&(src->w));
-#endif
-    } else {
-      dst = *src;
-    }
-  }
-  template <int TEMPORAL_MODE>
-  __device__ __forceinline__ void Store(float const& src, float* dst) {
-    if (TEMPORAL_MODE & TEMPORAL_STORE) {
-#if !defined(__NVCC__)
-      __builtin_nontemporal_store(src, dst);
-#endif
-    } else {
-      *dst = src;
-    }
-  }
-  template <int TEMPORAL_MODE>
-  __device__ __forceinline__ void Store(float2 const& src, float2* dst) {
-    if (TEMPORAL_MODE & TEMPORAL_STORE) {
-#if !defined(__NVCC__)
-      __builtin_nontemporal_store(src.x, &(dst->x));
-      __builtin_nontemporal_store(src.y, &(dst->y));
-#endif
-    } else {
-      *dst = src;
-    }
-  }
-  template <int TEMPORAL_MODE>
-  __device__ __forceinline__ void Store(float4 const& src, float4* dst) {
-    if (TEMPORAL_MODE & TEMPORAL_STORE) {
-#if !defined(__NVCC__)
-      __builtin_nontemporal_store(src.x, &(dst->x));
-      __builtin_nontemporal_store(src.y, &(dst->y));
-      __builtin_nontemporal_store(src.z, &(dst->z));
-      __builtin_nontemporal_store(src.w, &(dst->w));
-#endif
-    } else {
-      *dst = src;
-    }
-  }
-  // Kernel for GFX execution
-  template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
-  __global__ void __launch_bounds__(BLOCKSIZE)
-    GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
-  {
-    int64_t startCycle;
-    if (threadIdx.x == 0) startCycle = GetTimestamp();
-    SubExecParam& p = params[blockIdx.y];
-    // Filter by XCC
-#if !defined(__NVCC__)
-    int32_t xccId;
-    GetXccId(xccId);
-    if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
-#endif
-    // Collect data information
-    int32_t const  numSrcs  = p.numSrcs;
-    int32_t const  numDsts  = p.numDsts;
-    PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS];
-    PACKED_FLOAT*       __restrict__ dstFloatPacked[MAX_DSTS];
-    for (int i = 0; i < numSrcs; i++) srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i];
-    for (int i = 0; i < numDsts; i++) dstFloatPacked[i] = (PACKED_FLOAT*)p.dst[i];
-    // Operate on wavefront granularity
-    int32_t const nTeams   = p.teamSize;             // Number of threadblocks working together on this subarray
-    int32_t const teamIdx  = p.teamIdx;              // Index of this threadblock within the team
-    int32_t const nWaves   = BLOCKSIZE   / warpSize; // Number of wavefronts within this threadblock
-    int32_t const waveIdx  = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
-    int32_t const tIdx     = threadIdx.x % warpSize; // Thread index within wavefront
-    size_t  const numPackedFloat = p.N / (sizeof(PACKED_FLOAT)/sizeof(float));
-    int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
-    switch (waveOrder) {
-    case 0: /* U,W,C */ unrlStride = 1; waveStride = UNROLL; teamStride = UNROLL * nWaves;  teamStride2 = nWaves; waveStride2 = 1     ; break;
-    case 1: /* U,C,W */ unrlStride = 1; teamStride = UNROLL; waveStride = UNROLL * nTeams;  teamStride2 = 1;      waveStride2 = nTeams; break;
-    case 2: /* W,U,C */ waveStride = 1; unrlStride = nWaves; teamStride = nWaves * UNROLL;  teamStride2 = nWaves; waveStride2 = 1     ; break;
-    case 3: /* W,C,U */ waveStride = 1; teamStride = nWaves; unrlStride = nWaves * nTeams;  teamStride2 = nWaves; waveStride2 = 1     ; break;
-    case 4: /* C,U,W */ teamStride = 1; unrlStride = nTeams; waveStride = nTeams * UNROLL;  teamStride2 = 1;      waveStride2 = nTeams; break;
-    case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves;  teamStride2 = 1;      waveStride2 = nTeams; break;
-    }
-    int subIterations = 0;
-    while (1) {
-      // First loop: Each wavefront in the team works on UNROLL PACKED_FLOAT per thread
-      size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
-      size_t const loop1Limit  = numPackedFloat / loop1Stride * loop1Stride;
-      {
-        PACKED_FLOAT val[UNROLL];
-        PACKED_FLOAT tmp[UNROLL];
-        if (numSrcs == 0) {
-          #pragma unroll
-          for (int u = 0; u < UNROLL; u++)
-            val[u] = MemsetVal<PACKED_FLOAT>();
-        }
-        for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) {
-          // Read sources into memory and accumulate in registers
-          if (numSrcs) {
-            #pragma unroll
-            for (int u = 0; u < UNROLL; u++)
-              Load<TEMPORAL_MODE>(&srcFloatPacked[0][idx + u * unrlStride * warpSize], val[u]);
-            for (int s = 1; s < numSrcs; s++) {
-              #pragma unroll
-              for (int u = 0; u < UNROLL; u++)
-                Load<TEMPORAL_MODE>(&srcFloatPacked[s][idx + u * unrlStride * warpSize], tmp[u]);
-              #pragma unroll
-              for (int u = 0; u < UNROLL; u++)
-                val[u] += tmp[u];
-            }
-          }
-          // Write accumulation to all outputs
-          for (int d = 0; d < numDsts; d++) {
-            #pragma unroll
-            for (int u = 0; u < UNROLL; u++)
-              Store<TEMPORAL_MODE>(val[u], &dstFloatPacked[d][idx + u * unrlStride * warpSize]);
-          }
-        }
-      }
-      // Second loop: Deal with remaining PACKED_FLOAT
-      {
-        if (loop1Limit < numPackedFloat) {
-          PACKED_FLOAT val, tmp;
-          if (numSrcs == 0) val = MemsetVal<PACKED_FLOAT>();
-          size_t const loop2Stride = nTeams * nWaves * warpSize;
-          for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
-               idx < numPackedFloat; idx += loop2Stride) {
-            if (numSrcs) {
-              Load<TEMPORAL_MODE>(&srcFloatPacked[0][idx], val);
-              for (int s = 1; s < numSrcs; s++) {
-                Load<TEMPORAL_MODE>(&srcFloatPacked[s][idx], tmp);
-                val += tmp;
-              }
-            }
-            for (int d = 0; d < numDsts; d++)
-              Store<TEMPORAL_MODE>(val, &dstFloatPacked[d][idx]);
-          }
-        }
-      }
-      // Third loop; Deal with remaining floats
-      {
-        if (numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) < p.N) {
-          float val, tmp;
-          if (numSrcs == 0) val = MemsetVal<float>();
-          size_t const loop3Stride = nTeams * nWaves * warpSize;
-          for (size_t idx = numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
-            if (numSrcs) {
-              Load<TEMPORAL_MODE>(&p.src[0][idx], val);
-              for (int s = 1; s < numSrcs; s++) {
-                Load<TEMPORAL_MODE>(&p.src[s][idx], tmp);
-                val += tmp;
-              }
-            }
-            for (int d = 0; d < numDsts; d++)
-              Store<TEMPORAL_MODE>(val, &p.dst[d][idx]);
-          }
-        }
-      }
-      if (++subIterations == numSubIterations) break;
-    }
-    // Wait for all threads to finish
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      __threadfence_system();
-      p.stopCycle  = GetTimestamp();
-      p.startCycle = startCycle;
-      GetHwId(p.hwId);
-      GetXccId(p.xccId);
-    }
-  }
-#define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD)    \
-  {GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>,  \
-   GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>,  \
-   GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_STORE>, \
-   GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_BOTH>}
-#define GPU_KERNEL_DWORD_DECL(BLOCKSIZE, UNROLL)        \
-  {GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float),  \
-   GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float2), \
-   GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float4)}
-#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)    \
-  {GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 1),      \
-   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 2),      \
-   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 3),      \
-   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 4),      \
-   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 5),      \
-   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 6),      \
-   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 7),      \
-   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 8)}
-  // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size)
-  typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
-  GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3][4] =
-  {
-    GPU_KERNEL_UNROLL_DECL(64),
-    GPU_KERNEL_UNROLL_DECL(128),
-    GPU_KERNEL_UNROLL_DECL(192),
-    GPU_KERNEL_UNROLL_DECL(256),
-    GPU_KERNEL_UNROLL_DECL(320),
-    GPU_KERNEL_UNROLL_DECL(384),
-    GPU_KERNEL_UNROLL_DECL(448),
-    GPU_KERNEL_UNROLL_DECL(512)
-  };
-  #undef GPU_KERNEL_UNROLL_DECL
-  #undef GPU_KERNEL_DWORD_DECL
-  #undef GPU_KERNEL_TEMPORAL_DECL
-  // Execute a single GPU Transfer (when using 1 stream per Transfer)
-  static ErrResult ExecuteGpuTransfer(int           const  iteration,
-                                      hipStream_t   const  stream,
-                                      hipEvent_t    const  startEvent,
-                                      hipEvent_t    const  stopEvent,
-                                      int           const  xccDim,
-                                      ConfigOptions const& cfg,
-                                      TransferResources&   rss)
-  {
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-    int numSubExecs = rss.subExecParamCpu.size();
-    dim3 const gridSize(xccDim, numSubExecs, 1);
-    dim3 const blockSize(cfg.gfx.blockSize, 1);
-    int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
-                      cfg.gfx.wordSize == 2 ? 1 :
-                                              2;
-    auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode];
-#if defined(__NVCC__)
-    if (startEvent != NULL)
-      ERR_CHECK(hipEventRecord(startEvent, stream));
-    gpuKernel<<<gridSize, blockSize, 0, stream>>>(rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
-    if (stopEvent != NULL)
-      ERR_CHECK(hipEventRecord(stopEvent, stream));
-#else
-    hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream, startEvent, stopEvent,
-                          0, rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
-#endif
-    ERR_CHECK(hipStreamSynchronize(stream));
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-    if (iteration >= 0) {
-      double deltaMsec = cpuDeltaMsec;
-      if (startEvent != NULL) {
-        float gpuDeltaMsec;
-        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-        deltaMsec = gpuDeltaMsec;
-      }
-      rss.totalDurationMsec += deltaMsec;
-      if (cfg.general.recordPerIteration) {
-        rss.perIterMsec.push_back(deltaMsec);
-        std::set<std::pair<int,int>> CUs;
-        for (int i = 0; i < numSubExecs; i++) {
-          CUs.insert(std::make_pair(rss.subExecParamGpuPtr[i].xccId,
-                                    GetId(rss.subExecParamGpuPtr[i].hwId)));
-        }
-        rss.perIterCUs.push_back(CUs);
-      }
-    }
-    return ERR_NONE;
-  }
-  // Execute a single GPU executor
-  static ErrResult RunGpuExecutor(int           const  iteration,
-                                  ConfigOptions const& cfg,
-                                  int           const  exeIndex,
-                                  ExeInfo&             exeInfo)
-  {
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-    ERR_CHECK(hipSetDevice(exeIndex));
-    int xccDim = exeInfo.useSubIndices ? exeInfo.numSubIndices : 1;
-    if (cfg.gfx.useMultiStream) {
-      // Launch each Transfer separately in its own stream
-      vector<std::future<ErrResult>> asyncTransfers;
-      for (int i = 0; i < exeInfo.streams.size(); i++) {
-        asyncTransfers.emplace_back(std::async(std::launch::async,
-                                               ExecuteGpuTransfer,
-                                               iteration,
-                                               exeInfo.streams[i],
-                                               cfg.gfx.useHipEvents ? exeInfo.startEvents[i] : NULL,
-                                               cfg.gfx.useHipEvents ? exeInfo.stopEvents[i] : NULL,
-                                               xccDim,
-                                               std::cref(cfg),
-                                               std::ref(exeInfo.resources[i])));
-      }
-      for (auto& asyncTransfer : asyncTransfers)
-        ERR_CHECK(asyncTransfer.get());
-    } else {
-      // Combine all the Transfers into a single kernel launch
-      int numSubExecs = exeInfo.totalSubExecs;
-      dim3 const gridSize(xccDim, numSubExecs, 1);
-      dim3 const blockSize(cfg.gfx.blockSize, 1);
-      hipStream_t stream = exeInfo.streams[0];
-      int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
-                        cfg.gfx.wordSize == 2 ? 1 :
-                                                2;
-      auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode];
-#if defined(__NVCC__)
-      if (cfg.gfx.useHipEvents)
-        ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream));
-      gpuKernel<<<gridSize, blockSize, 0 , stream>>>(exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
-      if (cfg.gfx.useHipEvents)
-        ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream));
-#else
-      hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream,
-                            cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL,
-                            cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, 0,
-                            exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
-#endif
-      ERR_CHECK(hipStreamSynchronize(stream));
-    }
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-    if (iteration >= 0) {
-      if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) {
-        float gpuDeltaMsec;
-        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0]));
-        exeInfo.totalDurationMsec += gpuDeltaMsec;
-      } else {
-        exeInfo.totalDurationMsec += cpuDeltaMsec;
-      }
-      // Determine timing for each of the individual transfers that were part of this launch
-      if (!cfg.gfx.useMultiStream) {
-        for (int i = 0; i < exeInfo.resources.size(); i++) {
-          TransferResources& rss = exeInfo.resources[i];
-          long long minStartCycle = std::numeric_limits<long long>::max();
-          long long maxStopCycle  = std::numeric_limits<long long>::min();
-          std::set<std::pair<int, int>> CUs;
-          for (auto subExecIdx : rss.subExecIdx) {
-            minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
-            maxStopCycle  = std::max(maxStopCycle,  exeInfo.subExecParamGpu[subExecIdx].stopCycle);
-            if (cfg.general.recordPerIteration) {
-              CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
-                                        GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
-            }
-          }
-          double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
-          rss.totalDurationMsec += deltaMsec;
-          if (cfg.general.recordPerIteration) {
-            rss.perIterMsec.push_back(deltaMsec);
-            rss.perIterCUs.push_back(CUs);
-          }
-        }
-      }
-    }
-    return ERR_NONE;
-  }
-// DMA Executor-related functions
-//========================================================================================
-  // Execute a single DMA Transfer
-  static ErrResult ExecuteDmaTransfer(int           const  iteration,
-                                      bool          const  useSubIndices,
-                                      hipStream_t   const  stream,
-                                      hipEvent_t    const  startEvent,
-                                      hipEvent_t    const  stopEvent,
-                                      ConfigOptions const& cfg,
-                                      TransferResources&   resources)
-  {
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-    int subIterations = 0;
-    if (!useSubIndices && !cfg.dma.useHsaCopy) {
-      if (cfg.dma.useHipEvents)
-        ERR_CHECK(hipEventRecord(startEvent, stream));
-      // Use hipMemcpy
-      do {
-        ERR_CHECK(hipMemcpyAsync(resources.dstMem[0], resources.srcMem[0], resources.numBytes,
-                                 hipMemcpyDefault, stream));
-      } while (++subIterations != cfg.general.numSubIterations);
-      if (cfg.dma.useHipEvents)
-        ERR_CHECK(hipEventRecord(stopEvent, stream));
-      ERR_CHECK(hipStreamSynchronize(stream));
-    } else {
-#if defined(__NVCC__)
-      return {ERR_FATAL, "HSA copy not supported on NVIDIA hardware"};
-#else
-      // Use HSA async copy
-      do {
-        hsa_signal_store_screlease(resources.signal, 1);
-        if (!useSubIndices) {
-          ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[0], resources.dstAgent,
-                                              resources.srcMem[0], resources.srcAgent,
-                                              resources.numBytes, 0, NULL,
-                                              resources.signal));
-        } else {
-          HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[0], resources.dstAgent,
-                                                       resources.srcMem[0], resources.srcAgent,
-                                                       resources.numBytes, 0, NULL,
-                                                       resources.signal,
-                                                       resources.sdmaEngineId, true));
-        }
-        // Wait for SDMA transfer to complete
-        while(hsa_signal_wait_scacquire(resources.signal,
-                                        HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
-                                        HSA_WAIT_STATE_ACTIVE) >= 1);
-      } while (++subIterations != cfg.general.numSubIterations);
-#endif
-    }
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-    if (iteration >= 0) {
-      double deltaMsec = cpuDeltaMsec;
-      if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
-        float gpuDeltaMsec;
-        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-        deltaMsec = gpuDeltaMsec;
-      }
-      resources.totalDurationMsec += deltaMsec;
-      if (cfg.general.recordPerIteration)
-        resources.perIterMsec.push_back(deltaMsec);
-    }
-    return ERR_NONE;
-  }
-  // Execute a single DMA executor
-  static ErrResult RunDmaExecutor(int           const  iteration,
-                                  ConfigOptions const& cfg,
-                                  int           const  exeIndex,
-                                  ExeInfo&             exeInfo)
-  {
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-    ERR_CHECK(hipSetDevice(exeIndex));
-    vector<std::future<ErrResult>> asyncTransfers;
-    for (int i = 0; i < exeInfo.resources.size(); i++) {
-      asyncTransfers.emplace_back(std::async(std::launch::async,
-                                             ExecuteDmaTransfer,
-                                             iteration,
-                                             exeInfo.useSubIndices,
-                                             exeInfo.streams[i],
-                                             cfg.dma.useHipEvents ? exeInfo.startEvents[i] : NULL,
-                                             cfg.dma.useHipEvents ? exeInfo.stopEvents[i]  : NULL,
-                                             std::cref(cfg),
-                                             std::ref(exeInfo.resources[i])));
-    }
-    for (auto& asyncTransfer : asyncTransfers)
-      ERR_CHECK(asyncTransfer.get());
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-    if (iteration >= 0)
-      exeInfo.totalDurationMsec += deltaMsec;
-    return ERR_NONE;
-  }
-// Executor-related functions
-//========================================================================================
-  static ErrResult RunExecutor(int           const  iteration,
-                               ConfigOptions const& cfg,
-                               ExeDevice     const& exeDevice,
-                               ExeInfo&             exeInfo)
-  {
-    switch (exeDevice.exeType) {
-    case EXE_CPU:     return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
-    case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
-    case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
-#ifdef NIC_EXEC_ENABLED
-    case EXE_NIC:     return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
-#endif
-    default:          return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType};
-    }
-  }
-} // End of anonymous namespace
-//========================================================================================
-/// @endcond
-  ErrResult::ErrResult(ErrType err) : errType(err), errMsg("") {};
-  ErrResult::ErrResult(hipError_t err)
-  {
-    if (err == hipSuccess) {
-      this->errType = ERR_NONE;
-      this->errMsg  = "";
-    } else {
-      this->errType = ERR_FATAL;
-      this->errMsg  = std::string("HIP Error: ") + hipGetErrorString(err);
-    }
-  }
-#if !defined(__NVCC__)
-  ErrResult::ErrResult(hsa_status_t err)
-  {
-    if (err == HSA_STATUS_SUCCESS) {
-      this->errType = ERR_NONE;
-      this->errMsg  = "";
-    } else {
-      const char *errString = NULL;
-      hsa_status_string(err, &errString);
-      this->errType = ERR_FATAL;
-      this->errMsg  = std::string("HSA Error: ") + errString;
-    }
-  }
-#endif
-  ErrResult::ErrResult(ErrType errType, const char* format, ...)
-  {
-    this->errType = errType;
-    va_list args, args_temp;
-    va_start(args, format);
-    va_copy(args_temp, args);
-    int len = vsnprintf(nullptr, 0, format, args);
-    if (len < 0) {
-      va_end(args_temp);
-      va_end(args);
-    } else {
-      this->errMsg.resize(len);
-      vsnprintf(this->errMsg.data(), len+1, format, args_temp);
-    }
-    va_end(args_temp);
-    va_end(args);
-  }
-  bool RunTransfers(ConfigOptions         const& cfg,
-                    std::vector<Transfer> const& transfers,
-                    TestResults&                 results)
-  {
-    // Clear all errors;
-    auto& errResults = results.errResults;
-    errResults.clear();
-    // Check for valid configuration
-    if (ConfigOptionsHaveErrors(cfg, errResults)) return false;
-    // Check for valid transfers
-    if (TransfersHaveErrors(cfg, transfers, errResults)) return false;
-    // Collect up transfers by executor
-    int minNumSrcs = MAX_SRCS + 1;
-    int maxNumSrcs = 0;
-    size_t maxNumBytes = 0;
-    std::map<ExeDevice, ExeInfo> executorMap;
-    for (int i = 0; i < transfers.size(); i++) {
-      Transfer const& t = transfers[i];
-      ExeDevice exeDevice;
-      ERR_APPEND(GetActualExecutor(cfg, t.exeDevice, exeDevice), errResults);
-      TransferResources resource = {};
-      resource.transferIdx = i;
-      ExeInfo& exeInfo = executorMap[exeDevice];
-      exeInfo.totalBytes    += t.numBytes;
-      exeInfo.totalSubExecs += t.numSubExecs;
-      exeInfo.useSubIndices |= (t.exeSubIndex != -1 || (t.exeDevice.exeType == EXE_GPU_GFX && !cfg.gfx.prefXccTable.empty()));
-      exeInfo.resources.push_back(resource);
-      minNumSrcs  = std::min(minNumSrcs, (int)t.srcs.size());
-      maxNumSrcs  = std::max(maxNumSrcs, (int)t.srcs.size());
-      maxNumBytes = std::max(maxNumBytes, t.numBytes);
-    }
-    // Loop over each executor and prepare
-    // - Allocates memory for each Transfer
-    // - Set up work for subexecutors
-    vector<TransferResources*> transferResources;
-    for (auto& exeInfoPair : executorMap) {
-      ExeDevice const& exeDevice = exeInfoPair.first;
-      ExeInfo&         exeInfo   = exeInfoPair.second;
-      ERR_APPEND(PrepareExecutor(cfg, transfers, exeDevice, exeInfo), errResults);
-      for (auto& resource : exeInfo.resources) {
-        transferResources.push_back(&resource);
-      }
-    }
-    // Prepare reference src/dst arrays - only once for largest size
-    size_t maxN = maxNumBytes / sizeof(float);
-    vector<float> outputBuffer(maxN);
-    vector<vector<float>> dstReference(maxNumSrcs + 1, vector<float>(maxN));
-    {
-      vector<vector<float>> srcReference(maxNumSrcs, vector<float>(maxN));
-      memset(dstReference[0].data(), MEMSET_CHAR, maxNumBytes);
-      for (int numSrcs = 0; numSrcs < maxNumSrcs; numSrcs++) {
-        PrepareReference(cfg, srcReference[numSrcs], numSrcs);
-        for (int i = 0; i < maxN; i++) {
-          dstReference[numSrcs+1][i] = (numSrcs == 0 ? 0 : dstReference[numSrcs][i]) + srcReference[numSrcs][i];
-        }
-      }
-      // Release un-used partial sums
-      for (int numSrcs = 0; numSrcs < minNumSrcs; numSrcs++)
-        dstReference[numSrcs].clear();
-      // Initialize all src memory buffers
-      for (auto resource : transferResources) {
-        for (int srcIdx = 0; srcIdx < resource->srcMem.size(); srcIdx++) {
-          ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx], srcReference[srcIdx].data(), resource->numBytes,
-                               hipMemcpyDefault), errResults);
-        }
-      }
-    }
-    // Pause before starting when running in iteractive mode
-    if (cfg.general.useInteractive) {
-      printf("Memory prepared:\n");
-      for (int i = 0; i < transfers.size(); i++) {
-        ExeInfo const& exeInfo = executorMap[transfers[i].exeDevice];
-        printf("Transfer %03d:\n", i);
-        for (int iSrc = 0; iSrc < transfers[i].srcs.size(); ++iSrc)
-          printf("  SRC %0d: %p\n", iSrc, transferResources[i]->srcMem[iSrc]);
-        for (int iDst = 0; iDst < transfers[i].dsts.size(); ++iDst)
-          printf("  DST %0d: %p\n", iDst, transferResources[i]->dstMem[iDst]);
-      }
-      printf("Hit <Enter> to continue: ");
-      if (scanf("%*c") != 0) {
-        printf("[ERROR] Unexpected input\n");
-        exit(1);
-      }
-      printf("\n");
-    }
-    // Perform iterations
-    size_t numTimedIterations = 0;
-    double totalCpuTimeSec = 0.0;
-    for (int iteration = -cfg.general.numWarmups; ; iteration++) {
-      // Stop if number of iterations/seconds has reached limit
-      if (cfg.general.numIterations > 0 && iteration >= cfg.general.numIterations) break;
-      if (cfg.general.numIterations < 0 && totalCpuTimeSec > -cfg.general.numIterations) break;
-      // Start CPU timing for this iteration
-      auto cpuStart = std::chrono::high_resolution_clock::now();
-      // Execute all Transfers in parallel
-      std::vector<std::future<ErrResult>> asyncExecutors;
-      for (auto& exeInfoPair : executorMap) {
-        asyncExecutors.emplace_back(std::async(std::launch::async, RunExecutor,
-                                               iteration,
-                                               std::cref(cfg),
-                                               std::cref(exeInfoPair.first),
-                                               std::ref(exeInfoPair.second)));
-      }
-      // Wait for all threads to finish
-      for (auto& asyncExecutor : asyncExecutors) {
-        ERR_APPEND(asyncExecutor.get(), errResults);
-      }
-       // Stop CPU timing for this iteration
-      auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-      double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-      if (cfg.data.alwaysValidate) {
-        ERR_APPEND(ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
-                   errResults);
-      }
-      if (iteration >= 0) {
-        ++numTimedIterations;
-        totalCpuTimeSec += deltaSec;
-      }
-    }
-    // Pause for interactive mode
-    if (cfg.general.useInteractive) {
-      printf("Transfers complete. Hit <Enter> to continue: ");
-      if (scanf("%*c") != 0)  {
-        printf("[ERROR] Unexpected input\n");
-        exit(1);
-      }
-      printf("\n");
-    }
-    // Validate results
-    if (!cfg.data.alwaysValidate) {
-      ERR_APPEND(ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
-                 errResults);
-    }
-    // Prepare results
-    results.exeResults.clear();
-    results.tfrResults.clear();
-    results.tfrResults.resize(transfers.size());
-    results.numTimedIterations = numTimedIterations;
-    results.totalBytesTransferred = 0;
-    results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / (numTimedIterations * cfg.general.numSubIterations);
-    results.overheadMsec = results.avgTotalDurationMsec;
-    for (auto& exeInfoPair : executorMap) {
-      ExeDevice const& exeDevice = exeInfoPair.first;
-      ExeInfo&         exeInfo   = exeInfoPair.second;
-      // Copy over executor results
-      ExeResult& exeResult = results.exeResults[exeDevice];
-      exeResult.numBytes = exeInfo.totalBytes;
-      exeResult.avgDurationMsec = exeInfo.totalDurationMsec / (numTimedIterations * cfg.general.numSubIterations);
-      exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) /  exeResult.avgDurationMsec;
-      exeResult.sumBandwidthGbPerSec = 0.0;
-      exeResult.transferIdx.clear();
-      results.totalBytesTransferred += exeInfo.totalBytes;
-      results.overheadMsec = std::min(results.overheadMsec, (results.avgTotalDurationMsec -
-                                                             exeResult.avgDurationMsec));
-      // Copy over transfer results
-      for (auto const& rss : exeInfo.resources) {
-        int const transferIdx = rss.transferIdx;
-        exeResult.transferIdx.push_back(transferIdx);
-        TransferResult& tfrResult      = results.tfrResults[transferIdx];
-        tfrResult.exeDevice            = exeDevice;
-#ifdef NIC_EXEC_ENABLED
-        tfrResult.exeDstDevice         = {exeDevice.exeType, rss.dstNicIndex};
-#else
-        tfrResult.exeDstDevice         = exeDevice;
-#endif
-        tfrResult.numBytes             = rss.numBytes;
-        tfrResult.avgDurationMsec      = rss.totalDurationMsec / numTimedIterations;
-        tfrResult.avgBandwidthGbPerSec = (rss.numBytes / 1.0e6) / tfrResult.avgDurationMsec;
-        if (cfg.general.recordPerIteration) {
-          tfrResult.perIterMsec = rss.perIterMsec;
-          tfrResult.perIterCUs  = rss.perIterCUs;
-        }
-        exeResult.sumBandwidthGbPerSec += tfrResult.avgBandwidthGbPerSec;
-      }
-    }
-    results.avgTotalBandwidthGbPerSec = (results.totalBytesTransferred / 1.0e6) / results.avgTotalDurationMsec;
-    // Teardown executors
-    for (auto& exeInfoPair : executorMap) {
-      ExeDevice const& exeDevice = exeInfoPair.first;
-      ExeInfo&         exeInfo   = exeInfoPair.second;
-      ERR_APPEND(TeardownExecutor(cfg, exeDevice, transfers, exeInfo), errResults);
-    }
-    return true;
-  }
-  int GetIntAttribute(IntAttribute attribute)
-  {
-    switch (attribute) {
-    case ATR_GFX_MAX_BLOCKSIZE: return MAX_BLOCKSIZE;
-    case ATR_GFX_MAX_UNROLL:    return MAX_UNROLL;
-    default:                    return -1;
-    }
-  }
-  std::string GetStrAttribute(StrAttribute attribute)
-  {
-    switch (attribute) {
-    case ATR_SRC_PREP_DESCRIPTION:
-      return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)";
-    default:
-      return "";
-    }
-  }
-  ErrResult ParseTransfers(std::string            line,
-                           std::vector<Transfer>& transfers)
-  {
-    // Replace any round brackets or '->' with spaces,
-    for (int i = 1; line[i]; i++)
-      if (line[i] == '(' || line[i] == ')' || line[i] == '-'  || line[i] == ':' || line[i] == '>' ) line[i] = ' ';
-    transfers.clear();
-    // Read in number of transfers
-    int numTransfers = 0;
-    std::istringstream iss(line);
-    iss >> numTransfers;
-    if (iss.fail()) return ERR_NONE;
-    // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
-    // otherwise read triples (srcMem, exeMem, dstMem)
-    bool const advancedMode = (numTransfers < 0);
-    numTransfers = abs(numTransfers);
-    int numSubExecs;
-    std::string srcStr, exeStr, dstStr, numBytesToken;
-    if (!advancedMode) {
-      iss >> numSubExecs;
-      if (numSubExecs < 0 || iss.fail()) {
-        return {ERR_FATAL,
-                "Parsing error: Number of blocks to use (%d) must be non-negative", numSubExecs};
-      }
-    }
-    for (int i = 0; i < numTransfers; i++) {
-      Transfer transfer;
-      if (!advancedMode) {
-        iss >> srcStr >> exeStr >> dstStr;
-        transfer.numSubExecs = numSubExecs;
-        if (iss.fail()) {
-          return {ERR_FATAL,
-            "Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet", i+1};
-        }
-        transfer.numBytes = 0;
-      } else {
-        iss >> srcStr >> exeStr >> dstStr >> transfer.numSubExecs >> numBytesToken;
-        if (iss.fail()) {
-          return {ERR_FATAL,
-            "Parsing error: Unable to read valid Transfer %d (SRC EXE DST $CU #Bytes) tuple", i+1};
-        }
-        if (sscanf(numBytesToken.c_str(), "%lu", &transfer.numBytes) != 1) {
-          return {ERR_FATAL,
-            "Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple", i+1};
-        }
-        char units = numBytesToken.back();
-        switch (toupper(units)) {
-        case 'G': transfer.numBytes *= 1024;
-        case 'M': transfer.numBytes *= 1024;
-        case 'K': transfer.numBytes *= 1024;
-        }
-      }
-      ERR_CHECK(ParseMemType(srcStr, transfer.srcs));
-      ERR_CHECK(ParseMemType(dstStr, transfer.dsts));
-      ERR_CHECK(ParseExeType(exeStr, transfer.exeDevice, transfer.exeSubIndex));
-      transfers.push_back(transfer);
-    }
-    return ERR_NONE;
-  }
-  int GetNumExecutors(ExeType exeType)
-  {
-    switch (exeType) {
-    case EXE_CPU:
-      return numa_num_configured_nodes();
-    case EXE_GPU_GFX: case EXE_GPU_DMA:
-    {
-      int numDetectedGpus = 0;
-      hipError_t status = hipGetDeviceCount(&numDetectedGpus);
-      if (status != hipSuccess) numDetectedGpus = 0;
-      return numDetectedGpus;
-    }
-#ifdef NIC_EXEC_ENABLED
-    case EXE_NIC: case EXE_NIC_NEAREST:
-    {
-      return GetIbvDeviceList().size();
-    }
-#endif
-    default:
-      return 0;
-    }
-  }
-  int GetNumSubExecutors(ExeDevice exeDevice)
-  {
-    int const& exeIndex = exeDevice.exeIndex;
-    switch(exeDevice.exeType) {
-    case EXE_CPU:
-    {
-      int numCores = 0;
-      for (int i = 0; i < numa_num_configured_cpus(); i++)
-        if (numa_node_of_cpu(i) == exeIndex) numCores++;
-      return numCores;
-    }
-    case EXE_GPU_GFX:
-    {
-      int numGpus = GetNumExecutors(EXE_GPU_GFX);
-      if (exeIndex < 0 || numGpus <= exeIndex) return 0;
-      int numDeviceCUs = 0;
-      hipError_t status = hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, exeIndex);
-      if (status != hipSuccess) numDeviceCUs = 0;
-      return numDeviceCUs;
-    }
-    case EXE_GPU_DMA:
-    {
-      return 1;
-    }
-    default:
-      return 0;
-    }
-  }
-  int GetNumExecutorSubIndices(ExeDevice exeDevice)
-  {
-    // Executor subindices are not supported on NVIDIA hardware
-#if defined(__NVCC__)
-    return 0;
-#else
-    int const& exeIndex = exeDevice.exeIndex;
-    switch(exeDevice.exeType) {
-    case EXE_CPU: return 0;
-    case EXE_GPU_GFX:
-    {
-      hsa_agent_t agent;
-      ErrResult err = GetHsaAgent(exeDevice, agent);
-      if (err.errType != ERR_NONE) return 0;
-      int numXccs = 1;
-      if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_XCC, &numXccs) != HSA_STATUS_SUCCESS)
-        return 1;
-      return numXccs;
-    }
-    case EXE_GPU_DMA:
-    {
-      std::set<int> engineIds;
-      ErrResult err;
-      // Get HSA agent for this GPU
-      hsa_agent_t agent;
-      err = GetHsaAgent(exeDevice, agent);
-      if (err.errType != ERR_NONE) return 0;
-      int numTotalEngines = 0, numEnginesA = 0, numEnginesB = 0;
-      if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_ENG, &numEnginesA)
-          == HSA_STATUS_SUCCESS)
-        numTotalEngines += numEnginesA;
-      if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG, &numEnginesB)
-          == HSA_STATUS_SUCCESS)
-        numTotalEngines += numEnginesB;
-      return numTotalEngines;
-    }
-    default:
-      return 0;
-    }
-#endif
-  }
-  int GetClosestCpuNumaToGpu(int gpuIndex)
-  {
-    // Closest NUMA is not supported on NVIDIA hardware at this time
-#if defined(__NVCC__)
-    return -1;
-#else
-    hsa_agent_t gpuAgent;
-    ErrResult err = GetHsaAgent({EXE_GPU_GFX, gpuIndex}, gpuAgent);
-    if (err.errType != ERR_NONE) return -1;
-    hsa_agent_t closestCpuAgent;
-    if (hsa_agent_get_info(gpuAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, &closestCpuAgent)
-        == HSA_STATUS_SUCCESS) {
-      int numCpus = GetNumExecutors(EXE_CPU);
-      for (int i = 0; i < numCpus; i++) {
-        hsa_agent_t cpuAgent;
-        err = GetHsaAgent({EXE_CPU, i}, cpuAgent);
-        if (err.errType != ERR_NONE) return -1;
-        if (cpuAgent.handle == closestCpuAgent.handle) return i;
-      }
-    }
-    return -1;
-#endif
-  }
-  int GetClosestCpuNumaToNic(int nicIndex)
-  {
-#ifdef NIC_EXEC_ENABLED
-    int numNics = GetNumExecutors(EXE_NIC);
-    if (nicIndex < 0 || nicIndex >= numNics) return -1;
-    return GetIbvDeviceList()[nicIndex].numaNode;
-#else
-    return -1;
-#endif
-  }
-  int GetClosestNicToGpu(int gpuIndex)
-  {
-#ifdef NIC_EXEC_ENABLED
-    static bool isInitialized = false;
-    static std::vector<int> closestNicId;
-    int numGpus = GetNumExecutors(EXE_GPU_GFX);
-    if (gpuIndex < 0 || gpuIndex >= numGpus) return -1;
-    // Build closest NICs per GPU on first use
-    if (!isInitialized) {
-      closestNicId.resize(numGpus, -1);
-      // Build up list of NIC bus addresses
-      std::vector<std::string> ibvAddressList;
-      auto const& ibvDeviceList = GetIbvDeviceList();
-      for (auto const& ibvDevice : ibvDeviceList)
-        ibvAddressList.push_back(ibvDevice.hasActivePort ? ibvDevice.busId : "");
-      // Track how many times a device has been assigned as "closest"
-      // This allows distributed work across devices using multiple ports (sharing the same busID)
-      // NOTE: This isn't necessarily optimal, but likely to work in most cases involving multi-port
-      // Counter example:
-      //
-      //  G0 prefers (N0,N1), picks N0
-      //  G1 prefers (N1,N2), picks N1
-      //  G2 prefers N0,      picks N0
-      //
-      //  instead of G0->N1, G1->N2, G2->N0
-      std::vector<int> assignedCount(ibvDeviceList.size(), 0);
-      // Loop over each GPU to find the closest NIC(s) based on PCIe address
-      for (int i = 0; i < numGpus; i++) {
-        // Collect PCIe address for the GPU
-        char hipPciBusId[64];
-        hipError_t err = hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i);
-        if (err != hipSuccess) {
-#ifdef VERBS_DEBUG
-          printf("Failed to get PCI Bus ID for HIP device %d: %s\n", i, hipGetErrorString(err));
-#endif
-          closestNicId[i] = -1;
-          continue;
-        }
-        // Find closest NICs
-        std::set<int> closestNicIdxs = GetNearestDevicesInTree(hipPciBusId, ibvAddressList);
-        // Pick the least-used NIC to assign as closest
-        int closestIdx = -1;
-        for (auto idx : closestNicIdxs) {
-          if (closestIdx == -1 || assignedCount[idx] < assignedCount[closestIdx])
-            closestIdx = idx;
-        }
-        // The following will only use distance between bus IDs
-        // to determine the closest NIC to GPU if the PCIe tree approach fails
-        if (closestIdx < 0) {
-#ifdef VERBS_DEBUG
-          printf("[WARN] Falling back to PCIe bus ID distance to determine proximity\n");
-#endif
-          int minDistance = std::numeric_limits<int>::max();
-          for (int j = 0; j < ibvDeviceList.size(); j++) {
-            if (ibvDeviceList[j].busId != "") {
-              int distance = GetBusIdDistance(hipPciBusId, ibvDeviceList[j].busId);
-              if (distance < minDistance && distance >= 0) {
-                minDistance = distance;
-                closestIdx = j;
-              }
-            }
-          }
-        }
-        closestNicId[i] = closestIdx;
-        if (closestIdx != -1) assignedCount[closestIdx]++;
-      }
-      isInitialized = true;
-    }
-    return closestNicId[gpuIndex];
-#else
-    return -1;
-#endif
-  }
-// Undefine CUDA compatibility macros
-#if defined(__NVCC__)
-// ROCm specific
-#undef wall_clock64
-#undef gcnArchName
-// Datatypes
-#undef hipDeviceProp_t
-#undef hipError_t
-#undef hipEvent_t
-#undef hipStream_t
-// Enumerations
-#undef hipDeviceAttributeClockRate
-#undef hipDeviceAttributeMaxSharedMemoryPerMultiprocessor
-#undef hipDeviceAttributeMultiprocessorCount
-#undef hipErrorPeerAccessAlreadyEnabled
-#undef hipFuncCachePreferShared
-#undef hipMemcpyDefault
-#undef hipMemcpyDeviceToHost
-#undef hipMemcpyHostToDevice
-#undef hipSuccess
-// Functions
-#undef hipDeviceCanAccessPeer
-#undef hipDeviceEnablePeerAccess
-#undef hipDeviceGetAttribute
-#undef hipDeviceGetPCIBusId
-#undef hipDeviceSetCacheConfig
-#undef hipDeviceSynchronize
-#undef hipEventCreate
-#undef hipEventDestroy
-#undef hipEventElapsedTime
-#undef hipEventRecord
-#undef hipFree
-#undef hipGetDeviceCount
-#undef hipGetDeviceProperties
-#undef hipGetErrorString
-#undef hipHostFree
-#undef hipHostMalloc
-#undef hipMalloc
-#undef hipMallocManaged
-#undef hipMemcpy
-#undef hipMemcpyAsync
-#undef hipMemset
-#undef hipMemsetAsync
-#undef hipSetDevice
-#undef hipStreamCreate
-#undef hipStreamDestroy
-#undef hipStreamSynchronize
-#endif
-// Kernel macros
-#undef GetHwId
-#undef GetXccId
-// Undefine helper macros
-#undef ERR_CHECK
-#undef ERR_APPEND
-}