Header-only TransferBench library refactor (#134)

9658305f · gilbertlee-amd · GitHub · b56d4817 · b56d4817 · b56d4817
Unverified Commit 9658305f authored Nov 21, 2024 by gilbertlee-amd Committed by GitHub Nov 21, 2024
5 changed files
--- a/src/include/Compatibility.hpp
+++ b/src/include/Compatibility.hpp
-/*
-Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-#pragma once
-// Helper macro for catching HIP errors
-#define HIP_CALL(cmd)                                                                   \
-    do {                                                                                \
-        hipError_t error = (cmd);                                                       \
-        if (error != hipSuccess)                                                        \
-        {                                                                               \
-            std::cerr << "Encountered HIP error (" << hipGetErrorString(error)          \
-                      << ") at line " << __LINE__ << " in file " << __FILE__ << "\n";   \
-            exit(-1);                                                                   \
-        }                                                                               \
-    } while (0)
-#if defined(__NVCC__)
-#include <cuda_runtime.h>
-// ROCm specific
-#define wall_clock64                                       clock64
-#define gcnArchName                                        name
-// Datatypes
-#define hipDeviceProp_t                                    cudaDeviceProp
-#define hipError_t                                         cudaError_t
-#define hipEvent_t                                         cudaEvent_t
-#define hipStream_t                                        cudaStream_t
-// Enumerations
-#define hipDeviceAttributeClockRate                        cudaDevAttrClockRate
-#define hipDeviceAttributeMaxSharedMemoryPerMultiprocessor cudaDevAttrMaxSharedMemoryPerMultiprocessor
-#define hipDeviceAttributeMultiprocessorCount              cudaDevAttrMultiProcessorCount
-#define hipErrorPeerAccessAlreadyEnabled                   cudaErrorPeerAccessAlreadyEnabled
-#define hipFuncCachePreferShared                           cudaFuncCachePreferShared
-#define hipMemcpyDefault                                   cudaMemcpyDefault
-#define hipMemcpyDeviceToHost                              cudaMemcpyDeviceToHost
-#define hipMemcpyHostToDevice                              cudaMemcpyHostToDevice
-#define hipSuccess                                         cudaSuccess
-// Functions
-#define hipDeviceCanAccessPeer                             cudaDeviceCanAccessPeer
-#define hipDeviceEnablePeerAccess                          cudaDeviceEnablePeerAccess
-#define hipDeviceGetAttribute                              cudaDeviceGetAttribute
-#define hipDeviceGetPCIBusId                               cudaDeviceGetPCIBusId
-#define hipDeviceSetCacheConfig                            cudaDeviceSetCacheConfig
-#define hipDeviceSynchronize                               cudaDeviceSynchronize
-#define hipEventCreate                                     cudaEventCreate
-#define hipEventDestroy                                    cudaEventDestroy
-#define hipEventElapsedTime                                cudaEventElapsedTime
-#define hipEventRecord                                     cudaEventRecord
-#define hipFree                                            cudaFree
-#define hipGetDeviceCount                                  cudaGetDeviceCount
-#define hipGetDeviceProperties                             cudaGetDeviceProperties
-#define hipGetErrorString                                  cudaGetErrorString
-#define hipHostFree                                        cudaFreeHost
-#define hipHostMalloc                                      cudaMallocHost
-#define hipMalloc                                          cudaMalloc
-#define hipMallocManaged                                   cudaMallocManaged
-#define hipMemcpy                                          cudaMemcpy
-#define hipMemcpyAsync                                     cudaMemcpyAsync
-#define hipMemset                                          cudaMemset
-#define hipMemsetAsync                                     cudaMemsetAsync
-#define hipSetDevice                                       cudaSetDevice
-#define hipStreamCreate                                    cudaStreamCreate
-#define hipStreamDestroy                                   cudaStreamDestroy
-#define hipStreamSynchronize                               cudaStreamSynchronize
-// Define float4 addition operator for NVIDIA platform
-__device__ inline float4& operator +=(float4& a, const float4& b)
-{
-  a.x += b.x;
-  a.y += b.y;
-  a.z += b.z;
-  a.w += b.w;
-  return a;
-}
-#else
-#include <hip/hip_ext.h>
-#include <hip/hip_runtime.h>
-#include <hsa/hsa_ext_amd.h>
-#endif
--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
-/*
-Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-#ifndef ENVVARS_HPP
-#define ENVVARS_HPP
-#include <algorithm>
-#include <random>
-#include <time.h>
-#include "Compatibility.hpp"
-#include "Kernels.hpp"
-#define TB_VERSION "1.53"
-extern char const MemTypeStr[];
-extern char const ExeTypeStr[];
-enum ConfigModeEnum
-{
-  CFG_FILE   = 0,
-  CFG_P2P    = 1,
-  CFG_SWEEP  = 2,
-  CFG_SCALE  = 3,
-  CFG_A2A    = 4,
-  CFG_SCHMOO = 5,
-  CFG_RWRITE = 6
-};
-enum BlockOrderEnum
-{
-  ORDER_SEQUENTIAL  = 0,
-  ORDER_INTERLEAVED = 1,
-  ORDER_RANDOM      = 2
-};
-// This class manages environment variable that affect TransferBench
-class EnvVars
-{
-public:
-  // Default configuration values
-  int const DEFAULT_NUM_WARMUPS       =  3;
-  int const DEFAULT_NUM_ITERATIONS    = 10;
-  int const DEFAULT_SAMPLING_FACTOR   =  1;
-  // Peer-to-peer Benchmark preset defaults
-  int const DEFAULT_P2P_NUM_CPU_SE    = 4;
-  // Sweep-preset defaults
-  std::string const DEFAULT_SWEEP_SRC = "CG";
-  std::string const DEFAULT_SWEEP_EXE = "CDG";
-  std::string const DEFAULT_SWEEP_DST = "CG";
-  int const DEFAULT_SWEEP_MIN         = 1;
-  int const DEFAULT_SWEEP_MAX         = 24;
-  int const DEFAULT_SWEEP_TEST_LIMIT  = 0;
-  int const DEFAULT_SWEEP_TIME_LIMIT  = 0;
-  // Environment variables
-  int alwaysValidate;    // Validate after each iteration instead of once after all iterations
-  int blockBytes;        // Each subexecutor, except the last, gets a multiple of this many bytes to copy
-  int blockOrder;        // How blocks are ordered in single-stream mode (0=Sequential, 1=Interleaved, 2=Random)
-  int byteOffset;        // Byte-offset for memory allocations
-  int continueOnError;   // Continue tests even after mismatch detected
-  int gfxBlockSize;      // Size of each threadblock (must be multiple of 64)
-  int gfxSingleTeam;     // Team all subExecutors across the data array
-  int gfxUnroll;         // GFX-kernel unroll factor
-  int gfxWaveOrder;      // GFX-kernel wavefront ordering
-  int hideEnv;           // Skip printing environment variable
-  int minNumVarSubExec;  // Minimum # of subexecutors to use for variable subExec Transfers
-  int maxNumVarSubExec;  // Maximum # of subexecutors to use for variable subExec Transfers (0 to use device limit)
-  int numCpuDevices;     // Number of CPU devices to use (defaults to # NUMA nodes detected)
-  int numGpuDevices;     // Number of GPU devices to use (defaults to # HIP devices detected)
-  int numIterations;     // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
-  int numSubIterations;  // Number of subiterations to perform
-  int numWarmups;        // Number of un-timed warmup iterations to perform
-  int outputToCsv;       // Output in CSV format
-  int samplingFactor;    // Affects how many different values of N are generated (when N set to 0)
-  int sharedMemBytes;    // Amount of shared memory to use per threadblock
-  int showIterations;    // Show per-iteration timing info
-  int useHsaDma;         // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
-  int useInteractive;    // Pause for user-input before starting transfer loop
-  int usePcieIndexing;   // Base GPU indexing on PCIe address instead of HIP device
-  int usePrepSrcKernel;  // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
-  int useSingleStream;   // Use a single stream per GPU GFX executor instead of stream per Transfer
-  int useXccFilter;      // Use XCC filtering (experimental)
-  int validateDirect;    // Validate GPU destination memory directly instead of staging GPU memory on host
-  std::vector<float> fillPattern; // Pattern of floats used to fill source data
-  std::vector<uint32_t> cuMask;   // Bit-vector representing the CU mask
-  std::vector<std::vector<int>> prefXccTable;
-  // Environment variables only for P2P preset
-  int numCpuSubExecs;    // Number of CPU subexecttors to use
-  int numGpuSubExecs;    // Number of GPU subexecutors to use
-  int p2pMode;           // Both = 0, Unidirectional = 1, Bidirectional = 2
-  int useDmaCopy;        // Use DMA copy instead of GPU copy
-  int useRemoteRead;     // Use destination memory type as executor instead of source memory type
-  int useFineGrain;      // Use fine-grained memory
-  // Environment variables only for Sweep-preset
-  int sweepMin;          // Min number of simultaneous Transfers to be executed per test
-  int sweepMax;          // Max number of simulatneous Transfers to be executed per test
-  int sweepTestLimit;    // Max number of tests to run during sweep (0 = no limit)
-  int sweepTimeLimit;    // Max number of seconds to run sweep for  (0 = no limit)
-  int sweepXgmiMin;      // Min number of XGMI hops for Transfers
-  int sweepXgmiMax;      // Max number of XGMI hops for Transfers (-1 = no limit)
-  int sweepSeed;         // Random seed to use
-  int sweepRandBytes;    // Whether or not to use random number of bytes per Transfer
-  std::string sweepSrc;  // Set of src memory types to be swept
-  std::string sweepExe;  // Set of executors to be swept
-  std::string sweepDst;  // Set of dst memory types to be swept
-  // Enviroment variables only for A2A preset
-  int a2aDirect;         // Only execute on links that are directly connected
-  int a2aMode;           // Perform 0=copy, 1=read only, 2 = write only
-  // Developer features
-  int enableDebug;       // Enable debug output
-  int gpuMaxHwQueues;    // Tracks GPU_MAX_HW_QUEUES environment variable
-  // Used to track current configuration mode
-  ConfigModeEnum configMode;
-  // Random generator
-  std::default_random_engine *generator;
-  // Track how many CPUs are available per NUMA node
-  std::vector<int> numCpusPerNuma;
-  std::vector<int> wallClockPerDeviceMhz;
-  std::vector<std::set<int>> xccIdsPerDevice;
-  // Constructor that collects values
-  EnvVars()
-  {
-    int maxSharedMemBytes = 0;
-    HIP_CALL(hipDeviceGetAttribute(&maxSharedMemBytes,
-                                   hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0));
-#if !defined(__NVCC__)
-    int defaultSharedMemBytes = maxSharedMemBytes / 2 + 1;
-#else
-    int defaultSharedMemBytes = 0;
-#endif
-    int numDeviceCUs = 0;
-    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, 0));
-    int numDetectedCpus = numa_num_configured_nodes();
-    int numDetectedGpus;
-    HIP_CALL(hipGetDeviceCount(&numDetectedGpus));
-    hipDeviceProp_t prop;
-    HIP_CALL(hipGetDeviceProperties(&prop, 0));
-    std::string fullName = prop.gcnArchName;
-    std::string archName = fullName.substr(0, fullName.find(':'));
-    // Different hardware pick different GPU kernels
-    // This performance difference is generally only noticable when executing fewer CUs
-    int defaultGfxUnroll = 4;
-    if      (archName == "gfx906") defaultGfxUnroll = 8;
-    else if (archName == "gfx90a") defaultGfxUnroll = 8;
-    else if (archName == "gfx940") defaultGfxUnroll = 6;
-    else if (archName == "gfx941") defaultGfxUnroll = 6;
-    else if (archName == "gfx942") defaultGfxUnroll = 4;
-    alwaysValidate    = GetEnvVar("ALWAYS_VALIDATE"     , 0);
-    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
-    blockOrder        = GetEnvVar("BLOCK_ORDER"         , 0);
-    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
-    continueOnError   = GetEnvVar("CONTINUE_ON_ERROR"   , 0);
-    gfxBlockSize      = GetEnvVar("GFX_BLOCK_SIZE"      , 256);
-    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 1);
-    gfxUnroll         = GetEnvVar("GFX_UNROLL"          , defaultGfxUnroll);
-    gfxWaveOrder      = GetEnvVar("GFX_WAVE_ORDER"      , 0);
-    hideEnv           = GetEnvVar("HIDE_ENV"            , 0);
-    minNumVarSubExec  = GetEnvVar("MIN_VAR_SUBEXEC"     , 1);
-    maxNumVarSubExec  = GetEnvVar("MAX_VAR_SUBEXEC"     , 0);
-    numCpuDevices     = GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
-    numGpuDevices     = GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
-    numIterations     = GetEnvVar("NUM_ITERATIONS"      , DEFAULT_NUM_ITERATIONS);
-    numSubIterations  = GetEnvVar("NUM_SUBITERATIONS"   , 1);
-    numWarmups        = GetEnvVar("NUM_WARMUPS"         , DEFAULT_NUM_WARMUPS);
-    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
-    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , DEFAULT_SAMPLING_FACTOR);
-    sharedMemBytes    = GetEnvVar("SHARED_MEM_BYTES"    , defaultSharedMemBytes);
-    showIterations    = GetEnvVar("SHOW_ITERATIONS"     , 0);
-    useHsaDma         = GetEnvVar("USE_HSA_DMA"         , 0);
-    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
-    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
-    usePrepSrcKernel  = GetEnvVar("USE_PREP_KERNEL"     , 0);
-    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 1);
-    useXccFilter      = GetEnvVar("USE_XCC_FILTER"      , 0);
-    validateDirect    = GetEnvVar("VALIDATE_DIRECT"     , 0);
-    enableDebug       = GetEnvVar("DEBUG"               , 0);
-    gpuMaxHwQueues    = GetEnvVar("GPU_MAX_HW_QUEUES"   , 4);
-    // P2P Benchmark related
-    useDmaCopy        = GetEnvVar("USE_GPU_DMA"         , 0); // Needed for numGpuSubExec
-    numCpuSubExecs    = GetEnvVar("NUM_CPU_SE"          , DEFAULT_P2P_NUM_CPU_SE);
-    numGpuSubExecs    = GetEnvVar("NUM_GPU_SE"          , useDmaCopy ? 1 : numDeviceCUs);
-    p2pMode           = GetEnvVar("P2P_MODE"            , 0);
-    useRemoteRead     = GetEnvVar("USE_REMOTE_READ"     , 0);
-    useFineGrain      = GetEnvVar("USE_FINE_GRAIN"      , 0);
-    // Sweep related
-    sweepMin          = GetEnvVar("SWEEP_MIN"           , DEFAULT_SWEEP_MIN);
-    sweepMax          = GetEnvVar("SWEEP_MAX"           , DEFAULT_SWEEP_MAX);
-    sweepSrc          = GetEnvVar("SWEEP_SRC"           , DEFAULT_SWEEP_SRC);
-    sweepExe          = GetEnvVar("SWEEP_EXE"           , DEFAULT_SWEEP_EXE);
-    sweepDst          = GetEnvVar("SWEEP_DST"           , DEFAULT_SWEEP_DST);
-    sweepTestLimit    = GetEnvVar("SWEEP_TEST_LIMIT"    , DEFAULT_SWEEP_TEST_LIMIT);
-    sweepTimeLimit    = GetEnvVar("SWEEP_TIME_LIMIT"    , DEFAULT_SWEEP_TIME_LIMIT);
-    sweepXgmiMin      = GetEnvVar("SWEEP_XGMI_MIN"      , 0);
-    sweepXgmiMax      = GetEnvVar("SWEEP_XGMI_MAX"      , -1);
-    sweepRandBytes    = GetEnvVar("SWEEP_RAND_BYTES"    , 0);
-    // A2A Benchmark related
-    a2aDirect         = GetEnvVar("A2A_DIRECT"          , 1);
-    a2aMode           = GetEnvVar("A2A_MODE"            , 0);
-    // Determine random seed
-    char *sweepSeedStr = getenv("SWEEP_SEED");
-    sweepSeed = (sweepSeedStr != NULL ? atoi(sweepSeedStr) : time(NULL));
-    generator = new std::default_random_engine(sweepSeed);
-    // Check for fill pattern
-    char* pattern = getenv("FILL_PATTERN");
-    if (pattern != NULL)
-    {
-      if (usePrepSrcKernel)
-      {
-        printf("[ERROR] Unable to use FILL_PATTERN and USE_PREP_KERNEL together\n");
-        exit(1);
-      }
-      int patternLen = strlen(pattern);
-      if (patternLen % 2)
-      {
-        printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n");
-        exit(1);
-      }
-      // Read in bytes
-      std::vector<unsigned char> bytes;
-      unsigned char val = 0;
-      for (int i = 0; i < patternLen; i++)
-      {
-        if ('0' <= pattern[i] && pattern[i] <= '9')
-          val += (pattern[i] - '0');
-        else if ('A' <= pattern[i] && pattern[i] <= 'F')
-          val += (pattern[i] - 'A' + 10);
-        else if ('a' <= pattern[i] && pattern[i] <= 'f')
-          val += (pattern[i] - 'a' + 10);
-        else
-        {
-          printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F).  (not %c)\n", pattern[i]);
-          exit(1);
-        }
-        if (i % 2 == 0)
-          val <<= 4;
-        else
-        {
-          bytes.push_back(val);
-          val = 0;
-        }
-      }
-      // Reverse bytes (input is assumed to be given in big-endian)
-      std::reverse(bytes.begin(), bytes.end());
-      // Figure out how many copies of the pattern are necessary to fill a 4-byte float properly
-      int copies;
-      switch (patternLen % 8)
-      {
-      case 0:  copies = 1; break;
-      case 4:  copies = 2; break;
-      default: copies = 4; break;
-      }
-      // Fill floats
-      int numFloats = copies * patternLen / 8;
-      fillPattern.resize(numFloats);
-      unsigned char* rawData = (unsigned char*) fillPattern.data();
-      for (int i = 0; i < numFloats * 4; i++)
-        rawData[i] = bytes[i % bytes.size()];
-    }
-    else fillPattern.clear();
-    // Figure out number of xccs per device
-    int maxNumXccs = 64;
-    xccIdsPerDevice.resize(numGpuDevices);
-    for (int i = 0; i < numGpuDevices; i++)
-    {
-      int* data;
-      HIP_CALL(hipSetDevice(i));
-      HIP_CALL(hipHostMalloc((void**)&data, maxNumXccs * sizeof(int)));
-      CollectXccIdsKernel<<<maxNumXccs, 1>>>(data);
-      HIP_CALL(hipDeviceSynchronize());
-      xccIdsPerDevice[i].clear();
-      for (int j = 0; j < maxNumXccs; j++)
-        xccIdsPerDevice[i].insert(data[j]);
-      HIP_CALL(hipHostFree(data));
-    }
-    // Check for CU mask
-    cuMask.clear();
-    char* cuMaskStr = getenv("CU_MASK");
-    if (cuMaskStr != NULL)
-    {
-#if defined(__NVCC__)
-      printf("[WARN] CU_MASK is not supported in CUDA\n");
-#else
-      std::vector<std::pair<int, int>> ranges;
-      int numXccs = (xccIdsPerDevice.size() > 0 ? xccIdsPerDevice[0].size() : 1);
-      int maxCU = 0;
-      char* token = strtok(cuMaskStr, ",");
-      while (token)
-      {
-        int start, end;
-        if (sscanf(token, "%d-%d", &start, &end) == 2)
-        {
-          ranges.push_back(std::make_pair(std::min(start, end), std::max(start, end)));
-          maxCU = std::max(maxCU, std::max(start, end));
-        }
-        else if (sscanf(token, "%d", &start) == 1)
-        {
-          ranges.push_back(std::make_pair(start, start));
-          maxCU = std::max(maxCU, start);
-        }
-        else
-        {
-          printf("[ERROR] Unrecognized token [%s]\n", token);
-          exit(1);
-        }
-        token = strtok(NULL, ",");
-      }
-      cuMask.resize(2 * numXccs, 0);
-      for (auto range : ranges)
-      {
-        for (int i = range.first; i <= range.second; i++)
-        {
-          for (int x = 0; x < numXccs; x++)
-          {
-            int targetBit = i * numXccs + x;
-            cuMask[targetBit/32] |= (1<<(targetBit%32));
-          }
-        }
-      }
-#endif
-    }
-    // Parse preferred XCC table (if provided
-    prefXccTable.resize(numGpuDevices);
-    for (int i = 0; i < numGpuDevices; i++)
-    {
-      prefXccTable[i].resize(numGpuDevices, -1);
-    }
-    char* prefXccStr = getenv("XCC_PREF_TABLE");
-    if (prefXccStr)
-    {
-      char* token = strtok(prefXccStr, ",");
-      int tokenCount = 0;
-      while (token)
-      {
-        int xccId;
-        if (sscanf(token, "%d", &xccId) == 1)
-        {
-          int src = tokenCount / numGpuDevices;
-          int dst = tokenCount % numGpuDevices;
-          if (xccIdsPerDevice[src].count(xccId) == 0)
-          {
-            printf("[ERROR] GPU %d does not contain XCC %d\n", src, xccId);
-            exit(1);
-          }
-          prefXccTable[src][dst] = xccId;
-          tokenCount++;
-          if (tokenCount == (numGpuDevices * numGpuDevices)) break;
-        }
-        else
-        {
-          printf("[ERROR] Unrecognized token [%s]\n", token);
-          exit(1);
-        }
-        token = strtok(NULL, ",");
-      }
-    }
-    // Perform some basic validation
-    if (numCpuDevices > numDetectedCpus)
-    {
-      printf("[ERROR] Number of CPUs to use (%d) cannot exceed number of detected CPUs (%d)\n", numCpuDevices, numDetectedCpus);
-      exit(1);
-    }
-    if (numGpuDevices > numDetectedGpus)
-    {
-      printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
-      exit(1);
-    }
-    if (gfxBlockSize % 64)
-    {
-      printf("[ERROR] GFX_BLOCK_SIZE (%d) must be a multiple of 64\n", gfxBlockSize);
-      exit(1);
-    }
-    if (gfxBlockSize > MAX_BLOCKSIZE)
-    {
-      printf("[ERROR] BLOCK_SIZE (%d) must be less than %d\n", gfxBlockSize, MAX_BLOCKSIZE);
-      exit(1);
-    }
-    if (byteOffset % sizeof(float))
-    {
-      printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
-      exit(1);
-    }
-    if (blockOrder < 0 || blockOrder > 2)
-    {
-      printf("[ERROR] BLOCK_ORDER must be 0 (Sequential), 1 (Interleaved), or 2 (Random)\n");
-      exit(1);
-    }
-    if (minNumVarSubExec  < 1)
-    {
-      printf("[ERROR] Minimum number of subexecutors for variable subexector transfers must be at least 1\n");
-      exit(1);
-    }
-    if (numWarmups < 0)
-    {
-      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
-      exit(1);
-    }
-    if (samplingFactor < 1)
-    {
-      printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
-      exit(1);
-    }
-    if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes)
-    {
-      printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n", maxSharedMemBytes);
-      exit(1);
-    }
-    if (blockBytes <= 0 || blockBytes % 4)
-    {
-      printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n");
-      exit(1);
-    }
-    if (numGpuSubExecs <= 0)
-    {
-      printf("[ERROR] NUM_GPU_SE must be greater than 0\n");
-      exit(1);
-    }
-    if (numCpuSubExecs <= 0)
-    {
-      printf("[ERROR] NUM_CPU_SE must be greater than 0\n");
-      exit(1);
-    }
-    for (auto ch : sweepSrc)
-    {
-      if (!strchr(MemTypeStr, ch))
-      {
-        printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
-        exit(1);
-      }
-      if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch))
-      {
-        printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
-        exit(1);
-      }
-    }
-    for (auto ch : sweepDst)
-    {
-      if (!strchr(MemTypeStr, ch))
-      {
-        printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
-        exit(1);
-      }
-      if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch))
-      {
-        printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
-        exit(1);
-      }
-    }
-    for (auto ch : sweepExe)
-    {
-      if (!strchr(ExeTypeStr, ch))
-      {
-        printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
-        exit(1);
-      }
-      if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch))
-      {
-        printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
-        exit(1);
-      }
-    }
-    if (a2aMode < 0 || a2aMode > 2)
-    {
-      printf("[ERROR] a2aMode must be between 0 and 2\n");
-      exit(1);
-    }
-    if (gfxUnroll < 1 || gfxUnroll > MAX_UNROLL)
-    {
-      printf("[ERROR] GFX kernel unroll factor must be between 1 and %d (Not %d)\n", MAX_UNROLL, gfxUnroll);
-      exit(1);
-    }
-    if (gfxWaveOrder < 0 || gfxWaveOrder >= 6)
-    {
-      printf("[ERROR] GFX wave order must be between 0 and 5\n");
-      exit(1);
-    }
-    // Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
-    numCpusPerNuma.resize(numDetectedCpus);
-    int const totalCpus = numa_num_configured_cpus();
-    for (int i = 0; i < totalCpus; i++) {
-      int node = numa_node_of_cpu(i);
-      if (node >= 0) numCpusPerNuma[node]++;
-    }
-    // Build array of wall clock rates per GPU device
-    wallClockPerDeviceMhz.resize(numDetectedGpus);
-    for (int i = 0; i < numDetectedGpus; i++)
-    {
-#if defined(__NVCC__)
-      wallClockPerDeviceMhz[i] = 1000000;
-#else
-      hipDeviceProp_t prop;
-      HIP_CALL(hipGetDeviceProperties(&prop, i));
-      int value = 25000;
-      std::string fullName = prop.gcnArchName;
-      std::string archName = fullName.substr(0, fullName.find(':'));
-      if (archName == "gfx940" || archName == "gfx941" || archName == "gfx942")
-        wallClockPerDeviceMhz[i] = 100000;
-      else
-        wallClockPerDeviceMhz[i] = 25000;
-#endif
-    }
-    // Check for deprecated env vars
-    if (getenv("USE_HIP_CALL"))
-    {
-      printf("[WARN] USE_HIP_CALL has been deprecated.  Please use DMA executor 'D' or set USE_GPU_DMA for P2P-Benchmark preset\n");
-      exit(1);
-    }
-    if (getenv("GPU_KERNEL"))
-    {
-      printf("[WARN] GPU_KERNEL has been deprecated and replaced by GFX_KERNEL and GFX_UNROLL\n");
-      exit(1);
-    }
-    char* enableSdma = getenv("HSA_ENABLE_SDMA");
-    if (enableSdma && !strcmp(enableSdma, "0"))
-    {
-      printf("[WARN] DMA functionality disabled due to environment variable HSA_ENABLE_SDMA=0.  Copies will fallback to blit kernels\n");
-    }
-  }
-  // Display info on the env vars that can be used
-  static void DisplayUsage()
-  {
-    printf("Environment variables:\n");
-    printf("======================\n");
-    printf(" ALWAYS_VALIDATE        - Validate after each iteration instead of once after all iterations\n");
-    printf(" BLOCK_SIZE             - # of threads per threadblock (Must be multiple of 64). Defaults to 256\n");
-    printf(" BLOCK_BYTES            - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
-    printf(" BLOCK_ORDER            - Threadblock ordering in single-stream mode (0=Serial, 1=Interleaved, 2=Random)\n");
-    printf(" BYTE_OFFSET            - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
-    printf(" CONTINUE_ON_ERROR      - Continue tests even after mismatch detected\n");
-    printf(" CU_MASK                - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n");
-    printf(" FILL_PATTERN=STR       - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
-    printf(" GFX_UNROLL             - Unroll factor for GFX kernel (0=auto), must be less than %d\n", MAX_UNROLL);
-    printf(" GFX_SINGLE_TEAM        - Have subexecutors work together on full array instead of working on individual disjoint subarrays\n");
-    printf(" GFX_WAVE_ORDER         - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
-    printf(" HIDE_ENV               - Hide environment variable value listing\n");
-    printf(" MIN_VAR_SUBEXEC        - Minumum # of subexecutors to use for variable subExec Transfers\n");
-    printf(" MAX_VAR_SUBEXEC        - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
-    printf(" NUM_CPU_DEVICES=X      - Restrict number of CPUs to X.  May not be greater than # detected NUMA nodes\n");
-    printf(" NUM_GPU_DEVICES=X      - Restrict number of GPUs to X.  May not be greater than # detected HIP devices\n");
-    printf(" NUM_ITERATIONS=I       - Perform I timed iteration(s) per test\n");
-    printf(" NUM_SUBITERATIONS=S    - Perform S sub-iteration(s) per iteration. Must be non-negative\n");
-    printf(" NUM_WARMUPS=W          - Perform W untimed warmup iteration(s) per test\n");
-    printf(" OUTPUT_TO_CSV          - Outputs to CSV format if set\n");
-    printf(" SAMPLING_FACTOR=F      - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
-    printf(" SHARED_MEM_BYTES=X     - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
-    printf(" SHOW_ITERATIONS        - Show per-iteration timing info\n");
-    printf(" USE_HSA_DMA            - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
-    printf(" USE_INTERACTIVE        - Pause for user-input before starting transfer loop\n");
-    printf(" USE_PCIE_INDEX         - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
-    printf(" USE_PREP_KERNEL        - Use GPU kernel to initialize source data array pattern\n");
-    printf(" USE_SINGLE_STREAM      - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
-    printf(" USE_XCC_FILTER         - Use XCC filtering (experimental)\n");
-    printf(" VALIDATE_DIRECT        - Validate GPU destination memory directly instead of staging GPU memory on host\n");
-  }
-  // Helper macro to switch between CSV and terminal output
-#define PRINT_EV(NAME, VALUE, DESCRIPTION)                              \
-  printf("%-20s%s%12d%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ",  (DESCRIPTION).c_str())
-#define PRINT_ES(NAME, VALUE, DESCRIPTION)                           \
-  printf("%-20s%s%12s%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ",  (DESCRIPTION).c_str())
-  // Display env var settings
-  void DisplayEnvVars() const
-  {
-    if (!outputToCsv)
-    {
-      printf("TransferBench v%s\n", TB_VERSION);
-      printf("===============================================================\n");
-      if (!hideEnv) printf("[Common]                              (Suppress by setting HIDE_ENV=1)\n");
-    }
-    else if (!hideEnv)
-      printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
-    if (hideEnv) return;
-    PRINT_EV("ALWAYS_VALIDATE", alwaysValidate,
-             std::string("Validating after ") + (alwaysValidate ? "each iteration" : "all iterations"));
-    PRINT_EV("BLOCK_BYTES", blockBytes,
-             std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
-    PRINT_EV("BLOCK_ORDER", blockOrder,
-             std::string("Transfer blocks order: " + std::string((blockOrder == 0 ? "Sequential"  :
-                                                                  blockOrder == 1 ? "Interleaved" :
-                                                                                    "Random"))));
-    PRINT_EV("BYTE_OFFSET", byteOffset,
-             std::string("Using byte offset of " + std::to_string(byteOffset)));
-    PRINT_EV("CONTINUE_ON_ERROR", continueOnError,
-             std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error"));
-    PRINT_EV("CU_MASK", getenv("CU_MASK") ? 1 : 0,
-             (cuMask.size() ? GetCuMaskDesc() : "All"));
-    PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
-             (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
-    PRINT_EV("GFX_BLOCK_SIZE", gfxBlockSize,
-             std::string("Threadblock size of " + std::to_string(gfxBlockSize)));
-    PRINT_EV("GFX_SINGLE_TEAM", gfxSingleTeam,
-             (gfxSingleTeam ? std::string("Combining CUs to work across entire data array") :
-                              std::string("Each CUs operates on its own disjoint subarray")));
-    PRINT_EV("GFX_UNROLL", gfxUnroll,
-             std::string("Using GFX unroll factor of ") + std::to_string(gfxUnroll));
-    PRINT_EV("GFX_WAVE_ORDER", gfxWaveOrder,
-             std::string("Using GFX wave ordering of ") + std::string((gfxWaveOrder == 0 ? "Unroll,Wavefront,CU" :
-                                                                       gfxWaveOrder == 1 ? "Unroll,CU,Wavefront" :
-                                                                       gfxWaveOrder == 2 ? "Wavefront,Unroll,CU" :
-                                                                       gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
-                                                                       gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
-                                                                                           "CU,Wavefront,Unroll")));
-    PRINT_EV("MIN_VAR_SUBEXEC", minNumVarSubExec,
-             std::string("Using at least ") + std::to_string(minNumVarSubExec) + " subexecutor(s) for variable subExec tranfers");
-    PRINT_EV("MAX_VAR_SUBEXEC", maxNumVarSubExec,
-             maxNumVarSubExec ?
-             std::string("Using at most ") + std::to_string(maxNumVarSubExec) + " subexecutor(s) for variable subExec tranfers" :
-             "Using up to maximum device subexecutors for variable subExec tranfers");
-    PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
-             std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
-    PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
-             std::string("Using ") + std::to_string(numGpuDevices) + " GPU devices");
-    PRINT_EV("NUM_ITERATIONS", numIterations,
-             std::string("Running ") + std::to_string(numIterations > 0 ? numIterations : -numIterations) + " "
-             + (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
-    PRINT_EV("NUM_SUBITERATIONS", numSubIterations,
-             std::string("Running ") + (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)) + " subiterations");
-    PRINT_EV("NUM_WARMUPS", numWarmups,
-             std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test"));
-    PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes,
-             std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock"));
-    PRINT_EV("SHOW_ITERATIONS", showIterations,
-             std::string(showIterations ? "Showing" : "Hiding") + " per-iteration timing");
-    PRINT_EV("USE_HSA_DMA", useHsaDma,
-             std::string("Using ") + (useHsaDma ? "hsa_amd_async_copy" : "hipMemcpyAsync") + " for DMA execution");
-    PRINT_EV("USE_INTERACTIVE", useInteractive,
-             std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode");
-    PRINT_EV("USE_PCIE_INDEX", usePcieIndexing,
-             std::string("Use ") + (usePcieIndexing ? "PCIe" : "HIP") + " GPU device indexing");
-    PRINT_EV("USE_PREP_KERNEL", usePrepSrcKernel,
-             std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data");
-    PRINT_EV("USE_SINGLE_STREAM", useSingleStream,
-             std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer"));
-    PRINT_EV("USE_XCC_FILTER", useXccFilter,
-             std::string("XCC filtering ") + (useXccFilter ? "enabled" : "disabled"));
-    if (useXccFilter)
-    {
-      printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", "");
-      printf("%36s:         ", "");
-      for (int i = 0; i < numGpuDevices; i++) printf(" %3d", i); printf(" (#XCCs)\n");
-      for (int i = 0; i < numGpuDevices; i++)
-      {
-        printf("%36s: GPU %3d ", "", i);
-        for (int j = 0; j < numGpuDevices; j++)
-          printf(" %3d", prefXccTable[i][j]);
-        printf(" %3lu\n", xccIdsPerDevice[i].size());
-      }
-    }
-    PRINT_EV("VALIDATE_DIRECT", validateDirect,
-             std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer"));
-    printf("\n");
-    if (blockOrder != ORDER_SEQUENTIAL && !useSingleStream)
-      printf("[WARN] BLOCK_ORDER is ignored if USE_SINGLE_STREAM is not enabled\n");
-  };
-  // Display env var for P2P Benchmark preset
-  void DisplayP2PBenchmarkEnvVars() const
-  {
-    DisplayEnvVars();
-    if (hideEnv) return;
-    if (!outputToCsv)
-      printf("[P2P Related]\n");
-    PRINT_EV("NUM_CPU_SE", numCpuSubExecs,
-             std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors");
-    PRINT_EV("NUM_GPU_SE", numGpuSubExecs,
-             std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors");
-    PRINT_EV("P2P_MODE", p2pMode,
-             std::string("Running ") + (p2pMode == 1 ? "Unidirectional" :
-                                        p2pMode == 2 ? "Bidirectional"  :
-                                                       "Unidirectional + Bidirectional"));
-    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
-             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
-    PRINT_EV("USE_GPU_DMA", useDmaCopy,
-             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
-    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
-             std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
-    printf("\n");
-  }
-  // Display env var settings
-  void DisplaySweepEnvVars() const
-  {
-    DisplayEnvVars();
-    if (hideEnv) return;
-    if (!outputToCsv)
-      printf("[Sweep Related]\n");
-    PRINT_ES("SWEEP_DST", sweepDst.c_str(),
-             std::string("Destination Memory Types to sweep"));
-    PRINT_ES("SWEEP_EXE", sweepExe.c_str(),
-             std::string("Executor Types to sweep"));
-    PRINT_EV("SWEEP_MAX", sweepMax,
-             std::string("Max simultaneous transfers (0 = no limit)"));
-    PRINT_EV("SWEEP_MIN", sweepMin,
-             std::string("Min simultaenous transfers"));
-    PRINT_EV("SWEEP_RAND_BYTES", sweepRandBytes,
-             std::string("Using ") + (sweepRandBytes ? "random" : "constant") + " number of bytes per Transfer");
-    PRINT_EV("SWEEP_SEED", sweepSeed,
-             std::string("Random seed set to ") + std::to_string(sweepSeed));
-    PRINT_ES("SWEEP_SRC", sweepSrc.c_str(),
-             std::string("Source Memory Types to sweep"));
-    PRINT_EV("SWEEP_TEST_LIMIT", sweepTestLimit,
-             std::string("Max number of tests to run during sweep (0 = no limit)"));
-    PRINT_EV("SWEEP_TIME_LIMIT", sweepTimeLimit,
-             std::string("Max number of seconds to run sweep for  (0 = no limit)"));
-    PRINT_EV("SWEEP_XGMI_MAX", sweepXgmiMax,
-             std::string("Max number of XGMI hops for Transfers (-1 = no limit)"));
-    PRINT_EV("SWEEP_XGMI_MIN", sweepXgmiMin,
-             std::string("Min number of XGMI hops for Transfers"));
-    printf("\n");
-  }
-  void DisplayA2AEnvVars() const
-  {
-    DisplayEnvVars();
-    if (hideEnv) return;
-    if (!outputToCsv)
-      printf("[AllToAll Related]\n");
-    PRINT_EV("A2A_DIRECT", a2aDirect,
-             std::string(a2aDirect ? "Only using direct links" : "Full all-to-all"));
-    PRINT_EV("A2A_MODE", a2aMode,
-             std::string(a2aMode == 0 ? "Perform copy" :
-                         a2aMode == 1 ? "Perform read-only" :
-                                        "Perform write-only"));
-    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
-             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
-    PRINT_EV("USE_GPU_DMA", useDmaCopy,
-             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
-    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
-             std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
-    printf("\n");
-  }
-  void DisplaySchmooEnvVars() const
-  {
-    DisplayEnvVars();
-    if (hideEnv) return;
-    if (!outputToCsv)
-      printf("[Schmoo Related]\n");
-    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
-             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
-  }
-  void DisplayRemoteWriteEnvVars() const
-  {
-    DisplayEnvVars();
-    if (hideEnv) return;
-    if (!outputToCsv)
-      printf("[Remote-Write Related]\n");
-    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
-             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
-    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
-             std::string("Performing remote ") + (useRemoteRead ? "reads" : "writes"));
-    printf("\n");
-  }
-  void DisplayParallelCopyEnvVars() const
-  {
-    DisplayEnvVars();
-    if (hideEnv) return;
-    if (!outputToCsv)
-      printf("[Parallel-copy Related]\n");
-    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
-             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
-    PRINT_EV("USE_GPU_DMA", useDmaCopy,
-             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
-    printf("\n");
-  }
-  // Helper function that gets parses environment variable or sets to default value
-  static int GetEnvVar(std::string const& varname, int defaultValue)
-  {
-    if (getenv(varname.c_str()))
-      return atoi(getenv(varname.c_str()));
-    return defaultValue;
-  }
-  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
-  {
-    if (getenv(varname.c_str()))
-      return getenv(varname.c_str());
-    return defaultValue;
-  }
-  std::string GetCuMaskDesc() const
-  {
-    std::vector<std::pair<int, int>> runs;
-    int numXccs = (xccIdsPerDevice.size() > 0 ? xccIdsPerDevice[0].size() : 1);
-    bool inRun = false;
-    std::pair<int, int> curr;
-    int used = 0;
-    for (int targetBit = 0; targetBit < cuMask.size() * 32; targetBit += numXccs) {
-      if (cuMask[targetBit/32] & (1 << (targetBit%32))) {
-        used++;
-        if (!inRun) {
-          inRun = true;
-          curr.first = targetBit / numXccs;
-        }
-      } else {
-        if (inRun) {
-          inRun = false;
-          curr.second = targetBit / numXccs - 1;
-          runs.push_back(curr);
-        }
-      }
-    }
-    if (inRun)
-      curr.second = (cuMask.size() * 32) / numXccs - 1;
-    std::string result = "CUs used: (" + std::to_string(used) + ") ";
-    for (int i = 0; i < runs.size(); i++)
-    {
-      if (i) result += ",";
-      if (runs[i].first == runs[i].second) result += std::to_string(runs[i].first);
-      else result += std::to_string(runs[i].first) + "-" + std::to_string(runs[i].second);
-    }
-    return result;
-  }
-};
-#endif
--- a/src/include/GetClosestNumaNode.hpp
+++ b/src/include/GetClosestNumaNode.hpp
-/*
-Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-#pragma once
-// Helper macro for checking HSA calls
-#define HSA_CHECK(cmd)                                                  \
-  do {                                                                  \
-    hsa_status_t error = (cmd);                                         \
-    if (error != HSA_STATUS_SUCCESS) {                                  \
-      const char* errString = NULL;                                     \
-      hsa_status_string(error, &errString);                             \
-      std::cerr << "Encountered HSA error (" << errString << ") at line " \
-                << __LINE__ << " in file " << __FILE__ << "\n";         \
-      exit(-1);                                                         \
-    }                                                                   \
-  } while (0)
-// Structure to hold HSA agent information
-#if !defined(__NVCC__)
-struct AgentData
-{
-  bool isInitialized;
-  std::vector<hsa_agent_t> cpuAgents;
-  std::vector<hsa_agent_t> gpuAgents;
-  std::vector<int> closestNumaNode;
-};
-// Simple callback function to return any memory pool for an agent
-hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)
-{
-  hsa_amd_memory_pool_t* poolData = reinterpret_cast<hsa_amd_memory_pool_t*>(data);
-  // Check memory pool flags
-  uint32_t poolFlags;
-  HSA_CHECK(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &poolFlags));
-  // Only consider coarse-grained pools
-  if (!(poolFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) return HSA_STATUS_SUCCESS;
-  *poolData = pool;
-  return HSA_STATUS_SUCCESS;
-}
-// Callback function to gather HSA agent information
-hsa_status_t AgentInfoCallback(hsa_agent_t agent, void* data)
-{
-  AgentData* agentData = reinterpret_cast<AgentData*>(data);
-  // Get the device type
-  hsa_device_type_t deviceType;
-  HSA_CHECK(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &deviceType));
-  if (deviceType == HSA_DEVICE_TYPE_CPU)
-    agentData->cpuAgents.push_back(agent);
-  if (deviceType == HSA_DEVICE_TYPE_GPU)
-  {
-    agentData->gpuAgents.push_back(agent);
-    agentData->closestNumaNode.push_back(0);
-  }
-  return HSA_STATUS_SUCCESS;
-}
-AgentData& GetAgentData()
-{
-  static AgentData agentData = {};
-  if (!agentData.isInitialized) {
-    agentData.isInitialized = true;
-    // Add all detected agents to the list
-    HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData));
-    // Loop over each GPU
-    for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++) {
-      // Collect memory pool
-      hsa_amd_memory_pool_t pool;
-      HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool));
-      // Loop over each CPU agent and check distance
-      agentData.closestNumaNode[i] = 0;
-      int bestDistance = -1;
-      for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++) {
-        // Determine number of hops from GPU memory pool to CPU agent
-        uint32_t hops = 0;
-        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
-                                                     pool,
-                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS,
-                                                     &hops));
-        // Gather link info
-        if (hops) {
-          hsa_amd_memory_pool_link_info_t* link_info =
-            (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t));
-          HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
-                                                       pool,
-                                                       HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO,
-                                                       link_info));
-          int numaDist = 0;
-          for (int k = 0; k < hops; k++)
-            numaDist += link_info[k].numa_distance;
-          if (bestDistance == -1 || numaDist < bestDistance) {
-            agentData.closestNumaNode[i] = j;
-            bestDistance = numaDist;
-          }
-          free(link_info);
-        }
-      }
-    }
-  }
-  return agentData;
-}
-#endif
-// Returns closest CPU NUMA node to provided GPU
-// NOTE: This assumes HSA GPU indexing is similar to HIP GPU indexing
-int GetClosestNumaNode(int gpuIdx)
-{
-#if defined(__NVCC__)
-  return -1;
-#else
-  AgentData& agentData = GetAgentData();
-  if (gpuIdx < 0 || gpuIdx >= agentData.closestNumaNode.size())
-  {
-    printf("[ERROR] GPU index out is out of bounds\n");
-    exit(1);
-  }
-  return agentData.closestNumaNode[gpuIdx];
-#endif
-}
--- a/src/include/Kernels.hpp
+++ b/src/include/Kernels.hpp
-/*
-Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-#pragma once
-#define PackedFloat_t   float4
-#define MAX_BLOCKSIZE   512
-#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
-#define MEMSET_CHAR     75
-#define MEMSET_VAL      13323083.0f
-#if defined(__NVCC__)
-#define warpSize 32
-#endif
-#define MAX_WAVEGROUPS  MAX_BLOCKSIZE / warpSize
-#define MAX_UNROLL      8
-#define NUM_WAVEORDERS  6
-// Each subExecutor is provided with subarrays to work on
-#define MAX_SRCS 16
-#define MAX_DSTS 16
-struct SubExecParam
-{
-  // Inputs
-  size_t    N;                                  // Number of floats this subExecutor works on
-  int       numSrcs;                            // Number of source arrays
-  int       numDsts;                            // Number of destination arrays
-  float*    src[MAX_SRCS];                      // Source array pointers
-  float*    dst[MAX_DSTS];                      // Destination array pointers
-  int32_t   preferredXccId;                     // XCC ID to execute on
-  // Prepared
-  int       teamSize;                           // Index of this sub executor amongst team
-  int       teamIdx;                            // Size of team this sub executor is part of
-  // Outputs
-  long long startCycle;                         // Start timestamp for in-kernel timing (GPU-GFX executor)
-  long long stopCycle;                          // Stop  timestamp for in-kernel timing (GPU-GFX executor)
-  uint32_t  hwId;                               // Hardware ID
-  uint32_t  xccId;                              // XCC ID
-};
-// Macro for collecting HW_REG_HW_ID
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
-#define GetHwId(hwId) \
-  hwId = 0
-#elif defined(__NVCC__)
-#define GetHwId(hwId) \
-  asm("mov.u32 %0, %smid;" : "=r"(hwId) )
-#else
-#define GetHwId(hwId) \
-  asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwId));
-#endif
-// Macro for collecting HW_REG_XCC_ID
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
-#define GetXccId(val) \
-  asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
-#else
-#define GetXccId(val) \
-  val = 0
-#endif
-void CpuReduceKernel(SubExecParam const& p)
-{
-  int const& numSrcs = p.numSrcs;
-  int const& numDsts = p.numDsts;
-  if (numSrcs == 0)
-  {
-    for (int i = 0; i < numDsts; ++i)
-      memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
-  }
-  else if (numSrcs == 1)
-  {
-    float const* __restrict__ src = p.src[0];
-    if (numDsts == 0)
-    {
-      float sum = 0.0;
-      for (int j = 0; j < p.N; j++)
-        sum += p.src[0][j];
-      // Add a dummy check to ensure the read is not optimized out
-      if (sum != sum)
-      {
-        printf("[ERROR] Nan detected\n");
-      }
-    }
-    else
-    {
-      for (int i = 0; i < numDsts; ++i)
-      {
-        memcpy(p.dst[i], src, p.N * sizeof(float));
-      }
-    }
-  }
-  else
-  {
-    float sum = 0.0f;
-    for (int j = 0; j < p.N; j++)
-    {
-      sum = p.src[0][j];
-      for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
-      for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
-    }
-  }
-}
-std::string PrepSrcValueString()
-{
-  return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)";
-}
-__host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx)
-{
-  return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
-}
-__global__ void CollectXccIdsKernel(int* xccIds)
-{
-  int xccId;
-  GetXccId(xccId);
-  xccIds[blockIdx.x] = xccId;
-}
-// GPU kernel to prepare src buffer data
-__global__ void
-PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
-{
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-       idx < N;
-       idx += blockDim.x * gridDim.x)
-  {
-    ptr[idx] = PrepSrcValue(srcBufferIdx, idx);
-  }
-}
-__device__ int64_t GetTimestamp()
-{
-#if defined(__NVCC__)
-  int64_t result;
-  asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(result));
-  return result;
-#else
-  return wall_clock64();
-#endif
-}
-// Helper function for memset
-template <typename T> __device__ __forceinline__ T      MemsetVal();
-template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
-template <>           __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); }
-template <int BLOCKSIZE, int UNROLL>
-__global__ void __launch_bounds__(BLOCKSIZE)
-  GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
-{
-  int64_t startCycle;
-  if (threadIdx.x == 0) startCycle = GetTimestamp();
-  SubExecParam& p = params[blockIdx.y];
-  // (Experimental) Filter by XCC if desired
-#if !defined(__NVCC__)
-  int32_t xccId;
-  GetXccId(xccId);
-  if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
-#endif
-  // Collect data information
-  int32_t const  numSrcs  = p.numSrcs;
-  int32_t const  numDsts  = p.numDsts;
-  float4  const* __restrict__ srcFloat4[MAX_SRCS];
-  float4*        __restrict__ dstFloat4[MAX_DSTS];
-  for (int i = 0; i < numSrcs; i++) srcFloat4[i] = (float4*)p.src[i];
-  for (int i = 0; i < numDsts; i++) dstFloat4[i] = (float4*)p.dst[i];
-  // Operate on wavefront granularity
-  int32_t const nTeams   = p.teamSize;             // Number of threadblocks working together on this subarray
-  int32_t const teamIdx  = p.teamIdx;              // Index of this threadblock within the team
-  int32_t const nWaves   = BLOCKSIZE   / warpSize; // Number of wavefronts within this threadblock
-  int32_t const waveIdx  = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
-  int32_t const tIdx     = threadIdx.x % warpSize; // Thread index within wavefront
-  size_t  const numFloat4 = p.N / 4;
-  int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
-  switch (waveOrder)
-  {
-  case 0: /* U,W,C */ unrlStride = 1; waveStride = UNROLL; teamStride = UNROLL * nWaves;  teamStride2 = nWaves; waveStride2 = 1     ; break;
-  case 1: /* U,C,W */ unrlStride = 1; teamStride = UNROLL; waveStride = UNROLL * nTeams;  teamStride2 = 1;      waveStride2 = nTeams; break;
-  case 2: /* W,U,C */ waveStride = 1; unrlStride = nWaves; teamStride = nWaves * UNROLL;  teamStride2 = nWaves; waveStride2 = 1     ; break;
-  case 3: /* W,C,U */ waveStride = 1; teamStride = nWaves; unrlStride = nWaves * nTeams;  teamStride2 = nWaves; waveStride2 = 1     ; break;
-  case 4: /* C,U,W */ teamStride = 1; unrlStride = nTeams; waveStride = nTeams * UNROLL;  teamStride2 = 1;      waveStride2 = nTeams; break;
-  case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves;  teamStride2 = 1;      waveStride2 = nTeams; break;
-  }
-  int subIterations = 0;
-  while (1) {
-    // First loop: Each wavefront in the team works on UNROLL float4s per thread
-    size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
-    size_t const loop1Limit  = numFloat4 / loop1Stride * loop1Stride;
-    {
-      float4 val[UNROLL];
-      if (numSrcs == 0) {
-        #pragma unroll
-        for (int u = 0; u < UNROLL; u++)
-          val[u] = MemsetVal<float4>();
-      }
-      for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride)
-      {
-        // Read sources into memory and accumulate in registers
-        if (numSrcs)
-        {
-          for (int u = 0; u < UNROLL; u++)
-            val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
-          for (int s = 1; s < numSrcs; s++)
-            for (int u = 0; u < UNROLL; u++)
-              val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
-        }
-        // Write accumulation to all outputs
-        for (int d = 0; d < numDsts; d++)
-        {
-          #pragma unroll
-          for (int u = 0; u < UNROLL; u++)
-            dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
-        }
-      }
-    }
-    // Second loop: Deal with remaining float4s
-    {
-      if (loop1Limit < numFloat4)
-      {
-        float4 val;
-        if (numSrcs == 0) val = MemsetVal<float4>();
-        size_t const loop2Stride = nTeams * nWaves * warpSize;
-        for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < numFloat4; idx += loop2Stride)
-        {
-          if (numSrcs)
-          {
-            val = srcFloat4[0][idx];
-            for (int s = 1; s < numSrcs; s++)
-              val += srcFloat4[s][idx];
-          }
-          for (int d = 0; d < numDsts; d++)
-            dstFloat4[d][idx] = val;
-        }
-      }
-    }
-    // Third loop; Deal with remaining floats
-    {
-      if (numFloat4 * 4 < p.N)
-      {
-        float val;
-        if (numSrcs == 0) val = MemsetVal<float>();
-        size_t const loop3Stride = nTeams * nWaves * warpSize;
-        for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride)
-        {
-          if (numSrcs)
-          {
-            val = p.src[0][idx];
-            for (int s = 1; s < numSrcs; s++)
-              val += p.src[s][idx];
-          }
-          for (int d = 0; d < numDsts; d++)
-            p.dst[d][idx] = val;
-        }
-      }
-    }
-    if (++subIterations == numSubIterations) break;
-  }
-  // Wait for all threads to finish
-  __syncthreads();
-  if (threadIdx.x == 0)
-  {
-    __threadfence_system();
-    p.stopCycle  = GetTimestamp();
-    p.startCycle = startCycle;
-    GetHwId(p.hwId);
-    GetXccId(p.xccId);
-  }
-}
-typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
-#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
-  {GpuReduceKernel<BLOCKSIZE, 1>,  \
-   GpuReduceKernel<BLOCKSIZE, 2>,  \
-   GpuReduceKernel<BLOCKSIZE, 3>,  \
-   GpuReduceKernel<BLOCKSIZE, 4>,  \
-   GpuReduceKernel<BLOCKSIZE, 5>,  \
-   GpuReduceKernel<BLOCKSIZE, 6>,  \
-   GpuReduceKernel<BLOCKSIZE, 7>,  \
-   GpuReduceKernel<BLOCKSIZE, 8>}
-GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] =
-{
-  GPU_KERNEL_UNROLL_DECL(64),
-  GPU_KERNEL_UNROLL_DECL(128),
-  GPU_KERNEL_UNROLL_DECL(192),
-  GPU_KERNEL_UNROLL_DECL(256),
-  GPU_KERNEL_UNROLL_DECL(320),
-  GPU_KERNEL_UNROLL_DECL(384),
-  GPU_KERNEL_UNROLL_DECL(448),
-  GPU_KERNEL_UNROLL_DECL(512)
-};
--- a/src/include/TransferBench.hpp
+++ b/src/include/TransferBench.hpp
-/*
-Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-#pragma once
-#include <vector>
-#include <sstream>
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <cstdint>
-#include <set>
-#include <unistd.h>
-#include <map>
-#include <iostream>
-#include <sstream>
-#include "Compatibility.hpp"
-#include "EnvVars.hpp"
-// Simple configuration parameters
-size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);  // Amount of data transferred per Transfer
-#define MAX_LINE_LEN 32768
-// Different src/dst memory types supported
-typedef enum
-{
-  MEM_CPU          = 0, // Coarse-grained pinned CPU memory
-  MEM_GPU          = 1, // Coarse-grained global GPU memory
-  MEM_CPU_FINE     = 2, // Fine-grained pinned CPU memory
-  MEM_GPU_FINE     = 3, // Fine-grained global GPU memory
-  MEM_CPU_UNPINNED = 4, // Unpinned CPU memory
-  MEM_NULL         = 5, // NULL memory - used for empty
-  MEM_MANAGED      = 6
-} MemType;
-typedef enum
-{
-  EXE_CPU          = 0, // CPU executor              (subExecutor = CPU thread)
-  EXE_GPU_GFX      = 1, // GPU kernel-based executor (subExecutor = threadblock/CU)
-  EXE_GPU_DMA      = 2, // GPU SDMA-based executor   (subExecutor = streams)
-} ExeType;
-bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); }
-bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); };
-bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); };
-bool IsCpuType(ExeType e) { return (e == EXE_CPU); };
-char const MemTypeStr[8] = "CGBFUNM";
-char const ExeTypeStr[4] = "CGD";
-char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};
-MemType inline CharToMemType(char const c)
-{
-  char const* val = strchr(MemTypeStr, toupper(c));
-  if (val) return (MemType)(val - MemTypeStr);
-  printf("[ERROR] Unexpected memory type (%c)\n", c);
-  exit(1);
-}
-ExeType inline CharToExeType(char const c)
-{
-  char const* val = strchr(ExeTypeStr, toupper(c));
-  if (val) return (ExeType)(val - ExeTypeStr);
-  printf("[ERROR] Unexpected executor type (%c)\n", c);
-  exit(1);
-}
-// Each Transfer performs reads from source memory location(s), sums them (if multiple sources are specified)
-// then writes the summation to each of the specified destination memory location(s)
-struct Transfer
-{
-  // Inputs
-  ExeType                    exeType;            // Transfer executor type
-  int                        exeIndex;           // Executor index (NUMA node for CPU / device ID for GPU)
-  int                        exeSubIndex;        // Executor subindex
-  int                        numSubExecs;        // Number of subExecutors to use for this Transfer
-  size_t                     numBytes;           // # of bytes requested to Transfer (may be 0 to fallback to default)
-  int                        numSrcs;            // Number of sources
-  std::vector<MemType>       srcType;            // Source memory types
-  std::vector<int>           srcIndex;           // Source device indice
-  int                        numDsts;            // Number of destinations
-  std::vector<MemType>       dstType;            // Destination memory type
-  std::vector<int>           dstIndex;           // Destination device index
-  // Outputs
-  size_t                     numBytesActual;     // Actual number of bytes to copy
-  double                     transferTime;       // Time taken in milliseconds for this transfer
-  double                     transferBandwidth;  // Transfer bandwidth (GB/s)
-  double                     executorBandwidth;  // Executor bandwidth (GB/s)
-  std::vector<double>        perIterationTime;   // Per-iteration timing
-  std::vector<std::set<std::pair<int,int>>> perIterationCUs; // Per-iteration CU usage
-  // Internal
-  int                        transferIndex;      // Transfer identifier (within a Test)
-  std::vector<float*>        srcMem;             // Source memory
-  std::vector<float*>        dstMem;             // Destination memory
-  std::vector<SubExecParam>  subExecParam;       // Defines subarrays assigned to each threadblock
-  SubExecParam*              subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
-  std::vector<int>           subExecIdx;         // Indicies into subExecParamGpu
-#if !defined(__NVCC__)
-  // For targeted-SDMA
-  hsa_agent_t                dstAgent;           // DMA destination memory agent
-  hsa_agent_t                srcAgent;           // DMA source memory agent
-  hsa_signal_t               signal;             // HSA signal for completion
-  hsa_amd_sdma_engine_id_t   sdmaEngineId;       // DMA engine ID
-#endif
-  // Prepares src/dst subarray pointers for each SubExecutor
-  void PrepareSubExecParams(EnvVars const& ev);
-  // Prepare source arrays with input data
-  bool PrepareSrc(EnvVars const& ev);
-  // Validate that destination data contains expected results
-  void ValidateDst(EnvVars const& ev);
-  // Prepare reference buffers
-  void PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx);
-  // String representation functions
-  std::string SrcToStr() const;
-  std::string DstToStr() const;
-};
-struct ExecutorInfo
-{
-  std::vector<Transfer*>   transfers;        // Transfers to execute
-  size_t                   totalBytes;       // Total bytes this executor transfers
-  int                      totalSubExecs;    // Total number of subExecutors to use
-  // For GPU-Executors
-  SubExecParam*            subExecParamGpu;  // GPU copy of subExecutor parameters
-  std::vector<hipStream_t> streams;
-  std::vector<hipEvent_t>  startEvents;
-  std::vector<hipEvent_t>  stopEvents;
-  // Results
-  double totalTime;
-};
-struct ExeResult
-{
-  double bandwidthGbs;
-  double durationMsec;
-  double sumBandwidthGbs;
-  size_t totalBytes;
-  std::vector<int> transferIdx;
-};
-struct TestResults
-{
-  size_t numTimedIterations;
-  size_t totalBytesTransferred;
-  double totalBandwidthCpu;
-  double totalDurationMsec;
-  double overheadMsec;
-  std::map<std::pair<ExeType, int>, ExeResult> exeResults;
-};
-typedef std::pair<ExeType, int> Executor;
-typedef std::map<Executor, ExecutorInfo> TransferMap;
-// Display usage instructions
-void DisplayUsage(char const* cmdName);
-// Display detected GPU topology / CPU numa nodes
-void DisplayTopology(bool const outputToCsv);
-// Build array of test sizes based on sampling factor
-void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor,
-                       std::vector<size_t>& valuesofN);
-void ParseMemType(EnvVars const& ev, std::string const& token, std::vector<MemType>& memType, std::vector<int>& memIndex);
-void ParseExeType(EnvVars const& ev, std::string const& token, ExeType& exeType, int& exeIndex, int& exeSubIndex);
-void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transfers);
-void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N,
-                      std::vector<Transfer>& transfers, bool verbose = true,
-                      double* totalBandwidthCpu = nullptr);
-TestResults ExecuteTransfersImpl(EnvVars const& ev, std::vector<Transfer>& transfers);
-void ReportResults(EnvVars const& ev, std::vector<Transfer> const& transfers, TestResults const results);
-void EnablePeerAccess(int const deviceId, int const peerDeviceId);
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
-void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
-void CheckPages(char* byteArray, size_t numBytes, int targetId);
-void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
-void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N);
-void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int const maxSubExecs);
-void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom);
-void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs);
-void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs);
-void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
-void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
-void RunHealthCheck(EnvVars ev);
-std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
-int RemappedIndex(int const origIdx, bool const isCpuType);
-void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
-std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset);