impr: Library/Client build organization

Change code organization and build options Code changes related to the following: * Build files * Options to build client, shared, and static libraries * Source code directories * Modern C++20 changes * Based on TB 1.6.4 * Formatting Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>

impr: Library/Client build organization
Change code organization and build options Code changes related to the following: * Build files * Options to build client, shared, and static libraries * Source code directories * Modern C++20 changes * Based on TB 1.6.4 * Formatting Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
09f4f11b · Oliveira, Daniel · 2d0ecaae · 09f4f11b · 09f4f11b · 09f4f11b
Commit 09f4f11b authored Jul 15, 2025 by Oliveira, Daniel
20 changed files
--- a/client/include/Topology.hpp
+++ b/client/include/Topology.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "TransferBench.hpp"
+
+#include <vector>
+
+
+static int RemappedCpuIndex(int origIdx)
+{
+    static std::vector<int> remappingCpu;
+
+    // Build CPU remapping on first use
+    // Skip numa nodes that are not configured
+    if (remappingCpu.empty()) {
+        for (int node = 0; node <= numa_max_node(); node++) {
+            if (numa_bitmask_isbitset(numa_get_mems_allowed(), node)) {
+                remappingCpu.push_back(node);
+            }
+        }
+    }
+    return remappingCpu[origIdx];
+}
+
+static void PrintNicToGPUTopo([[maybe_unused]] bool outputToCsv)
+{
+#ifdef NIC_EXEC_ENABLED
+    printf(
+        " NIC | Device Name | Active | PCIe Bus ID  | NUMA | Closest GPU(s) | GID Index | GID "
+        "Descriptor\n");
+    if (!outputToCsv) {
+        printf(
+            "-----+-------------+--------+--------------+------+----------------+-----------+------"
+            "-------------\n");
+    }
+
+    int numGpus               = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    auto const& ibvDeviceList = GetIbvDeviceList();
+    for (int i = 0; i < ibvDeviceList.size(); i++) {
+        std::string closestGpusStr = "";
+        for (int j = 0; j < numGpus; j++) {
+            if (TransferBench::GetClosestNicToGpu(j) == i) {
+                if (closestGpusStr != "") { closestGpusStr += ","; }
+                closestGpusStr += std::to_string(j);
+            }
+        }
+
+        printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-14s | %-9s | %-20s\n",
+               i,
+               ibvDeviceList[i].name.c_str(),
+               ibvDeviceList[i].hasActivePort ? "Yes" : "No",
+               ibvDeviceList[i].busId.c_str(),
+               ibvDeviceList[i].numaNode,
+               closestGpusStr.c_str(),
+               ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort
+                   ? std::to_string(ibvDeviceList[i].gidIndex).c_str()
+                   : "N/A",
+               ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort
+                   ? ibvDeviceList[i].gidDescriptor.c_str()
+                   : "N/A");
+    }
+    printf("\n");
+#endif
+}
+
+void DisplayTopology(bool outputToCsv)
+{
+    int numCpus = TransferBench::GetNumExecutors(EXE_CPU);
+    int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    int numNics = TransferBench::GetNumExecutors(EXE_NIC);
+    char sep    = (outputToCsv ? ',' : '|');
+
+    if (outputToCsv) {
+        printf("NumCpus,%d\n", numCpus);
+        printf("NumGpus,%d\n", numGpus);
+        printf("NumNics,%d\n", numNics);
+    } else {
+        printf("\nDetected Topology:\n");
+        printf("==================\n");
+        printf("  %d configured CPU NUMA node(s) [%d total]\n", numCpus, numa_max_node() + 1);
+        printf("  %d GPU device(s)\n", numGpus);
+        printf("  %d Supported NIC device(s)\n", numNics);
+    }
+
+    // Print out detected CPU topology
+    printf("\n            %c", sep);
+    for (int j = 0; j < numCpus; j++) { printf("NUMA %02d%c", j, sep); }
+    printf(" #Cpus %c Closest GPU(s)\n", sep);
+
+    if (!outputToCsv) {
+        printf("------------+");
+        for (int j = 0; j <= numCpus; j++) { printf("-------+"); }
+        printf("---------------\n");
+    }
+
+    for (int i = 0; i < numCpus; i++) {
+        int nodeI = RemappedCpuIndex(i);
+        printf("NUMA %02d (%02d)%c", i, nodeI, sep);
+        for (int j = 0; j < numCpus; j++) {
+            int nodeJ    = RemappedCpuIndex(j);
+            int numaDist = numa_distance(nodeI, nodeJ);
+            printf(" %5d %c", numaDist, sep);
+        }
+
+        int numCpuCores = 0;
+        for (int j = 0; j < numa_num_configured_cpus(); j++) {
+            if (numa_node_of_cpu(j) == nodeI) { numCpuCores++; }
+        }
+        printf(" %5d %c", numCpuCores, sep);
+
+        for (int j = 0; j < numGpus; j++) {
+            if (TransferBench::GetClosestCpuNumaToGpu(j) == nodeI) { printf(" %d", j); }
+        }
+        printf("\n");
+    }
+    printf("\n");
+
+    // Print out detected NIC topology
+    PrintNicToGPUTopo(outputToCsv);
+
+    // Print out detected GPU topology
+#if defined(__NVCC__)
+    for (int i = 0; i < numGpus; i++) {
+        hipDeviceProp_t prop;
+        HIP_CALL(hipGetDeviceProperties(&prop, i));
+        printf(" GPU %02d | %s\n", i, prop.name);
+    }
+    // No further topology detection done for NVIDIA platforms
+    return;
+#else
+    // Print headers
+    if (!outputToCsv) {
+        printf("        |");
+        for (int j = 0; j < numGpus; j++) {
+            hipDeviceProp_t prop;
+            HIP_CALL(hipGetDeviceProperties(&prop, j));
+            std::string fullName = prop.gcnArchName;
+            std::string archName = fullName.substr(0, fullName.find(':'));
+            printf(" %6s |", archName.c_str());
+        }
+        printf("\n");
+    }
+
+    printf("        %c", sep);
+    for (int j = 0; j < numGpus; j++) { printf(" GPU %02d %c", j, sep); }
+    printf(" PCIe Bus ID  %c #CUs %c NUMA %c #DMA %c #XCC %c NIC\n", sep, sep, sep, sep, sep);
+
+    if (!outputToCsv) {
+        for (int j = 0; j <= numGpus; j++) { printf("--------+"); }
+        printf("--------------+------+------+------+------+------\n");
+    }
+
+    // Loop over each GPU device
+    for (int i = 0; i < numGpus; i++) {
+        printf(" GPU %02d %c", i, sep);
+
+        // Print off link information
+        for (int j = 0; j < numGpus; j++) {
+            if (i == j) {
+                printf("    N/A %c", sep);
+            } else {
+                uint32_t linkType, hopCount;
+                HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+                printf(" %s-%d %c",
+                       linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT"
+                       : linkType == HSA_AMD_LINK_INFO_TYPE_QPI          ? " QPI"
+                       : linkType == HSA_AMD_LINK_INFO_TYPE_PCIE         ? "PCIE"
+                       : linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND    ? "INFB"
+                       : linkType == HSA_AMD_LINK_INFO_TYPE_XGMI         ? "XGMI"
+                                                                         : "????",
+                       hopCount,
+                       sep);
+            }
+        }
+
+        char pciBusId[20];
+        HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
+        printf(" %-11s %c %-4d %c %-4d %c %-4d %c %-4d %c %-4d\n",
+               pciBusId,
+               sep,
+               TransferBench::GetNumSubExecutors({EXE_GPU_GFX, i}),
+               sep,
+               TransferBench::GetClosestCpuNumaToGpu(i),
+               sep,
+               TransferBench::GetNumExecutorSubIndices({EXE_GPU_DMA, i}),
+               sep,
+               TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}),
+               sep,
+               TransferBench::GetClosestNicToGpu(i));
+    }
+#endif
+}
--- a/client/src/Client.cpp
+++ b/client/src/Client.cpp
+/*
+Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "Client.hpp"
+#include "EnvVars.hpp"
+#include "Presets.hpp"
+#include "Topology.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+
+int main(int argc, char** argv)
+{
+    // Collect environment variables
+    EnvVars ev;
+
+    // Display usage instructions and detected topology
+    if (argc <= 1) {
+        if (!ev.outputToCsv) {
+            DisplayUsage(argv[0]);
+            DisplayPresets();
+        }
+        DisplayTopology(ev.outputToCsv);
+        exit(0);
+    }
+
+    // Determine number of bytes to run per Transfer
+    size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
+    if (argc > 2) {
+        // Adjust bytes if unit specified
+        char units = argv[2][strlen(argv[2]) - 1];
+        switch (units) {
+            case 'G':
+            case 'g': numBytesPerTransfer *= 1024;
+            case 'M':
+            case 'm': numBytesPerTransfer *= 1024;
+            case 'K':
+            case 'k': numBytesPerTransfer *= 1024;
+        }
+    }
+    if (numBytesPerTransfer % 4) {
+        printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
+        exit(1);
+    }
+
+    // Run preset benchmark if requested
+    if (RunPreset(ev, numBytesPerTransfer, argc, argv)) { exit(0); }
+
+    // Read input from command line or configuration file
+    std::vector<std::string> lines;
+    {
+        std::string line;
+        if (!strcmp(argv[1], "cmdline")) {
+            for (int i = 3; i < argc; i++) { line += std::string(argv[i]) + " "; }
+            lines.push_back(line);
+        } else {
+            std::ifstream cfgFile(argv[1]);
+            if (!cfgFile.is_open()) {
+                printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
+                exit(1);
+            }
+            while (std::getline(cfgFile, line)) { lines.push_back(line); }
+            cfgFile.close();
+        }
+    }
+
+    // Print environment variables and CSV header
+    ev.DisplayEnvVars();
+    if (ev.outputToCsv) {
+        printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
+    }
+
+    TransferBench::ConfigOptions cfgOptions = ev.ToConfigOptions();
+    TransferBench::TestResults results;
+    std::vector<ErrResult> errors;
+
+    // Process each line as a Test
+    int testNum = 0;
+    for (std::string const& line : lines) {
+        // Check if line is a comment to be echoed to output (starts with ##)
+        if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') { printf("%s\n", line.c_str()); }
+
+        // Parse set of parallel Transfers to execute
+        std::vector<Transfer> transfers;
+        CheckForError(TransferBench::ParseTransfers(line, transfers));
+        if (transfers.empty()) { continue; }
+
+        // Check for variable sub-executors Transfers
+        auto numVariableTransfers = std::size_t(0);
+        int maxVarCount           = 0;
+        {
+            std::map<ExeDevice, int> varTransferCount;
+            for (auto const& t : transfers) {
+                if (t.numSubExecs == 0) {
+                    if (t.exeDevice.exeType != EXE_GPU_GFX) {
+                        printf(
+                            "[ERROR] Variable number of subexecutors is only supported on GFX "
+                            "executors\n");
+                        exit(1);
+                    }
+                    numVariableTransfers++;
+                    varTransferCount[t.exeDevice]++;
+                    maxVarCount = max(maxVarCount, varTransferCount[t.exeDevice]);
+                }
+            }
+            if (numVariableTransfers > 0 && numVariableTransfers != transfers.size()) {
+                printf(
+                    "[ERROR] All or none of the Transfers in the Test must use variable number of "
+                    "Subexecutors\n");
+                exit(1);
+            }
+        }
+
+        // Track which transfers have already numBytes specified
+        std::vector<bool> bytesSpecified(transfers.size());
+        int hasUnspecified = false;
+        for (auto i = std::size_t(0); i < transfers.size(); i++) {
+            bytesSpecified[i] = (transfers[i].numBytes != 0);
+            if (transfers[i].numBytes == 0) { hasUnspecified = true; }
+        }
+
+        // Run the specified numbers of bytes otherwise generate a range of values
+        for (size_t bytes = (1 << 10); bytes <= (1 << 29); bytes *= 2) {
+            size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor);
+            size_t currBytes  = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer;
+            do {
+                for (auto i = std::size_t(0); i < transfers.size(); i++) {
+                    if (!bytesSpecified[i]) { transfers[i].numBytes = currBytes; }
+                }
+
+                if (maxVarCount == 0) {
+                    if (TransferBench::RunTransfers(cfgOptions, transfers, results)) {
+                        PrintResults(ev, ++testNum, transfers, results);
+                    }
+                    PrintErrors(results.errResults);
+                } else {
+                    // Variable subexecutors - Determine how many subexecutors to sweep up to
+                    int maxNumVarSubExec = ev.maxNumVarSubExec;
+                    if (maxNumVarSubExec == 0) {
+                        maxNumVarSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}) /
+                                           maxVarCount;
+                    }
+
+                    TransferBench::TestResults bestResults;
+                    std::vector<Transfer> bestTransfers;
+                    for (int numSubExecs = ev.minNumVarSubExec; numSubExecs <= maxNumVarSubExec;
+                         numSubExecs++) {
+                        std::vector<Transfer> tempTransfers = transfers;
+                        for (auto& t : tempTransfers) {
+                            if (t.numSubExecs == 0) { t.numSubExecs = numSubExecs; }
+                        }
+
+                        TransferBench::TestResults tempResults;
+                        if (!TransferBench::RunTransfers(cfgOptions, tempTransfers, tempResults)) {
+                            PrintErrors(tempResults.errResults);
+                        } else {
+                            if (tempResults.avgTotalBandwidthGbPerSec >
+                                bestResults.avgTotalBandwidthGbPerSec) {
+                                bestResults   = tempResults;
+                                bestTransfers = tempTransfers;
+                            }
+                        }
+                    }
+                    PrintResults(ev, ++testNum, bestTransfers, bestResults);
+                    PrintErrors(bestResults.errResults);
+                }
+                if (numBytesPerTransfer != 0 || !hasUnspecified) { break; }
+                currBytes += deltaBytes;
+            } while (currBytes < bytes * 2);
+            if (numBytesPerTransfer != 0 || !hasUnspecified) { break; }
+        }
+    }
+}
+
+void DisplayUsage(char const* cmdName)
+{
+    std::string nicSupport = "";
+#if NIC_EXEC_ENABLED
+    nicSupport = " (with NIC support)";
+#endif
+    printf("TransferBench v%s.(%s)[%s]\n",
+           TransferBench::GetTransferBenchVersion().c_str(),
+           GetClientVersion().c_str(),
+           nicSupport.c_str());
+    printf("========================================\n");
+
+    if (numa_available() == -1) {
+        printf(
+            "[ERROR] NUMA library not supported. Check to see if libnuma has been installed on "
+            "this system\n");
+        exit(1);
+    }
+
+    printf("Usage: %s config <N>\n", cmdName);
+    printf("  config: Either:\n");
+    printf(
+        "          - Filename of configFile containing Transfers to execute (see example.cfg for "
+        "format)\n");
+    printf("          - Name of preset config:\n");
+    printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
+    printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
+           DEFAULT_BYTES_PER_TRANSFER);
+    printf("          If 0 is specified, a range of Ns will be benchmarked\n");
+    printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
+    printf("\n");
+
+    EnvVars::DisplayUsage();
+}
+
+std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices)
+{
+    if (memDevices.empty()) { return "N"; }
+    std::stringstream ss;
+    for (auto const& m : memDevices) { ss << TransferBench::MemTypeStr[m.memType] << m.memIndex; }
+    return ss.str();
+}
+
+void PrintResults(EnvVars const& ev,
+                  int const testNum,
+                  std::vector<Transfer> const& transfers,
+                  TransferBench::TestResults const& results)
+{
+    char sep                  = ev.outputToCsv ? ',' : '|';
+    size_t numTimedIterations = results.numTimedIterations;
+
+    if (!ev.outputToCsv) { printf("Test %d:\n", testNum); }
+
+    // Loop over each executor
+    for (auto exeInfoPair : results.exeResults) {
+        ExeDevice const& exeDevice = exeInfoPair.first;
+        ExeResult const& exeResult = exeInfoPair.second;
+        ExeType const exeType      = exeDevice.exeType;
+        int32_t const exeIndex     = exeDevice.exeIndex;
+
+        printf(
+            " Executor: %3s %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
+            ExeTypeName[exeType],
+            exeIndex,
+            sep,
+            exeResult.avgBandwidthGbPerSec,
+            sep,
+            exeResult.avgDurationMsec,
+            sep,
+            exeResult.numBytes,
+            sep,
+            exeResult.sumBandwidthGbPerSec);
+
+        // Loop over each executor
+        for (int idx : exeResult.transferIdx) {
+            Transfer const& t       = transfers[idx];
+            TransferResult const& r = results.tfrResults[idx];
+
+            char exeSubIndexStr[32] = "";
+            if (t.exeSubIndex != -1) { sprintf(exeSubIndexStr, ".%d", t.exeSubIndex); }
+            printf(
+                "     Transfer %02d  %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> "
+                "%c%03d%s:%03d -> %s\n",
+                idx,
+                sep,
+                r.avgBandwidthGbPerSec,
+                sep,
+                r.avgDurationMsec,
+                sep,
+                r.numBytes,
+                sep,
+                MemDevicesToStr(t.srcs).c_str(),
+                TransferBench::ExeTypeStr[t.exeDevice.exeType],
+                t.exeDevice.exeIndex,
+                exeSubIndexStr,
+                t.numSubExecs,
+                MemDevicesToStr(t.dsts).c_str());
+
+            // Show per-iteration timing information
+            if (ev.showIterations) {
+                // Check that per-iteration information exists
+                if (r.perIterMsec.size() != numTimedIterations) {
+                    printf(
+                        "[ERROR] Per iteration timing data unavailable: Expected %lu data points, "
+                        "but have %lu\n",
+                        numTimedIterations,
+                        r.perIterMsec.size());
+                    exit(1);
+                }
+
+                // Compute standard deviation and track iterations by speed
+                std::set<std::pair<double, int>> times;
+                double stdDevTime = 0;
+                double stdDevBw   = 0;
+                for (auto i = std::size_t(0); i < numTimedIterations; i++) {
+                    times.insert(std::make_pair(r.perIterMsec[i], i + 1));
+                    double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
+                    stdDevTime += varTime * varTime;
+
+                    double iterBandwidthGbs = (t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
+                    double const varBw      = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
+                    stdDevBw += varBw * varBw;
+                }
+                stdDevTime = sqrt(stdDevTime / numTimedIterations);
+                stdDevBw   = sqrt(stdDevBw / numTimedIterations);
+
+                // Loop over iterations (fastest to slowest)
+                for (auto& time : times) {
+                    double iterDurationMsec = time.first;
+                    double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
+                    printf("      Iter %03d    %c %8.3f GB/s %c %8.3f ms %c",
+                           time.second,
+                           sep,
+                           iterBandwidthGbs,
+                           sep,
+                           iterDurationMsec,
+                           sep);
+
+                    std::set<int> usedXccs;
+                    if ((time.second - 1) < static_cast<std::int32_t>(r.perIterCUs.size())) {
+                        printf(" CUs:");
+                        for (auto x : r.perIterCUs[time.second - 1]) {
+                            printf(" %02d:%02d", x.first, x.second);
+                            usedXccs.insert(x.first);
+                        }
+                    }
+
+                    printf(" XCCs:");
+                    for (auto x : usedXccs) { printf(" %02d", x); }
+                    printf("\n");
+                }
+                printf("      StandardDev %c %8.3f GB/s %c %8.3f ms %c\n",
+                       sep,
+                       stdDevBw,
+                       sep,
+                       stdDevTime,
+                       sep);
+            }
+        }
+    }
+    printf(" Aggregate (CPU)  %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
+           sep,
+           results.avgTotalBandwidthGbPerSec,
+           sep,
+           results.avgTotalDurationMsec,
+           sep,
+           results.totalBytesTransferred,
+           sep,
+           results.overheadMsec);
+}
+
+void CheckForError(ErrResult const& error)
+{
+    switch (error.errType) {
+        case ERR_NONE: return;
+        case ERR_WARN: printf("[WARN] %s\n", error.errMsg.c_str()); return;
+        case ERR_FATAL: printf("[ERROR] %s\n", error.errMsg.c_str()); exit(1);
+        default: break;
+    }
+}
+
+void PrintErrors(std::vector<ErrResult> const& errors)
+{
+    bool isFatal = false;
+    for (auto const& err : errors) {
+        printf("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN", err.errMsg.c_str());
+        isFatal |= (err.errType == ERR_FATAL);
+    }
+    if (isFatal) { exit(1); }
+}
+
+auto GetClientVersion() -> const std::string
+{
+    static constexpr auto TB_UNKNOWN_CLIENT_VERSION = std::string_view("Unknown");
+    auto tb_client_version                          = std::string(TRANSFERBENCH_CLIENT_VERSION);
+    if (tb_client_version.empty()) { tb_client_version = std::string(TB_UNKNOWN_CLIENT_VERSION); }
+
+    return tb_client_version;
+}
--- a/cmake/build_utils.cmake
+++ b/cmake/build_utils.cmake
+# MIT License
+#
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+#
+
+#
+#   Required Includes
+include(FetchContent)
+
+#
+#   Note: All functions definitions here
+function(get_rocm_install_path rocm_install_base_path)
+    if(NOT DEFINED ROCM_INSTALL_PATH_FOR_BUILD)
+        message(STATUS ">> Checking ROCm install path settings...")
+        set(TMP_ROCM_INSTALL_PATH "")
+        if(DEFINED ENV{ROCM_PATH} OR DEFINED ROCM_PATH)
+            if(DEFINED ENV{ROCM_PATH})
+                message(STATUS "  >> Environment variable ROCM_PATH: '$ENV{ROCM_PATH}'")
+                set(TMP_ROCM_INSTALL_PATH "$ENV{ROCM_PATH}")
+            endif()
+            if(DEFINED ROCM_PATH)
+                message(STATUS "  >> CMake variable ROCM_PATH: '${ROCM_PATH}'")
+                set(TMP_ROCM_INSTALL_PATH "${ROCM_PATH}")
+            endif()
+        elseif(DEFINED ENV{ROCM_INSTALL_PATH} OR DEFINED ROCM_INSTALL_PATH)
+            if(DEFINED ENV{ROCM_INSTALL_PATH})
+                message(STATUS "  >> Environment variable ROCM_INSTALL_PATH: '$ENV{ROCM_INSTALL_PATH}'")
+                set(TMP_ROCM_INSTALL_PATH "$ENV{ROCM_PATH}")
+            endif()
+            if(DEFINED ROCM_INSTALL_PATH)
+                message(STATUS "  >> CMake variable ROCM_INSTALL_PATH: '${ROCM_INSTALL_PATH}'")
+            endif()
+        else()
+            set(TMP_ROCM_INSTALL_PATH "/opt/rocm")
+            message(STATUS "  >> Using default ROCm install path: '${TMP_ROCM_INSTALL_PATH}'")
+        endif()
+        set(ROCM_INSTALL_PATH_FOR_BUILD "${TMP_ROCM_INSTALL_PATH}" CACHE STRING "ROCm install directory for build" FORCE)
+        set(ROCM_INSTALL_PATH_FOR_BUILD "${ROCM_INSTALL_PATH_FOR_BUILD}" PARENT_SCOPE)
+    else()
+        set(${rocm_install_base_path} "${ROCM_INSTALL_PATH_FOR_BUILD}" PARENT_SCOPE)
+    endif()
+endfunction()
+
+function(setup_build_version version_num version_text)
+    set(TARGET_VERSION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/VERSION")
+    if(NOT EXISTS "${TARGET_VERSION_FILE}")
+        message(FATAL_ERROR "  >> VERSION file not found at: '${TARGET_VERSION_FILE}' ...")
+    endif()
+
+    set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${TARGET_VERSION_FILE})
+    file(READ "${TARGET_VERSION_FILE}" file_version)
+    string(STRIP ${file_version} file_version)
+    string(REPLACE ".wip" "" file_version_text ${file_version})
+    string(REPLACE ".WIP" "" file_version_text ${file_version})
+    set(${version_num} ${file_version} PARENT_SCOPE)
+    set(${version_text} ${file_version_text} PARENT_SCOPE)
+endfunction()
+
+function(setup_rocm_requirements)
+    message(STATUS ">> Checking ROCm environment...")
+    get_rocm_install_path(ROCM_BASE_PATH)
+
+    #
+    #find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_BASE_PATH})
+    find_package(ROCmCMakeBuildTools REQUIRED PATHS ${ROCM_BASE_PATH})
+    find_package(ROCM REQUIRED PATHS ${ROCM_BASE_PATH})
+    find_package(HSA-RUNTIME64 REQUIRED PATHS ${ROCM_BASE_PATH})
+
+    set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "ROCM_WARN_TOOLCHAIN warnings disabled: 'OFF'")
+    set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "ROCMCHECKS_WARN_TOOLCHAIN_VAR warnings disabled: 'OFF'")
+endfunction()
+
+function(add_include_from_library target_name library_name)
+    get_target_property(LIBRARY_INCLUDE_DIRECTORIES ${library_name} INTERFACE_INCLUDE_DIRECTORIES)
+    target_include_directories(${target_name} PRIVATE ${LIBRARY_INCLUDE_DIRECTORIES})
+endfunction()
+
+function(add_source_definitions target_name definition_text)
+    set_property(SOURCE ${target_name} APPEND PROPERTY COMPILE_DEFINITIONS "${definition_text}")
+endfunction()
+
+function(build_transferbench_engine)
+    include(ROCMInstallTargets)
+    include(ROCMCreatePackage)
+endfunction()
+
+function(has_build_debug_mode debug_mode_result)
+    if(NOT DEFINED IS_BUILD_DEBUG_MSG_MODE_ENABLED)
+        if(AMD_APP_DEBUG_BUILD_INFO OR
+            (DEFINED ENV{AMD_APP_DEBUG_BUILD_INFO} AND
+            ("$ENV{AMD_APP_DEBUG_BUILD_INFO}" STREQUAL "ON") OR
+            ("$ENV{AMD_APP_DEBUG_BUILD_INFO}" STREQUAL "1")) OR
+            (DEFINED BUILD_DEBUG_MSG_MODE AND (BUILD_DEBUG_MSG_MODE STREQUAL "ON")))
+            set(IS_BUILD_DEBUG_MSG_MODE_ENABLED BOOL TRUE)
+            set(IS_BUILD_DEBUG_MSG_MODE_ENABLED BOOL TRUE PARENT_SCOPE)
+            set(${debug_mode_result} BOOL TRUE PARENT_SCOPE)
+        else()
+            set(IS_BUILD_DEBUG_MSG_MODE_ENABLED BOOL FALSE)
+            set(IS_BUILD_DEBUG_MSG_MODE_ENABLED BOOL FALSE PARENT_SCOPE)
+            set(${debug_mode_result} BOOL FALSE PARENT_SCOPE)
+        endif()
+    else()
+        if(IS_BUILD_DEBUG_MSG_MODE_ENABLED)
+            set(${debug_mode_result} BOOL TRUE PARENT_SCOPE)
+        else()
+            set(${debug_mode_result} BOOL FALSE PARENT_SCOPE)
+        endif()
+    endif()
+endfunction()
+
+function(get_target target_name target_type)
+    get_target_property(IMPORTED_TARGET ${target_name} IMPORTED)
+    if(IMPORTED_TARGET)
+        set(${target_type} INTERFACE PARENT_SCOPE)
+    else()
+        set(${target_type} PRIVATE PARENT_SCOPE)
+    endif()
+endfunction()
+
+function(add_c_flag)
+    if (ARGC EQUAL 1)
+        add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARGV0}>)
+    elseif(ARGC EQUAL 2)
+        get_target(${ARGV1} TYPE)
+        target_compile_options(${ARGV1} ${TYPE} $<$<COMPILE_LANGUAGE:C>:${ARGV0}>)
+    endif()
+endfunction()
+
+function(add_cxx_flag)
+    if (ARGC EQUAL 1)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARGV0}>)
+    elseif(ARGC EQUAL 2)
+        get_target(${ARGV1} TYPE)
+        target_compile_options(${ARGV1} ${TYPE} $<$<COMPILE_LANGUAGE:CXX>:${ARGV0}>)
+    endif()
+endfunction()
+
+function(add_linker_flag)
+    if (ARGC EQUAL 1)
+        add_link_options(${ARGV0})
+    elseif(ARGC EQUAL 2)
+        get_target(${ARGV1} TYPE)
+        target_link_options(${ARGV1} ${TYPE} ${ARGV0})
+    endif()
+endfunction()
+
+function(add_c_cxx_flag)
+    add_c_flag(${ARGV0} ${ARGV1})
+    add_cxx_flag(${ARGV0} ${ARGV1})
+endfunction()
+
+function(add_common_flag)
+    add_c_flag(${ARGV0} ${ARGV1})
+    add_cxx_flag(${ARGV0} ${ARGV1})
+endfunction()
+
+function(add_cppcheck target_name)
+    if(NOT TRANSFERBENCH_ENABLE_CPPCHECK_WARNINGS)
+        return()
+    endif()
+
+    find_program(CPPCHECK_EXECUTABLE NAMES cppcheck REQUIRED)
+    if(NOT CPPCHECK_EXECUTABLE)
+        message(WARNING ">> Skipping 'cppcheck' target for: ${target_name}. Could not find 'Cppcheck' ...")
+        return()
+    endif()
+
+    set(CPPCHECK_CONFIG_FILE "cppcheck_static_supp.config")
+    set(CPPCHECK_REPORT_FILE "cppcheck_report.txt")
+    set(TARGET_BUILD_DIRECTORY $<TARGET_FILE_DIR:${target_name}>)
+    set(CPPCHECK_OPTION_LIST
+        --enable=all
+        --quiet
+        --std=c++${CMAKE_CXX_STANDARD}
+        --inline-suppr
+        --check-level=exhaustive
+        --error-exitcode=10
+        --suppressions-list=${CMAKE_SOURCE_DIR}/dist/${CPPCHECK_CONFIG_FILE}
+        --checkers-report=${TARGET_BUILD_DIRECTORY}/${CPPCHECK_REPORT_FILE}
+    )
+    set_target_properties(${target_name}
+        PROPERTIES
+            CXX_CPPCHECK "${CPPCHECK_EXECUTABLE};${CPPCHECK_OPTION_LIST}"
+    )
+
+    has_build_debug_mode(HAS_DEBUG_MODE_ENABLED)
+    if(HAS_DEBUG_MODE_ENABLED)
+        developer_status_message("DEVEL" ">> CppCheck settings for: '${target_name}' ...")
+        developer_status_message("DEVEL" "  >> Target Build Directory: '${TARGET_BUILD_DIRECTORY}' ")
+        developer_status_message("DEVEL" "  >> Cpp std: 'c++${CMAKE_CXX_STANDARD}' ")
+        developer_status_message("DEVEL" "  >> suppressions-list: '${CMAKE_SOURCE_DIR}/dist/${CPPCHECK_CONFIG_FILE}' ")
+        developer_status_message("DEVEL" "  >> checkers-report: ${TARGET_BUILD_DIRECTORY}/${CPPCHECK_REPORT_FILE}' ")
+        developer_status_message("DEVEL" "  >> CppCheck located at: '${CPPCHECK_EXECUTABLE}' ")
+        developer_status_message("DEVEL" "  >> CppCheck options: '${CPPCHECK_OPTION_LIST}' ")
+    endif()
+endfunction()
+
+function(check_compiler_requirements component_name)
+    ##  We need to make sure we have C++ enabled, or we get errors like:
+    ##  'check_compiler_flag: CXX: needs to be enabled before use'
+    get_property(project_enabled_languages GLOBAL PROPERTY ENABLED_LANGUAGES)
+    if(NOT project_enabled_languages OR NOT "CXX" IN_LIST project_enabled_languages)
+        enable_language(CXX)
+    endif()
+
+    
+    ##  Check if we are able to use Lightning (Clang++) as default compiler
+    ##  Note:   If this condition is met, we used rocm_clang_toolchain.cmake and the toolchain was already
+    ##          checked and set up.
+    if(NOT IS_LIGHTNING_CLANG_DEFAULT_COMPILER AND NOT ROCM_CLANG_TOOLCHAIN_USED)
+        message(FATAL_ERROR ">> ROCm 'Lightning Clang++' Toolchain: was not set (rocm_clang_toolchain.cmake) ...")
+    endif()
+   
+
+    ##  Check if the compiler is compatible with the C++ standard.
+    ##  Note:   Minimum required is ${CMAKE_CXX_STANDARD} = 20, but we check for 23, 20, and 17.
+    if(NOT DEFINED IS_COMPILER_SUPPORTS_CXX23_STANDARD OR NOT DEFINED IS_COMPILER_SUPPORTS_CXX20_STANDARD OR NOT DEFINED IS_COMPILER_SUPPORTS_CXX17_STANDARD)
+        include(CheckCXXCompilerFlag)
+        message(STATUS ">> Checking Compiler: '${CMAKE_CXX_COMPILER}' for C++ standard ...")
+
+        ## Just to have independent checks/variables
+        set(CHECK_CMAKE_CXX_STANDARD 23)
+        if(NOT DEFINED IS_COMPILER_SUPPORTS_CXX23_STANDARD)
+            set(IS_COMPILER_SUPPORTS_CHECK "IS_COMPILER_SUPPORTS_CXX${CHECK_CMAKE_CXX_STANDARD}_STANDARD")
+            check_cxx_compiler_flag("-std=c++${CHECK_CMAKE_CXX_STANDARD}" COMPILER_SUPPORTS_CXX23_STANDARD)
+            if(COMPILER_SUPPORTS_CXX23_STANDARD)
+                set(${IS_COMPILER_SUPPORTS_CHECK} BOOL TRUE)
+                set(${IS_COMPILER_SUPPORTS_CHECK} BOOL TRUE PARENT_SCOPE)
+                developer_status_message("DEVEL" " >> Compiler: ${CMAKE_CXX_COMPILER} supports CXX Standard '${CHECK_CMAKE_CXX_STANDARD}' ...")
+            else()
+                set(${IS_COMPILER_SUPPORTS_CHECK} BOOL FALSE)
+                set(${IS_COMPILER_SUPPORTS_CHECK} BOOL FALSE PARENT_SCOPE)
+            endif()
+        endif()
+
+        set(CHECK_CMAKE_CXX_STANDARD 20)
+        if(NOT DEFINED IS_COMPILER_SUPPORTS_CXX20_STANDARD)
+            set(IS_COMPILER_SUPPORTS_CHECK "IS_COMPILER_SUPPORTS_CXX${CHECK_CMAKE_CXX_STANDARD}_STANDARD")
+            check_cxx_compiler_flag("-std=c++${CHECK_CMAKE_CXX_STANDARD}" COMPILER_SUPPORTS_CXX20_STANDARD)
+            if(COMPILER_SUPPORTS_CXX20_STANDARD)
+                set(${IS_COMPILER_SUPPORTS_CHECK} BOOL TRUE)
+                set(${IS_COMPILER_SUPPORTS_CHECK} BOOL TRUE PARENT_SCOPE)
+                developer_status_message("DEVEL" "  >> Compiler: ${CMAKE_CXX_COMPILER} supports CXX Standard '${CHECK_CMAKE_CXX_STANDARD}' ...")
+            else()
+                set(${IS_COMPILER_SUPPORTS_CHECK} BOOL FALSE)
+                set(${IS_COMPILER_SUPPORTS_CHECK} BOOL FALSE PARENT_SCOPE)
+            endif()
+        endif()
+
+        set(CHECK_CMAKE_CXX_STANDARD 17)
+        if(NOT DEFINED IS_COMPILER_SUPPORTS_CXX17_STANDARD)
+            set(IS_COMPILER_SUPPORTS_CHECK "IS_COMPILER_SUPPORTS_CXX${CHECK_CMAKE_CXX_STANDARD}_STANDARD")
+            check_cxx_compiler_flag("-std=c++${CHECK_CMAKE_CXX_STANDARD}" COMPILER_SUPPORTS_CXX17_STANDARD)
+            if(COMPILER_SUPPORTS_CXX17_STANDARD)
+                set(${IS_COMPILER_SUPPORTS_CHECK} BOOL TRUE)
+                set(${IS_COMPILER_SUPPORTS_CHECK} BOOL TRUE PARENT_SCOPE)
+                developer_status_message("DEVEL" "  >> Compiler: ${CMAKE_CXX_COMPILER} supports CXX Standard '${CHECK_CMAKE_CXX_STANDARD}' ...")
+            else()
+                set(${IS_COMPILER_SUPPORTS_CHECK} BOOL FALSE)
+                set(${IS_COMPILER_SUPPORTS_CHECK} BOOL FALSE PARENT_SCOPE)
+            endif()
+        endif()
+    endif()
+
+    ## Does it support the project C++ standard, ${CMAKE_CXX_STANDARD} = 20?
+    set(IS_COMPILER_SUPPORTS_MIN_STANDARD "${IS_COMPILER_SUPPORTS_CXX${CMAKE_CXX_STANDARD}_STANDARD}")
+    if(NOT IS_COMPILER_SUPPORTS_MIN_STANDARD)
+        message(FATAL_ERROR ">> Compiler: '${CMAKE_CXX_COMPILER}' v'${CMAKE_CXX_COMPILER_VERSION}' doesn't support CXX Standard '${CMAKE_CXX_STANDARD}'! \n"
+                             "  >> Project: '${${component_name}}' can't be built ...")
+    else()
+        message(STATUS ">> Compiler: '${CMAKE_CXX_COMPILER}' v'${CMAKE_CXX_COMPILER_VERSION}' supports the required CXX Standard '${CMAKE_CXX_STANDARD}' ...")
+    endif()
+endfunction()
+
+
+#
+#   Note: All macro definitions here
+macro(set_variable_in_parent variable value)
+    get_directory_property(has_parent PARENT_DIRECTORY)
+
+    if(has_parent)
+        set(${variable} "${value}" PARENT_SCOPE)
+    else()
+        set(${variable} "${value}")
+    endif()
+endmacro()
+
+macro(setup_cmake target_name target_version)
+    message(STATUS ">> Building ${${target_name}} v${${target_version}} ...")
+    #   If building shared libraries or linking static libraries into shared ones
+    if(TRANSFERBENCH_ENGINE_SHARED)
+        set(CMAKE_POSITION_INDEPENDENT_CODE ON CACHE BOOL "Set position independent code for all targets ..." FORCE)
+    endif()
+    message(STATUS ">> Configuring CMake to use the following build tools...")
+    check_compiler_requirements(${target_name})
+
+    #
+    find_program(CCACHE_PATH ccache)
+    find_program(NINJA_PATH ninja)
+    find_program(LD_LLD_PATH ld.lld)
+    find_program(LD_MOLD_PATH ld.mold)
+
+    if(NOT IS_LIGHTNING_CLANG_DEFAULT_COMPILER)
+        if(CCACHE_PATH)
+            set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_PATH})
+            set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PATH})
+        else()
+            message(WARNING ">> CCache was not found!")
+        endif()
+    endif()
+
+    if(NINJA_PATH)
+        set(CMAKE_GENERATOR Ninja)
+    else()
+        message(WARNING ">> Ninja was not found! Using default generator.")
+    endif()
+
+    #   Lets give priority to MOLD linker
+    set(AMD_PROJECT_LINKER_OPTION "")
+    if(LD_MOLD_PATH AND TRANSFERBENCH_LINKER_TRY_MOLD)
+        set(CMAKE_LINKER ${LD_MOLD_PATH} CACHE STRING "Linker to use: ${LD_MOLD_PATH}")
+        set(AMD_PROJECT_LINKER_OPTION "-fuse-ld=mold")
+    #   Then LLD linker
+    elseif(LD_LLD_PATH)
+        set(CMAKE_LINKER ${LD_LLD_PATH} CACHE STRING "Linker to use: ${LD_LLD_PATH}")
+        set(AMD_PROJECT_LINKER_OPTION "-fuse-ld=lld")
+    else()
+        message(WARNING ">> LLD linker was not found! Using default 'Gold' linker.")
+    endif()
+
+    if(LD_MOLD_PATH OR LD_LLD_PATH AND AMD_PROJECT_LINKER_OPTION)
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${AMD_PROJECT_LINKER_OPTION}")
+        message(STATUS ">> Using linker: '${CMAKE_LINKER}' with options: '${AMD_PROJECT_LINKER_OPTION}'")
+    endif()
+      
+
+    #   CMake policies for the project
+    foreach(_policy
+        CMP0028 CMP0046 CMP0048 CMP0051 CMP0054
+        CMP0056 CMP0063 CMP0065 CMP0074 CMP0075
+        CMP0077 CMP0082 CMP0093 CMP0127 CMP0135)
+        if(POLICY ${_policy})
+            cmake_policy(SET ${_policy} NEW)
+        endif()
+    endforeach()
+
+    set(CMAKE_WARN_DEPRECATED OFF CACHE BOOL "Disable deprecated warning messages" FORCE)
+endmacro()
+
+macro(add_build_definitions)
+    if(NOT PROJECT_TARGET_VERSION)
+        message(FATAL_ERROR ">> Project: 'PROJECT_TARGET_VERSION' was not defined!")
+    endif()
+
+    message(STATUS ">> Project: '${PROJECT_NAME}' v${${PROJECT_NAME}_VERSION} ...")
+    set (CMAKE_RC_FLAGS "${CMAKE_RC_FLAGS} -DAMD_PROJECT_VERSION_MAJOR=${AMD_PROJECT_VERSION_MAJOR} 
+                                           -DAMD_PROJECT_VERSION_MINOR=${AMD_PROJECT_VERSION_MINOR} 
+                                           -DAMD_PROJECT_VERSION_MINOR=${AMD_PROJECT_VERSION_MINOR}")
+
+    if (TRANSFERBENCH_ENGINE_HEADER_ONLY)
+        add_compile_definitions(AMD_TRANSFERBENCH_ENGINE_HEADER_ONLY)
+    endif()
+    if (TRANSFERBENCH_ENGINE_STATIC)
+        add_compile_definitions(AMD_TRANSFERBENCH_ENGINE_STATIC)
+    endif()
+    if (TRANSFERBENCH_ENGINE_SHARED)
+        add_compile_definitions(AMD_TRANSFERBENCH_ENGINE_SHARED)
+    endif()
+endmacro()
+
+macro(setup_compiler_init_flags)
+    include(CheckCXXCompilerFlag)
+    check_cxx_compiler_flag(-ftrivial-auto-var-init=zero HAS_TRIVIAL_AUTO_VAR_INIT_COMPILER)
+
+    if(NOT COMPILER_INIT_FLAG)
+        if(HAS_TRIVIAL_AUTO_VAR_INIT_COMPILER)
+            message(STATUS ">> Compiler supports -ftrivial-auto-var-init")
+            set(COMPILER_INIT_FLAG "-ftrivial-auto-var-init=zero" CACHE STRING "Using cache trivially-copyable automatic variable initialization.")
+        else()
+            message(STATUS ">> Compiler does not support -ftrivial-auto-var-init")
+            set(COMPILER_INIT_FLAG " " CACHE STRING "Using cache trivially-copyable automatic variable initialization.")
+        endif()
+    endif()
+
+    ##  Initialize automatic variables with either a pattern or with zeroes to increase program security by preventing
+    ##  uninitialized memory disclosure and use. '-ftrivial-auto-var-init=[uninitialized|pattern|zero]' where
+    ##  'uninitialized' is the default, 'pattern' initializes variables with a pattern, and 'zero' initializes variables
+    ##  with zeroes.
+    set(AMD_WORK_BENCH_COMMON_FLAGS "${AMD_WORK_BENCH_COMMON_FLAGS} ${COMPILER_INIT_FLAG}")
+endmacro()
+
+macro(setup_compression_flags)
+    include(CheckCXXCompilerFlag)
+    include(CheckLinkerFlag)
+    check_cxx_compiler_flag(-gz=zstd ZSTD_AVAILABLE_COMPILER)
+    check_linker_flag(CXX -gz=zstd ZSTD_AVAILABLE_LINKER)
+    check_cxx_compiler_flag(-gz COMPRESS_AVAILABLE_COMPILER)
+    check_linker_flag(CXX -gz COMPRESS_AVAILABLE_LINKER)
+
+    # From cache
+    if(NOT DEBUG_COMPRESSION_FLAG)
+        if(ZSTD_AVAILABLE_COMPILER AND ZSTD_AVAILABLE_LINKER)
+            message(STATUS ">> Compiler and Linker support ZSTD... using it.")
+            set(DEBUG_COMPRESSION_FLAG "-gz=zstd" CACHE STRING "Using cache for debug info compression.")
+        elseif(COMPRESS_AVAILABLE_COMPILER AND COMPRESS_AVAILABLE_LINKER)
+            message(STATUS ">> Compiler and Linker support default compression... using it.")
+            set(DEBUG_COMPRESSION_FLAG "-gz" CACHE STRING "Using cache for debug info compression.")
+        endif()
+    endif()
+
+    set(AMD_WORK_BENCH_COMMON_FLAGS "${AMD_WORK_BENCH_COMMON_FLAGS} ${DEBUG_COMPRESSION_FLAG}")
+endmacro()
+
+macro(setup_default_compiler_flags target_name)
+    #   Compiler specific flags
+    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+        add_common_flag("-Wall" ${target_name})
+        add_common_flag("-Wextra" ${target_name})
+        add_common_flag("-Wno-unused-function" ${target_name})
+        add_common_flag("-Wno-unused-variable" ${target_name})
+        add_common_flag("-Wpedantic" ${target_name})
+
+        if(TRANSFERBENCH_TREAT_WARNINGS_AS_ERRORS)
+            add_common_flag("-Werror" ${target_name})
+        endif()
+
+        if(CMAKE_SYSTEM_NAME MATCHES "Linux" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+            add_common_flag("-rdynamic" ${target_name})
+        endif()
+
+        ##
+        ## -fno-omit-frame-pointer -fno-strict-aliasing -fvisibility=hidden -fvisibility-inlines-hidden
+        ## -fno-exceptions -fno-rtti
+        add_cxx_flag("-fexceptions" ${target_name})
+        add_cxx_flag("-frtti" ${target_name})
+        add_cxx_flag("-fno-omit-frame-pointer" ${target_name})
+        add_c_cxx_flag("-Wno-array-bounds" ${target_name})
+        add_c_cxx_flag("-Wno-deprecated-declarations" ${target_name})
+        add_c_cxx_flag("-Wno-unknown-pragmas" ${target_name})
+
+        if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+            add_c_cxx_flag("-Wno-restrict" ${target_name})
+            add_c_cxx_flag("-Wno-stringop-overread" ${target_name})
+            add_c_cxx_flag("-Wno-stringop-overflow" ${target_name})
+            add_c_cxx_flag("-Wno-dangling-reference" ${target_name})
+
+        elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+            add_c_cxx_flag("-Wno-unknown-warning-option" ${target_name})
+        endif()
+
+        if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+            add_common_flag("-O1" ${target_name})
+
+            if(TRANSFERBENCH_HARDENING_ENABLED)
+                ##  Building with _FORTIFY_SOURCE=3 may impact the size and performance of the code. Since _FORTIFY_SOURCE=2 
+                ##  generated only constant sizes, its overhead was negligible. However, _FORTIFY_SOURCE=3 may generate 
+                ##  additional code to compute object sizes. These additions may also cause secondary effects, such as register 
+                ##  pressure during code generation. Code size tends to increase the size of resultant binaries for the same reason.
+                ##
+                ##  _FORTIFY_SOURCE=3 has led to significant gains in security mitigation, but it may not be suitable for all
+                ##  applications. We need a proper study of performance and code size to understand the magnitude of the impact 
+                ##  created by _FORTIFY_SOURCE=3 additional runtime code generation, but the performance, and code size might well 
+                ##  be worth the magnitude of the security benefits.  _FORTIFY_SOURCE requires compiling with optimization (-O).
+                ##
+                add_common_flag("-U_FORTIFY_SOURCE" ${target_name})
+                add_common_flag("-D_FORTIFY_SOURCE=2" ${target_name})
+
+                ##  Stack canary check for buffer overflow on the stack. 
+                ##  Emit extra code to check for buffer overflows, such as stack smashing attacks. This is done by adding a guard 
+                ##  variable to functions with vulnerable objects. This includes functions that call alloca, and functions with 
+                ##  buffers larger than or equal to 8 bytes.
+                ##  Only variables that are actually allocated on the stack are considered, optimized away variables or variables 
+                ##  allocated in registers don’t count. 
+                ##  'stack-protector-strong' is a stronger version of 'stack-protector', but includes additional functions to be 
+                ##  protected — those that have local array definitions, or have references to local frame addresses. Only 
+                ##  variables that are actually allocated on the stack are considered, optimized away variables or variables 
+                ##  allocated in registers don’t count.
+                ##
+                add_common_flag("-fstack-protector-strong" ${target_name})
+            endif()
+        endif()            
+
+        if(TRANSFERBENCH_COMPRESS_DEBUG_INFO)
+            setup_compression_flags()
+        endif()
+
+        ##  Compiler initialization flags
+        setup_compiler_init_flags()
+
+        ## RelWithDebInfo builds, minimum debug info
+        if (NOT CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
+            if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+                add_c_cxx_flag("-g3" ${target_name})
+            endif()
+
+            ## Inline function debugg
+            if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+                add_c_cxx_flag("-ginline-points" ${target_name})
+                add_c_cxx_flag("-gstatement-frontiers" ${target_name})
+            endif()
+        endif()
+
+    endif()
+
+    ## TODO: Check if RPATH settings is needed
+endmacro()
+
+macro(developer_status_message message_mode message)
+    #   Note:   This macro is used to print developer messages.
+    has_build_debug_mode(HAS_DEBUG_MODE_ENABLED)
+    if(HAS_DEBUG_MODE_ENABLED)
+        #   Check for valid message mode
+        #   Note:   We will use the 'STATUS' message mode as default if the user doesn't set it or
+        if(NOT "${message_mode}" MATCHES "^(STATUS|WARNING|ERROR|DEBUG|FATAL_ERROR|DEVEL)$")
+            message(WARNING "[DEVELOPER]: The '${message_mode}' message mode is not supported for message: '${message}' .")
+        else()
+
+            #   ${message_mode} doesn't work here. CMake interpreter sees it as a string; "STATUS", "WARNING"...
+            if("${message_mode}" STREQUAL "STATUS")
+                message(STATUS "[DEVELOPER]: ${message}")
+            elseif("${message_mode}" STREQUAL "WARNING")
+                message(WARNING "[DEVELOPER]: ${message}")
+            elseif("${message_mode}" STREQUAL "ERROR")
+                message(ERROR "[DEVELOPER]: ${message}")
+            elseif("${message_mode}" STREQUAL "DEBUG")
+                message(DEBUG "[DEVELOPER]: ${message}")
+            elseif("${message_mode}" STREQUAL "FATAL_ERROR")
+                message(FATAL_ERROR "[DEVELOPER]: ${message}")
+            elseif("${message_mode}" STREQUAL "DEVEL")
+                message(STATUS "[DEVELOPER]: ${message}")
+            else()
+                message(WARNING "[DEVELOPER]: ${message}, with invalid message mode: '${message_mode}'")
+            endif()
+        endif()
+    endif()
+endmacro()
+
--- a/cmake/modules/Dependencies.cmake
+++ b/cmake/modules/Dependencies.cmake
+# MIT License
+#
+# Copyright (c) 2023-25 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+#
+
+# Test dependencies
+#==================================================================================================
+include(FetchContent)
+
+set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "")
+
+# Find or download/install rocm-cmake project
+#==================================================================================================
+find_package(ROCmCMakeBuildTools 0.11.0 CONFIG QUIET PATHS "${ROCM_PATH}")
+if((NOT ROCmCMakeBuildTools_FOUND) OR INSTALL_DEPENDENCIES)
+    message(STATUS "ROCmCMakeBuildTools not found. Checking for ROCM (deprecated)")
+    find_package(ROCM 0.7.3 CONFIG QUIET PATHS "${ROCM_PATH}") # deprecated fallback
+    if((NOT ROCM_FOUND) OR INSTALL_DEPENDENCIES)
+        message(STATUS "ROCM (deprecated) not found. Downloading and building ROCmCMakeBuildTools")
+        set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
+        set(rocm_cmake_tag "rocm-6.4.0" CACHE STRING "rocm-cmake tag to download")
+        FetchContent_Declare(
+            rocm-cmake
+            GIT_REPOSITORY https://github.com/ROCm/rocm-cmake.git
+            GIT_TAG ${rocm_cmake_tag}
+            SOURCE_SUBDIR "DISABLE ADDING TO BUILD"
+        )
+        FetchContent_MakeAvailable(rocm-cmake)
+        message(STATUS "rocm-cmake_SOURCE_DIR: ${rocm-cmake_SOURCE_DIR}")
+        find_package(ROCmCMakeBuildTools CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}")
+        message(STATUS "Found ROCmCmakeBuildTools version: ${ROCmCMakeBuildTools_VERSION}")
+    endif()
+elseif(ROCmCMakeBuildTools_FOUND)
+    message(STATUS "Found ROCmCmakeBuildTools version: ${ROCmCMakeBuildTools_VERSION}")
+endif()
+
+
+# Find available local ROCM targets
+# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
+#==================================================================================================
+function(rocm_local_targets VARIABLE)
+    set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
+    find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH)
+    if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
+        execute_process(
+            COMMAND "${_rocm_agent_enumerator}"
+            RESULT_VARIABLE _found_agents
+            OUTPUT_VARIABLE _rocm_agents
+            ERROR_QUIET
+        )
+        if (_found_agents EQUAL 0)
+            string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
+            unset(result)
+            foreach (agent IN LISTS _rocm_agents)
+                if (NOT agent STREQUAL "gfx000")
+                    list(APPEND result "${agent}")
+                endif()
+            endforeach()
+            if(result)
+                list(REMOVE_DUPLICATES result)
+                set(${VARIABLE} "${result}" PARENT_SCOPE)
+            endif()
+        endif()
+    endif()
+endfunction()
+
+include(ROCMSetupVersion)
+include(ROCMCreatePackage)
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMInstallSymlinks)
+include(ROCMCheckTargetIds)
+include(ROCMClients)
+include(ROCMHeaderWrapper)
--- a/cmake/modules/FindNUMA.cmake
+++ b/cmake/modules/FindNUMA.cmake
+# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+find_path(NUMA_INCLUDE_DIR numa.h)
+find_library(NUMA_LIBRARIES numa)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NUMA
+  DEFAULT_MSG
+  NUMA_LIBRARIES NUMA_INCLUDE_DIR)
+
+mark_as_advanced(NUMA_LIBRARIES NUMA_INCLUDE_DIR)
--- a/cmake/modules/FindPackageHandleStandardArgs.cmake
+++ b/cmake/modules/FindPackageHandleStandardArgs.cmake
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+FindPackageHandleStandardArgs
+-----------------------------
+
+This module provides functions intended to be used in :ref:`Find Modules`
+implementing :command:`find_package(<PackageName>)` calls.
+
+.. command:: find_package_handle_standard_args
+
+  This command handles the ``REQUIRED``, ``QUIET`` and version-related
+  arguments of :command:`find_package`.  It also sets the
+  ``<PackageName>_FOUND`` variable.  The package is considered found if all
+  variables listed contain valid results, e.g. valid filepaths.
+
+  There are two signatures:
+
+  .. code-block:: cmake
+
+    find_package_handle_standard_args(<PackageName>
+      (DEFAULT_MSG|<custom-failure-message>)
+      <required-var>...
+      )
+
+    find_package_handle_standard_args(<PackageName>
+      [FOUND_VAR <result-var>]
+      [REQUIRED_VARS <required-var>...]
+      [VERSION_VAR <version-var>]
+      [HANDLE_VERSION_RANGE]
+      [HANDLE_COMPONENTS]
+      [CONFIG_MODE]
+      [NAME_MISMATCHED]
+      [REASON_FAILURE_MESSAGE <reason-failure-message>]
+      [FAIL_MESSAGE <custom-failure-message>]
+      )
+
+  The ``<PackageName>_FOUND`` variable will be set to ``TRUE`` if all
+  the variables ``<required-var>...`` are valid and any optional
+  constraints are satisfied, and ``FALSE`` otherwise.  A success or
+  failure message may be displayed based on the results and on
+  whether the ``REQUIRED`` and/or ``QUIET`` option was given to
+  the :command:`find_package` call.
+
+  The options are:
+
+  ``(DEFAULT_MSG|<custom-failure-message>)``
+    In the simple signature this specifies the failure message.
+    Use ``DEFAULT_MSG`` to ask for a default message to be computed
+    (recommended).  Not valid in the full signature.
+
+  ``FOUND_VAR <result-var>``
+    .. deprecated:: 3.3
+
+    Specifies either ``<PackageName>_FOUND`` or
+    ``<PACKAGENAME>_FOUND`` as the result variable.  This exists only
+    for compatibility with older versions of CMake and is now ignored.
+    Result variables of both names are always set for compatibility.
+
+  ``REQUIRED_VARS <required-var>...``
+    Specify the variables which are required for this package.
+    These may be named in the generated failure message asking the
+    user to set the missing variable values.  Therefore these should
+    typically be cache entries such as ``FOO_LIBRARY`` and not output
+    variables like ``FOO_LIBRARIES``.
+
+    .. versionchanged:: 3.18
+      If ``HANDLE_COMPONENTS`` is specified, this option can be omitted.
+
+  ``VERSION_VAR <version-var>``
+    Specify the name of a variable that holds the version of the package
+    that has been found.  This version will be checked against the
+    (potentially) specified required version given to the
+    :command:`find_package` call, including its ``EXACT`` option.
+    The default messages include information about the required
+    version and the version which has been actually found, both
+    if the version is ok or not.
+
+  ``HANDLE_VERSION_RANGE``
+    .. versionadded:: 3.19
+
+    Enable handling of a version range, if one is specified. Without this
+    option, a developer warning will be displayed if a version range is
+    specified.
+
+  ``HANDLE_COMPONENTS``
+    Enable handling of package components.  In this case, the command
+    will report which components have been found and which are missing,
+    and the ``<PackageName>_FOUND`` variable will be set to ``FALSE``
+    if any of the required components (i.e. not the ones listed after
+    the ``OPTIONAL_COMPONENTS`` option of :command:`find_package`) are
+    missing.
+
+  ``CONFIG_MODE``
+    Specify that the calling find module is a wrapper around a
+    call to ``find_package(<PackageName> NO_MODULE)``.  This implies
+    a ``VERSION_VAR`` value of ``<PackageName>_VERSION``.  The command
+    will automatically check whether the package configuration file
+    was found.
+
+  ``REASON_FAILURE_MESSAGE <reason-failure-message>``
+    .. versionadded:: 3.16
+
+    Specify a custom message of the reason for the failure which will be
+    appended to the default generated message.
+
+  ``FAIL_MESSAGE <custom-failure-message>``
+    Specify a custom failure message instead of using the default
+    generated message.  Not recommended.
+
+  ``NAME_MISMATCHED``
+    .. versionadded:: 3.17
+
+    Indicate that the ``<PackageName>`` does not match
+    ``${CMAKE_FIND_PACKAGE_NAME}``. This is usually a mistake and raises a
+    warning, but it may be intentional for usage of the command for components
+    of a larger package.
+
+Example for the simple signature:
+
+.. code-block:: cmake
+
+  find_package_handle_standard_args(LibXml2 DEFAULT_MSG
+    LIBXML2_LIBRARY LIBXML2_INCLUDE_DIR)
+
+The ``LibXml2`` package is considered to be found if both
+``LIBXML2_LIBRARY`` and ``LIBXML2_INCLUDE_DIR`` are valid.
+Then also ``LibXml2_FOUND`` is set to ``TRUE``.  If it is not found
+and ``REQUIRED`` was used, it fails with a
+:command:`message(FATAL_ERROR)`, independent whether ``QUIET`` was
+used or not.  If it is found, success will be reported, including
+the content of the first ``<required-var>``.  On repeated CMake runs,
+the same message will not be printed again.
+
+.. note::
+
+  If ``<PackageName>`` does not match ``CMAKE_FIND_PACKAGE_NAME`` for the
+  calling module, a warning that there is a mismatch is given. The
+  ``FPHSA_NAME_MISMATCHED`` variable may be set to bypass the warning if using
+  the old signature and the ``NAME_MISMATCHED`` argument using the new
+  signature. To avoid forcing the caller to require newer versions of CMake for
+  usage, the variable's value will be used if defined when the
+  ``NAME_MISMATCHED`` argument is not passed for the new signature (but using
+  both is an error)..
+
+Example for the full signature:
+
+.. code-block:: cmake
+
+  find_package_handle_standard_args(LibArchive
+    REQUIRED_VARS LibArchive_LIBRARY LibArchive_INCLUDE_DIR
+    VERSION_VAR LibArchive_VERSION)
+
+In this case, the ``LibArchive`` package is considered to be found if
+both ``LibArchive_LIBRARY`` and ``LibArchive_INCLUDE_DIR`` are valid.
+Also the version of ``LibArchive`` will be checked by using the version
+contained in ``LibArchive_VERSION``.  Since no ``FAIL_MESSAGE`` is given,
+the default messages will be printed.
+
+Another example for the full signature:
+
+.. code-block:: cmake
+
+  find_package(Automoc4 QUIET NO_MODULE HINTS /opt/automoc4)
+  find_package_handle_standard_args(Automoc4  CONFIG_MODE)
+
+In this case, a ``FindAutmoc4.cmake`` module wraps a call to
+``find_package(Automoc4 NO_MODULE)`` and adds an additional search
+directory for ``automoc4``.  Then the call to
+``find_package_handle_standard_args`` produces a proper success/failure
+message.
+
+.. command:: find_package_check_version
+
+  .. versionadded:: 3.19
+
+  Helper function which can be used to check if a ``<version>`` is valid
+  against version-related arguments of :command:`find_package`.
+
+  .. code-block:: cmake
+
+    find_package_check_version(<version> <result-var>
+      [HANDLE_VERSION_RANGE]
+      [RESULT_MESSAGE_VARIABLE <message-var>]
+      )
+
+  The ``<result-var>`` will hold a boolean value giving the result of the check.
+
+  The options are:
+
+  ``HANDLE_VERSION_RANGE``
+    Enable handling of a version range, if one is specified. Without this
+    option, a developer warning will be displayed if a version range is
+    specified.
+
+  ``RESULT_MESSAGE_VARIABLE <message-var>``
+    Specify a variable to get back a message describing the result of the check.
+
+Example for the usage:
+
+.. code-block:: cmake
+
+  find_package_check_version(1.2.3 result HANDLE_VERSION_RANGE
+    RESULT_MESSAGE_VARIABLE reason)
+  if (result)
+    message (STATUS "${reason}")
+  else()
+    message (FATAL_ERROR "${reason}")
+  endif()
+#]=======================================================================]
+
+include(${CMAKE_CURRENT_LIST_DIR}/FindPackageMessage.cmake)
+
+
+cmake_policy(PUSH)
+# numbers and boolean constants
+cmake_policy (SET CMP0012 NEW)
+# IN_LIST operator
+cmake_policy (SET CMP0057 NEW)
+
+
+# internal helper macro
+macro(_FPHSA_FAILURE_MESSAGE _msg)
+  set (__msg "${_msg}")
+  if (FPHSA_REASON_FAILURE_MESSAGE)
+    string(APPEND __msg "\n    Reason given by package: ${FPHSA_REASON_FAILURE_MESSAGE}\n")
+  endif()
+  if (${_NAME}_FIND_REQUIRED)
+    message(FATAL_ERROR "${__msg}")
+  else ()
+    if (NOT ${_NAME}_FIND_QUIETLY)
+      message(STATUS "${__msg}")
+    endif ()
+  endif ()
+endmacro()
+
+
+# internal helper macro to generate the failure message when used in CONFIG_MODE:
+macro(_FPHSA_HANDLE_FAILURE_CONFIG_MODE)
+  # <PackageName>_CONFIG is set, but FOUND is false, this means that some other of the REQUIRED_VARS was not found:
+  if(${_NAME}_CONFIG)
+    _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: missing:${MISSING_VARS} (found ${${_NAME}_CONFIG} ${VERSION_MSG})")
+  else()
+    # If _CONSIDERED_CONFIGS is set, the config-file has been found, but no suitable version.
+    # List them all in the error message:
+    if(${_NAME}_CONSIDERED_CONFIGS)
+      set(configsText "")
+      list(LENGTH ${_NAME}_CONSIDERED_CONFIGS configsCount)
+      math(EXPR configsCount "${configsCount} - 1")
+      foreach(currentConfigIndex RANGE ${configsCount})
+        list(GET ${_NAME}_CONSIDERED_CONFIGS ${currentConfigIndex} filename)
+        list(GET ${_NAME}_CONSIDERED_VERSIONS ${currentConfigIndex} version)
+        string(APPEND configsText "\n    ${filename} (version ${version})")
+      endforeach()
+      if (${_NAME}_NOT_FOUND_MESSAGE)
+        if (FPHSA_REASON_FAILURE_MESSAGE)
+          string(PREPEND FPHSA_REASON_FAILURE_MESSAGE "${${_NAME}_NOT_FOUND_MESSAGE}\n    ")
+        else()
+          set(FPHSA_REASON_FAILURE_MESSAGE "${${_NAME}_NOT_FOUND_MESSAGE}")
+        endif()
+      else()
+        string(APPEND configsText "\n")
+      endif()
+      _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} ${VERSION_MSG}, checked the following files:${configsText}")
+
+    else()
+      # Simple case: No Config-file was found at all:
+      _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: found neither ${_NAME}Config.cmake nor ${_NAME_LOWER}-config.cmake ${VERSION_MSG}")
+    endif()
+  endif()
+endmacro()
+
+
+function(FIND_PACKAGE_CHECK_VERSION version result)
+  cmake_parse_arguments (PARSE_ARGV 2 FPCV "HANDLE_VERSION_RANGE;NO_AUTHOR_WARNING_VERSION_RANGE" "RESULT_MESSAGE_VARIABLE" "")
+
+  if (FPCV_UNPARSED_ARGUMENTS)
+    message (FATAL_ERROR "find_package_check_version(): ${FPCV_UNPARSED_ARGUMENTS}: unexpected arguments")
+  endif()
+  if ("RESULT_MESSAGE_VARIABLE" IN_LIST FPCV_KEYWORDS_MISSING_VALUES)
+    message (FATAL_ERROR "find_package_check_version(): RESULT_MESSAGE_VARIABLE expects an argument")
+  endif()
+
+  set (${result} FALSE PARENT_SCOPE)
+  if (FPCV_RESULT_MESSAGE_VARIABLE)
+    unset (${FPCV_RESULT_MESSAGE_VARIABLE} PARENT_SCOPE)
+  endif()
+
+  if (_CMAKE_FPHSA_PACKAGE_NAME)
+    set (package "${_CMAKE_FPHSA_PACKAGE_NAME}")
+  elseif (CMAKE_FIND_PACKAGE_NAME)
+    set (package "${CMAKE_FIND_PACKAGE_NAME}")
+  else()
+    message (FATAL_ERROR "find_package_check_version(): Cannot be used outside a 'Find Module'")
+  endif()
+
+  if (NOT FPCV_NO_AUTHOR_WARNING_VERSION_RANGE
+      AND ${package}_FIND_VERSION_RANGE AND NOT FPCV_HANDLE_VERSION_RANGE)
+    message(AUTHOR_WARNING
+      "`find_package()` specify a version range but the option "
+      "HANDLE_VERSION_RANGE` is not passed to `find_package_check_version()`. "
+      "Only the lower endpoint of the range will be used.")
+  endif()
+
+
+  set (version_ok FALSE)
+  unset (version_msg)
+
+  if (FPCV_HANDLE_VERSION_RANGE AND ${package}_FIND_VERSION_RANGE)
+    if ((${package}_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE"
+          AND version VERSION_GREATER_EQUAL ${package}_FIND_VERSION_MIN)
+        AND ((${package}_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE"
+            AND version VERSION_LESS_EQUAL ${package}_FIND_VERSION_MAX)
+          OR (${package}_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE"
+            AND version VERSION_LESS ${package}_FIND_VERSION_MAX)))
+      set (version_ok TRUE)
+      set(version_msg "(found suitable version \"${version}\", required range is \"${${package}_FIND_VERSION_RANGE}\")")
+    else()
+      set(version_msg "Found unsuitable version \"${version}\", required range is \"${${package}_FIND_VERSION_RANGE}\"")
+    endif()
+  elseif (DEFINED ${package}_FIND_VERSION)
+    if(${package}_FIND_VERSION_EXACT)       # exact version required
+      # count the dots in the version string
+      string(REGEX REPLACE "[^.]" "" version_dots "${version}")
+      # add one dot because there is one dot more than there are components
+      string(LENGTH "${version_dots}." version_dots)
+      if (version_dots GREATER ${package}_FIND_VERSION_COUNT)
+        # Because of the C++ implementation of find_package() ${package}_FIND_VERSION_COUNT
+        # is at most 4 here. Therefore a simple lookup table is used.
+        if (${package}_FIND_VERSION_COUNT EQUAL 1)
+          set(version_regex "[^.]*")
+        elseif (${package}_FIND_VERSION_COUNT EQUAL 2)
+          set(version_regex "[^.]*\\.[^.]*")
+        elseif (${package}_FIND_VERSION_COUNT EQUAL 3)
+          set(version_regex "[^.]*\\.[^.]*\\.[^.]*")
+        else()
+          set(version_regex "[^.]*\\.[^.]*\\.[^.]*\\.[^.]*")
+        endif()
+        string(REGEX REPLACE "^(${version_regex})\\..*" "\\1" version_head "${version}")
+        if (NOT ${package}_FIND_VERSION VERSION_EQUAL version_head)
+          set(version_msg "Found unsuitable version \"${version}\", but required is exact version \"${${package}_FIND_VERSION}\"")
+        else ()
+          set(version_ok TRUE)
+          set(version_msg "(found suitable exact version \"${_FOUND_VERSION}\")")
+        endif ()
+      else ()
+        if (NOT ${package}_FIND_VERSION VERSION_EQUAL version)
+          set(version_msg "Found unsuitable version \"${version}\", but required is exact version \"${${package}_FIND_VERSION}\"")
+        else ()
+          set(version_ok TRUE)
+          set(version_msg "(found suitable exact version \"${version}\")")
+        endif ()
+      endif ()
+    else()     # minimum version
+      if (${package}_FIND_VERSION VERSION_GREATER version)
+        set(version_msg "Found unsuitable version \"${version}\", but required is at least \"${${package}_FIND_VERSION}\"")
+      else()
+        set(version_ok TRUE)
+        set(version_msg "(found suitable version \"${version}\", minimum required is \"${${package}_FIND_VERSION}\")")
+      endif()
+    endif()
+  else ()
+    set(version_ok TRUE)
+    set(version_msg "(found version \"${version}\")")
+  endif()
+
+  set (${result} ${version_ok} PARENT_SCOPE)
+  if (FPCV_RESULT_MESSAGE_VARIABLE)
+    set (${FPCV_RESULT_MESSAGE_VARIABLE} "${version_msg}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+function(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FIRST_ARG)
+
+  # Set up the arguments for `cmake_parse_arguments`.
+  set(options  CONFIG_MODE  HANDLE_COMPONENTS NAME_MISMATCHED HANDLE_VERSION_RANGE)
+  set(oneValueArgs  FAIL_MESSAGE  REASON_FAILURE_MESSAGE VERSION_VAR  FOUND_VAR)
+  set(multiValueArgs REQUIRED_VARS)
+
+  # Check whether we are in 'simple' or 'extended' mode:
+  set(_KEYWORDS_FOR_EXTENDED_MODE  ${options} ${oneValueArgs} ${multiValueArgs} )
+  list(FIND _KEYWORDS_FOR_EXTENDED_MODE "${_FIRST_ARG}" INDEX)
+
+  unset(FPHSA_NAME_MISMATCHED_override)
+  if (DEFINED FPHSA_NAME_MISMATCHED)
+    # If the variable NAME_MISMATCHED variable is set, error if it is passed as
+    # an argument. The former is for old signatures, the latter is for new
+    # signatures.
+    list(FIND ARGN "NAME_MISMATCHED" name_mismatched_idx)
+    if (NOT name_mismatched_idx EQUAL "-1")
+      message(FATAL_ERROR
+        "The `NAME_MISMATCHED` argument may only be specified by the argument or "
+        "the variable, not both.")
+    endif ()
+
+    # But use the variable if it is not an argument to avoid forcing minimum
+    # CMake version bumps for calling modules.
+    set(FPHSA_NAME_MISMATCHED_override "${FPHSA_NAME_MISMATCHED}")
+  endif ()
+
+  if(${INDEX} EQUAL -1)
+    set(FPHSA_FAIL_MESSAGE ${_FIRST_ARG})
+    set(FPHSA_REQUIRED_VARS ${ARGN})
+    set(FPHSA_VERSION_VAR)
+  else()
+    cmake_parse_arguments(FPHSA "${options}" "${oneValueArgs}" "${multiValueArgs}"  ${_FIRST_ARG} ${ARGN})
+
+    if(FPHSA_UNPARSED_ARGUMENTS)
+      message(FATAL_ERROR "Unknown keywords given to FIND_PACKAGE_HANDLE_STANDARD_ARGS(): \"${FPHSA_UNPARSED_ARGUMENTS}\"")
+    endif()
+
+    if(NOT FPHSA_FAIL_MESSAGE)
+      set(FPHSA_FAIL_MESSAGE  "DEFAULT_MSG")
+    endif()
+
+    # In config-mode, we rely on the variable <PackageName>_CONFIG, which is set by find_package()
+    # when it successfully found the config-file, including version checking:
+    if(FPHSA_CONFIG_MODE)
+      list(INSERT FPHSA_REQUIRED_VARS 0 ${_NAME}_CONFIG)
+      list(REMOVE_DUPLICATES FPHSA_REQUIRED_VARS)
+      set(FPHSA_VERSION_VAR ${_NAME}_VERSION)
+    endif()
+
+    if(NOT FPHSA_REQUIRED_VARS AND NOT FPHSA_HANDLE_COMPONENTS)
+      message(FATAL_ERROR "No REQUIRED_VARS specified for FIND_PACKAGE_HANDLE_STANDARD_ARGS()")
+    endif()
+  endif()
+
+  if (DEFINED FPHSA_NAME_MISMATCHED_override)
+    set(FPHSA_NAME_MISMATCHED "${FPHSA_NAME_MISMATCHED_override}")
+  endif ()
+
+  if (DEFINED CMAKE_FIND_PACKAGE_NAME
+      AND NOT FPHSA_NAME_MISMATCHED
+      AND NOT _NAME STREQUAL CMAKE_FIND_PACKAGE_NAME)
+    message(AUTHOR_WARNING
+      "The package name passed to `find_package_handle_standard_args` "
+      "(${_NAME}) does not match the name of the calling package "
+      "(${CMAKE_FIND_PACKAGE_NAME}). This can lead to problems in calling "
+      "code that expects `find_package` result variables (e.g., `_FOUND`) "
+      "to follow a certain pattern.")
+  endif ()
+
+  if (${_NAME}_FIND_VERSION_RANGE AND NOT FPHSA_HANDLE_VERSION_RANGE)
+    message(AUTHOR_WARNING
+      "`find_package()` specify a version range but the module ${_NAME} does "
+      "not support this capability. Only the lower endpoint of the range "
+      "will be used.")
+  endif()
+
+  # to propagate package name to FIND_PACKAGE_CHECK_VERSION
+  set(_CMAKE_FPHSA_PACKAGE_NAME "${_NAME}")
+
+  # now that we collected all arguments, process them
+
+  if("x${FPHSA_FAIL_MESSAGE}" STREQUAL "xDEFAULT_MSG")
+    set(FPHSA_FAIL_MESSAGE "Could NOT find ${_NAME}")
+  endif()
+
+  if (FPHSA_REQUIRED_VARS)
+    list(GET FPHSA_REQUIRED_VARS 0 _FIRST_REQUIRED_VAR)
+  endif()
+
+  string(TOUPPER ${_NAME} _NAME_UPPER)
+  string(TOLOWER ${_NAME} _NAME_LOWER)
+
+  if(FPHSA_FOUND_VAR)
+    set(_FOUND_VAR_UPPER ${_NAME_UPPER}_FOUND)
+    set(_FOUND_VAR_MIXED ${_NAME}_FOUND)
+    if(FPHSA_FOUND_VAR STREQUAL _FOUND_VAR_MIXED  OR  FPHSA_FOUND_VAR STREQUAL _FOUND_VAR_UPPER)
+      set(_FOUND_VAR ${FPHSA_FOUND_VAR})
+    else()
+      message(FATAL_ERROR "The argument for FOUND_VAR is \"${FPHSA_FOUND_VAR}\", but only \"${_FOUND_VAR_MIXED}\" and \"${_FOUND_VAR_UPPER}\" are valid names.")
+    endif()
+  else()
+    set(_FOUND_VAR ${_NAME_UPPER}_FOUND)
+  endif()
+
+  # collect all variables which were not found, so they can be printed, so the
+  # user knows better what went wrong (#6375)
+  set(MISSING_VARS "")
+  set(DETAILS "")
+  # check if all passed variables are valid
+  set(FPHSA_FOUND_${_NAME} TRUE)
+  foreach(_CURRENT_VAR ${FPHSA_REQUIRED_VARS})
+    if(NOT ${_CURRENT_VAR})
+      set(FPHSA_FOUND_${_NAME} FALSE)
+      string(APPEND MISSING_VARS " ${_CURRENT_VAR}")
+    else()
+      string(APPEND DETAILS "[${${_CURRENT_VAR}}]")
+    endif()
+  endforeach()
+  if(FPHSA_FOUND_${_NAME})
+    set(${_NAME}_FOUND TRUE)
+    set(${_NAME_UPPER}_FOUND TRUE)
+  else()
+    set(${_NAME}_FOUND FALSE)
+    set(${_NAME_UPPER}_FOUND FALSE)
+  endif()
+
+  # component handling
+  unset(FOUND_COMPONENTS_MSG)
+  unset(MISSING_COMPONENTS_MSG)
+
+  if(FPHSA_HANDLE_COMPONENTS)
+    foreach(comp ${${_NAME}_FIND_COMPONENTS})
+      if(${_NAME}_${comp}_FOUND)
+
+        if(NOT DEFINED FOUND_COMPONENTS_MSG)
+          set(FOUND_COMPONENTS_MSG "found components:")
+        endif()
+        string(APPEND FOUND_COMPONENTS_MSG " ${comp}")
+
+      else()
+
+        if(NOT DEFINED MISSING_COMPONENTS_MSG)
+          set(MISSING_COMPONENTS_MSG "missing components:")
+        endif()
+        string(APPEND MISSING_COMPONENTS_MSG " ${comp}")
+
+        if(${_NAME}_FIND_REQUIRED_${comp})
+          set(${_NAME}_FOUND FALSE)
+          string(APPEND MISSING_VARS " ${comp}")
+        endif()
+
+      endif()
+    endforeach()
+    set(COMPONENT_MSG "${FOUND_COMPONENTS_MSG} ${MISSING_COMPONENTS_MSG}")
+    string(APPEND DETAILS "[c${COMPONENT_MSG}]")
+  endif()
+
+  # version handling:
+  set(VERSION_MSG "")
+  set(VERSION_OK TRUE)
+
+  # check that the version variable is not empty to avoid emitting a misleading
+  # message (i.e. `Found unsuitable version ""`)
+  if (DEFINED ${_NAME}_FIND_VERSION)
+    if(DEFINED ${FPHSA_VERSION_VAR})
+      if(NOT "${${FPHSA_VERSION_VAR}}" STREQUAL "")
+        set(_FOUND_VERSION ${${FPHSA_VERSION_VAR}})
+        if (FPHSA_HANDLE_VERSION_RANGE)
+          set (FPCV_HANDLE_VERSION_RANGE HANDLE_VERSION_RANGE)
+        else()
+          set(FPCV_HANDLE_VERSION_RANGE NO_AUTHOR_WARNING_VERSION_RANGE)
+        endif()
+        find_package_check_version ("${_FOUND_VERSION}" VERSION_OK RESULT_MESSAGE_VARIABLE VERSION_MSG
+          ${FPCV_HANDLE_VERSION_RANGE})
+      else()
+        set(VERSION_OK FALSE)
+      endif()
+    endif()
+    if("${${FPHSA_VERSION_VAR}}" STREQUAL "")
+      # if the package was not found, but a version was given, add that to the output:
+      if(${_NAME}_FIND_VERSION_EXACT)
+        set(VERSION_MSG "(Required is exact version \"${${_NAME}_FIND_VERSION}\")")
+      elseif (FPHSA_HANDLE_VERSION_RANGE AND ${_NAME}_FIND_VERSION_RANGE)
+        set(VERSION_MSG "(Required is version range \"${${_NAME}_FIND_VERSION_RANGE}\")")
+      else()
+        set(VERSION_MSG "(Required is at least version \"${${_NAME}_FIND_VERSION}\")")
+      endif()
+    endif()
+  else ()
+    # Check with DEFINED as the found version may be 0.
+    if(DEFINED ${FPHSA_VERSION_VAR})
+      set(VERSION_MSG "(found version \"${${FPHSA_VERSION_VAR}}\")")
+    endif()
+  endif ()
+
+  if(VERSION_OK)
+    string(APPEND DETAILS "[v${${FPHSA_VERSION_VAR}}(${${_NAME}_FIND_VERSION})]")
+  else()
+    set(${_NAME}_FOUND FALSE)
+  endif()
+
+
+  # print the result:
+  if (${_NAME}_FOUND)
+    FIND_PACKAGE_MESSAGE(${_NAME} "Found ${_NAME}: ${${_FIRST_REQUIRED_VAR}} ${VERSION_MSG} ${COMPONENT_MSG}" "${DETAILS}")
+  else ()
+
+    if(FPHSA_CONFIG_MODE)
+      _FPHSA_HANDLE_FAILURE_CONFIG_MODE()
+    else()
+      if(NOT VERSION_OK)
+        set(RESULT_MSG)
+        if (_FIRST_REQUIRED_VAR)
+          string (APPEND RESULT_MSG "found ${${_FIRST_REQUIRED_VAR}}")
+        endif()
+        if (COMPONENT_MSG)
+          if (RESULT_MSG)
+            string (APPEND RESULT_MSG ", ")
+          endif()
+          string (APPEND RESULT_MSG "${FOUND_COMPONENTS_MSG}")
+        endif()
+        _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: ${VERSION_MSG} (${RESULT_MSG})")
+      else()
+        _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} (missing:${MISSING_VARS}) ${VERSION_MSG}")
+      endif()
+    endif()
+
+  endif ()
+
+  set(${_NAME}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE)
+  set(${_NAME_UPPER}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE)
+endfunction()
+
+
+cmake_policy(POP)
\ No newline at end of file
--- a/cmake/modules/FindPackageMessage.cmake
+++ b/cmake/modules/FindPackageMessage.cmake
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+FindPackageMessage
+------------------
+
+.. code-block:: cmake
+
+  find_package_message(<name> "message for user" "find result details")
+
+This function is intended to be used in FindXXX.cmake modules files.
+It will print a message once for each unique find result.  This is
+useful for telling the user where a package was found.  The first
+argument specifies the name (XXX) of the package.  The second argument
+specifies the message to display.  The third argument lists details
+about the find result so that if they change the message will be
+displayed again.  The macro also obeys the QUIET argument to the
+find_package command.
+
+Example:
+
+.. code-block:: cmake
+
+  if(X11_FOUND)
+    find_package_message(X11 "Found X11: ${X11_X11_LIB}"
+      "[${X11_X11_LIB}][${X11_INCLUDE_DIR}]")
+  else()
+   ...
+  endif()
+#]=======================================================================]
+
+function(find_package_message pkg msg details)
+  # Avoid printing a message repeatedly for the same find result.
+  if(NOT ${pkg}_FIND_QUIETLY)
+    string(REPLACE "\n" "" details "${details}")
+    set(DETAILS_VAR FIND_PACKAGE_MESSAGE_DETAILS_${pkg})
+    if(NOT "${details}" STREQUAL "${${DETAILS_VAR}}")
+      # The message has not yet been printed.
+      message(STATUS "${msg}")
+
+      # Save the find details in the cache to avoid printing the same
+      # message again.
+      set("${DETAILS_VAR}" "${details}"
+        CACHE INTERNAL "Details about finding ${pkg}")
+    endif()
+  endif()
+endfunction()
\ No newline at end of file
--- a/cmake/rocm_clang_toolchain.cmake
+++ b/cmake/rocm_clang_toolchain.cmake
+# MIT License
+#
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+#
+
+#
+# --- CMake Toolchain File for using Clang from ROCm build environment ---
+#
+#   To use this, invoke CMake like this:
+#   export ROCM_INSTALL_PATH=/opt/rocm[-${rocm_version}]    # Or ROCm path: ROCM_PATH
+#   example:    export ROCM_INSTALL_PATH=/opt/rocm-6.5.0
+#               export ROCM_INSTALL_PATH=/opt/rocm
+#               export ROCM_INSTALL_PATH=$ROCM_PATH
+#
+#   cmake -DCMAKE_TOOLCHAIN_FILE=./src/cmake/rocm-clang-toolchain.cmake ...
+#
+# This toolchain file assumes you are building for the host system (e.g., Linux x86_64)
+# but specifically using the Clang toolchain provided with ROCm.
+#
+
+#
+cmake_minimum_required(VERSION 3.25)
+
+
+#
+# --- CMake OS version checkpoint ---
+#
+#   For some distros/versions, the default compiler and std library versions are not
+#   up to the minimum C++20 (or newer). As those cannot be updated and as our compiler
+#   'Lightning/Clang++' is *built without* its 'libc++' component, we are not able to
+#   use some much needed features part of the source code.
+#   Here we check for those distros/versions, so we can skip the build gracefully with
+#   no build failures.
+#
+#   For now, we are checking for:
+#   NAME="Red Hat Enterprise Linux"
+#   VERSION_ID="8.8"
+#   ||
+#   NAME="Debian GNU/Linux"
+#   VERSION_ID="10"
+#
+#   Note:   CMake regex does not support multiline mode by default. So '^' and '$' only
+#           match the beginning and end of the entire string, not the start and end of
+#           individual lines.
+#           string(REGEX MATCH "NAME=\"?([^\n\"]+)\"?" _ "${OS_RELEASE_FILE_INFO}") will cause
+#           errors when we have:
+#               PRETTY_NAME="Debian GNU/Linux 10 (buster)"
+#               NAME="Debian GNU/Linux"
+#           We will try to fix it with (prepending a newline manually, simulating line-by-line):
+#               string(REGEX MATCH "\nNAME=\"([^\"]+)\"" _name_match "\n${OS_RELEASE_FILE_INFO}")
+
+#
+# --- ROCm default compiler/toolchain ---
+# If already set, skip further processing.
+if(IS_LIGHTNING_CLANG_DEFAULT_COMPILER AND ROCM_CLANG_TOOLCHAIN_USED)
+    message(STATUS ">> ROCm 'Lightning Clang++' toolchain is already set as default compiler.")
+    return()
+endif()
+
+
+set(SKIP_BUILD_PROCESS OFF)
+set(OS_RELEASE_FILE "/etc/os-release")
+if(EXISTS ${OS_RELEASE_FILE})
+    file(READ "${OS_RELEASE_FILE}" OS_RELEASE_FILE_INFO)
+    string(REGEX MATCH "\nNAME=\"([^\"]+)\"" _name_match "\n${OS_RELEASE_FILE_INFO}")
+    set(DISTRO_NAME "${CMAKE_MATCH_1}")
+    string(REGEX MATCH "\nVERSION_ID=\"([^\"]+)\"" _version_match "\n${OS_RELEASE_FILE_INFO}")
+    set(DISTRO_VERSION_ID "${CMAKE_MATCH_1}")
+
+    message(STATUS ">> ROCm Clang Toolchain Environment Detected: '${DISTRO_NAME}', v'${DISTRO_VERSION_ID}'")
+    ##  Check for unsupported distros/versions
+    ##  That is, distros/versions with compilers and std libraries not supporting C++20 fully.
+    if((DISTRO_NAME STREQUAL "Debian GNU/Linux" AND (DISTRO_VERSION_ID VERSION_GREATER_EQUAL "10")))
+        #   CACHE INTERNAL makes sure the SKIP_BUILD_PROCESS variable survives into the main CMake context
+        set(SKIP_BUILD_PROCESS ON CACHE INTERNAL "Skip build process for this OS version")
+        file(WRITE "${CMAKE_BINARY_DIR}/rbt_skip_build_process.flag" "1")
+        message(WARNING ">> Build not supported: '${DISTRO_NAME}', v'${DISTRO_VERSION_ID}'")
+    endif()
+else()
+    set(SKIP_BUILD_PROCESS ON)
+    message(WARNING ">> Unable to read OS release file: '${OS_RELEASE_FILE}'")
+endif()
+
+
+#
+# --- ROCm Build Path Setup ---
+if(DEFINED ENV{ROCM_INSTALL_PATH})
+    set(ROCM_BASE_PATH "$ENV{ROCM_INSTALL_PATH}")
+elseif(DEFINED ENV{ROCM_PATH})
+    set(ROCM_BASE_PATH "$ENV{ROCM_PATH}")
+else()
+    message(FATAL_ERROR ">> No ROCM_INSTALL_PATH or ROCM_PATH environment variable is set. "
+                        "  That is a requirement to locate 'Lightning Clang++'")
+endif()
+
+#
+# --- Path to Clang/LLVM root directory, (ie: /opt/rocm/lib/llvm/) ---
+if(DEFINED ENV{ROCM_LLVM_PATH})
+    set(ROCM_LLVM_BIN_DIR "$ENV{ROCM_LLVM_PATH}/bin")
+else()
+    set(ROCM_LLVM_BIN_DIR "${ROCM_BASE_PATH}/lib/llvm/bin")
+endif()
+
+set(ROCM_BIN_DIR "${ROCM_BASE_PATH}/bin")
+message(STATUS ">> ROCM_INSTALL_PATH detected: '${ROCM_BASE_PATH}'")
+message(STATUS ">> Expecting Clang/LLVM tools in: '${ROCM_LLVM_BIN_DIR}'")
+
+if(NOT IS_DIRECTORY "${ROCM_LLVM_BIN_DIR}")
+    message(FATAL_ERROR ">> ROCM_LLVM_BIN_DIR is not a valid directory: '${ROCM_LLVM_BIN_DIR}'\n"
+                        "  Check ROCM_INSTALL_PATH and the LLVM binary path structure.")
+endif()
+
+#
+# --- Compilers and Tools ---
+# Find Clang C and C++ compilers within the ROCm LLVM binary directory
+# NO_DEFAULT_PATH ensures CMake only looks in the HINTS path first for these specific finds.
+# REQUIRED will cause CMake to stop with an error if the compiler is not found there.
+find_program(CMAKE_C_COMPILER
+    NAMES clang
+    HINTS "${ROCM_LLVM_BIN_DIR}"
+    NO_DEFAULT_PATH
+    REQUIRED
+)
+find_program(CMAKE_CXX_COMPILER
+    NAMES clang++
+    HINTS "${ROCM_LLVM_BIN_DIR}"
+    NO_DEFAULT_PATH
+    REQUIRED
+)
+find_program(AMD_CLANG_CXX_COMPILER
+    NAMES amdclang++
+    HINTS "${ROCM_LLVM_BIN_DIR}"
+    NO_DEFAULT_PATH
+    REQUIRED
+)
+find_program(AMD_HIP_CXX_COMPILER
+    NAMES hipcc
+    HINTS "${ROCM_BIN_DIR}"
+    NO_DEFAULT_PATH
+    REQUIRED
+)
+
+
+# --- Verify hipcc/Clang compiler version ---
+set(CMAKE_C_COMPILER ${AMD_HIP_CXX_COMPILER})
+set(CMAKE_CXX_COMPILER ${AMD_HIP_CXX_COMPILER})
+
+# Minimum required version of Clang
+if(CMAKE_CXX_COMPILER)
+    set(CLANG_COMPILER_MAJOR_VERSION_REQUIRED "19")
+    set(CLANG_COMPILER_MINOR_VERSION_REQUIRED "0")
+    set(CLANG_COMPILER_REVISION_VERSION_REQUIRED "0")
+    set(CLANG_COMPILER_MINIMUM_VERSION_REQUIRED "${CLANG_COMPILER_MAJOR_VERSION_REQUIRED}.${CLANG_COMPILER_MINOR_VERSION_REQUIRED}.${CLANG_COMPILER_REVISION_VERSION_REQUIRED}")
+
+    execute_process(
+        COMMAND ${CMAKE_CXX_COMPILER} -dumpversion
+        OUTPUT_VARIABLE CLANG_COMPILER_VERSION
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+
+    # Check if the version is valid
+    string(REGEX MATCHALL "[0-9]+" CLANG_COMPILER_VERSION_COMPONENTS "${CLANG_COMPILER_VERSION}")
+    if(CLANG_COMPILER_VERSION_COMPONENTS)
+        list(GET CLANG_COMPILER_VERSION_COMPONENTS 0 CLANG_COMPILER_VERSION_MAJOR)
+        list(GET CLANG_COMPILER_VERSION_COMPONENTS 1 CLANG_COMPILER_VERSION_MINOR)
+        list(GET CLANG_COMPILER_VERSION_COMPONENTS 2 CLANG_COMPILER_VERSION_REVISION)
+        set(CLANG_COMPILER_FULL_VERSION "${CLANG_COMPILER_VERSION_MAJOR}.${CLANG_COMPILER_VERSION_MINOR}.${CLANG_COMPILER_VERSION_REVISION}")
+        ##
+        if(CLANG_COMPILER_VERSION_MAJOR GREATER_EQUAL ${CLANG_COMPILER_MAJOR_VERSION_REQUIRED} AND
+           CLANG_COMPILER_VERSION_MINOR GREATER_EQUAL ${CLANG_COMPILER_MINOR_VERSION_REQUIRED})
+            set(CLANG_COMPILER_VERSION_RESULT TRUE)
+        else()
+            set(CLANG_COMPILER_VERSION_RESULT FALSE)
+        endif()
+
+        if(NOT CLANG_COMPILER_VERSION_RESULT)
+            message(FATAL_ERROR ">> 'Clang++' compiler v'${CLANG_COMPILER_VERSION}' is not as default compiler! Minimum version required: 'v${CLANG_COMPILER_MINIMUM_VERSION_REQUIRED}'")
+        endif()
+    endif()
+
+else()
+    message(FATAL_ERROR ">> 'Clang++' compiler not found in ROCM_INSTALL_PATH: '${ROCM_BASE_PATH}'")
+endif()
+
+
+#
+# --- Search Behavior ---
+# For ROCm, the ROCM_PATH itself is a root for its specific components (headers, libs).
+# We add it to CMAKE_FIND_ROOT_PATH so find_package, find_library etc., look there.
+# We use list(PREPEND ...) to ensure ROCM_PATH is searched before system paths for relevant items.
+list(PREPEND CMAKE_FIND_ROOT_PATH "${ROCM_BASE_PATH}")
+list(REMOVE_DUPLICATES CMAKE_FIND_ROOT_PATH)
+
+# Adjust find behavior.
+# 'BOTH' allows searching in CMAKE_FIND_ROOT_PATH (ROCm paths) and then system paths.
+# This is often suitable for ROCm which overlays on a standard system.
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)    # Don't look for host programs in ROCM_PATH
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH)
+
+#
+# --- Confirmation Message ---
+# Note: CMAKE_C_COMPILER_VERSION and CMAKE_CXX_COMPILER_VERSION are populated
+#       the needed compiler flags are defined by the 'build_utils.cmake'
+#       *after* the 'project() command and language enablement', so they won't be available here.
+#
+# Set a cached variable to indicate this toolchain is used
+set(ROCM_CLANG_TOOLCHAIN_USED TRUE CACHE BOOL "Indicates that the ROCm 'Lightning Clang++' toolchain is in use")
+set(IS_LIGHTNING_CLANG_DEFAULT_COMPILER TRUE CACHE BOOL "build_utils.cmake: Indicates that 'Lightning Clang++' is the default compiler")
+set(CMAKE_C_COMPILER "${CMAKE_C_COMPILER}" CACHE PATH "C compiler")
+set(CMAKE_CXX_COMPILER "${CMAKE_CXX_COMPILER}" CACHE PATH "C++ compiler")
+message(STATUS ">> Using ROCm 'Lightning Clang++' Toolchain: ${CMAKE_CURRENT_LIST_FILE}")
+message(STATUS "  >> C   Compiler:   ${CMAKE_C_COMPILER}")
+message(STATUS "  >> C++ Compiler:   ${CMAKE_CXX_COMPILER}")
--- a/cmake/tbclient_version.hpp.in
+++ b/cmake/tbclient_version.hpp.in
+/*
+MIT License
+
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+#pragma once
+
+/*
+    CMake auto-generated file: Do not edit it.
+*/
+#define TRANSFERBENCH_CLIENT_VERSION "@TRANSFERBENCH_CLIENT_TARGET_VERSION@"
+
--- a/cmake/tbengine_version.hpp.in
+++ b/cmake/tbengine_version.hpp.in
+/*
+MIT License
+
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+#pragma once
+
+/*
+    CMake auto-generated file: Do not edit it.
+*/
+#define TRANSFERBENCH_GIT_BRANCH     "@GIT_BRANCH@"
+#define TRANSFERBENCH_GIT_COMMIT     "@GIT_COMMIT_HASH_LONG@"
+#define TRANSFERBENCH_HEADER_VERSION "@TRANSFERBENCH_HEADER_VERSION@"
+
--- a/cmake/toolchain-linux.cmake
+++ b/cmake/toolchain-linux.cmake
+
+if (DEFINED ENV{ROCM_PATH})
+  set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Path to the ROCm installation.")
+  set(rocm_bin "$ENV{ROCM_PATH}/bin")
+else()
+  set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.")
+  set(rocm_bin "/opt/rocm/bin")
+endif()
+
+if (NOT DEFINED ENV{CXX})
+  if(EXISTS "${rocm_bin}/amdclang++")
+    set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
+  else()
+    if(EXISTS "${ROCM_PATH}/llvm/bin/amdclang++")
+      set(rocm_bin "${ROCM_PATH}/llvm/bin")
+      set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
+    elseif(EXISTS "${ROCM_PATH}/llvm/bin/clang++")
+      set(rocm_bin "${ROCM_PATH}/llvm/bin")
+      set(CMAKE_CXX_COMPILER "${rocm_bin}/clang++" CACHE PATH "Path to the C++ compiler")
+    endif()
+  endif()
+else()
+  set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler")
+endif()
+
+if (NOT DEFINED ENV{CXXFLAGS})
+  set(CMAKE_CXX_FLAGS_DEBUG "-g -O1")
+  set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+endif()
+
+if(NOT CMAKE_BUILD_TYPE)
+  message(STATUS "Setting build type to 'Release' as none was specified.")
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
+endif()
--- a/deps/tbengine/CMakeLists.txt
+++ b/deps/tbengine/CMakeLists.txt
+# MIT License
+#
+# Copyright (c) 2023-25 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+#
+
+#
+# --- Header-Only vs Regular Compiled Library ---
+#
+##  Header-Only Library:
+#   A header-only library is a library that consists entirely of header files (.h or .hpp)
+#       - It does not require separate compilation into binary files (.lib, .a, .so, etc)
+#
+#   - Simplicity and Ease of Distribution:
+#       - No need for separate compilation or linking steps
+#       - Just include the headers
+#           - Great for header-only utilities or template-heavy libraries
+#
+#   - Heavy Use of Templates or Inline Functions:
+#       - Templates must be defined in headers, so template libraries are often header-only (like, Eigen, Catch2)
+#       - Inline functions benefit from this as well for potential compiler optimizations
+#
+#   - Small to Medium Size Libraries:
+#       - Ideal when the codebase is not too large, avoiding long compile times
+#
+#   - Performance-Critical Components:
+#       - Enables the compiler to inline aggressively across translation units
+#
+#   - Cross-Platform or Header-Only Dependencies:
+#       - Avoids needing to build for multiple platforms or compilers
+#
+##  Regular Compiled Library:
+#   A regular compiled library is a library that is compiled into binary files (.lib, .a, .so, etc)
+#       - It requires separate compilation and linking steps
+#
+#   - Large Codebase / Long Compile Times
+#       - Avoid recompiling all code that includes the library headers
+#
+#   - Improved Encapsulation
+#       - Hides implementation details, reduces header bloat, and maintains a clean API
+#       - Binary distribution keeps proprietary code hidden
+#
+#   - ABI Stability & Compatibility
+#       - Enables decoupling user code from library internals
+#       - Users don’t need to recompile their code when internals of the library change (if ABI remains stable)
+#
+#   - Reduced Binary Size:
+#       - Prevents code bloat due to duplication in each translation unit
+#
+#   - Dynamic Loading / Plugin Systems
+#       - Necessary if you want runtime dynamic linking (like, via dlopen)
+#
+#   - Separate Build and Test Pipelines
+#       - Easier to build and test the library independently from the application
+#
+##  Hybrid Approach:
+#   - Public API in headers, and compiled internals:
+#       - Templates or inline functions stay in headers
+#       - Logic-heavy or stable parts go into .so or .a files
+#
+
+#
+cmake_minimum_required(VERSION 3.25 FATAL_ERROR)
+project(${AMD_PROJECT_LIBRARY_NAME}
+    VERSION ${PROJECT_TARGET_VERSION_TEXT}
+    DESCRIPTION "TransferBench Engine Library"
+    LANGUAGES CXX HIP
+)
+
+# Load CMake modules
+#==================================================================================================
+set(AMD_PROJECT_CMAKE_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/cmake)
+set(AMD_PROJECT_CMAKE_MODULES_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/cmake/modules)
+list(APPEND CMAKE_MODULE_PATH "${AMD_PROJECT_CMAKE_MODULES_DIRECTORY}")
+
+
+# CMake Toolchain file to define compilers and path to ROCm
+#==================================================================================================
+if (NOT CMAKE_TOOLCHAIN_FILE)
+    set(CMAKE_TOOLCHAIN_FILE "${AMD_PROJECT_CMAKE_DIRECTORY}/rocm_clang_toolchain.cmake")
+    message(STATUS ">> CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
+endif()
+
+#
+include(CheckIncludeFiles)
+include(CheckSymbolExists)
+include(${AMD_PROJECT_CMAKE_DIRECTORY}/build_utils.cmake)               # setup_default_compiler_flags
+include(${AMD_PROJECT_CMAKE_MODULES_DIRECTORY}/Dependencies.cmake)      # rocm-cmake, rocm_local_targets
+
+#
+set (TRANSFERBENCH_CLIENT_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/client)
+set (TRANSFERBENCH_TBENGINE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+set (TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY ${TRANSFERBENCH_TBENGINE_DIRECTORY}/include)
+set (TRANSFERBENCH_TBENGINE_SRC_DIRECTORY ${TRANSFERBENCH_TBENGINE_DIRECTORY}/src)
+
+
+#
+#   Default GPU architectures to build
+set(BUILD_TRANSFERBENCH_DEFAULT_GPUS_LIST
+    gfx906
+    gfx908
+    gfx90a
+    gfx942
+    gfx950
+    gfx1030
+    gfx1100
+    gfx1101
+    gfx1102
+    gfx1150
+    gfx1151
+    gfx1200
+    gfx1201
+)
+
+
+#
+#   Build only for local GPU architecture
+set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF)
+if (TRANSFERBENCH_LOCAL_GPU_TARGET_ONLY)
+    message(STATUS "  >> Building only for local GPU target")
+    if (COMMAND rocm_local_targets)
+        rocm_local_targets(BUILD_TRANSFERBENCH_DEFAULT_GPUS_LIST)
+    else()
+        message(WARNING "  >> Unable to determine local GPU targets. Falling back to default GPUs.")
+  endif()
+endif()
+
+#
+#   Determine which GPU architectures to build for
+set(TRANSFERBENCH_GPU_TARGETS "${BUILD_TRANSFERBENCH_DEFAULT_GPUS_LIST}" CACHE STRING "Target default GPUs if TRANSFERBENCH_GPU_TARGETS is not defined.")
+
+#
+#   Check if clang compiler can offload to GPU_TARGETS
+if (COMMAND rocm_check_target_ids)
+    message(STATUS ">> Checking for ROCm support for GPU targets: " "${TRANSFERBENCH_GPU_TARGETS}")
+    rocm_check_target_ids(TRANSFERBENCH_SUPPORTED_GPUS TARGETS ${TRANSFERBENCH_GPU_TARGETS})
+else()
+    message(WARNING ">> Unable to check for supported GPU targets. Falling back to default GPUs.")
+    set(TRANSFERBENCH_SUPPORTED_GPUS ${BUILD_TRANSFERBENCH_DEFAULT_GPUS_LIST})
+endif()
+
+set(TRANSFERBENCH_COMPILING_TARGETS "${TRANSFERBENCH_SUPPORTED_GPUS}" CACHE STRING "GPU targets to compile for.")
+message(STATUS ">> Building for: ${TRANSFERBENCH_COMPILING_TARGETS}")
+foreach(target ${TRANSFERBENCH_COMPILING_TARGETS})
+    list(APPEND STATIC_LINK_FLAGS --offload-arch=${target})
+endforeach()
+list(JOIN STATIC_LINK_FLAGS " " FLAGS_STR)
+
+#
+#   NOTE: Reload rocm-cmake in order to update GPU_TARGETS
+#   Reloading to use desired GPU_TARGETS instead of defaults
+include(${AMD_PROJECT_CMAKE_MODULES_DIRECTORY}/Dependencies.cmake)
+
+#
+get_rocm_install_path(ROCM_PATH)
+
+
+#
+#   Set CMAKE flags
+if (NOT DEFINED CMAKE_CXX_STANDARD)
+    set(CMAKE_CXX_STANDARD 17)
+endif()
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# --- HIP Package ---
+#   Check for HIP
+#
+#   Add ROCM_BASE_PATH to CMake search paths for finding HIP / HSA
+list(APPEND CMAKE_PREFIX_PATH
+            ${ROCM_PATH}
+            ${ROCM_PATH}/llvm
+            ${ROCM_PATH}/hip
+            ${ROCM_PATH}/hsa
+            /opt/rocm
+            /opt/rocm/llvm
+            /opt/rocm/hip
+            /opt/rocm/hsa
+)
+
+#
+#   Check for HIP
+find_package(hip REQUIRED CONFIG PATHS ${CMAKE_PREFIX_PATH})
+developer_status_message("DEVEL" "  >> HIP Include Dirs: ${hip_INCLUDE_DIRS} ...")
+developer_status_message("DEVEL" "  >> HIP Libraries: ${hip_LIBRARIES} ...")
+
+#
+#   Ensuring that CXX compiler meets expectations
+if(NOT (("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc") OR ("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+")))
+    message(FATAL_ERROR ">> On ROCm platform CMAKE_CXX_COMPILER must be 'hipcc' or 'HIP-aware Clang'.")
+endif()
+
+#
+#   Check for Threads
+find_package(Threads REQUIRED)
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+
+#
+#   Check for numa support
+set(WAS_NUMA_FOUND OFF)
+set(NUMA_LIBRARY_NAME "numa")
+find_library(NUMA_LIBRARY ${NUMA_LIBRARY_NAME})
+find_path(NUMA_INCLUDE_DIR numa.h)
+if(NUMA_LIBRARY AND NUMA_INCLUDE_DIR)
+    set(WAS_NUMA_FOUND ON)
+    add_library(${NUMA_LIBRARY_NAME} SHARED IMPORTED)
+    set_target_properties(${NUMA_LIBRARY_NAME}
+            PROPERTIES
+                INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}"
+                IMPORTED_LOCATION "${NUMA_LIBRARY}"
+                INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}"
+    )
+endif()
+developer_status_message("DEVEL" "NUMA_INCLUDE_DIR: ${NUMA_INCLUDE_DIR} ...")
+developer_status_message("DEVEL" "NUMA_LIBRARY_NAME: ${NUMA_LIBRARY_NAME} ...")
+developer_status_message("DEVEL" "NUMA_LIBRARY: ${NUMA_LIBRARY} ...")
+
+
+#
+#   Check for hsa support: 'libamdhip64.so' (libhsa-runtime-dev package)
+#  ${ROCM_PATH}/include/hsa/hsa.h
+find_path(HIP_ROOT_DIR
+            NAMES
+                "include/hip/hip_runtime.h"
+            HINTS
+                ${ROCM_PATH}
+                /opt/rocm/
+)
+if(NOT HIP_ROOT_DIR)
+    message(FATAL_ERROR ">> HIP_ROOT_DIR 'hip_runtime.h' not found. Ensure ROCm is properly set up ...")
+endif()
+
+set(HIP_INCLUDE_ROOT_DIR "${HIP_ROOT_DIR}/include")
+set(HIP_LIBRARY_ROOT_DIR "${HIP_ROOT_DIR}/lib")
+developer_status_message("DEVEL" "HIP_ROOT_DIR: ${HIP_ROOT_DIR} ...")
+developer_status_message("DEVEL" "HIP_INCLUDE_ROOT_DIR: ${HIP_INCLUDE_ROOT_DIR} ...")
+developer_status_message("DEVEL" "HIP_LIBRARY_ROOT_DIR: ${HIP_LIBRARY_ROOT_DIR} ...")
+
+set(WAS_HSA_FOUND OFF)
+set(HSA_LIBRARY_NAME "hsa-runtime64")
+find_library(HSA_LIBRARY ${HSA_LIBRARY_NAME} PATHS ${HIP_LIBRARY_ROOT_DIR} ${ROCM_PATH})
+find_path(HSA_INCLUDE_DIR "hsa/hsa.h" PATHS ${HIP_INCLUDE_ROOT_DIR} NO_DEFAULT_PATH)
+if(HSA_LIBRARY AND HSA_INCLUDE_DIR)
+    set(WAS_HSA_FOUND ON)
+    add_library(${HSA_LIBRARY_NAME} SHARED IMPORTED)
+    set_target_properties(${HSA_LIBRARY_NAME}
+            PROPERTIES
+                INTERFACE_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}"
+                IMPORTED_LOCATION "${HSA_LIBRARY}"
+                INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}"
+    )
+endif()
+developer_status_message("DEVEL" "HSA_INCLUDE_DIR: ${HSA_INCLUDE_DIR} ...")
+developer_status_message("DEVEL" "HSA_LIBRARY_NAME: ${HSA_LIBRARY_NAME} ...")
+developer_status_message("DEVEL" "HSA_LIBRARY: ${HSA_LIBRARY} ...")
+
+
+#
+#   Check for hip support: 'libamdhip64.so' (libamdhip64-dev package)
+#   HIP_LIBRARY will be set to: "hip::host;hip::device" by find_library())
+#   ${ROCM_PATH}/include/hip/hip_ext.h
+set(WAS_HIP_FOUND OFF)
+set(HIP_LIBRARY_NAME "amdhip64")
+find_library(HIP_LIBRARY ${HIP_LIBRARY_NAME} PATHS ${HIP_LIBRARY_ROOT_DIR} ${ROCM_PATH})
+find_path(HIP_INCLUDE_DIR "hip/hip_ext.h" PATHS ${HIP_INCLUDE_ROOT_DIR} NO_DEFAULT_PATH)
+if(NOT HIP_INCLUDE_DIR)
+    message(FATAL_ERROR ">> HIP_INCLUDE_DIR 'hip_ext.h' not found. Ensure ROCm is properly set up ...")
+endif()
+
+if(HIP_LIBRARY AND HIP_INCLUDE_DIR)
+    set(HIP_LIBRARY "${HIP_LIBRARY_ROOT_DIR}/libamdhip64.so")
+    set(WAS_HIP_FOUND ON)
+    add_library(${HIP_LIBRARY_NAME} SHARED IMPORTED)
+    set_target_properties(${HIP_LIBRARY_NAME}
+            PROPERTIES
+                INTERFACE_INCLUDE_DIRECTORIES "${HIP_INCLUDE_DIR}"
+                IMPORTED_LOCATION "${HIP_LIBRARY}"
+                INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_INCLUDE_DIR}"
+    )
+endif()
+developer_status_message("DEVEL" "HIP_INCLUDE_DIR: ${HIP_INCLUDE_DIR} ...")
+developer_status_message("DEVEL" "HIP_LIBRARY_NAME: ${HIP_LIBRARY_NAME} ...")
+developer_status_message("DEVEL" "HIP_LIBRARY: ${HIP_LIBRARY} ...")
+
+#
+#   Library/interface names
+set(AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_header")
+set(TRANSFERBENCH_INTERFACE_TARGET_NAME "${AMD_PROJECT_PACKAGE_NAME}_engine")
+set(TRANSFERBENCH_INTERFACE_TARGET_NAME_ALIAS "${AMD_PROJECT_PACKAGE_NAME}::engine")
+set(AMD_PROJECT_STATIC_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_static")
+set(AMD_PROJECT_SHARED_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_shared")
+set(AMD_PROJECT_OBJECT_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_object_library")
+set(AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_hip_object_library")
+set(AMD_PROJECT_CLIENT_NAME "${AMD_PROJECT_NAME}")
+
+#
+#   Check for infiniband verbs support
+set(WAS_IBVERBS_FOUND OFF)
+if(DEFINED ENV{DISABLE_NIC_EXEC} AND "$ENV{DISABLE_NIC_EXEC}" STREQUAL "1")
+    message(STATUS ">> Disabling 'NIC Executor' support. 'DISABLE_NIC_EXEC' was enabled ...")
+elseif(NOT TRANSFERBENCH_ENABLE_NIC_EXEC)
+    message(STATUS ">> For CMake builds, NIC executor requires explicit opt-in by setting CMake flag '-DTRANSFERBENCH_ENABLE_NIC_EXEC=1|ON' ...")
+    message(STATUS ">> Disabling 'NIC Executor' support ...")
+else()
+    set(IBVERBS_LIBRARY_NAME "ibverbs")
+    find_library(IBVERBS_LIBRARY ${IBVERBS_LIBRARY_NAME})
+    find_path(IBVERBS_INCLUDE_DIR "infiniband/verbs.h")
+    if(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
+        set(WAS_IBVERBS_FOUND ON)
+        add_library(${IBVERBS_LIBRARY_NAME} SHARED IMPORTED)
+        set_target_properties(${IBVERBS_LIBRARY_NAME}
+                PROPERTIES
+                    INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}"
+                    IMPORTED_LOCATION "${IBVERBS_LIBRARY}"
+                    INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}"
+        )
+        message(STATUS ">> Building with 'NIC executor' support. Can set 'DISABLE_NIC_EXEC=1' to disable")
+    else()
+        if (NOT IBVERBS_LIBRARY)
+            message(WARNING ">> 'IBVerbs' library not found ...")
+        endif()
+        if (NOT IBVERBS_INCLUDE_DIR)
+            message(WARNING ">> 'infiniband/verbs.h' not found ...")
+        endif()
+        message(WARNING "Building without 'NIC executor' support. To use the TransferBench RDMA executor, \n"
+                        " check if your system has NICs, the NIC drivers are installed, and 'libibverbs-dev' is installed")
+    endif()
+endif()
+
+
+# --- Get TB commit and branch ---
+# That's useful for tracking which version of the code was used to build the library
+if(DEFINED TRANSFERBENCH_COMMIT_HASH_LONG AND DEFINED TRANSFERBENCH_COMMIT_BRANCH)
+    set(GIT_COMMIT_HASH_LONG "${TRANSFERBENCH_COMMIT_HASH_LONG}")
+    set(GIT_BRANCH "${TRANSFERBENCH_COMMIT_BRANCH}")
+else()
+    # Get info about the current branch
+    execute_process(
+        COMMAND git rev-parse --abbrev-ref HEAD
+        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+        OUTPUT_VARIABLE GIT_BRANCH
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE RESULT_BRANCH
+        ERROR_QUIET
+    )
+
+    # Get hash log info for the current branch
+    execute_process(
+        COMMAND git log -1 --format=%H
+        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+        OUTPUT_VARIABLE GIT_COMMIT_HASH_LONG
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE RESULT_HASH_LONG
+        ERROR_QUIET
+    )
+endif()
+
+if(GIT_COMMIT_HASH_LONG STREQUAL "" OR GIT_BRANCH STREQUAL "")
+    message(WARNING "[[ No commit hash/branch were found. ]]")
+else()
+    set(TRANSFERBENCH_HEADER_VERSION ${PROJECT_TARGET_VERSION_TEXT})
+    developer_status_message("DEVEL" ">> Setting TransferBench commit/branch info in '${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}/TransferBench.hpp' ...")
+    developer_status_message("DEVEL" "  >> GIT_BRANCH=\"${GIT_BRANCH}\"")
+    developer_status_message("DEVEL" "  >> GIT_COMMIT_HASH_LONG=\"${GIT_COMMIT_HASH_LONG}\"")
+    developer_status_message("DEVEL" "  >> TRANSFERBENCH_HEADER_VERSION=\"${TRANSFERBENCH_HEADER_VERSION}\"")
+
+    file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/include/)
+    configure_file(
+        ${AMD_PROJECT_CMAKE_DIRECTORY}/tbengine_version.hpp.in
+        ${CMAKE_BINARY_DIR}/include/tbengine_version.hpp
+        @ONLY
+    )
+endif()
+# --- End of Get TB commit and branch ---
+
+#
+#   Header/Source files
+set(TRANSFERBENCH_ENGINE_HEADER_SOURCES
+    ${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}/TransferBench.hpp
+)
+
+set(TRANSFERBENCH_ENGINE_HEADER_IMPL_SOURCES
+    ${TRANSFERBENCH_TBENGINE_SRC_DIRECTORY}/TransferBench.cpp
+)
+
+set(TRANSFERBENCH_ENGINE_ALL_SOURCES
+    ${TRANSFERBENCH_ENGINE_HEADER_SOURCES}
+    ${TRANSFERBENCH_ENGINE_HEADER_IMPL_SOURCES}
+)
+
+
+#
+# --- Object Libraries, and avoid recompilation ---
+#   Common public interface target
+add_library(${TRANSFERBENCH_INTERFACE_TARGET_NAME} INTERFACE)
+add_library(${TRANSFERBENCH_INTERFACE_TARGET_NAME_ALIAS} ALIAS ${TRANSFERBENCH_INTERFACE_TARGET_NAME})
+target_include_directories(${TRANSFERBENCH_INTERFACE_TARGET_NAME}
+    INTERFACE
+        $<BUILD_INTERFACE:${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}>
+        $<INSTALL_INTERFACE:include>
+)
+
+#
+#   For dynamic linking: HIP object library (for use in other targets)
+add_library(${AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME}
+    OBJECT
+        ${TRANSFERBENCH_ENGINE_HEADER_IMPL_SOURCES}
+)
+
+set_target_properties(${AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME}
+    PROPERTIES
+        POSITION_INDEPENDENT_CODE ON
+)
+
+target_include_directories(${AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME}
+    PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include/>
+        $<BUILD_INTERFACE:${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}>
+        $<INSTALL_INTERFACE:include>
+)
+setup_default_compiler_flags(${AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME})
+
+
+#
+#   For static linking: Standard object library (for use in other targets)
+add_library(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
+    OBJECT
+        ${TRANSFERBENCH_ENGINE_HEADER_IMPL_SOURCES}
+)
+
+set_target_properties(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
+    PROPERTIES
+        POSITION_INDEPENDENT_CODE ON
+)
+
+target_include_directories(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
+    PUBLIC
+        $<BUILD_INTERFACE:${NUMA_INCLUDE_DIR}>
+        $<BUILD_INTERFACE:${HIP_INCLUDE_ROOT_DIR}>
+        $<BUILD_INTERFACE:${HIP_INCLUDE_DIR}>
+        $<BUILD_INTERFACE:${HSA_INCLUDE_DIR}>
+        $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include/>
+        $<BUILD_INTERFACE:${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}>
+        $<INSTALL_INTERFACE:include>
+)
+setup_default_compiler_flags(${AMD_PROJECT_OBJECT_LIBRARY_NAME})
+
+target_link_libraries(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
+    PUBLIC
+        ${NUMA_LIBRARY}
+        ${HIP_LIBRARY}
+        ${HSA_LIBRARY}
+        hip::host
+        Threads::Threads
+        dl
+)
+
+
+if(WAS_IBVERBS_FOUND)
+    target_include_directories(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
+        PRIVATE
+            $<BUILD_INTERFACE:${IBVERBS_INCLUDE_DIR}>
+    )
+
+    target_link_libraries(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
+        PRIVATE
+            ${IBVERBS_LIBRARY}
+    )
+    target_compile_definitions(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
+        PRIVATE
+            NIC_EXEC_ENABLED
+    )
+endif()
+
+set_target_properties(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
+    PROPERTIES
+        POSITION_INDEPENDENT_CODE ON
+        EXCLUDE_FROM_ALL ON
+        EXCLUDE_FROM_DEFAULT_BUILD ON
+)
+# ---
+
+#
+# --- Different build types ---
+if(TRANSFERBENCH_ENGINE_SHARED)
+    message(STATUS ">> Building TransferBench 'shared' library ...")
+    add_library(${AMD_PROJECT_SHARED_LIBRARY_NAME} SHARED
+        $<TARGET_OBJECTS:${AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME}>
+    )
+
+    developer_status_message("DEVEL" "  >> PROJECT_TARGET_BINARY_VERSION: '${PROJECT_TARGET_BINARY_VERSION}' ")
+    developer_status_message("DEVEL" "  >> PROJECT_TARGET_VERSION_TEXT: '${PROJECT_TARGET_VERSION_TEXT}' ")
+    developer_status_message("DEVEL" "  >> PROJECT_TARGET_VERSION: '${PROJECT_TARGET_VERSION}' ")
+    developer_status_message("DEVEL" "  >> PROJECT_MAJOR.MINOR_VERSION: '${AMD_PROJECT_VERSION_MAJOR}'.'${AMD_PROJECT_VERSION_MINOR}' ")
+    set_target_properties(${AMD_PROJECT_SHARED_LIBRARY_NAME}
+        PROPERTIES
+            OUTPUT_NAME ${AMD_PROJECT_LIBRARY_NAME}
+            VERSION ${PROJECT_TARGET_VERSION}
+            SOVERSION ${AMD_PROJECT_VERSION_MAJOR}
+            LINKER_LANGUAGE CXX
+            CUDA_RESOLVE_DEVICE_SYMBOLS ON
+            POSITION_INDEPENDENT_CODE ON
+    )
+
+    target_include_directories(${AMD_PROJECT_SHARED_LIBRARY_NAME}
+        PUBLIC
+            $<BUILD_INTERFACE:${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}>
+            $<INSTALL_INTERFACE:include>
+    )
+
+    target_link_libraries(${AMD_PROJECT_SHARED_LIBRARY_NAME}
+        PUBLIC
+            ${NUMA_LIBRARY}
+            ${HSA_LIBRARY}
+            Threads::Threads
+            dl
+
+        PRIVATE
+            hip::device
+    )
+
+    target_compile_definitions(${AMD_PROJECT_SHARED_LIBRARY_NAME}
+        PUBLIC
+            TRANSFERBENCH_SHARED
+    )
+
+    #   Shared library specific compile options
+    setup_default_compiler_flags(${AMD_PROJECT_SHARED_LIBRARY_NAME})
+    add_common_flag("-fgpu-rdc" ${AMD_PROJECT_SHARED_LIBRARY_NAME})
+
+    #   Install shared library
+    install(TARGETS ${AMD_PROJECT_SHARED_LIBRARY_NAME}
+        EXPORT ${AMD_PROJECT_NAME}Targets
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    )
+endif()
+
+if(TRANSFERBENCH_ENGINE_STATIC)
+    message(STATUS ">> Building TransferBench 'static' library ...")
+    add_library(${AMD_PROJECT_STATIC_LIBRARY_NAME} STATIC
+        $<TARGET_OBJECTS:${AMD_PROJECT_OBJECT_LIBRARY_NAME}>
+    )
+
+    set_target_properties(${AMD_PROJECT_STATIC_LIBRARY_NAME}
+        PROPERTIES
+            OUTPUT_NAME ${AMD_PROJECT_LIBRARY_NAME}
+            POSITION_INDEPENDENT_CODE ON
+    )
+
+    target_include_directories(${AMD_PROJECT_STATIC_LIBRARY_NAME}
+        PUBLIC
+            $<BUILD_INTERFACE:${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}>
+            $<INSTALL_INTERFACE:include>
+    )
+
+    target_link_libraries(${AMD_PROJECT_STATIC_LIBRARY_NAME}
+        PUBLIC
+            ${NUMA_LIBRARY}
+            ${HSA_LIBRARY}
+            Threads::Threads
+            hip::host
+            dl
+    )
+
+    target_compile_definitions(${AMD_PROJECT_STATIC_LIBRARY_NAME}
+        PUBLIC
+            TRANSFERBENCH_STATIC
+    )
+
+    #   Static library specific compile options
+    setup_default_compiler_flags(${AMD_PROJECT_STATIC_LIBRARY_NAME})
+    add_common_flag("-fgpu-rdc" ${AMD_PROJECT_STATIC_LIBRARY_NAME})
+
+    #   Install static library
+    install(TARGETS ${AMD_PROJECT_STATIC_LIBRARY_NAME}
+        EXPORT ${AMD_PROJECT_NAME}Targets
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    )
+endif()
+
+if(TRANSFERBENCH_ENGINE_HEADER_ONLY)
+    message(STATUS ">> Building TransferBench 'header-only' library ...")
+    add_library(${AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME} INTERFACE)
+
+    set_target_properties(${AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME}
+        PROPERTIES
+            INTERFACE_CUDA_RESOLVE_DEVICE_SYMBOLS ON
+    )
+
+    target_include_directories(${AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME}
+        INTERFACE
+            $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>
+            $<BUILD_INTERFACE:${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}>
+            $<INSTALL_INTERFACE:include>
+    )
+
+    target_link_libraries(${AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME}
+        INTERFACE
+            ${AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME}
+            ${NUMA_LIBRARY}
+            ${HIP_LIBRARY}
+            ${HSA_LIBRARY}
+            hip::device
+            Threads::Threads
+            dl
+    )
+
+    #   Install header-only library
+    install(
+        DIRECTORY ${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}
+        DESTINATION include
+        FILES_MATCHING
+            PATTERN "*.hpp"
+    )
+endif()
+
+#
+#   Common install
+install(
+    DIRECTORY ${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+
+if(NOT TRANSFERBENCH_ENGINE_HEADER_ONLY AND TRANSFERBENCH_ENGINE_STATIC OR TRANSFERBENCH_ENGINE_SHARED)
+    install(EXPORT ${AMD_PROJECT_NAME}Targets
+        FILE ${AMD_PROJECT_NAME}Targets.cmake
+        NAMESPACE "${AMD_PROJECT_PACKAGE_NAME}::"
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${AMD_PROJECT_PACKAGE_NAME}
+    )
+endif()
+
+
+## End of CMakeLists.txt
+
--- a/deps/tbengine/include/TransferBench.hpp
+++ b/deps/tbengine/include/TransferBench.hpp
+/*
+MIT License
+
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+/// @cond
+#pragma once
+
+#include <tbengine_version.hpp>
+
+#include <numa.h>    // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
+#include <numaif.h>
+#include <stdarg.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cstring>
+#include <future>
+#include <map>
+#include <random>
+#include <set>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <thread>
+#include <vector>
+
+#ifdef NIC_EXEC_ENABLED
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <infiniband/verbs.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <filesystem>
+#include <fstream>
+#endif
+
+#if defined(__NVCC__)
+#include <cuda_runtime.h>
+#else
+#include <hip/hip_ext.h>
+#include <hip/hip_runtime.h>
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+#endif
+/// @endcond
+
+
+/*
+ *  Note:   If for any reason, we have something that is header-only implementation, it can be added
+ * here.
+ */
+#if defined(TRANSFERBENCH_HEADER_IMPLEMENTATION_DETAILS)
+
+#endif    //-- TRANSFERBENCH_HEADER_IMPLEMENTATION_DETAILS
+
+
+namespace TransferBench
+{
+using std::map;
+using std::pair;
+using std::set;
+using std::vector;
+
+// constexpr char VERSION[] = "1.64";
+static const auto TB_GIT_BRANCH          = std::string_view(TRANSFERBENCH_GIT_BRANCH);
+static const auto TB_GIT_COMMIT          = std::string_view(TRANSFERBENCH_GIT_COMMIT);
+static constexpr auto TB_HEADER_VERSION  = std::string_view(TRANSFERBENCH_HEADER_VERSION);
+static constexpr auto TB_UNKNOWN_VERSION = std::string_view("Unknown");
+
+
+/**
+ * Enumeration of supported Executor types
+ *
+ * @note The Executor is the device used to perform a Transfer
+ */
+enum ExeType
+{
+    EXE_CPU         = 0,    ///<  CPU executor              (subExecutor = CPU thread)
+    EXE_GPU_GFX     = 1,    ///<  GPU kernel-based executor (subExecutor = threadblock/CU)
+    EXE_GPU_DMA     = 2,    ///<  GPU SDMA executor         (subExecutor = not supported)
+    EXE_NIC         = 3,    ///<  NIC RDMA executor         (subExecutor = queue pair)
+    EXE_NIC_NEAREST = 4     ///<  NIC RDMA nearest executor (subExecutor = queue pair)
+};
+
+char const ExeTypeStr[6] = "CGDIN";
+
+inline bool IsCpuExeType(ExeType e) { return e == EXE_CPU; }
+
+inline bool IsGpuExeType(ExeType e) { return e == EXE_GPU_GFX || e == EXE_GPU_DMA; }
+
+inline bool IsNicExeType(ExeType e) { return e == EXE_NIC || e == EXE_NIC_NEAREST; }
+
+/**
+ * A ExeDevice defines a specific Executor
+ */
+struct ExeDevice
+{
+    ExeType exeType;     ///< Executor type
+    int32_t exeIndex;    ///< Executor index
+
+    bool operator<(ExeDevice const& other) const
+    {
+            return (exeType < other.exeType) ||
+                   (exeType == other.exeType && exeIndex < other.exeIndex);
+    }
+};
+
+/**
+ * Enumeration of supported memory types
+ *
+ * @note These are possible types of memory to be used as sources/destinations
+ */
+enum MemType
+{
+    MEM_CPU          = 0,    ///< Coarse-grained pinned CPU memory
+    MEM_GPU          = 1,    ///< Coarse-grained global GPU memory
+    MEM_CPU_FINE     = 2,    ///< Fine-grained pinned CPU memory
+    MEM_GPU_FINE     = 3,    ///< Fine-grained global GPU memory
+    MEM_CPU_UNPINNED = 4,    ///< Unpinned CPU memory
+    MEM_NULL         = 5,    ///< NULL memory - used for empty
+    MEM_MANAGED      = 6,    ///< Managed memory
+    MEM_CPU_CLOSEST  = 7,    ///< Coarse-grained pinned CPU memory indexed by closest GPU
+};
+
+char const MemTypeStr[9] = "CGBFUNMP";
+
+inline bool IsCpuMemType(MemType m)
+{
+    return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED || m == MEM_CPU_CLOSEST);
+}
+
+inline bool IsGpuMemType(MemType m)
+{
+    return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED);
+}
+
+/**
+ * A MemDevice indicates a memory type on a specific device
+ */
+struct MemDevice
+{
+    MemType memType;     ///< Memory type
+    int32_t memIndex;    ///< Device index
+
+    bool operator<(MemDevice const& other) const
+    {
+            return (memType < other.memType) ||
+                   (memType == other.memType && memIndex < other.memIndex);
+    }
+};
+
+/**
+ * A Transfer adds together data from zero or more sources then writes the sum to zero or more
+ * destinations
+ */
+struct Transfer
+{
+    size_t numBytes        = 0;     ///< Number of bytes to Transfer
+    vector<MemDevice> srcs = {};    ///< List of source memory devices
+    vector<MemDevice> dsts = {};    ///< List of destination memory devices
+    ExeDevice exeDevice    = {};    ///< Executor to use
+    int32_t exeSubIndex    = -1;    ///< Executor subindex
+    int numSubExecs        = 0;     ///< Number of subExecutors to use for this Transfer
+};
+
+/**
+ * General options
+ */
+struct GeneralOptions
+{
+    int numIterations = 10;    ///< Number of timed iterations to perform. If negative, run for
+                                   ///< -numIterations seconds instead
+    int numSubIterations   = 1;    ///< Number of sub-iterations per iteration
+    int numWarmups         = 3;    ///< Number of un-timed warmup iterations to perform
+    int recordPerIteration = 0;    ///< Record per-iteration timing information
+    int useInteractive     = 0;    ///< Pause for user-input before starting transfer loop
+};
+
+/**
+ * Data options
+ */
+struct DataOptions
+{
+    int alwaysValidate = 0;      ///< Validate after each iteration instead of once at end
+    int blockBytes     = 256;    ///< Each subexecutor works on a multiple of this many bytes
+    int byteOffset     = 0;      ///< Byte-offset for memory allocations
+    vector<float> fillPattern = {};    ///< Pattern of floats used to fill source data
+    ///< Customized data patterns (overrides fillPattern if non-empty)
+    vector<int> fillCompress = {};
+    int validateDirect = 0;    ///< Validate GPU results directly instead of copying to host
+    int validateSource = 0;    ///< Validate src GPU memory immediately after preparation
+};
+
+/**
+ * GFX Executor options
+ */
+struct GfxOptions
+{
+    int blockOrder = 0;    ///< Determines how threadblocks are ordered (0=sequential,
+                               ///< 1=interleaved, 2=random)
+    int blockSize           = 256;    ///< Size of each threadblock (must be multiple of 64)
+    vector<uint32_t> cuMask = {};     ///< Bit-vector representing the CU mask
+    ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
+    vector<vector<int>> prefXccTable = {};
+    int temporalMode   = 0;    ///< Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both
+    int unrollFactor   = 4;    ///< GFX-kernel unroll factor
+    int useHipEvents   = 1;    ///< Use HIP events for timing GFX Executor
+    int useMultiStream = 0;    ///< Use multiple streams for GFX
+    int useSingleTeam  = 0;    ///< Team all subExecutors across the data array
+    int waveOrder      = 0;    ///< GFX-kernel wavefront ordering
+    int wordSize = 4;    ///< GFX-kernel packed data size (4=dwordx4, 2=dwordx2, 1=dwordx1)
+};
+
+/**
+ * DMA Executor options
+ */
+struct DmaOptions
+{
+    int useHipEvents = 1;    ///< Use HIP events for timing DMA Executor
+    int useHsaCopy   = 0;    ///< Use HSA copy instead of HIP copy to perform DMA
+};
+
+/**
+ * NIC Executor options
+ */
+struct NicOptions
+{
+    vector<int> closestNics = {};     ///< Overrides the auto-detected closest NIC per GPU
+    int ibGidIndex          = -1;     ///< GID Index for RoCE NICs (-1 is auto)
+    uint8_t ibPort          = 1;      ///< NIC port number to be used
+    int ipAddressFamily     = 4;      ///< 4=IPv4, 6=IPv6 (used for auto GID detection)
+    int maxRecvWorkReq      = 16;     ///< Maximum number of recv work requests per queue pair
+    int maxSendWorkReq      = 16;     ///< Maximum number of send work requests per queue pair
+    int queueSize           = 100;    ///< Completion queue size
+    int roceVersion         = 2;      ///< RoCE version (used for auto GID detection)
+    int useRelaxedOrder     = 1;      ///< Use relaxed ordering
+    int useNuma             = 0;      ///< Switch to closest numa thread for execution
+};
+
+
+/**
+ * Configuration options for performing Transfers
+ */
+struct ConfigOptions
+{
+    GeneralOptions general;    ///< General options
+    DataOptions data;          ///< Data options
+
+    GfxOptions gfx;    ///< GFX executor options
+    DmaOptions dma;    ///< DMA executor options
+    NicOptions nic;    ///< NIC executor options
+};
+
+/**
+ * Enumeration of possible error types
+ */
+enum ErrType
+{
+    ERR_NONE  = 0,    ///< No errors
+    ERR_WARN  = 1,    ///< Warning - results may not be accurate
+    ERR_FATAL = 2,    ///< Fatal error - results are invalid
+};
+
+/**
+ * Enumeration of GID priority
+ *
+ * @note These are the GID types ordered in priority from lowest (0) to highest
+ */
+enum GidPriority
+{
+    UNKNOWN           = -1,    ///< Default
+    ROCEV1_LINK_LOCAL = 0,     ///< RoCEv1 Link-local
+    ROCEV2_LINK_LOCAL = 1,     ///< RoCEv2 Link-local fe80::/10
+    ROCEV1_IPV6       = 2,     ///< RoCEv1 IPv6
+    ROCEV2_IPV6       = 3,     ///< RoCEv2 IPv6
+    ROCEV1_IPV4       = 4,     ///< RoCEv1 IPv4-mapped IPv6
+    ROCEV2_IPV4       = 5,     ///< RoCEv2 IPv4-mapped IPv6 ::ffff:192.168.x.x
+};
+
+const char* GidPriorityStr[] = {"RoCEv1 Link-local",
+                                "RoCEv2 Link-local",
+                                "RoCEv1 IPv6",
+                                "RoCEv2 IPv6",
+                                "RoCEv1 IPv4-mapped IPv6",
+                                "RoCEv2 IPv4-mapped IPv6"};
+
+/**
+ * ErrResult consists of error type and error message
+ */
+struct ErrResult
+{
+    ErrType errType;       ///< Error type
+    std::string errMsg;    ///< Error details
+
+    ErrResult() = default;
+
+    // clang-format off
+#if defined(__NVCC__)
+    ErrResult(cudaError_t err);
+#else
+    ErrResult(hipError_t err);
+    ErrResult(hsa_status_t err);
+#endif
+    ErrResult(ErrType err);
+    ErrResult(ErrType errType, const char* format, ...);
+    // clang-format on
+};
+
+/**
+ * Results for a single Executor
+ */
+struct ExeResult
+{
+    size_t numBytes;           ///< Total bytes transferred by this Executor
+    double avgDurationMsec;    ///< Averaged duration for all the Transfers for this Executor
+    double avgBandwidthGbPerSec;    ///< Average bandwidth for this Executor
+    double sumBandwidthGbPerSec;    ///< Naive sum of individual Transfer average bandwidths
+    vector<int> transferIdx;        ///< Indicies of Transfers this Executor executed
+};
+
+/**
+ * Results for a single Transfer
+ */
+struct TransferResult
+{
+    size_t numBytes;    ///< Number of bytes transferred by this Transfer
+    ///< Duration for this Transfer, averaged over all timed iterations
+    double avgDurationMsec;
+    double avgBandwidthGbPerSec;    ///< Bandwidth for this Transfer based on averaged duration
+
+    // Only filled in if recordPerIteration = 1
+    vector<double> perIterMsec;                ///< Duration for each individual iteration
+    vector<set<pair<int, int>>> perIterCUs;    ///< GFX-Executor only. XCC:CU used per iteration
+
+    ExeDevice exeDevice;       ///< Tracks which executor performed this Transfer (e.g. for
+                                   ///< EXE_NIC_NEAREST)
+    ExeDevice exeDstDevice;    ///< Tracks actual destination executor (only valid for
+                                   ///< EXE_NIC/EXE_NIC_NEAREST)
+};
+
+/**
+ * TestResults contain timing results for a set of Transfers as a group as well as per Executor and
+ * per Transfer timing information
+ */
+struct TestResults
+{
+    int numTimedIterations;          ///< Number of iterations executed
+    size_t totalBytesTransferred;    ///< Total bytes transferred per iteration
+    double avgTotalDurationMsec;     ///< Wall-time (msec) to finish all Transfers (averaged
+                                        ///< across all timed iterations)
+    double avgTotalBandwidthGbPerSec;    ///< Bandwidth based on all Transfers and average wall
+                                             ///< time
+    double overheadMsec;    ///< Difference between total wall time and slowest executor
+
+    map<ExeDevice, ExeResult> exeResults;    ///< Per Executor results
+    vector<TransferResult> tfrResults;       ///< Per Transfer results
+    vector<ErrResult> errResults;            ///< List of any errors/warnings that occurred
+};
+
+/**
+ * Run a set of Transfers
+ *
+ * @param[in]  config     Configuration options
+ * @param[in]  transfers  Set of Transfers to execute
+ * @param[out] results    Timing results
+ * @returns true if and only if Transfers were run successfully without any fatal errors
+ */
+bool RunTransfers(ConfigOptions const& config,
+                  vector<Transfer> const& transfers,
+                  TestResults& results);
+
+/**
+ * Enumeration of implementation attributes
+ */
+enum IntAttribute
+{
+    ATR_GFX_MAX_BLOCKSIZE,    ///< Maximum blocksize for GFX executor
+    ATR_GFX_MAX_UNROLL,       ///< Maximum unroll factor for GFX executor
+};
+
+enum StrAttribute
+{
+    ATR_SRC_PREP_DESCRIPTION    ///< Description of how source memory is prepared
+};
+
+/**
+ * Query attributes (integer)
+ *
+ * @note This allows querying of implementation information such as limits
+ *
+ * @param[in] attribute   Attribute to query
+ * @returns Value of the attribute
+ */
+int GetIntAttribute(IntAttribute attribute);
+
+/**
+ * Query attributes (string)
+ *
+ * @note This allows query of implementation details such as limits
+ *
+ * @param[in] attrtibute Attribute to query
+ * @returns Value of the attribute
+ */
+std::string GetStrAttribute(StrAttribute attribute);
+
+/**
+ * Returns information about number of available available Executors
+ *
+ * @param[in] exeType    Executor type to query
+ * @returns Number of detected Executors of exeType
+ */
+int GetNumExecutors(ExeType exeType);
+
+/**
+ * Returns the number of possible Executor subindices
+ *
+ * @note For CPU, this is 0
+ * @note For GFX, this refers to the number of XCDs
+ * @note For DMA, this refers to the number of DMA engines
+ *
+ * @param[in] exeDevice The specific Executor to query
+ * @returns Number of detected executor subindices
+ */
+int GetNumExecutorSubIndices(ExeDevice exeDevice);
+
+/**
+ * Returns number of subExecutors for a given ExeDevice
+ *
+ * @param[in] exeDevice   The specific Executor to query
+ * @returns Number of detected subExecutors for the given ExePair
+ */
+int GetNumSubExecutors(ExeDevice exeDevice);
+
+/**
+ * Returns the index of the NUMA node closest to the given GPU
+ *
+ * @param[in] gpuIndex Index of the GPU to query
+ * @returns NUMA node index closest to GPU gpuIndex, or -1 if unable to detect
+ */
+int GetClosestCpuNumaToGpu(int gpuIndex);
+
+/**
+ * Returns the index of the NUMA node closest to the given NIC
+ *
+ * @param[in] nicIndex Index of the NIC to query
+ * @returns NUMA node index closest to the NIC nicIndex, or -1 if unable to detect
+ */
+int GetClosestCpuNumaToNic(int nicIndex);
+
+/**
+ * Returns the index of the NIC closest to the given GPU
+ *
+ * @param[in] gpuIndex Index of the GPU to query
+ * @note This function is applicable when the IBV/RDMA executor is available
+ * @returns IB Verbs capable NIC index closest to GPU gpuIndex, or -1 if unable to detect
+ */
+int GetClosestNicToGpu(int gpuIndex);
+
+/**
+ * Helper function to parse a line containing Transfers into a vector of Transfers
+ *
+ * @param[in]  str       String containing description of Transfers
+ * @param[out] transfers List of Transfers described by 'str'
+ * @returns Information about any error that may have occured
+ */
+ErrResult ParseTransfers(std::string str, std::vector<Transfer>& transfers);
+
+
+/**
+ * Helper function to get the builtin version
+ *
+ * @returns string format of tb builtin version
+ */
+auto GetTransferBenchVersion() -> const std::string;
+
+
+/**
+ * Helper function to get branch information
+ *
+ * @returns string format of tb branch information
+ */
+auto GetTransferBenchBranch() -> const std::string;
+
+/**
+ * Helper function to get the builtin git commit
+ *
+ * @returns string format of tb builtin git commit
+ */
+auto GetTransferBenchCommitHash([[maybe_unused]] bool is_long_commit = true) -> const std::string;
+
+
+};    // namespace TransferBench
+
+//==========================================================================================
+// End of TransferBench API
+//==========================================================================================
+
+
+// Redefinitions for CUDA compatibility
+//==========================================================================================
+
+// clang-format off
+#if defined(__NVCC__)
+
+// ROCm specific
+#define wall_clock64    clock64
+#define gcnArchName     name
+
+// Datatypes
+#define hipDeviceProp_t cudaDeviceProp
+#define hipError_t      cudaError_t
+#define hipEvent_t      cudaEvent_t
+#define hipStream_t     cudaStream_t
+
+// Enumerations
+#define hipDeviceAttributeClockRate             cudaDevAttrClockRate
+#define hipDeviceAttributeMultiprocessorCount   cudaDevAttrMultiProcessorCount
+#define hipErrorPeerAccessAlreadyEnabled        cudaErrorPeerAccessAlreadyEnabled
+#define hipFuncCachePreferShared                cudaFuncCachePreferShared
+#define hipMemcpyDefault                        cudaMemcpyDefault
+#define hipMemcpyDeviceToHost                   cudaMemcpyDeviceToHost
+#define hipMemcpyHostToDevice                   cudaMemcpyHostToDevice
+#define hipSuccess                              cudaSuccess
+
+// Functions
+#define hipDeviceCanAccessPeer                  cudaDeviceCanAccessPeer
+#define hipDeviceEnablePeerAccess               cudaDeviceEnablePeerAccess
+#define hipDeviceGetAttribute                   cudaDeviceGetAttribute
+#define hipDeviceGetPCIBusId                    cudaDeviceGetPCIBusId
+#define hipDeviceSetCacheConfig                 cudaDeviceSetCacheConfig
+#define hipDeviceSynchronize                    cudaDeviceSynchronize
+#define hipEventCreate                          cudaEventCreate
+#define hipEventDestroy                         cudaEventDestroy
+#define hipEventElapsedTime                     cudaEventElapsedTime
+#define hipEventRecord                          cudaEventRecord
+#define hipFree                                 cudaFree
+#define hipGetDeviceCount                       cudaGetDeviceCount
+#define hipGetDeviceProperties                  cudaGetDeviceProperties
+#define hipGetErrorString                       cudaGetErrorString
+#define hipHostFree                             cudaFreeHost
+#define hipHostMalloc                           cudaMallocHost
+#define hipMalloc                               cudaMalloc
+#define hipMallocManaged                        cudaMallocManaged
+#define hipMemcpy                               cudaMemcpy
+#define hipMemcpyAsync                          cudaMemcpyAsync
+#define hipMemset                               cudaMemset
+#define hipMemsetAsync                          cudaMemsetAsync
+#define hipSetDevice                            cudaSetDevice
+#define hipStreamCreate                         cudaStreamCreate
+#define hipStreamDestroy                        cudaStreamDestroy
+#define hipStreamSynchronize                    cudaStreamSynchronize
+// clang-format on
+
+// Define float2 addition operator for NVIDIA platform
+__device__ inline float2& operator+=(float2& a, const float2& b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    return a;
+}
+
+// Define float4 addition operator for NVIDIA platform
+__device__ inline float4& operator+=(float4& a, const float4& b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+    return a;
+}
+#endif
+
+
+// Helper macro functions
+//==========================================================================================
+
+// Macro for collecting CU/SM GFX kernel is running on
+
+// clang-format off
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1150__) || defined(__gfx1151__) || \
+    defined(__gfx1200__) || defined(__gfx1201__)
+#define GetHwId(hwId) hwId = 0
+#elif defined(__NVCC__)
+    #define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId))
+#else
+    #define GetHwId(hwId) asm volatile("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s"(hwId));
+#endif
+
+// Macro for collecting XCC GFX kernel is running on
+#if defined(__gfx942__) || defined(__gfx950__)
+    #define GetXccId(val) asm volatile("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s"(val));
+#else
+    #define GetXccId(val) val = 0
+#endif
+
+// Error check macro (NOTE: This will return even for ERR_WARN)
+#define ERR_CHECK(cmd)                  \
+    do {                                \
+        ErrResult err = (cmd);          \
+        if (err.errType != ERR_NONE) {  \
+            return err;                 \
+        }                               \
+    } while (0)
+
+// Appends warn/fatal errors to a list, return false if fatal
+#define ERR_APPEND(cmd, list)           \
+    do {                                \
+        ErrResult err = (cmd);          \
+        if (err.errType != ERR_NONE) {  \
+            list.push_back(err);        \
+        }                               \
+        if (err.errType == ERR_FATAL) { \
+            return false;               \
+        }                               \
+    } while (0)
+
+// Helper macros for calling RDMA functions and reporting errors
+#ifdef VERBS_DEBUG
+#define IBV_CALL(__func__, ...)                                    \
+    do {                                                           \
+        int error = __func__(__VA_ARGS__);                         \
+        if (error != 0) {                                          \
+            return {ERR_FATAL,                                     \
+                    "Encountered IbVerbs error (%d) at line (%d) " \
+                    "and function (%s)",                           \
+                    (error),                                       \
+                    __LINE__,                                      \
+                    #__func__};                                    \
+        }                                                          \
+    } while (0)
+
+#define IBV_PTR_CALL(__ptr__, __func__, ...)                          \
+    do {                                                              \
+        __ptr__ = __func__(__VA_ARGS__);                              \
+        if (__ptr__ == nullptr) {                                     \
+            return {ERR_FATAL,                                        \
+                    "Encountered IbVerbs nullptr error at line (%d) " \
+                    "and function (%s)",                              \
+                    __LINE__,                                         \
+                    #__func__};                                       \
+        }                                                             \
+    } while (0)
+#else
+#define IBV_CALL(__func__, ...)                                                                   \
+    do {                                                                                          \
+        int error = __func__(__VA_ARGS__);                                                        \
+        if (error != 0) {                                                                         \
+            return {ERR_FATAL, "Encountered IbVerbs error (%d) in func (%s) ", error, #__func__}; \
+        }                                                                                         \
+    } while (0)
+
+#define IBV_PTR_CALL(__ptr__, __func__, ...)                                                  \
+    do {                                                                                      \
+        __ptr__ = __func__(__VA_ARGS__);                                                      \
+        if (__ptr__ == nullptr) {                                                             \
+            return {ERR_FATAL, "Encountered IbVerbs nullptr error in func (%s) ", #__func__}; \
+        }                                                                                     \
+    } while (0)
+#endif
+// clang-format on
+
+namespace TransferBench
+{
+
+/// @cond
+// Helper functions ('hidden' in anonymous namespace)
+//========================================================================================
+namespace
+{
+
+// Constants
+//========================================================================================
+
+int constexpr MAX_BLOCKSIZE  = 1024;                  // Max threadblock size
+int constexpr MAX_WAVEGROUPS = MAX_BLOCKSIZE / 64;    // Max wavegroups/warps
+int constexpr MAX_UNROLL     = 8;                     // Max unroll factor
+int constexpr MAX_SRCS       = 8;                     // Max srcs per Transfer
+int constexpr MAX_DSTS       = 8;                     // Max dsts per Transfer
+int constexpr MEMSET_CHAR    = 75;                    // Value to memset (char)
+float constexpr MEMSET_VAL   = 13323083.0f;           // Value to memset (double)
+
+// Parsing-related functions
+//========================================================================================
+
+static ErrResult CharToMemType(char const c, MemType& memType)
+{
+    char const* val = strchr(MemTypeStr, toupper(c));
+    if (val) {
+        memType = (MemType)(val - MemTypeStr);
+        return ERR_NONE;
+    }
+    return {ERR_FATAL, "Unexpected memory type (%c)", c};
+}
+
+static ErrResult CharToExeType(char const c, ExeType& exeType)
+{
+    char const* val = strchr(ExeTypeStr, toupper(c));
+    if (val) {
+        exeType = (ExeType)(val - ExeTypeStr);
+        return ERR_NONE;
+    }
+    return {ERR_FATAL, "Unexpected executor type (%c)", c};
+}
+
+static ErrResult ParseMemType(std::string const& token, std::vector<MemDevice>& memDevices)
+{
+    char memTypeChar;
+    int offset = 0, memIndex, inc;
+    MemType memType;
+    bool found = false;
+
+    memDevices.clear();
+    while (sscanf(token.c_str() + offset, " %c %d%n", &memTypeChar, &memIndex, &inc) == 2) {
+        offset += inc;
+
+        ErrResult err = CharToMemType(memTypeChar, memType);
+        if (err.errType != ERR_NONE) { return err; }
+
+        if (memType != MEM_NULL) { memDevices.push_back({memType, memIndex}); }
+        found = true;
+    }
+    if (found) { return ERR_NONE; }
+    return {ERR_FATAL,
+            "Unable to parse memory type token %s.  Expected one of %s followed by an index",
+            token.c_str(),
+            MemTypeStr};
+}
+
+static ErrResult ParseExeType(std::string const& token, ExeDevice& exeDevice, int& exeSubIndex)
+{
+    char exeTypeChar;
+    exeSubIndex = -1;
+
+    int numTokensParsed = sscanf(
+        token.c_str(), " %c%d.%d", &exeTypeChar, &exeDevice.exeIndex, &exeSubIndex);
+    if (numTokensParsed < 2) {
+        return {ERR_FATAL,
+                "Unable to parse valid executor token (%s)."
+                "Expected one of %s followed by an index",
+                token.c_str(),
+                ExeTypeStr};
+    }
+    return CharToExeType(exeTypeChar, exeDevice.exeType);
+}
+
+// Memory-related functions
+//========================================================================================
+// Enable peer access between two GPUs
+static ErrResult EnablePeerAccess(int const deviceId, int const peerDeviceId)
+{
+    int canAccess;
+    ERR_CHECK(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
+    if (!canAccess) {
+        return {ERR_FATAL,
+                "Peer access is unavailable between GPU devices %d to %d."
+                "For AMD hardware, check IOMMU configuration",
+                peerDeviceId,
+                deviceId};
+    }
+
+    ERR_CHECK(hipSetDevice(deviceId));
+    hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
+    if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled) {
+        return {ERR_FATAL,
+                "Unable to enable peer to peer access from %d to %d (%s)",
+                deviceId,
+                peerDeviceId,
+                hipGetErrorString(error)};
+    }
+    return ERR_NONE;
+}
+
+// Check that CPU memory array of numBytes has been allocated on targetId NUMA node
+static ErrResult CheckPages(char* array, size_t numBytes, int targetId)
+{
+    size_t const pageSize = getpagesize();
+    size_t const numPages = (numBytes + pageSize - 1) / pageSize;
+
+    std::vector<void*> pages(numPages);
+    std::vector<int> status(numPages);
+
+    pages[0] = array;
+    for (auto i = std::size_t(1); i < numPages; i++) { pages[i] = (char*)pages[i - 1] + pageSize; }
+
+    long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
+    if (retCode) {
+        return {ERR_FATAL,
+                "Unable to collect page table information for allocated memory. "
+                "Ensure NUMA library is installed properly"};
+    }
+
+    size_t mistakeCount = 0;
+    for (size_t i = 0; i < numPages; i++) {
+        if (status[i] < 0) {
+            return {ERR_FATAL, "Unexpected page status (%d) for page %llu", status[i], i};
+        }
+        if (status[i] != targetId) { mistakeCount++; }
+    }
+    if (mistakeCount > 0) {
+        return {ERR_FATAL,
+                "%lu out of %lu pages for memory allocation were not on NUMA node %d."
+                " This could be due to hardware memory issues, or the use of numa-rebalancing "
+                "daemons such as numad",
+                mistakeCount,
+                numPages,
+                targetId};
+    }
+    return ERR_NONE;
+}
+
+// Allocate memory
+static ErrResult AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr)
+{
+    if (numBytes == 0) { return {ERR_FATAL, "Unable to allocate 0 bytes"}; }
+    *memPtr = nullptr;
+
+    MemType const& memType = memDevice.memType;
+
+    if (IsCpuMemType(memType)) {
+        // Determine which NUMA device to use
+        int numaIdx = memDevice.memIndex;
+        if (memType == MEM_CPU_CLOSEST) { numaIdx = GetClosestCpuNumaToGpu(memDevice.memIndex); }
+
+        // Set NUMA policy prior to call to hipHostMalloc
+        numa_set_preferred(numaIdx);
+
+        // Allocate host-pinned memory (should respect NUMA mem policy)
+        if (memType == MEM_CPU_FINE) {
+            // clang-format off
+#if defined(__NVCC__)
+            return {ERR_FATAL, "Fine-grained CPU memory not supported on NVIDIA platform"};
+#else
+            ERR_CHECK(hipHostMalloc((void**)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocCoherent));
+#endif
+        } else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) {
+#if defined(__NVCC__)
+            ERR_CHECK(hipHostMalloc((void**)memPtr, numBytes, 0));
+#else
+            ERR_CHECK(hipHostMalloc((void**)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent));
+#endif
+            // clang-format on
+        } else if (memType == MEM_CPU_UNPINNED) {
+            *memPtr = numa_alloc_onnode(numBytes, numaIdx);
+        }
+
+        // Check that the allocated pages are actually on the correct NUMA node
+        memset(*memPtr, 0, numBytes);
+        ERR_CHECK(CheckPages((char*)*memPtr, numBytes, numaIdx));
+
+        // Reset to default numa mem policy
+        numa_set_preferred(-1);
+    } else if (IsGpuMemType(memType)) {
+        // Switch to the appropriate GPU
+        ERR_CHECK(hipSetDevice(memDevice.memIndex));
+
+        if (memType == MEM_GPU) {
+            // Allocate GPU memory on appropriate device
+            ERR_CHECK(hipMalloc((void**)memPtr, numBytes));
+        } else if (memType == MEM_GPU_FINE) {
+            // clang-format off
+#if defined(__NVCC__)
+            return {ERR_FATAL, "Fine-grained GPU memory not supported on NVIDIA platform"};
+#else
+            int flag = hipDeviceMallocUncached;
+            ERR_CHECK(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
+#endif
+            // clang-format on
+        } else if (memType == MEM_MANAGED) {
+            ERR_CHECK(hipMallocManaged((void**)memPtr, numBytes));
+        }
+
+        // Clear the memory
+        ERR_CHECK(hipMemset(*memPtr, 0, numBytes));
+        ERR_CHECK(hipDeviceSynchronize());
+    } else {
+        return {ERR_FATAL, "Unsupported memory type (%d)", memType};
+    }
+    return ERR_NONE;
+}
+
+// Deallocate memory
+static ErrResult DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
+{
+    // Avoid deallocating nullptr
+    if (memPtr == nullptr) {
+        return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes};
+    }
+
+    switch (memType) {
+        case MEM_CPU:
+        case MEM_CPU_FINE:
+        case MEM_CPU_CLOSEST: {
+            ERR_CHECK(hipHostFree(memPtr));
+            break;
+        }
+        case MEM_CPU_UNPINNED: {
+            numa_free(memPtr, bytes);
+            break;
+        }
+        case MEM_GPU:
+        case MEM_GPU_FINE:
+        case MEM_MANAGED: {
+            ERR_CHECK(hipFree(memPtr));
+            break;
+        }
+        default:
+            return {ERR_FATAL, "Attempting to deallocate unrecognized memory type (%d)", memType};
+    }
+    return ERR_NONE;
+}
+
+// HSA-related functions
+//========================================================================================
+
+#if !defined(__NVCC__)
+// Get the hsa_agent_t associated with a ExeDevice
+static ErrResult GetHsaAgent(ExeDevice const& exeDevice, hsa_agent_t& agent)
+{
+    static bool isInitialized = false;
+    static std::vector<hsa_agent_t> cpuAgents;
+    static std::vector<hsa_agent_t> gpuAgents;
+
+    int const& exeIndex = exeDevice.exeIndex;
+    int const numCpus   = GetNumExecutors(EXE_CPU);
+    int const numGpus   = GetNumExecutors(EXE_GPU_GFX);
+
+    // Initialize results on first use
+    if (!isInitialized) {
+        hsa_amd_pointer_info_t info;
+        info.size = sizeof(info);
+
+        ErrResult err;
+        int32_t* tempBuffer;
+
+        // Index CPU agents
+        cpuAgents.clear();
+        for (int i = 0; i < numCpus; i++) {
+            ERR_CHECK(AllocateMemory({MEM_CPU, i}, 1024, (void**)&tempBuffer));
+            ERR_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
+            cpuAgents.push_back(info.agentOwner);
+            ERR_CHECK(DeallocateMemory(MEM_CPU, tempBuffer, 1024));
+        }
+
+        // Index GPU agents
+        gpuAgents.clear();
+        for (int i = 0; i < numGpus; i++) {
+            ERR_CHECK(AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer));
+            ERR_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
+            gpuAgents.push_back(info.agentOwner);
+            ERR_CHECK(DeallocateMemory(MEM_GPU, tempBuffer, 1024));
+        }
+        isInitialized = true;
+    }
+
+    switch (exeDevice.exeType) {
+        case EXE_CPU:
+            if (exeIndex < 0 || exeIndex >= numCpus) {
+                return {ERR_FATAL, "CPU index must be between 0 and %d inclusively", numCpus - 1};
+            }
+            agent = cpuAgents[exeDevice.exeIndex];
+            break;
+        case EXE_GPU_GFX:
+        case EXE_GPU_DMA:
+            if (exeIndex < 0 || exeIndex >= numGpus) {
+                return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1};
+            }
+            agent = gpuAgents[exeIndex];
+            break;
+        default:
+            return {ERR_FATAL,
+                    "Attempting to get HSA agent of unknown or unsupported executor type (%d)",
+                    exeDevice.exeType};
+    }
+    return ERR_NONE;
+}
+
+// Get the hsa_agent_t associated with a MemDevice
+static ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent)
+{
+    if (memDevice.memType == MEM_CPU_CLOSEST) {
+        return GetHsaAgent({EXE_CPU, GetClosestCpuNumaToGpu(memDevice.memIndex)}, agent);
+    }
+    if (IsCpuMemType(memDevice.memType)) {
+        return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent);
+    }
+    if (IsGpuMemType(memDevice.memType)) {
+        return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent);
+    }
+    return {ERR_FATAL,
+            "Unable to get HSA agent for memDevice (%d,%d)",
+            memDevice.memType,
+            memDevice.memIndex};
+}
+#endif
+
+// Setup validation-related functions
+//========================================================================================
+
+static ErrResult GetActualExecutor(ConfigOptions const& cfg,
+                                   ExeDevice const& origExeDevice,
+                                   ExeDevice& actualExeDevice)
+{
+    // By default, nothing needs to change
+    actualExeDevice = origExeDevice;
+
+    // When using NIC_NEAREST, remap to the closest NIC to the GPU
+    if (origExeDevice.exeType == EXE_NIC_NEAREST) {
+        actualExeDevice.exeType = EXE_NIC;
+
+        if (cfg.nic.closestNics.size() > 0) {
+            if (origExeDevice.exeIndex < 0 ||
+                static_cast<std::size_t>(origExeDevice.exeIndex) >= cfg.nic.closestNics.size()) {
+                return {ERR_FATAL, "NIC index is out of range (%d)", origExeDevice.exeIndex};
+            }
+
+            actualExeDevice.exeIndex = cfg.nic.closestNics[origExeDevice.exeIndex];
+        } else {
+            actualExeDevice.exeIndex = GetClosestNicToGpu(origExeDevice.exeIndex);
+        }
+    }
+    return ERR_NONE;
+}
+
+// Validate that MemDevice exists
+static ErrResult CheckMemDevice(MemDevice const& memDevice)
+{
+    if (memDevice.memType == MEM_NULL) { return ERR_NONE; }
+
+    if (IsCpuMemType(memDevice.memType) && memDevice.memType != MEM_CPU_CLOSEST) {
+        int numCpus = GetNumExecutors(EXE_CPU);
+        if (memDevice.memIndex < 0 || memDevice.memIndex >= numCpus) {
+            return {ERR_FATAL,
+                    "CPU index must be between 0 and %d (instead of %d)",
+                    numCpus - 1,
+                    memDevice.memIndex};
+        }
+        return ERR_NONE;
+    }
+
+    if (IsGpuMemType(memDevice.memType) || memDevice.memType == MEM_CPU_CLOSEST) {
+        int numGpus = GetNumExecutors(EXE_GPU_GFX);
+        if (memDevice.memIndex < 0 || memDevice.memIndex >= numGpus) {
+            return {ERR_FATAL,
+                    "GPU index must be between 0 and %d (instead of %d)",
+                    numGpus - 1,
+                    memDevice.memIndex};
+        }
+        if (memDevice.memType == MEM_CPU_CLOSEST) {
+            if (GetClosestCpuNumaToGpu(memDevice.memIndex) == -1) {
+                return {ERR_FATAL,
+                        "Unable to determine closest NUMA node for GPU %d",
+                        memDevice.memIndex};
+            }
+        }
+        return ERR_NONE;
+    }
+    return {ERR_FATAL, "Unsupported memory type (%d)", memDevice.memType};
+}
+
+// Validate configuration options - return trues if and only if an fatal error is detected
+static bool ConfigOptionsHaveErrors(ConfigOptions const& cfg, std::vector<ErrResult>& errors)
+{
+    // Check general options
+    if (cfg.general.numWarmups < 0) {
+        errors.push_back({ERR_FATAL, "[general.numWarmups] must be a non-negative number"});
+    }
+
+    // Check data options
+    if (cfg.data.blockBytes == 0 || cfg.data.blockBytes % 4) {
+        errors.push_back(
+            {ERR_FATAL, "[data.blockBytes] must be positive multiple of %lu", sizeof(float)});
+    }
+    if (cfg.data.byteOffset < 0 || cfg.data.byteOffset % sizeof(float)) {
+        errors.push_back(
+            {ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)});
+    }
+    if (cfg.data.fillCompress.size() > 0 && cfg.data.fillPattern.size() > 0) {
+        errors.push_back(
+            {ERR_WARN,
+             "[data.fillCompress] will override [data.fillPattern] when both are specified"});
+    }
+    if (cfg.data.fillCompress.size() > 0) {
+        int sum = 0;
+        for (int bin : cfg.data.fillCompress) { sum += bin; }
+        if (sum != 100) {
+            errors.push_back({ERR_FATAL, "[data.fillCompress] values must add up to 100"});
+        }
+    }
+    if (cfg.data.fillCompress.size() > 5) {
+        errors.push_back({ERR_FATAL, "[data.fillCompress] may only have up to 5 values"});
+    }
+
+    // Check GFX options
+    if (cfg.gfx.blockOrder < 0 || cfg.gfx.blockOrder > 2) {
+        errors.push_back(
+            {ERR_FATAL,
+             "[gfx.blockOrder] must be 0 for sequential, 1 for interleaved, or 2 for random"});
+    }
+
+    if (cfg.gfx.useMultiStream && cfg.gfx.blockOrder > 0) {
+        errors.push_back(
+            {ERR_WARN, "[gfx.blockOrder] will be ignored when running in multi-stream mode"});
+    }
+
+    int gfxMaxBlockSize = GetIntAttribute(ATR_GFX_MAX_BLOCKSIZE);
+    if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > gfxMaxBlockSize) {
+        errors.push_back(
+            {ERR_FATAL,
+             "[gfx.blockSize] must be positive multiple of 64 less than or equal to %d",
+             gfxMaxBlockSize});
+    }
+
+    if (cfg.gfx.temporalMode < 0 || cfg.gfx.temporalMode > 3) {
+        errors.push_back(
+            {ERR_FATAL, "[gfx.temporalMode] must be non-negative and less than or equal to 3"});
+    }
+
+    // clang-format off
+#if defined(__NVCC__)
+    if (cfg.gfx.temporalMode > 0) {
+        errors.push_back({ERR_FATAL, "[gfx.temporalMode] is not supported on NVIDIA hardware"});
+    }
+#endif
+    // clang-format on
+
+    int gfxMaxUnroll = GetIntAttribute(ATR_GFX_MAX_UNROLL);
+    if (cfg.gfx.unrollFactor < 0 || cfg.gfx.unrollFactor > gfxMaxUnroll) {
+        errors.push_back({ERR_FATAL,
+                          "[gfx.unrollFactor] must be non-negative and less than or equal to %d",
+                          gfxMaxUnroll});
+    }
+    if (cfg.gfx.waveOrder < 0 || cfg.gfx.waveOrder >= 6) {
+        errors.push_back({ERR_FATAL, "[gfx.waveOrder] must be non-negative and less than 6"});
+    }
+
+    if (!(cfg.gfx.wordSize == 1 || cfg.gfx.wordSize == 2 || cfg.gfx.wordSize == 4)) {
+        errors.push_back({ERR_FATAL, "[gfx.wordSize] must be either 1, 2 or 4"});
+    }
+
+    int numGpus                      = GetNumExecutors(EXE_GPU_GFX);
+    int numXccs                      = GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
+    vector<vector<int>> const& table = cfg.gfx.prefXccTable;
+
+    if (!table.empty()) {
+        if (table.size() != static_cast<std::size_t>(numGpus)) {
+            errors.push_back(
+                {ERR_FATAL, "[gfx.prefXccTable] must be have size %dx%d", numGpus, numGpus});
+        } else {
+            for (auto i = std::size_t(0); i < table.size(); i++) {
+                if (table[i].size() != static_cast<std::size_t>(numGpus)) {
+                    errors.push_back({ERR_FATAL,
+                                      "[gfx.prefXccTable] must be have size %dx%d",
+                                      numGpus,
+                                      numGpus});
+                    break;
+                } else {
+                    for (auto x : table[i]) {
+                        if (x < 0 || x >= numXccs) {
+                            errors.push_back(
+                                {ERR_FATAL,
+                                 "[gfx.prefXccTable] must contain values between 0 and %d",
+                                 numXccs - 1});
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // clang-format off
+    // Check NIC options
+#ifdef NIC_EXEC_ENABLED
+    int numNics = GetNumExecutors(EXE_NIC);
+    for (auto const& nic : cfg.nic.closestNics) {
+        if (nic < 0 || nic >= static_cast<std::size_t>(numNics)) {
+            errors.push_back(
+                {ERR_FATAL, "NIC index (%d) in user-specified closest NIC list must be between 0 and %d", nic, numNics - 1});
+        }
+    }
+
+    size_t closetNicsSize = cfg.nic.closestNics.size();
+    if (closetNicsSize > 0 && closetNicsSize < static_cast<std::size_t>(numGpus)) {
+        errors.push_back({ERR_FATAL, "User-specified closest NIC list must match GPU count of %d", numGpus});
+    }
+#endif
+// clang-format on
+
+// clang-format on
+// NVIDIA specific
+#if defined(__NVCC__)
+    if (cfg.data.validateDirect) {
+        errors.push_back({ERR_FATAL, "[data.validateDirect] is not supported on NVIDIA hardware"});
+    }
+#else
+    // AMD specific
+    // Check for largeBar enablement on GPUs
+    for (int i = 0; i < numGpus; i++) {
+        int isLargeBar = 0;
+        hipError_t err = hipDeviceGetAttribute(&isLargeBar, hipDeviceAttributeIsLargeBar, i);
+        if (err != hipSuccess) {
+            errors.push_back({ERR_FATAL, "Unable to query if GPU %d has largeBAR enabled", i});
+        } else if (!isLargeBar) {
+            errors.push_back({ERR_WARN,
+                              "Large BAR is not enabled for GPU %d in BIOS. "
+                              "Large BAR is required to enable multi-gpu data access",
+                              i});
+        }
+    }
+#endif
+    // clang-format off
+
+    // Check for fatal errors
+    for (auto const& err : errors) {
+        if (err.errType == ERR_FATAL) {
+            return true;
+        }
+    }
+    return false;
+}
+
+// Validate Transfers to execute - returns true if and only if fatal error detected
+static bool TransfersHaveErrors(ConfigOptions const& cfg, std::vector<Transfer> const& transfers, std::vector<ErrResult>& errors)
+{
+    int numCpus = GetNumExecutors(EXE_CPU);
+    int numGpus = GetNumExecutors(EXE_GPU_GFX);
+    int numNics = GetNumExecutors(EXE_NIC);
+
+    std::set<ExeDevice> executors;
+    std::map<ExeDevice, int> transferCount;
+    std::map<ExeDevice, int> useSubIndexCount;
+    std::map<ExeDevice, int> totalSubExecs;
+
+    // Per-Transfer checks
+    for (size_t i = 0; i < transfers.size(); i++) {
+        Transfer const& t = transfers[i];
+
+        if (t.numBytes == 0) {
+            errors.push_back({ERR_FATAL, "Transfer %d: Cannot perform 0-byte transfers", i});
+        }
+
+        if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU) {
+            size_t const N = t.numBytes / sizeof(float);
+            int const targetMultiple = cfg.data.blockBytes / sizeof(float);
+            int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)t.numSubExecs);
+
+            if (maxSubExecToUse < t.numSubExecs) {
+                errors.push_back({ERR_WARN,
+                                  "Transfer %d data size is too small - will only use %d of %d subexecutors",
+                                  i,
+                                  maxSubExecToUse,
+                                  t.numSubExecs});
+            }
+        }
+
+        // Check sources and destinations
+        if (t.srcs.empty() && t.dsts.empty()) {
+            errors.push_back({ERR_FATAL, "Transfer %d: Must have at least one source or destination", i});
+        }
+
+        for (auto j = std::size_t(0); j < t.srcs.size(); j++) {
+            ErrResult err = CheckMemDevice(t.srcs[j]);
+            if (err.errType != ERR_NONE) {
+                errors.push_back({ERR_FATAL, "Transfer %d: SRC %d: %s", i, j, err.errMsg.c_str()});
+            }
+        }
+        for (auto j = std::size_t(0); j < t.dsts.size(); j++) {
+            ErrResult err = CheckMemDevice(t.dsts[j]);
+            if (err.errType != ERR_NONE) {
+                errors.push_back({ERR_FATAL, "Transfer %d: DST %d: %s", i, j, err.errMsg.c_str()});
+            }
+        }
+
+        // Check executor
+        executors.insert(t.exeDevice);
+        transferCount[t.exeDevice]++;
+        switch (t.exeDevice.exeType) {
+            case EXE_CPU:
+                if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numCpus) {
+                    errors.push_back({ERR_FATAL,
+                                      "Transfer %d: CPU index must be between 0 and %d (instead of %d)",
+                                      i,
+                                      numCpus - 1,
+                                      t.exeDevice.exeIndex});
+                }
+                break;
+            case EXE_GPU_GFX:
+                if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numGpus) {
+                    errors.push_back({ERR_FATAL,
+                                      "Transfer %d: GFX index must be between 0 and %d (instead of %d)",
+                                      i,
+                                      numGpus - 1,
+                                      t.exeDevice.exeIndex});
+                } else {
+                    if (t.exeSubIndex != -1) {
+// clang-format off
+#if defined(__NVCC__)
+                        errors.push_back({ERR_FATAL, "Transfer %d: GFX executor subindex not supported on NVIDIA hardware", i});
+#else
+                        useSubIndexCount[t.exeDevice]++;
+                        int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
+                        if (t.exeSubIndex >= numSubIndices) {
+                            errors.push_back(
+                                {ERR_FATAL, "Transfer %d: GFX subIndex (XCC) must be between 0 and %d", i, numSubIndices - 1});
+                        }
+#endif
+                        // clang-format on
+                    }
+                }
+                break;
+            case EXE_GPU_DMA:
+                if (t.srcs.size() != 1 || t.dsts.size() != 1) {
+                    errors.push_back(
+                        {ERR_FATAL,
+                         "Transfer %d: DMA executor must have exactly 1 source and 1 destination",
+                         i});
+                }
+
+                if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numGpus) {
+                    errors.push_back(
+                        {ERR_FATAL,
+                         "Transfer %d: DMA index must be between 0 and %d (instead of %d)",
+                         i,
+                         numGpus - 1,
+                         t.exeDevice.exeIndex});
+                    // Cannot proceed with any further checks
+                    continue;
+                }
+
+                if (t.exeSubIndex != -1) {
+                    // clang-format off
+#if defined(__NVCC__)
+                    errors.push_back({ERR_FATAL, "Transfer %d: DMA executor subindex not supported on NVIDIA hardware", i});
+#else
+                    useSubIndexCount[t.exeDevice]++;
+                    int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
+                    if (t.exeSubIndex >= numSubIndices) {
+                        errors.push_back(
+                            {ERR_FATAL, "Transfer %d: DMA subIndex (engine) must be between 0 and %d", i, numSubIndices - 1});
+                    }
+
+                    // Check that engine Id exists between agents
+                    hsa_agent_t srcAgent, dstAgent;
+                    ErrResult err;
+                    err = GetHsaAgent(t.srcs[0], srcAgent);
+                    if (err.errType != ERR_NONE) {
+                        errors.push_back(err);
+                        if (err.errType == ERR_FATAL) {
+                            break;
+                        }
+                    }
+                    err = GetHsaAgent(t.dsts[0], dstAgent);
+                    if (err.errType != ERR_NONE) {
+                        errors.push_back(err);
+                        if (err.errType == ERR_FATAL) {
+                            break;
+                        }
+                    }
+
+                    // Skip check of engine Id mask for self copies
+                    if (srcAgent.handle != dstAgent.handle) {
+                        uint32_t engineIdMask = 0;
+                        err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
+                        if (err.errType != ERR_NONE) {
+                            errors.push_back(err);
+                            if (err.errType == ERR_FATAL) {
+                                break;
+                            }
+                        }
+                        hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
+                        if (!(sdmaEngineId & engineIdMask)) {
+                            errors.push_back({ERR_FATAL,
+                                              "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
+                                              i,
+                                              t.exeDevice.exeIndex,
+                                              t.exeSubIndex});
+                        }
+                    }
+#endif
+                    // clang-format on
+                }
+
+                if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) {
+                    errors.push_back({ERR_WARN,
+                                      "Transfer %d: No GPU memory for source or destination.  Copy "
+                                      "might not execute on DMA %d",
+                                      i,
+                                      t.exeDevice.exeIndex});
+                } else {
+                    // Currently HIP will use src agent if source memory is GPU, otherwise dst agent
+                    if (IsGpuMemType(t.srcs[0].memType)) {
+                        if (t.srcs[0].memIndex != t.exeDevice.exeIndex) {
+                            errors.push_back({ERR_WARN,
+                                              "Transfer %d: DMA executor will automatically switch "
+                                              "to using the source "
+                                              "memory device (%d) not (%d)",
+                                              i,
+                                              t.srcs[0].memIndex,
+                                              t.exeDevice.exeIndex});
+                        }
+                    } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) {
+                        errors.push_back(
+                            {ERR_WARN,
+                             "Transfer %d: DMA executor will automatically switch to using the "
+                             "destination memory device (%d) not (%d)",
+                             i,
+                             t.dsts[0].memIndex,
+                             t.exeDevice.exeIndex});
+                    }
+                }
+                break;
+            case EXE_NIC:
+                // clang-format off
+#ifdef NIC_EXEC_ENABLED
+                {
+                    int srcIndex = t.exeDevice.exeIndex;
+                    int dstIndex = t.exeSubIndex;
+                    if (srcIndex < 0 || srcIndex >= numNics) {
+                        errors.push_back(
+                            {ERR_FATAL, "Transfer %d: src NIC executor indexes an out-of-range NIC (%d)", i, srcIndex});
+                    }
+                    if (dstIndex < 0 || dstIndex >= numNics) {
+                        errors.push_back(
+                            {ERR_FATAL, "Transfer %d: dst NIC executor indexes an out-of-range NIC (%d)", i, dstIndex});
+                    }
+                }
+#else
+                errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available", i});
+#endif
+                break;
+            case EXE_NIC_NEAREST:
+
+#ifdef NIC_EXEC_ENABLED
+                {
+                    ExeDevice srcExeDevice;
+                    ErrResult errSrc = GetActualExecutor(cfg, t.exeDevice, srcExeDevice);
+                    if (errSrc.errType != ERR_NONE) {
+                        errors.push_back(errSrc);
+                    }
+                    ExeDevice dstExeDevice;
+                    ErrResult errDst = GetActualExecutor(cfg, {t.exeDevice.exeType, t.exeSubIndex}, dstExeDevice);
+                    if (errDst.errType != ERR_NONE) {
+                        errors.push_back(errDst);
+                    }
+                }
+#else
+                errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available", i});
+#endif
+            // clang-format on
+            break;
+        }
+
+        // Check subexecutors
+        if (t.numSubExecs <= 0) {
+            errors.push_back({ERR_FATAL, "Transfer %d: # of subexecutors must be positive", i});
+        } else {
+            totalSubExecs[t.exeDevice] += t.numSubExecs;
+        }
+    }
+
+    int gpuMaxHwQueues = 4;
+    if (getenv("GPU_MAX_HW_QUEUES")) { gpuMaxHwQueues = atoi(getenv("GPU_MAX_HW_QUEUES")); }
+
+    // Aggregate checks
+    for (auto const& exeDevice : executors) {
+        switch (exeDevice.exeType) {
+            case EXE_CPU: {
+                // Check total number of subexecutors requested
+                int numCpuSubExec = GetNumSubExecutors(exeDevice);
+                if (totalSubExecs[exeDevice] > numCpuSubExec) {
+                    errors.push_back({ERR_WARN,
+                                      "CPU %d requests %d total cores however only %d available. "
+                                      "Serialization will occur",
+                                      exeDevice.exeIndex,
+                                      totalSubExecs[exeDevice],
+                                      numCpuSubExec});
+                }
+                break;
+            }
+            case EXE_GPU_GFX: {
+                // Check total number of subexecutors requested
+                int numGpuSubExec = GetNumSubExecutors(exeDevice);
+                if (totalSubExecs[exeDevice] > numGpuSubExec) {
+                    errors.push_back({ERR_WARN,
+                                      "GPU %d requests %d total CUs however only %d available. "
+                                      "Serialization will occur",
+                                      exeDevice.exeIndex,
+                                      totalSubExecs[exeDevice],
+                                      numGpuSubExec});
+                }
+                // Check that if executor subindices are used, all Transfers specify executor
+                // subindices
+                if (useSubIndexCount[exeDevice] > 0 &&
+                    useSubIndexCount[exeDevice] != transferCount[exeDevice]) {
+                    errors.push_back({ERR_FATAL,
+                                      "GPU %d specifies XCC on only %d of %d Transfers. "
+                                      "Must either specific none or all",
+                                      exeDevice.exeIndex,
+                                      useSubIndexCount[exeDevice],
+                                      transferCount[exeDevice]});
+                }
+
+                if (cfg.gfx.useMultiStream && transferCount[exeDevice] > gpuMaxHwQueues) {
+                    errors.push_back({ERR_WARN,
+                                      "GPU %d attempting %d parallel transfers, however "
+                                      "GPU_MAX_HW_QUEUES only set to %d",
+                                      exeDevice.exeIndex,
+                                      transferCount[exeDevice],
+                                      gpuMaxHwQueues});
+                }
+                break;
+            }
+            case EXE_GPU_DMA: {
+                // Check that if executor subindices are used, all Transfers specify executor
+                // subindices
+                if (useSubIndexCount[exeDevice] > 0 &&
+                    useSubIndexCount[exeDevice] != transferCount[exeDevice]) {
+                    errors.push_back({ERR_FATAL,
+                                      "DMA %d specifies engine on only %d of %d Transfers. "
+                                      "Must either specific none or all",
+                                      exeDevice.exeIndex,
+                                      useSubIndexCount[exeDevice],
+                                      transferCount[exeDevice]});
+                }
+                if (transferCount[exeDevice] > gpuMaxHwQueues) {
+                    errors.push_back({ERR_WARN,
+                                      "DMA %d attempting %d parallel transfers, however "
+                                      "GPU_MAX_HW_QUEUES only set to %d",
+                                      exeDevice.exeIndex,
+                                      transferCount[exeDevice],
+                                      gpuMaxHwQueues});
+                }
+
+                char* enableSdma = getenv("HSA_ENABLE_SDMA");
+                if (enableSdma && !strcmp(enableSdma, "0")) {
+                    errors.push_back({ERR_WARN,
+                                      "DMA functionality disabled due to environment variable "
+                                      "HSA_ENABLE_SDMA=0. "
+                                      "DMA %d copies will fallback to blit (GFX) kernels",
+                                      exeDevice.exeIndex});
+                }
+                break;
+            }
+            default: break;
+        }
+    }
+
+    // Check for fatal errors
+    for (auto const& err : errors) {
+        if (err.errType == ERR_FATAL) { return true; }
+    }
+    return false;
+}
+
+// Internal data structures
+//========================================================================================
+
+// Parameters for each SubExecutor
+struct SubExecParam
+{
+    // Inputs
+    size_t N;                  ///< Number of floats this subExecutor works on
+    int numSrcs;               ///< Number of source arrays
+    int numDsts;               ///< Number of destination arrays
+    float* src[MAX_SRCS];      ///< Source array pointers
+    float* dst[MAX_DSTS];      ///< Destination array pointers
+    int32_t preferredXccId;    ///< XCC ID to execute on (GFX only)
+
+    // Prepared
+    int teamSize;    ///< Index of this sub executor amongst team
+    int teamIdx;     ///< Size of team this sub executor is part of
+
+    // Outputs
+    long long startCycle;    ///< Start timestamp for in-kernel timing (GPU-GFX executor)
+    long long stopCycle;     ///< Stop  timestamp for in-kernel timing (GPU-GFX executor)
+    uint32_t hwId;           ///< Hardware ID
+    uint32_t xccId;          ///< XCC ID
+};
+
+// Internal resources allocated per Transfer
+struct TransferResources
+{
+    int transferIdx;                         ///< The associated Transfer
+    size_t numBytes;                         ///< Number of bytes to Transfer
+    vector<float*> srcMem;                   ///< Source memory
+    vector<float*> dstMem;                   ///< Destination memory
+    vector<SubExecParam> subExecParamCpu;    ///< Defines subarrays for each subexecutor
+    vector<int> subExecIdx;                  ///< Indices into subExecParamGpu
+    int numaNode;                            ///< NUMA node to use for this Transfer
+
+    // For GFX executor
+    SubExecParam* subExecParamGpuPtr;
+
+    // For targeted-SDMA
+    // clang-format off
+#if !defined(__NVCC__)
+    hsa_agent_t dstAgent;                       ///< DMA destination memory agent
+    hsa_agent_t srcAgent;                       ///< DMA source memory agent
+    hsa_signal_t signal;                        ///< HSA signal for completion
+    hsa_amd_sdma_engine_id_t sdmaEngineId;      ///< DMA engine ID
+#endif
+// clang-format on
+
+// For IBV executor
+// clang-format off
+#ifdef NIC_EXEC_ENABLED
+    int srcNicIndex;                            ///< SRC NIC index
+    int dstNicIndex;                            ///< DST NIC index
+    ibv_context* srcContext;                    ///< Device context for SRC NIC
+    ibv_context* dstContext;                    ///< Device context for DST NIC
+    ibv_pd* srcProtect;                         ///< Protection domain for SRC NIC
+    ibv_pd* dstProtect;                         ///< Protection domain for DST NIC
+    ibv_cq* srcCompQueue;                       ///< Completion queue for SRC NIC
+    ibv_cq* dstCompQueue;                       ///< Completion queue for DST NIC
+    ibv_port_attr srcPortAttr;                  ///< Port attributes for SRC NIC
+    ibv_port_attr dstPortAttr;                  ///< Port attributes for DST NIC
+    ibv_gid srcGid;                             ///< GID handle for SRC NIC
+    ibv_gid dstGid;                             ///< GID handle for DST NIC
+    vector<ibv_qp*> srcQueuePairs;              ///< Queue pairs for SRC NIC
+    vector<ibv_qp*> dstQueuePairs;              ///< Queue pairs for DST NIC
+    ibv_mr* srcMemRegion;                       ///< Memory region for SRC
+    ibv_mr* dstMemRegion;                       ///< Memory region for DST
+    uint8_t qpCount;                            ///< Number of QPs to be used for transferring data
+    vector<ibv_sge> sgePerQueuePair;            ///< Scatter-gather elements per queue pair
+    vector<ibv_send_wr> sendWorkRequests;       ///< Send work requests per queue pair
+#endif
+    // clang-format on
+    // Counters
+    double totalDurationMsec;      ///< Total duration for all iterations for this Transfer
+    vector<double> perIterMsec;    ///< Duration for each individual iteration
+    vector<set<pair<int, int>>> perIterCUs;    ///< GFX-Executor only. XCC:CU used per iteration
+};
+
+// Internal resources allocated per Executor
+struct ExeInfo
+{
+    size_t totalBytes;           ///< Total bytes this executor transfers
+    double totalDurationMsec;    ///< Total duration for all iterations for this Executor
+    int totalSubExecs;           ///< Total number of subExecutors to use
+    bool useSubIndices;          ///< Use subexecutor indicies
+    int numSubIndices;           ///< Number of subindices this ExeDevice has
+    vector<SubExecParam> subExecParamCpu;    ///< Subexecutor parameters for this executor
+    vector<TransferResources> resources;     ///< Per-Transfer resources
+
+    // For GPU-Executors
+    SubExecParam* subExecParamGpu;     ///< GPU copy of subExecutor parameters
+    vector<hipStream_t> streams;       ///< HIP streams to launch on
+    vector<hipEvent_t> startEvents;    ///< HIP start timing event
+    vector<hipEvent_t> stopEvents;     ///< HIP stop timing event
+    int wallClockRate;                 ///< (GFX-only) Device wall clock rate
+};
+
+// Structure to track PCIe topology
+struct PCIeNode
+{
+    std::string address;            ///< PCIe address for this PCIe node
+    std::string description;        ///< Description for this PCIe node
+    std::set<PCIeNode> children;    ///< Children PCIe nodes
+
+    // Default constructor
+    PCIeNode() : address(""), description("") {}
+
+    // Constructor
+    PCIeNode(std::string const& addr) : address(addr) {}
+
+    // Constructor
+    PCIeNode(std::string const& addr, std::string const& desc)
+            : address(addr), description(desc)
+    {}
+
+    // Comparison operator for std::set
+    bool operator<(PCIeNode const& other) const { return address < other.address; }
+};
+
+#ifdef NIC_EXEC_ENABLED
+// Structure to track information about IBV devices
+struct IbvDevice
+{
+    ibv_device* devicePtr;
+    std::string name;
+    std::string busId;
+    bool hasActivePort;
+    int numaNode;
+    int gidIndex;
+    std::string gidDescriptor;
+    bool isRoce;
+};
+#endif
+
+#ifdef NIC_EXEC_ENABLED
+// Function to collect information about IBV devices
+//========================================================================================
+static bool IsConfiguredGid(union ibv_gid const& gid)
+{
+    const struct in6_addr* a = (struct in6_addr*)gid.raw;
+    int trailer              = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
+    if (((a->s6_addr32[0] | trailer) == 0UL) ||
+        ((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
+        return false;
+    }
+    return true;
+}
+
+static bool LinkLocalGid(union ibv_gid const& gid)
+{
+    const struct in6_addr* a = (struct in6_addr*)gid.raw;
+    if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) { return true; }
+    return false;
+}
+
+static ErrResult GetRoceVersionNumber(struct ibv_context* const& context,
+                                      int const& portNum,
+                                      int const& gidIndex,
+                                      int& version)
+{
+    char const* deviceName      = ibv_get_device_name(context->device);
+    char gidRoceVerStr[16]      = {};
+    char roceTypePath[PATH_MAX] = {};
+    sprintf(roceTypePath,
+            "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d",
+            deviceName,
+            portNum,
+            gidIndex);
+
+    int fd = open(roceTypePath, O_RDONLY);
+    if (fd == -1) { return {ERR_FATAL, "Failed while opening RoCE file path (%s)", roceTypePath}; }
+
+    int ret = read(fd, gidRoceVerStr, 15);
+    close(fd);
+
+    if (ret == -1) { return {ERR_FATAL, "Failed while reading RoCE version"}; }
+
+    if (strlen(gidRoceVerStr)) {
+        if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0 ||
+            strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
+            version = 1;
+        } else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
+            version = 2;
+        }
+    }
+    return ERR_NONE;
+}
+
+static bool IsIPv4MappedIPv6(const union ibv_gid& gid)
+{
+    // look for ::ffff:x.x.x.x format
+    // From Broadcom documentation
+    // https://techdocs.broadcom.com/us/en/storage-and-ethernet-connectivity/ethernet-nic-controllers/bcm957xxx/adapters/frequently-asked-questions1.html
+    // "The IPv4 address is really an IPv4 address mapped into the IPv6 address space.
+    // This can be identified by 80 “0” bits, followed by 16 “1” bits (“FFFF” in hexadecimal)
+    // followed by the original 32-bit IPv4 address."
+    return (gid.global.subnet_prefix == 0 && gid.raw[8] == 0 && gid.raw[9] == 0 &&
+            gid.raw[10] == 0xff && gid.raw[11] == 0xff);
+}
+
+static ErrResult GetGidIndex(struct ibv_context* context,
+                             int const& gidTblLen,
+                             int const& portNum,
+                             std::pair<int, std::string>& gidInfo)
+{
+    if (gidInfo.first >= 0) {
+        return ERR_NONE;    // honor user choice
+    }
+    union ibv_gid gid;
+
+    GidPriority highestPriority = GidPriority::UNKNOWN;
+    int gidIndex                = -1;
+
+    for (int i = 0; i < gidTblLen; ++i) {
+        IBV_CALL(ibv_query_gid, context, portNum, i, &gid);
+        if (!IsConfiguredGid(gid)) { continue; }
+        int gidCurrRoceVersion;
+        if (GetRoceVersionNumber(context, portNum, i, gidCurrRoceVersion).errType != ERR_NONE) {
+            continue;
+        }
+        GidPriority currPriority;
+        if (IsIPv4MappedIPv6(gid)) {
+            currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV4
+                                                     : GidPriority::ROCEV1_IPV4;
+        } else if (!LinkLocalGid(gid)) {
+            currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV6
+                                                     : GidPriority::ROCEV1_IPV6;
+        } else {
+            currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_LINK_LOCAL
+                                                     : GidPriority::ROCEV1_LINK_LOCAL;
+        }
+        if (currPriority > highestPriority) {
+            highestPriority = currPriority;
+            gidIndex        = i;
+        }
+    }
+
+    if (highestPriority == GidPriority::UNKNOWN) {
+        gidInfo.first = -1;
+        return {ERR_FATAL,
+                "Failed to auto-detect a valid GID index. Try setting it manually through "
+                "IB_GID_INDEX"};
+    }
+    gidInfo.first  = gidIndex;
+    gidInfo.second = GidPriorityStr[highestPriority];
+    return ERR_NONE;
+}
+
+static vector<IbvDevice>& GetIbvDeviceList()
+{
+    static bool isInitialized              = false;
+    static vector<IbvDevice> ibvDeviceList = {};
+
+    // Build list on first use
+    if (!isInitialized) {
+        // Query the number of IBV devices
+        int numIbvDevices       = 0;
+        ibv_device** deviceList = ibv_get_device_list(&numIbvDevices);
+
+        if (deviceList && numIbvDevices > 0) {
+            // Loop over each device to collect information
+            for (int i = 0; i < numIbvDevices; i++) {
+                IbvDevice ibvDevice;
+                ibvDevice.devicePtr     = deviceList[i];
+                ibvDevice.name          = deviceList[i]->name;
+                ibvDevice.hasActivePort = false;
+                {
+                    struct ibv_context* context = ibv_open_device(ibvDevice.devicePtr);
+                    if (context) {
+                        struct ibv_device_attr deviceAttr;
+                        if (!ibv_query_device(context, &deviceAttr)) {
+                            int activePort;
+                            ibvDevice.gidIndex = -1;
+                            for (int port = 1; port <= deviceAttr.phys_port_cnt; ++port) {
+                                struct ibv_port_attr portAttr;
+                                if (ibv_query_port(context, port, &portAttr)) { continue; }
+                                if (portAttr.state == IBV_PORT_ACTIVE) {
+                                    activePort              = port;
+                                    ibvDevice.hasActivePort = true;
+                                    if (portAttr.link_layer == IBV_LINK_LAYER_ETHERNET) {
+                                        ibvDevice.isRoce = true;
+                                        std::pair<int, std::string> gidInfo(-1, "");
+                                        auto res = GetGidIndex(
+                                            context, portAttr.gid_tbl_len, activePort, gidInfo);
+                                        if (res.errType == ERR_NONE) {
+                                            ibvDevice.gidIndex      = gidInfo.first;
+                                            ibvDevice.gidDescriptor = gidInfo.second;
+                                        }
+                                    }
+                                    break;
+                                }
+                            }
+                        }
+                        ibv_close_device(context);
+                    }
+                }
+                ibvDevice.busId = "";
+                {
+                    std::string device_path(ibvDevice.devicePtr->dev_path);
+                    if (std::filesystem::exists(device_path)) {
+                        std::string pciPath = std::filesystem::canonical(device_path + "/device")
+                                                  .string();
+                        std::size_t pos = pciPath.find_last_of('/');
+                        if (pos != std::string::npos) { ibvDevice.busId = pciPath.substr(pos + 1); }
+                    }
+                }
+
+                // Get nearest numa node for this device
+                ibvDevice.numaNode               = -1;
+                std::filesystem::path devicePath = "/sys/bus/pci/devices/" + ibvDevice.busId +
+                                                   "/numa_node";
+                std::string canonicalPath = std::filesystem::canonical(devicePath).string();
+
+                if (std::filesystem::exists(canonicalPath)) {
+                    std::ifstream file(canonicalPath);
+                    if (file.is_open()) {
+                        std::string numaNodeStr;
+                        std::getline(file, numaNodeStr);
+                        int numaNodeVal;
+                        if (sscanf(numaNodeStr.c_str(), "%d", &numaNodeVal) == 1) {
+                            ibvDevice.numaNode = numaNodeVal;
+                        }
+                        file.close();
+                    }
+                }
+                ibvDeviceList.push_back(ibvDevice);
+            }
+        }
+        ibv_free_device_list(deviceList);
+        isInitialized = true;
+    }
+    return ibvDeviceList;
+}
+#endif    // NIC_EXEC_ENABLED
+// clang-format on
+
+#ifdef NIC_EXEC_ENABLED
+// PCIe-related functions
+//========================================================================================
+
+// Prints off PCIe tree
+static void PrintPCIeTree(PCIeNode const& node, std::string const& prefix = "", bool isLast = true)
+{
+    if (!node.address.empty()) {
+        printf("%s%s%s", prefix.c_str(), (isLast ? "└── " : "├── "), node.address.c_str());
+        if (!node.description.empty()) { printf("(%s)", node.description.c_str()); }
+        printf("\n");
+    }
+    auto const& children = node.children;
+    for (auto it = children.begin(); it != children.end(); ++it) {
+        PrintPCIeTree(*it, prefix + (isLast ? "    " : "│   "), std::next(it) == children.end());
+    }
+}
+
+// Inserts nodes along pcieAddress down a tree starting from root
+static ErrResult InsertPCIePathToTree(std::string const& pcieAddress,
+                                      std::string const& description,
+                                      PCIeNode& root)
+{
+    std::filesystem::path devicePath = "/sys/bus/pci/devices/" + pcieAddress;
+    std::string canonicalPath        = std::filesystem::canonical(devicePath).string();
+
+    if (!std::filesystem::exists(devicePath)) {
+        return {ERR_FATAL, "Device path %s does not exist", devicePath.c_str()};
+    }
+
+    std::istringstream iss(canonicalPath);
+    std::string token;
+
+    PCIeNode* currNode = &root;
+    while (std::getline(iss, token, '/')) {
+        auto it  = (currNode->children.insert(PCIeNode(token))).first;
+        currNode = const_cast<PCIeNode*>(&(*it));
+    }
+    currNode->description = description;
+
+    return ERR_NONE;
+}
+
+// Returns root node for PCIe tree.  Constructed on first use
+static PCIeNode* GetPCIeTreeRoot()
+{
+    static bool isInitialized = false;
+    static PCIeNode pcieRoot;
+
+    // Build PCIe tree on first use
+    if (!isInitialized) {
+        // Add NICs to the tree
+        int numNics               = GetNumExecutors(EXE_NIC);
+        auto const& ibvDeviceList = GetIbvDeviceList();
+        for (IbvDevice const& ibvDevice : ibvDeviceList) {
+            if (!ibvDevice.hasActivePort || ibvDevice.busId == "") { continue; }
+            InsertPCIePathToTree(ibvDevice.busId, ibvDevice.name, pcieRoot);
+        }
+
+        // Add GPUs to the tree
+        int numGpus = GetNumExecutors(EXE_GPU_GFX);
+        for (int i = 0; i < numGpus; ++i) {
+            char hipPciBusId[64];
+            if (hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i) == hipSuccess) {
+                InsertPCIePathToTree(hipPciBusId, "GPU " + std::to_string(i), pcieRoot);
+            }
+        }
+#ifdef VERBS_DEBUG
+        PrintPCIeTree(pcieRoot);
+#endif
+        isInitialized = true;
+    }
+    return &pcieRoot;
+}
+
+// Finds the lowest common ancestor in PCIe tree between two nodes
+static PCIeNode const* GetLcaBetweenNodes(PCIeNode const* root,
+                                          std::string const& node1Address,
+                                          std::string const& node2Address)
+{
+    if (!root || root->address == node1Address || root->address == node2Address) { return root; }
+
+    PCIeNode const* lcaFound1 = nullptr;
+    PCIeNode const* lcaFound2 = nullptr;
+
+    // Recursively iterate over children
+    for (auto const& child : root->children) {
+        PCIeNode const* lca = GetLcaBetweenNodes(&child, node1Address, node2Address);
+        if (!lca) { continue; }
+        if (!lcaFound1) {
+            // First time found
+            lcaFound1 = lca;
+        } else {
+            // Second time found
+            lcaFound2 = lca;
+            break;
+        }
+    }
+
+    // If two children were found, then current node is the lowest common ancestor
+    return (lcaFound1 && lcaFound2) ? root : lcaFound1;
+}
+
+// Gets the depth of an node in the PCIe tree
+static int GetLcaDepth(std::string const& targetBusID, PCIeNode const* const& node, int depth = 0)
+{
+    if (!node) { return -1; }
+    if (targetBusID == node->address) { return depth; }
+
+    for (auto const& child : node->children) {
+        int distance = GetLcaDepth(targetBusID, &child, depth + 1);
+        if (distance != -1) { return distance; }
+    }
+    return -1;
+}
+
+// Function to extract the bus number from a PCIe address (domain:bus:device.function)
+static int ExtractBusNumber(std::string const& pcieAddress)
+{
+    int domain, bus, device, function;
+    char delimiter;
+
+    std::istringstream iss(pcieAddress);
+    iss >> std::hex >> domain >> delimiter >> bus >> delimiter >> device >> delimiter >> function;
+    if (iss.fail()) {
+#ifdef VERBS_DEBUG
+        printf("Invalid PCIe address format: %s\n", pcieAddress.c_str());
+#endif
+        return -1;
+    }
+    return bus;
+}
+
+// Function to compute the distance between two bus IDs
+static int GetBusIdDistance(std::string const& pcieAddress1, std::string const& pcieAddress2)
+{
+    int bus1 = ExtractBusNumber(pcieAddress1);
+    int bus2 = ExtractBusNumber(pcieAddress2);
+    return (bus1 < 0 || bus2 < 0) ? -1 : std::abs(bus1 - bus2);
+}
+
+// Given a target busID and a set of candidate devices, returns a set of indices
+// that is "closest" to the target
+static std::set<int> GetNearestDevicesInTree(std::string const& targetBusId,
+                                             std::vector<std::string> const& candidateBusIdList)
+{
+    int maxDepth          = -1;
+    int minDistance       = std::numeric_limits<int>::max();
+    std::set<int> matches = {};
+
+    // Loop over the candidates to find the ones with the lowest common ancestor (LCA)
+    for (int i = 0; i < candidateBusIdList.size(); i++) {
+        std::string const& candidateBusId = candidateBusIdList[i];
+        if (candidateBusId == "") { continue; }
+        PCIeNode const* lca = GetLcaBetweenNodes(GetPCIeTreeRoot(), targetBusId, candidateBusId);
+        if (!lca) { continue; }
+
+        int depth        = GetLcaDepth(lca->address, GetPCIeTreeRoot());
+        int currDistance = GetBusIdDistance(targetBusId, candidateBusId);
+
+        // When more than one LCA match is found, choose the one with smallest busId difference
+        // NOTE: currDistance could be -1, which signals problem with parsing, however still
+        //       remains a valid "closest" candidate, so is included
+        if (depth > maxDepth || (depth == maxDepth && depth >= 0 && currDistance < minDistance)) {
+            maxDepth = depth;
+            matches.clear();
+            matches.insert(i);
+            minDistance = currDistance;
+        } else if (depth == maxDepth && depth >= 0 && currDistance == minDistance) {
+            matches.insert(i);
+        }
+    }
+    return matches;
+}
+#endif    // NIC_EXEC_ENABLED
+
+#ifdef NIC_EXEC_ENABLED
+// IB Verbs-related functions
+//========================================================================================
+
+// Create a queue pair
+static ErrResult CreateQueuePair(ConfigOptions const& cfg,
+                                 struct ibv_pd* pd,
+                                 struct ibv_cq* cq,
+                                 struct ibv_qp*& qp)
+{
+    // Set queue pair attributes
+    struct ibv_qp_init_attr attr = {};
+    attr.qp_type                 = IBV_QPT_RC;                // Set type to reliable connection
+    attr.send_cq                 = cq;                        // Send completion queue
+    attr.recv_cq                 = cq;                        // Recv completion queue
+    attr.cap.max_send_wr         = cfg.nic.maxSendWorkReq;    // Max send work requests
+    attr.cap.max_recv_wr         = cfg.nic.maxRecvWorkReq;    // Max recv work requests
+    attr.cap.max_send_sge        = 1;                         // Max send scatter-gather entries
+    attr.cap.max_recv_sge        = 1;                         // Max recv scatter-gather entries
+
+    qp = ibv_create_qp(pd, &attr);
+    if (qp == NULL) { return {ERR_FATAL, "Error while creating QP"}; }
+
+    return ERR_NONE;
+}
+
+// Initialize a queue pair
+static ErrResult InitQueuePair(struct ibv_qp* qp, uint8_t port, unsigned flags)
+{
+    struct ibv_qp_attr attr = {};              // Clear all attributes
+    attr.qp_state           = IBV_QPS_INIT;    // Set the QP state to INIT
+    attr.pkey_index         = 0;               // Set the partition key index to 0
+    attr.port_num           = port;            // Set the port number to the defined IB_PORT
+    attr.qp_access_flags    = flags;           // Set the QP access flags to the provided flags
+
+    int ret = ibv_modify_qp(qp,
+                            &attr,
+                            IBV_QP_STATE |               // Modify the QP state
+                                IBV_QP_PKEY_INDEX |      // Modify the partition key index
+                                IBV_QP_PORT |            // Modify the port number
+                                IBV_QP_ACCESS_FLAGS);    // Modify the access flags
+
+    if (ret != 0) { return {ERR_FATAL, "Error during QP Init. IB Verbs Error code: %d", ret}; }
+
+    return ERR_NONE;
+}
+
+// Transition QueuePair to Ready to Receive State
+static ErrResult TransitionQpToRtr(ibv_qp* qp,
+                                   uint16_t const& dlid,
+                                   uint32_t const& dqpn,
+                                   ibv_gid const& gid,
+                                   uint8_t const& gidIndex,
+                                   uint8_t const& port,
+                                   bool const& isRoCE,
+                                   ibv_mtu const& mtu)
+{
+    // Prepare QP attributes
+    struct ibv_qp_attr attr = {};
+    attr.qp_state           = IBV_QPS_RTR;
+    attr.path_mtu           = mtu;
+    attr.rq_psn             = 0;
+    attr.max_dest_rd_atomic = 1;
+    attr.min_rnr_timer      = 12;
+    if (isRoCE) {
+        attr.ah_attr.is_global                     = 1;
+        attr.ah_attr.grh.dgid.global.subnet_prefix = gid.global.subnet_prefix;
+        attr.ah_attr.grh.dgid.global.interface_id  = gid.global.interface_id;
+        attr.ah_attr.grh.flow_label                = 0;
+        attr.ah_attr.grh.sgid_index                = gidIndex;
+        attr.ah_attr.grh.hop_limit                 = 255;
+    } else {
+        attr.ah_attr.is_global = 0;
+        attr.ah_attr.dlid      = dlid;
+    }
+    attr.ah_attr.sl            = 0;
+    attr.ah_attr.src_path_bits = 0;
+    attr.ah_attr.port_num      = port;
+    attr.dest_qp_num           = dqpn;
+
+    // Modify the QP
+    int ret = ibv_modify_qp(qp,
+                            &attr,
+                            IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
+                                IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER);
+    if (ret != 0) { return {ERR_FATAL, "Error during QP RTR. IB Verbs Error code: %d", ret}; }
+
+    return ERR_NONE;
+}
+
+// Transition QueuePair to Ready to Send state
+static ErrResult TransitionQpToRts(struct ibv_qp* qp)
+{
+    struct ibv_qp_attr attr = {};
+    attr.qp_state           = IBV_QPS_RTS;
+    attr.sq_psn             = 0;
+    attr.timeout            = 14;
+    attr.retry_cnt          = 7;
+    attr.rnr_retry          = 7;
+    attr.max_rd_atomic      = 1;
+
+    int ret = ibv_modify_qp(qp,
+                            &attr,
+                            IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY |
+                                IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC);
+    if (ret != 0) { return {ERR_FATAL, "Error during QP RTS. IB Verbs Error code: %d", ret}; }
+
+    return ERR_NONE;
+}
+
+static ErrResult PrepareNicTransferResources(ConfigOptions const& cfg,
+                                             ExeDevice const& srcExeDevice,
+                                             Transfer const& t,
+                                             TransferResources& rss)
+
+{
+    // Switch to the closest NUMA node to this NIC
+    int numaNode = GetIbvDeviceList()[srcExeDevice.exeIndex].numaNode;
+    if (numaNode != -1) { numa_run_on_node(numaNode); }
+
+    int const port = cfg.nic.ibPort;
+
+    // Figure out destination NIC (Accounts for possible remap due to use of EXE_NIC_NEAREST)
+    ExeDevice dstExeDevice;
+    ERR_CHECK(GetActualExecutor(cfg, {t.exeDevice.exeType, t.exeSubIndex}, dstExeDevice));
+
+    rss.srcNicIndex = srcExeDevice.exeIndex;
+    rss.dstNicIndex = dstExeDevice.exeIndex;
+    rss.qpCount     = t.numSubExecs;
+
+    // Check for valid NICs and active ports
+    int numNics = GetNumExecutors(EXE_NIC);
+    if (rss.srcNicIndex < 0 || rss.srcNicIndex >= numNics) {
+        return {ERR_FATAL, "SRC NIC index is out of range (%d)", rss.srcNicIndex};
+    }
+    if (rss.dstNicIndex < 0 || rss.dstNicIndex >= numNics) {
+        return {ERR_FATAL, "DST NIC index is out of range (%d)", rss.dstNicIndex};
+    }
+    if (!GetIbvDeviceList()[rss.srcNicIndex].hasActivePort) {
+        return {ERR_FATAL, "SRC NIC %d is not active\n", rss.srcNicIndex};
+    }
+    if (!GetIbvDeviceList()[rss.dstNicIndex].hasActivePort) {
+        return {ERR_FATAL, "DST NIC %d is not active\n", rss.dstNicIndex};
+    }
+
+    // Queue pair flags
+    unsigned int rdmaAccessFlags = (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ |
+                                    IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC);
+
+    unsigned int rdmaMemRegFlags = rdmaAccessFlags;
+    if (cfg.nic.useRelaxedOrder) { rdmaMemRegFlags |= IBV_ACCESS_RELAXED_ORDERING; }
+
+    // Open NIC contexts
+    IBV_PTR_CALL(rss.srcContext, ibv_open_device, GetIbvDeviceList()[rss.srcNicIndex].devicePtr);
+    IBV_PTR_CALL(rss.dstContext, ibv_open_device, GetIbvDeviceList()[rss.dstNicIndex].devicePtr);
+
+    // Open protection domains
+    IBV_PTR_CALL(rss.srcProtect, ibv_alloc_pd, rss.srcContext);
+    IBV_PTR_CALL(rss.dstProtect, ibv_alloc_pd, rss.dstContext);
+
+    // Register memory region
+    IBV_PTR_CALL(
+        rss.srcMemRegion, ibv_reg_mr, rss.srcProtect, rss.srcMem[0], rss.numBytes, rdmaMemRegFlags);
+    IBV_PTR_CALL(
+        rss.dstMemRegion, ibv_reg_mr, rss.dstProtect, rss.dstMem[0], rss.numBytes, rdmaMemRegFlags);
+
+    // Create completion queues
+    IBV_PTR_CALL(rss.srcCompQueue, ibv_create_cq, rss.srcContext, cfg.nic.queueSize, NULL, NULL, 0);
+    IBV_PTR_CALL(rss.dstCompQueue, ibv_create_cq, rss.dstContext, cfg.nic.queueSize, NULL, NULL, 0);
+
+    // Get port attributes
+    IBV_CALL(ibv_query_port, rss.srcContext, port, &rss.srcPortAttr);
+    IBV_CALL(ibv_query_port, rss.dstContext, port, &rss.dstPortAttr);
+
+
+    if (rss.srcPortAttr.link_layer != rss.dstPortAttr.link_layer) {
+        return {ERR_FATAL,
+                "SRC NIC (%d) and DST NIC (%d) do not have the same link layer",
+                rss.srcNicIndex,
+                rss.dstNicIndex};
+    }
+
+    // Prepare GID index
+    int srcGidIndex = cfg.nic.ibGidIndex;
+    int dstGidIndex = cfg.nic.ibGidIndex;
+
+    // Check for RDMA over Converged Ethernet (RoCE) and update GID index appropriately
+    bool isRoCE = (rss.srcPortAttr.link_layer == IBV_LINK_LAYER_ETHERNET);
+    if (isRoCE) {
+        // Try to auto-detect the GID index
+        std::pair<int, std::string> srcGidInfo(srcGidIndex, "");
+        std::pair<int, std::string> dstGidInfo(dstGidIndex, "");
+        ERR_CHECK(
+            GetGidIndex(rss.srcContext, rss.srcPortAttr.gid_tbl_len, cfg.nic.ibPort, srcGidInfo));
+        ERR_CHECK(
+            GetGidIndex(rss.dstContext, rss.dstPortAttr.gid_tbl_len, cfg.nic.ibPort, dstGidInfo));
+        srcGidIndex = srcGidInfo.first;
+        dstGidIndex = dstGidInfo.first;
+        IBV_CALL(ibv_query_gid, rss.srcContext, port, srcGidIndex, &rss.srcGid);
+        IBV_CALL(ibv_query_gid, rss.dstContext, port, dstGidIndex, &rss.dstGid);
+    }
+
+    // Prepare queue pairs and send elements
+    rss.srcQueuePairs.resize(rss.qpCount);
+    rss.dstQueuePairs.resize(rss.qpCount);
+    rss.sgePerQueuePair.resize(rss.qpCount);
+    rss.sendWorkRequests.resize(rss.qpCount);
+
+    for (int i = 0; i < rss.qpCount; ++i) {
+        // Create scatter-gather element for the portion of memory assigned to this queue pair
+        ibv_sge sg             = {};
+        sg.addr                = (uint64_t)rss.subExecParamCpu[i].src[0];
+        sg.length              = rss.subExecParamCpu[i].N * sizeof(float);
+        sg.lkey                = rss.srcMemRegion->lkey;
+        rss.sgePerQueuePair[i] = sg;
+
+        // Create send work request
+        ibv_send_wr wr          = {};
+        wr.wr_id                = i;
+        wr.sg_list              = &rss.sgePerQueuePair[i];
+        wr.num_sge              = 1;
+        wr.opcode               = IBV_WR_RDMA_WRITE;
+        wr.send_flags           = IBV_SEND_SIGNALED;
+        wr.wr.rdma.remote_addr  = (uint64_t)rss.subExecParamCpu[i].dst[0];
+        wr.wr.rdma.rkey         = rss.dstMemRegion->rkey;
+        rss.sendWorkRequests[i] = wr;
+
+        // Create SRC/DST queue pairs
+        ERR_CHECK(CreateQueuePair(cfg, rss.srcProtect, rss.srcCompQueue, rss.srcQueuePairs[i]));
+        ERR_CHECK(CreateQueuePair(cfg, rss.dstProtect, rss.dstCompQueue, rss.dstQueuePairs[i]));
+
+        // Initialize SRC/DST queue pairs
+        ERR_CHECK(InitQueuePair(rss.srcQueuePairs[i], port, rdmaAccessFlags));
+        ERR_CHECK(InitQueuePair(rss.dstQueuePairs[i], port, rdmaAccessFlags));
+
+        // Transition the SRC queue pair to ready to receive
+        ERR_CHECK(TransitionQpToRtr(rss.srcQueuePairs[i],
+                                    rss.dstPortAttr.lid,
+                                    rss.dstQueuePairs[i]->qp_num,
+                                    rss.dstGid,
+                                    dstGidIndex,
+                                    port,
+                                    isRoCE,
+                                    rss.srcPortAttr.active_mtu));
+
+        // Transition the SRC queue pair to ready to send
+        ERR_CHECK(TransitionQpToRts(rss.srcQueuePairs[i]));
+
+        // Transition the DST queue pair to ready to receive
+        ERR_CHECK(TransitionQpToRtr(rss.dstQueuePairs[i],
+                                    rss.srcPortAttr.lid,
+                                    rss.srcQueuePairs[i]->qp_num,
+                                    rss.srcGid,
+                                    srcGidIndex,
+                                    port,
+                                    isRoCE,
+                                    rss.dstPortAttr.active_mtu));
+
+        // Transition the DST queue pair to ready to send
+        ERR_CHECK(TransitionQpToRts(rss.dstQueuePairs[i]));
+    }
+
+    return ERR_NONE;
+}
+
+static ErrResult TeardownNicTransferResources(TransferResources& rss)
+{
+    // Deregister memory regions
+    IBV_CALL(ibv_dereg_mr, rss.srcMemRegion);
+    IBV_CALL(ibv_dereg_mr, rss.dstMemRegion);
+
+    // Destroy queue pairs
+    for (auto srcQueuePair : rss.srcQueuePairs) { IBV_CALL(ibv_destroy_qp, srcQueuePair); }
+    rss.srcQueuePairs.clear();
+    for (auto dstQueuePair : rss.dstQueuePairs) { IBV_CALL(ibv_destroy_qp, dstQueuePair); }
+    rss.dstQueuePairs.clear();
+
+    // Destroy completion queues
+    IBV_CALL(ibv_destroy_cq, rss.srcCompQueue);
+    IBV_CALL(ibv_destroy_cq, rss.dstCompQueue);
+
+    // Deallocate protection domains
+    IBV_CALL(ibv_dealloc_pd, rss.srcProtect);
+    IBV_CALL(ibv_dealloc_pd, rss.dstProtect);
+
+    // Destroy context
+    IBV_CALL(ibv_close_device, rss.srcContext);
+    IBV_CALL(ibv_close_device, rss.dstContext);
+
+    return ERR_NONE;
+}
+#endif    // NIC_EXEC_ENABLED
+
+// Data validation-related functions
+//========================================================================================
+
+// Pseudo-random formula for each element in array
+static __host__ float PrepSrcValue(int srcBufferIdx, size_t idx)
+{
+    return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
+}
+
+// Fills a pre-sized buffer with the pattern, based on which src index buffer
+// Note: Can also generate expected dst buffer
+static void PrepareReference(ConfigOptions const& cfg, std::vector<float>& cpuBuffer, int bufferIdx)
+{
+    size_t N = cpuBuffer.size();
+
+    if (!cfg.data.fillCompress.empty()) {
+        // 0 -> Random
+        // 1 ->  1B0 - The upper  1 byte  of each aligned 2 bytes is 0
+        // 2 ->  2B0 - The upper  2 bytes of each aligned 4 bytes are 0
+        // 3 ->  4B0 - The upper  4 bytes of each aligned 8 bytes are 0
+        // 4 -> 32B0 - The upper 32 bytes of each 64-byte line are 0
+
+        // Fill buffer with random floats
+        std::mt19937 gen;
+        gen.seed(bufferIdx * 425);
+        std::uniform_real_distribution<float> dist(-100000.0f, +100000.0f);
+        for (size_t i = 0; i < N; i++) { cpuBuffer[i] = dist(gen); }
+
+        // Figure out distribution for lines based on the percentages given
+        size_t numLines = N / 16;
+        size_t leftover = numLines;
+        std::vector<size_t> lineCounts(5, 0);
+        std::set<std::pair<double, int>> remainder;
+
+        // Assign rounded down values first
+        std::vector<int> percentages = cfg.data.fillCompress;
+        while (percentages.size() < 5) { percentages.push_back(0); }
+        for (auto i = std::size_t(0); i < percentages.size(); i++) {
+            lineCounts[i] = (size_t)(numLines * (percentages[i] / 100.0));
+            leftover -= lineCounts[i];
+            remainder.insert(
+                std::make_pair(numLines * (percentages[i] / 100.0) - lineCounts[i], i));
+        }
+
+        // Assign leftovers based on largest remainder
+        while (leftover != 0) {
+            auto last = *remainder.rbegin();
+            lineCounts[last.second]++;
+            remainder.erase(last);
+            leftover--;
+        }
+
+        // Randomly decide which lines get assigned to which types
+        std::vector<int> lineTypes(numLines, 0);
+        int offset = lineCounts[0];
+        for (int i = 1; i < 5; i++) {
+            for (auto j = std::size_t(0); j < lineCounts[i]; j++) { lineTypes[offset++] = i; }
+        }
+        std::shuffle(lineTypes.begin(), lineTypes.end(), gen);
+
+        // Apply zero-ing
+        int dumpLines = getenv("DUMP_LINES") ? atoi(getenv("DUMP_LINES")) : 0;
+
+        if (dumpLines) {
+            printf("Input pattern 64B line statistics for bufferIdx %d:\n", bufferIdx);
+            printf("Total lines: %lu\n", numLines);
+            printf("- 0: Random : %8lu (%8.3f%%)\n",
+                   lineCounts[0],
+                   100.0 * lineCounts[0] / (1.0 * numLines));
+            printf("- 1: 1B0    : %8lu (%8.3f%%)\n",
+                   lineCounts[1],
+                   100.0 * lineCounts[1] / (1.0 * numLines));
+            printf("- 2: 2B0    : %8lu (%8.3f%%)\n",
+                   lineCounts[2],
+                   100.0 * lineCounts[2] / (1.0 * numLines));
+            printf("- 3: 4B0    : %8lu (%8.3f%%)\n",
+                   lineCounts[3],
+                   100.0 * lineCounts[3] / (1.0 * numLines));
+            printf("- 4: 32B0   : %8lu (%8.3f%%)\n",
+                   lineCounts[4],
+                   100.0 * lineCounts[4] / (1.0 * numLines));
+        }
+
+        for (auto line = std::size_t(0); line < numLines; line++) {
+            unsigned char* linePtr = (unsigned char*)&cpuBuffer[line * 16];
+
+            switch (lineTypes[line]) {
+                case 1:    // 1B0
+                    for (int i = 0; i < 32; i++) { linePtr[2 * i + 1] = 0; }
+                    break;
+                case 2:    // 2B0
+                    for (int i = 0; i < 16; i++) {
+                        linePtr[4 * i + 2] = 0;
+                        linePtr[4 * i + 3] = 0;
+                    }
+                    break;
+                case 3:    // 4B0
+                    for (int i = 0; i < 8; i++) {
+                        linePtr[8 * i + 4] = 0;
+                        linePtr[8 * i + 5] = 0;
+                        linePtr[8 * i + 6] = 0;
+                        linePtr[8 * i + 7] = 0;
+                    }
+                    break;
+                case 4:    // 32B0
+                    for (int i = 32; i < 64; i++) { linePtr[i] = 0; }
+                    break;
+            }
+
+            if (line < static_cast<std::size_t>(dumpLines)) {
+                printf("Line %02zu [%d]: ", line, lineTypes[line]);
+                for (int j = 63; j >= 0; j--) {
+                    printf("%02x ", linePtr[j]);
+                    if (j % 16 == 0) { printf(" "); }
+                }
+                printf("\n");
+            }
+        }
+    } else {
+        // Use fill pattern if specified
+        size_t patternLen = cfg.data.fillPattern.size();
+        if (patternLen > 0) {
+            size_t copies       = N / patternLen;
+            size_t leftOver     = N % patternLen;
+            float* cpuBufferPtr = cpuBuffer.data();
+            for (auto i = std::size_t(0); i < copies; i++) {
+                memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), patternLen * sizeof(float));
+                cpuBufferPtr += patternLen;
+            }
+            if (leftOver) {
+                memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), leftOver * sizeof(float));
+            }
+        } else {
+            // Fall back to pseudo-random
+            for (size_t i = 0; i < N; ++i) { cpuBuffer[i] = PrepSrcValue(bufferIdx, i); }
+        }
+    }
+}
+
+// Checks that destination buffers match expected values
+static ErrResult ValidateAllTransfers(ConfigOptions const& cfg,
+                                      vector<Transfer> const& transfers,
+                                      vector<TransferResources*> const& transferResources,
+                                      vector<vector<float>> const& dstReference,
+                                      vector<float>& outputBuffer)
+{
+    float* output;
+    size_t initOffset = cfg.data.byteOffset / sizeof(float);
+
+    for (auto rss : transferResources) {
+        int transferIdx   = rss->transferIdx;
+        Transfer const& t = transfers[transferIdx];
+        size_t N          = t.numBytes / sizeof(float);
+
+        float const* expected = dstReference[t.srcs.size()].data();
+        for (auto dstIdx = std::size_t(0); dstIdx < rss->dstMem.size(); dstIdx++) {
+            if (IsCpuMemType(t.dsts[dstIdx].memType) || cfg.data.validateDirect) {
+                output = (rss->dstMem[dstIdx]) + initOffset;
+            } else {
+                ERR_CHECK(hipMemcpy(outputBuffer.data(),
+                                    (rss->dstMem[dstIdx]) + initOffset,
+                                    t.numBytes,
+                                    hipMemcpyDefault));
+                ERR_CHECK(hipDeviceSynchronize());
+                output = outputBuffer.data();
+            }
+
+            if (memcmp(output, expected, t.numBytes)) {
+                // Difference found - find first error
+                for (size_t i = 0; i < N; i++) {
+                    if (output[i] != expected[i]) {
+                        return {ERR_FATAL,
+                                "Transfer %d: Unexpected mismatch at index %lu of destination %d: "
+                                "Expected %10.5f "
+                                "Actual: %10.5f",
+                                transferIdx,
+                                i,
+                                dstIdx,
+                                expected[i],
+                                output[i]};
+                    }
+                }
+                return {ERR_FATAL,
+                        "Transfer %d: Unexpected output mismatch for destination %d",
+                        transferIdx,
+                        dstIdx};
+            }
+        }
+    }
+    return ERR_NONE;
+}
+
+// Preparation-related functions
+//========================================================================================
+
+// Prepares input parameters for each subexecutor
+// Determines how sub-executors will split up the work
+// Initializes counters
+static ErrResult PrepareSubExecParams(ConfigOptions const& cfg,
+                                      Transfer const& transfer,
+                                      TransferResources& rss)
+{
+    // Each subExecutor needs to know src/dst pointers and how many elements to transfer
+    // Figure out the sub-array each subExecutor works on for this Transfer
+    // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of
+    // data.blockBytes
+    //   except the very last one, for alignment reasons
+    size_t const N           = transfer.numBytes / sizeof(float);
+    int const initOffset     = cfg.data.byteOffset / sizeof(float);
+    int const targetMultiple = cfg.data.blockBytes / sizeof(float);
+
+    // In some cases, there may not be enough data for all subExectors
+    int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple,
+                                         (size_t)transfer.numSubExecs);
+
+    vector<SubExecParam>& subExecParam = rss.subExecParamCpu;
+    subExecParam.clear();
+    subExecParam.resize(transfer.numSubExecs);
+
+    size_t assigned = 0;
+    for (int i = 0; i < transfer.numSubExecs; ++i) {
+        SubExecParam& p = subExecParam[i];
+        p.numSrcs       = rss.srcMem.size();
+        p.numDsts       = rss.dstMem.size();
+        p.startCycle    = 0;
+        p.stopCycle     = 0;
+        p.hwId          = 0;
+        p.xccId         = 0;
+
+        // In single team mode, subexecutors stripe across the entire array
+        if (cfg.gfx.useSingleTeam && transfer.exeDevice.exeType == EXE_GPU_GFX) {
+            p.N        = N;
+            p.teamSize = transfer.numSubExecs;
+            p.teamIdx  = i;
+            for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) {
+                p.src[iSrc] = rss.srcMem[iSrc] + initOffset;
+            }
+            for (int iDst = 0; iDst < p.numDsts; ++iDst) {
+                p.dst[iDst] = rss.dstMem[iDst] + initOffset;
+            }
+        } else {
+            // Otherwise, each subexecutor works on separate subarrays
+            int const subExecLeft = std::max(0, maxSubExecToUse - i);
+            size_t const leftover = N - assigned;
+            size_t const roundedN = (leftover + targetMultiple - 1) / targetMultiple;
+
+            p.N = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
+            p.teamSize = 1;
+            p.teamIdx  = 0;
+            for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) {
+                p.src[iSrc] = rss.srcMem[iSrc] + initOffset + assigned;
+            }
+            for (int iDst = 0; iDst < p.numDsts; ++iDst) {
+                p.dst[iDst] = rss.dstMem[iDst] + initOffset + assigned;
+            }
+            assigned += p.N;
+        }
+
+        p.preferredXccId = transfer.exeSubIndex;
+        // Override if XCC table has been specified
+        vector<vector<int>> const& table = cfg.gfx.prefXccTable;
+        if (transfer.exeDevice.exeType == EXE_GPU_GFX && transfer.exeSubIndex == -1 &&
+            !table.empty() && transfer.dsts.size() == 1 && IsGpuMemType(transfer.dsts[0].memType)) {
+            if (table.size() <= static_cast<std::size_t>(transfer.exeDevice.exeIndex) ||
+                table[transfer.exeDevice.exeIndex].size() <=
+                    static_cast<std::size_t>(transfer.dsts[0].memIndex)) {
+                return {ERR_FATAL, "[gfx.xccPrefTable] is too small"};
+            }
+            p.preferredXccId = table[transfer.exeDevice.exeIndex][transfer.dsts[0].memIndex];
+            if (p.preferredXccId < 0 ||
+                p.preferredXccId >= GetNumExecutorSubIndices(transfer.exeDevice)) {
+                return {ERR_FATAL,
+                        "[gfx.xccPrefTable] defines out-of-bound XCC index %d",
+                        p.preferredXccId};
+            }
+        }
+    }
+
+    // Clear counters
+    rss.totalDurationMsec = 0.0;
+
+    return ERR_NONE;
+}
+
+// Prepare each executor
+// Allocates memory for src/dst, prepares subexecutors, executor-specific data structures
+static ErrResult PrepareExecutor(ConfigOptions const& cfg,
+                                 vector<Transfer> const& transfers,
+                                 ExeDevice const& exeDevice,
+                                 ExeInfo& exeInfo)
+{
+    exeInfo.totalDurationMsec = 0.0;
+
+    // Loop over each transfer this executor is involved in
+    for (auto& rss : exeInfo.resources) {
+        Transfer const& t = transfers[rss.transferIdx];
+        rss.numBytes      = t.numBytes;
+
+        // Allocate source memory
+        rss.srcMem.resize(t.srcs.size());
+        for (auto iSrc = std::size_t(0); iSrc < t.srcs.size(); ++iSrc) {
+            MemDevice const& srcMemDevice = t.srcs[iSrc];
+
+            // Ensure executing GPU can access source memory
+            if (IsGpuExeType(exeDevice.exeType) && IsGpuMemType(srcMemDevice.memType) &&
+                srcMemDevice.memIndex != exeDevice.exeIndex) {
+                ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, srcMemDevice.memIndex));
+            }
+            ERR_CHECK(AllocateMemory(
+                srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.srcMem[iSrc]));
+        }
+
+        // Allocate destination memory
+        rss.dstMem.resize(t.dsts.size());
+        for (auto iDst = std::size_t(0); iDst < t.dsts.size(); ++iDst) {
+            MemDevice const& dstMemDevice = t.dsts[iDst];
+
+            // Ensure executing GPU can access destination memory
+            if (IsGpuExeType(exeDevice.exeType) && IsGpuMemType(dstMemDevice.memType) &&
+                dstMemDevice.memIndex != exeDevice.exeIndex) {
+                ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, dstMemDevice.memIndex));
+            }
+            ERR_CHECK(AllocateMemory(
+                dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.dstMem[iDst]));
+        }
+
+        if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) {
+#if !defined(__NVCC__)
+            // Collect HSA agent information
+            hsa_amd_pointer_info_t info;
+            info.size = sizeof(info);
+            ERR_CHECK(hsa_amd_pointer_info(rss.dstMem[0], &info, NULL, NULL, NULL));
+            rss.dstAgent = info.agentOwner;
+
+            ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL));
+            rss.srcAgent = info.agentOwner;
+
+            // Create HSA completion signal
+            ERR_CHECK(hsa_signal_create(1, 0, NULL, &rss.signal));
+
+            if (t.exeSubIndex != -1) {
+                rss.sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
+            }
+#endif
+        }
+
+        // Prepare subexecutor parameters
+        ERR_CHECK(PrepareSubExecParams(cfg, t, rss));
+    }
+
+    // Prepare additional requirements for GPU-based executors
+    if (exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) {
+        ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
+
+        // Determine how many streams to use
+        int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA ||
+                                     (exeDevice.exeType == EXE_GPU_GFX && cfg.gfx.useMultiStream))
+                                        ? exeInfo.resources.size()
+                                        : 1;
+        exeInfo.streams.resize(numStreamsToUse);
+
+        // Create streams
+        for (int i = 0; i < numStreamsToUse; ++i) {
+            if (cfg.gfx.cuMask.size()) {
+#if !defined(__NVCC__)
+                ERR_CHECK(hipExtStreamCreateWithCUMask(
+                    &exeInfo.streams[i], cfg.gfx.cuMask.size(), cfg.gfx.cuMask.data()));
+#else
+                return {ERR_FATAL, "CU Masking in not supported on NVIDIA hardware"};
+#endif
+            } else {
+                ERR_CHECK(hipStreamCreate(&exeInfo.streams[i]));
+            }
+        }
+
+        if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
+            exeInfo.startEvents.resize(numStreamsToUse);
+            exeInfo.stopEvents.resize(numStreamsToUse);
+            for (int i = 0; i < numStreamsToUse; ++i) {
+                ERR_CHECK(hipEventCreate(&exeInfo.startEvents[i]));
+                ERR_CHECK(hipEventCreate(&exeInfo.stopEvents[i]));
+            }
+        }
+    }
+
+    // Prepare for GPU GFX executor
+    if (exeDevice.exeType == EXE_GPU_GFX) {
+        // Allocate one contiguous chunk of GPU memory for threadblock parameters
+        // This allows support for executing one transfer per stream, or all transfers in a single
+        // stream
+#if !defined(__NVCC__)
+        MemType memType = MEM_GPU;    // AMD hardware can directly access GPU memory from host
+#else
+        MemType
+            memType = MEM_MANAGED;    // NVIDIA hardware requires managed memory to access from host
+#endif
+        ERR_CHECK(AllocateMemory({memType, exeDevice.exeIndex},
+                                 exeInfo.totalSubExecs * sizeof(SubExecParam),
+                                 (void**)&exeInfo.subExecParamGpu));
+
+        // Create subexecutor parameter array for entire executor
+        exeInfo.subExecParamCpu.clear();
+        exeInfo.numSubIndices = GetNumExecutorSubIndices(exeDevice);
+#if defined(__NVCC__)
+        exeInfo.wallClockRate = 1000000;
+#else
+        ERR_CHECK(hipDeviceGetAttribute(
+            &exeInfo.wallClockRate, hipDeviceAttributeWallClockRate, exeDevice.exeIndex));
+#endif
+        int transferOffset = 0;
+        if (cfg.gfx.useMultiStream || cfg.gfx.blockOrder == 0) {
+            // Threadblocks are ordered sequentially one transfer at a time
+            for (auto& rss : exeInfo.resources) {
+                Transfer const& t      = transfers[rss.transferIdx];
+                rss.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
+                for (auto p : rss.subExecParamCpu) {
+                    rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
+                    exeInfo.subExecParamCpu.push_back(p);
+                    transferOffset++;
+                }
+            }
+        } else if (cfg.gfx.blockOrder == 1) {
+            // Interleave threadblocks of different Transfers
+            for (int subExecIdx = 0;
+                 exeInfo.subExecParamCpu.size() < static_cast<std::size_t>(exeInfo.totalSubExecs);
+                 ++subExecIdx) {
+                for (auto& rss : exeInfo.resources) {
+                    Transfer const& t = transfers[rss.transferIdx];
+                    if (subExecIdx < t.numSubExecs) {
+                        rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
+                        exeInfo.subExecParamCpu.push_back(rss.subExecParamCpu[subExecIdx]);
+                    }
+                }
+            }
+        } else if (cfg.gfx.blockOrder == 2) {
+            // Build randomized threadblock list
+            std::vector<std::pair<int, int>> indices;
+            for (auto i = std::size_t(0); i < exeInfo.resources.size(); i++) {
+                auto const& rss   = exeInfo.resources[i];
+                Transfer const& t = transfers[rss.transferIdx];
+                for (int j = 0; j < t.numSubExecs; j++) { indices.push_back(std::make_pair(i, j)); }
+            }
+
+            std::random_device rd;
+            std::mt19937 gen(rd());
+            std::shuffle(indices.begin(), indices.end(), gen);
+
+            // Build randomized threadblock list
+            for (auto p : indices) {
+                auto& rss = exeInfo.resources[p.first];
+                rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
+                exeInfo.subExecParamCpu.push_back(rss.subExecParamCpu[p.second]);
+            }
+        }
+
+        // Copy sub executor parameters to GPU
+        ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
+        ERR_CHECK(hipMemcpy(exeInfo.subExecParamGpu,
+                            exeInfo.subExecParamCpu.data(),
+                            exeInfo.totalSubExecs * sizeof(SubExecParam),
+                            hipMemcpyHostToDevice));
+        ERR_CHECK(hipDeviceSynchronize());
+    }
+
+    // Prepare for NIC-based executors
+    if (IsNicExeType(exeDevice.exeType)) {
+#ifdef NIC_EXEC_ENABLED
+        for (auto& rss : exeInfo.resources) {
+            Transfer const& t = transfers[rss.transferIdx];
+            ERR_CHECK(PrepareNicTransferResources(cfg, exeDevice, t, rss));
+        }
+#else
+        return {ERR_FATAL, "RDMA executor is not supported"};
+#endif
+    }
+    return ERR_NONE;
+}
+
+// Teardown-related functions
+//========================================================================================
+
+// Clean up all resources
+static ErrResult TeardownExecutor(ConfigOptions const& cfg,
+                                  ExeDevice const& exeDevice,
+                                  vector<Transfer> const& transfers,
+                                  ExeInfo& exeInfo)
+{
+    // Loop over each transfer this executor is involved in
+    for (auto& rss : exeInfo.resources) {
+        Transfer const& t = transfers[rss.transferIdx];
+
+        // Deallocate source memory
+        for (auto iSrc = std::size_t(0); iSrc < t.srcs.size(); ++iSrc) {
+            ERR_CHECK(DeallocateMemory(
+                t.srcs[iSrc].memType, rss.srcMem[iSrc], t.numBytes + cfg.data.byteOffset));
+        }
+
+        // Deallocate destination memory
+        for (auto iDst = std::size_t(0); iDst < t.dsts.size(); ++iDst) {
+            ERR_CHECK(DeallocateMemory(
+                t.dsts[iDst].memType, rss.dstMem[iDst], t.numBytes + cfg.data.byteOffset));
+        }
+
+        // Destroy HSA signal for DMA executor
+#if !defined(__NVCC__)
+        if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) {
+            ERR_CHECK(hsa_signal_destroy(rss.signal));
+        }
+#endif
+
+        // Destroy NIC related resources
+#ifdef NIC_EXEC_ENABLED
+        if (IsNicExeType(exeDevice.exeType)) { ERR_CHECK(TeardownNicTransferResources(rss)); }
+#endif
+    }
+
+    // Teardown additional requirements for GPU-based executors
+    if (exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) {
+        for (auto stream : exeInfo.streams) { ERR_CHECK(hipStreamDestroy(stream)); }
+        if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
+            for (auto event : exeInfo.startEvents) { ERR_CHECK(hipEventDestroy(event)); }
+            for (auto event : exeInfo.stopEvents) { ERR_CHECK(hipEventDestroy(event)); }
+        }
+    }
+
+    if (exeDevice.exeType == EXE_GPU_GFX) {
+#if !defined(__NVCC__)
+        MemType memType = MEM_GPU;
+#else
+        MemType memType = MEM_MANAGED;
+#endif
+        ERR_CHECK(DeallocateMemory(
+            memType, exeInfo.subExecParamGpu, exeInfo.totalSubExecs * sizeof(SubExecParam)));
+    }
+
+    return ERR_NONE;
+}
+
+// CPU Executor-related functions
+//========================================================================================
+
+// Kernel for CPU execution (run by a single subexecutor)
+static void CpuReduceKernel(SubExecParam const& p, int numSubIterations)
+{
+    if (p.N == 0) { return; }
+
+    int subIteration = 0;
+    do {
+        int const& numSrcs = p.numSrcs;
+        int const& numDsts = p.numDsts;
+
+        if (numSrcs == 0) {
+            for (int i = 0; i < numDsts; ++i) {
+                memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
+                // for (int j = 0; j < p.N; j++) p.dst[i][j] = MEMSET_VAL;
+            }
+        } else if (numSrcs == 1) {
+            float const* __restrict__ src = p.src[0];
+            if (numDsts == 0) {
+                float sum = 0.0;
+                for (auto j = std::size_t(0); j < p.N; j++) { sum += p.src[0][j]; }
+
+                // Add a dummy check to ensure the read is not optimized out
+                if (sum != sum) { printf("[ERROR] Nan detected\n"); }
+            } else {
+                for (int i = 0; i < numDsts; ++i) { memcpy(p.dst[i], src, p.N * sizeof(float)); }
+            }
+        } else {
+            float sum = 0.0f;
+            for (auto j = std::size_t(0); j < p.N; j++) {
+                sum = p.src[0][j];
+                for (int i = 1; i < numSrcs; i++) { sum += p.src[i][j]; }
+                for (int i = 0; i < numDsts; i++) { p.dst[i][j] = sum; }
+            }
+        }
+    } while (++subIteration != numSubIterations);
+}
+
+// Execution of a single CPU Transfers
+static void ExecuteCpuTransfer(int const iteration,
+                               ConfigOptions const& cfg,
+                               [[maybe_unused]] int const exeIndex,
+                               TransferResources& rss)
+{
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    vector<std::thread> childThreads;
+
+    for (auto const& subExecParam : rss.subExecParamCpu) {
+        childThreads.emplace_back(
+            std::thread(CpuReduceKernel, std::cref(subExecParam), cfg.general.numSubIterations));
+    }
+
+    for (auto& subExecThread : childThreads) { subExecThread.join(); }
+    childThreads.clear();
+
+    auto cpuDelta    = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
+                            .count() *
+                        1000.0) /
+                       cfg.general.numSubIterations;
+
+    if (iteration >= 0) {
+        rss.totalDurationMsec += deltaMsec;
+        if (cfg.general.recordPerIteration) { rss.perIterMsec.push_back(deltaMsec); }
+    }
+}
+
+// Execution of a single CPU executor
+static ErrResult RunCpuExecutor(int const iteration,
+                                ConfigOptions const& cfg,
+                                int const exeIndex,
+                                ExeInfo& exeInfo)
+{
+    numa_run_on_node(exeIndex);
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    vector<std::thread> asyncTransfers;
+    for (auto& rss : exeInfo.resources) {
+        asyncTransfers.emplace_back(
+            std::thread(ExecuteCpuTransfer, iteration, std::cref(cfg), exeIndex, std::ref(rss)));
+    }
+    for (auto& asyncTransfer : asyncTransfers) { asyncTransfer.join(); }
+
+    auto cpuDelta    = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() *
+                       1000.0 / cfg.general.numSubIterations;
+
+    if (iteration >= 0) { exeInfo.totalDurationMsec += deltaMsec; }
+    return ERR_NONE;
+}
+
+#ifdef NIC_EXEC_ENABLED
+// Execution of a single NIC Transfer
+static ErrResult ExecuteNicTransfer(int const iteration,
+                                    ConfigOptions const& cfg,
+                                    int const exeIndex,
+                                    TransferResources& rss)
+{
+    // Loop over each of the queue pairs and post the send
+    ibv_send_wr* badWorkReq;
+    for (int qpIndex = 0; qpIndex < rss.qpCount; qpIndex++) {
+        int error = ibv_post_send(
+            rss.srcQueuePairs[qpIndex], &rss.sendWorkRequests[qpIndex], &badWorkReq);
+        if (error) {
+            return {ERR_FATAL,
+                    "Transfer %d: Error when calling ibv_post_send for QP %d Error code %d\n",
+                    rss.transferIdx,
+                    qpIndex,
+                    error};
+        }
+    }
+    return ERR_NONE;
+}
+
+// Execution of a single NIC executor
+static ErrResult RunNicExecutor(int const iteration,
+                                ConfigOptions const& cfg,
+                                int const exeIndex,
+                                ExeInfo& exeInfo)
+{
+    // Switch to the closest NUMA node to this NIC
+    if (cfg.nic.useNuma) {
+        int numaNode = GetIbvDeviceList()[exeIndex].numaNode;
+        if (numaNode != -1) { numa_run_on_node(numaNode); }
+    }
+
+    auto transferCount = exeInfo.resources.size();
+    std::vector<double> totalTimeMsec(transferCount, 0.0);
+
+    int subIterations = 0;
+    auto cpuStart     = std::chrono::high_resolution_clock::now();
+    std::vector<std::chrono::high_resolution_clock::time_point> transferTimers(transferCount);
+
+    do {
+        std::vector<uint8_t> receivedQPs(transferCount, 0);
+        // post the sends
+        for (auto i = 0; i < transferCount; i++) {
+            transferTimers[i] = std::chrono::high_resolution_clock::now();
+            ERR_CHECK(ExecuteNicTransfer(iteration, cfg, exeIndex, exeInfo.resources[i]));
+        }
+        // poll for completions
+        size_t completedTransfers = 0;
+        while (completedTransfers < transferCount) {
+            for (auto i = 0; i < transferCount; i++) {
+                if (receivedQPs[i] < exeInfo.resources[i].qpCount) {
+                    auto& rss = exeInfo.resources[i];
+                    // Poll the completion queue until all queue pairs are complete
+                    // The order of completion doesn't matter because this completion queue is
+                    // dedicated to this Transfer
+                    ibv_wc wc;
+                    int nc = ibv_poll_cq(rss.srcCompQueue, 1, &wc);
+                    if (nc > 0) {
+                        receivedQPs[i]++;
+                        if (wc.status != IBV_WC_SUCCESS) {
+                            return {ERR_FATAL,
+                                    "Transfer %d: Received unsuccessful work completion",
+                                    rss.transferIdx};
+                        }
+                    } else if (nc < 0) {
+                        return {ERR_FATAL,
+                                "Transfer %d: Received negative work completion",
+                                rss.transferIdx};
+                    }
+                    if (receivedQPs[i] == rss.qpCount) {
+                        auto cpuDelta = std::chrono::high_resolution_clock::now() -
+                                        transferTimers[i];
+                        double
+                            deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(
+                                            cpuDelta)
+                                            .count() *
+                                        1000.0;
+                        if (iteration >= 0) { totalTimeMsec[i] += deltaMsec; }
+                        completedTransfers++;
+                    }
+                }
+            }
+        }
+    } while (++subIterations < cfg.general.numSubIterations);
+
+    auto cpuDelta    = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() *
+                       1000.0 / cfg.general.numSubIterations;
+
+    if (iteration >= 0) {
+        exeInfo.totalDurationMsec += deltaMsec;
+        for (int i = 0; i < transferCount; i++) {
+            auto& rss               = exeInfo.resources[i];
+            double transferTimeMsec = totalTimeMsec[i] / cfg.general.numSubIterations;
+            rss.totalDurationMsec += transferTimeMsec;
+            if (cfg.general.recordPerIteration) { rss.perIterMsec.push_back(transferTimeMsec); }
+        }
+    }
+    return ERR_NONE;
+}
+#endif
+// GFX Executor-related functions
+//========================================================================================
+
+// Converts register value to a CU/SM index
+static uint32_t GetId(uint32_t hwId)
+{
+#if defined(__NVCC_)
+    return hwId;
+#else
+    // Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
+    int const shId = (hwId >> 12) & 1;
+    int const cuId = (hwId >> 8) & 15;
+    int const seId = (hwId >> 13) & 3;
+    return (shId << 5) + (cuId << 2) + seId;
+#endif
+}
+
+// Device level timestamp function
+__device__ int64_t GetTimestamp()
+{
+#if defined(__NVCC__)
+    int64_t result;
+    asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(result));
+    return result;
+#else
+    return wall_clock64();
+#endif
+}
+
+// Helper function for memset
+template<typename T>
+__device__ __forceinline__ T MemsetVal();
+
+template<>
+__device__ __forceinline__ float MemsetVal()
+{
+    return MEMSET_VAL;
+};
+
+template<>
+__device__ __forceinline__ float2 MemsetVal()
+{
+    return make_float2(MEMSET_VAL, MEMSET_VAL);
+};
+
+template<>
+__device__ __forceinline__ float4 MemsetVal()
+{
+    return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL);
+}
+
+
+// Helper function for temporal/non-temporal reads / writes
+#define TEMPORAL_NONE 0
+#define TEMPORAL_LOAD 1
+#define TEMPORAL_STORE 2
+#define TEMPORAL_BOTH 3
+
+template<int TEMPORAL_MODE>
+__device__ __forceinline__ void Load(float const* src, float& dst)
+{
+    if (TEMPORAL_MODE & TEMPORAL_LOAD) {
+#if !defined(__NVCC__)
+        dst = __builtin_nontemporal_load(src);
+
+#endif
+    } else {
+        dst = *src;
+    }
+}
+
+template<int TEMPORAL_MODE>
+__device__ __forceinline__ void Load(float2 const* src, float2& dst)
+{
+    if (TEMPORAL_MODE & TEMPORAL_LOAD) {
+#if !defined(__NVCC__)
+        dst.x = __builtin_nontemporal_load(&(src->x));
+        dst.y = __builtin_nontemporal_load(&(src->y));
+#endif
+    } else {
+        dst = *src;
+    }
+}
+
+template<int TEMPORAL_MODE>
+__device__ __forceinline__ void Load(float4 const* src, float4& dst)
+{
+    if (TEMPORAL_MODE & TEMPORAL_LOAD) {
+#if !defined(__NVCC__)
+        dst.x = __builtin_nontemporal_load(&(src->x));
+        dst.y = __builtin_nontemporal_load(&(src->y));
+        dst.z = __builtin_nontemporal_load(&(src->z));
+        dst.w = __builtin_nontemporal_load(&(src->w));
+#endif
+    } else {
+        dst = *src;
+    }
+}
+
+template<int TEMPORAL_MODE>
+__device__ __forceinline__ void Store(float const& src, float* dst)
+{
+    if (TEMPORAL_MODE & TEMPORAL_STORE) {
+#if !defined(__NVCC__)
+        __builtin_nontemporal_store(src, dst);
+#endif
+    } else {
+        *dst = src;
+    }
+}
+
+template<int TEMPORAL_MODE>
+__device__ __forceinline__ void Store(float2 const& src, float2* dst)
+{
+    if (TEMPORAL_MODE & TEMPORAL_STORE) {
+#if !defined(__NVCC__)
+        __builtin_nontemporal_store(src.x, &(dst->x));
+        __builtin_nontemporal_store(src.y, &(dst->y));
+#endif
+    } else {
+        *dst = src;
+    }
+}
+
+template<int TEMPORAL_MODE>
+__device__ __forceinline__ void Store(float4 const& src, float4* dst)
+{
+    if (TEMPORAL_MODE & TEMPORAL_STORE) {
+#if !defined(__NVCC__)
+        __builtin_nontemporal_store(src.x, &(dst->x));
+        __builtin_nontemporal_store(src.y, &(dst->y));
+        __builtin_nontemporal_store(src.z, &(dst->z));
+        __builtin_nontemporal_store(src.w, &(dst->w));
+#endif
+    } else {
+        *dst = src;
+    }
+}
+
+// Kernel for GFX execution
+template<typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
+__global__ void __launch_bounds__(BLOCKSIZE)
+    GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
+{
+    int64_t startCycle;
+    if (threadIdx.x == 0) { startCycle = GetTimestamp(); }
+
+    SubExecParam& p = params[blockIdx.y];
+
+    // Filter by XCC
+#if !defined(__NVCC__)
+    int32_t xccId;
+    GetXccId(xccId);
+    if (p.preferredXccId != -1 && xccId != p.preferredXccId) { return; }
+#endif
+
+    // Collect data information
+    int32_t const numSrcs = p.numSrcs;
+    int32_t const numDsts = p.numDsts;
+    PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS];
+    PACKED_FLOAT* __restrict__ dstFloatPacked[MAX_DSTS];
+    for (int i = 0; i < numSrcs; i++) { srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i]; }
+    for (int i = 0; i < numDsts; i++) { dstFloatPacked[i] = (PACKED_FLOAT*)p.dst[i]; }
+
+    // Operate on wavefront granularity
+    int32_t const
+        nTeams = p.teamSize;    // Number of threadblocks working together on this subarray
+    int32_t const teamIdx = p.teamIdx;               // Index of this threadblock within the team
+    int32_t const nWaves  = BLOCKSIZE / warpSize;    // Number of wavefronts within this threadblock
+    int32_t const waveIdx = threadIdx.x /
+                            warpSize;    // Index of this wavefront within the threadblock
+    int32_t const tIdx = threadIdx.x % warpSize;    // Thread index within wavefront
+
+    size_t const numPackedFloat = p.N / (sizeof(PACKED_FLOAT) / sizeof(float));
+
+    int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
+    switch (waveOrder) {
+        case 0: /* U,W,C */
+            unrlStride  = 1;
+            waveStride  = UNROLL;
+            teamStride  = UNROLL * nWaves;
+            teamStride2 = nWaves;
+            waveStride2 = 1;
+            break;
+        case 1: /* U,C,W */
+            unrlStride  = 1;
+            teamStride  = UNROLL;
+            waveStride  = UNROLL * nTeams;
+            teamStride2 = 1;
+            waveStride2 = nTeams;
+            break;
+        case 2: /* W,U,C */
+            waveStride  = 1;
+            unrlStride  = nWaves;
+            teamStride  = nWaves * UNROLL;
+            teamStride2 = nWaves;
+            waveStride2 = 1;
+            break;
+        case 3: /* W,C,U */
+            waveStride  = 1;
+            teamStride  = nWaves;
+            unrlStride  = nWaves * nTeams;
+            teamStride2 = nWaves;
+            waveStride2 = 1;
+            break;
+        case 4: /* C,U,W */
+            teamStride  = 1;
+            unrlStride  = nTeams;
+            waveStride  = nTeams * UNROLL;
+            teamStride2 = 1;
+            waveStride2 = nTeams;
+            break;
+        case 5: /* C,W,U */
+            teamStride  = 1;
+            waveStride  = nTeams;
+            unrlStride  = nTeams * nWaves;
+            teamStride2 = 1;
+            waveStride2 = nTeams;
+            break;
+    }
+
+    int subIterations = 0;
+    while (1) {
+        // First loop: Each wavefront in the team works on UNROLL PACKED_FLOAT per thread
+        size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
+        size_t const loop1Limit  = numPackedFloat / loop1Stride * loop1Stride;
+        {
+            PACKED_FLOAT val[UNROLL];
+            PACKED_FLOAT tmp[UNROLL];
+            if (numSrcs == 0) {
+#pragma unroll
+                for (int u = 0; u < UNROLL; u++) { val[u] = MemsetVal<PACKED_FLOAT>(); }
+            }
+
+            for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx;
+                 idx < loop1Limit;
+                 idx += loop1Stride) {
+                // Read sources into memory and accumulate in registers
+                if (numSrcs) {
+#pragma unroll
+                    for (int u = 0; u < UNROLL; u++) {
+                        Load<TEMPORAL_MODE>(&srcFloatPacked[0][idx + u * unrlStride * warpSize],
+                                            val[u]);
+                    }
+
+                    for (int s = 1; s < numSrcs; s++) {
+#pragma unroll
+                        for (int u = 0; u < UNROLL; u++) {
+                            Load<TEMPORAL_MODE>(&srcFloatPacked[s][idx + u * unrlStride * warpSize],
+                                                tmp[u]);
+                        }
+#pragma unroll
+                        for (int u = 0; u < UNROLL; u++) { val[u] += tmp[u]; }
+                    }
+                }
+
+                // Write accumulation to all outputs
+                for (int d = 0; d < numDsts; d++) {
+#pragma unroll
+                    for (int u = 0; u < UNROLL; u++) {
+                        Store<TEMPORAL_MODE>(val[u],
+                                             &dstFloatPacked[d][idx + u * unrlStride * warpSize]);
+                    }
+                }
+            }
+        }
+
+        // Second loop: Deal with remaining PACKED_FLOAT
+        {
+            if (loop1Limit < numPackedFloat) {
+                PACKED_FLOAT val, tmp;
+                if (numSrcs == 0) { val = MemsetVal<PACKED_FLOAT>(); }
+
+                size_t const loop2Stride = nTeams * nWaves * warpSize;
+                for (size_t idx = loop1Limit +
+                                  (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
+                     idx < numPackedFloat;
+                     idx += loop2Stride) {
+                    if (numSrcs) {
+                        Load<TEMPORAL_MODE>(&srcFloatPacked[0][idx], val);
+                        for (int s = 1; s < numSrcs; s++) {
+                            Load<TEMPORAL_MODE>(&srcFloatPacked[s][idx], tmp);
+                            val += tmp;
+                        }
+                    }
+                    for (int d = 0; d < numDsts; d++) {
+                        Store<TEMPORAL_MODE>(val, &dstFloatPacked[d][idx]);
+                    }
+                }
+            }
+        }
+
+        // Third loop; Deal with remaining floats
+        {
+            if (numPackedFloat * (sizeof(PACKED_FLOAT) / sizeof(float)) < p.N) {
+                float val, tmp;
+                if (numSrcs == 0) { val = MemsetVal<float>(); }
+
+                size_t const loop3Stride = nTeams * nWaves * warpSize;
+                for (size_t idx = numPackedFloat * (sizeof(PACKED_FLOAT) / sizeof(float)) +
+                                  (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
+                     idx < p.N;
+                     idx += loop3Stride) {
+                    if (numSrcs) {
+                        Load<TEMPORAL_MODE>(&p.src[0][idx], val);
+                        for (int s = 1; s < numSrcs; s++) {
+                            Load<TEMPORAL_MODE>(&p.src[s][idx], tmp);
+                            val += tmp;
+                        }
+                    }
+
+                    for (int d = 0; d < numDsts; d++) { Store<TEMPORAL_MODE>(val, &p.dst[d][idx]); }
+                }
+            }
+        }
+
+        if (++subIterations == numSubIterations) { break; }
+    }
+
+    // Wait for all threads to finish
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        __threadfence_system();
+        p.stopCycle  = GetTimestamp();
+        p.startCycle = startCycle;
+        GetHwId(p.hwId);
+        GetXccId(p.xccId);
+    }
+}
+
+#define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD)             \
+    {                                                                  \
+        GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>,      \
+            GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>,  \
+            GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_STORE>, \
+            GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_BOTH>   \
+    }
+
+#define GPU_KERNEL_DWORD_DECL(BLOCKSIZE, UNROLL)                 \
+    {                                                            \
+        GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float),      \
+            GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float2), \
+            GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float4)  \
+    }
+
+#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)                                             \
+    {                                                                                 \
+        GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 1), GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 2),     \
+            GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 3), GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 4), \
+            GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 5), GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 6), \
+            GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 7), GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 8)  \
+    }
+
+// Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size)
+typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
+GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3][4] = {
+    GPU_KERNEL_UNROLL_DECL(64),
+    GPU_KERNEL_UNROLL_DECL(128),
+    GPU_KERNEL_UNROLL_DECL(192),
+    GPU_KERNEL_UNROLL_DECL(256),
+    GPU_KERNEL_UNROLL_DECL(320),
+    GPU_KERNEL_UNROLL_DECL(384),
+    GPU_KERNEL_UNROLL_DECL(448),
+    GPU_KERNEL_UNROLL_DECL(512),
+    GPU_KERNEL_UNROLL_DECL(576),
+    GPU_KERNEL_UNROLL_DECL(640),
+    GPU_KERNEL_UNROLL_DECL(704),
+    GPU_KERNEL_UNROLL_DECL(768),
+    GPU_KERNEL_UNROLL_DECL(832),
+    GPU_KERNEL_UNROLL_DECL(896),
+    GPU_KERNEL_UNROLL_DECL(960),
+    GPU_KERNEL_UNROLL_DECL(1024),
+};
+#undef GPU_KERNEL_UNROLL_DECL
+#undef GPU_KERNEL_DWORD_DECL
+#undef GPU_KERNEL_TEMPORAL_DECL
+
+// Execute a single GPU Transfer (when using 1 stream per Transfer)
+static ErrResult ExecuteGpuTransfer(int const iteration,
+                                    hipStream_t const stream,
+                                    hipEvent_t const startEvent,
+                                    hipEvent_t const stopEvent,
+                                    int const xccDim,
+                                    ConfigOptions const& cfg,
+                                    TransferResources& rss)
+{
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    int numSubExecs = rss.subExecParamCpu.size();
+    dim3 const gridSize(xccDim, numSubExecs, 1);
+    dim3 const blockSize(cfg.gfx.blockSize, 1);
+
+    int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 : cfg.gfx.wordSize == 2 ? 1 : 2;
+    auto gpuKernel  = GpuKernelTable[cfg.gfx.blockSize / 64 - 1][cfg.gfx.unrollFactor - 1]
+                                   [wordSizeIdx][cfg.gfx.temporalMode];
+
+#if defined(__NVCC__)
+    if (startEvent != NULL) { ERR_CHECK(hipEventRecord(startEvent, stream)); }
+    gpuKernel<<<gridSize, blockSize, 0, stream>>>(
+        rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+    if (stopEvent != NULL) { ERR_CHECK(hipEventRecord(stopEvent, stream)); }
+#else
+    hipExtLaunchKernelGGL(gpuKernel,
+                          gridSize,
+                          blockSize,
+                          0,
+                          stream,
+                          startEvent,
+                          stopEvent,
+                          0,
+                          rss.subExecParamGpuPtr,
+                          cfg.gfx.waveOrder,
+                          cfg.general.numSubIterations);
+#endif
+
+    ERR_CHECK(hipStreamSynchronize(stream));
+
+    auto cpuDelta       = std::chrono::high_resolution_clock::now() - cpuStart;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
+                              .count() * 1000.0 / cfg.general.numSubIterations;
+
+    if (iteration >= 0) {
+        double deltaMsec = cpuDeltaMsec;
+        if (startEvent != NULL) {
+            float gpuDeltaMsec;
+            ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+            deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
+        }
+        rss.totalDurationMsec += deltaMsec;
+        if (cfg.general.recordPerIteration) {
+            rss.perIterMsec.push_back(deltaMsec);
+            std::set<std::pair<int, int>> CUs;
+            for (int i = 0; i < numSubExecs; i++) {
+                CUs.insert(std::make_pair(rss.subExecParamGpuPtr[i].xccId,
+                                          GetId(rss.subExecParamGpuPtr[i].hwId)));
+            }
+            rss.perIterCUs.push_back(CUs);
+        }
+    }
+    return ERR_NONE;
+}
+
+// Execute a single GPU executor
+static ErrResult RunGpuExecutor(int const iteration,
+                                ConfigOptions const& cfg,
+                                int const exeIndex,
+                                ExeInfo& exeInfo)
+{
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    ERR_CHECK(hipSetDevice(exeIndex));
+
+    int xccDim = exeInfo.useSubIndices ? exeInfo.numSubIndices : 1;
+
+    if (cfg.gfx.useMultiStream) {
+        // Launch each Transfer separately in its own stream
+        vector<std::future<ErrResult>> asyncTransfers;
+        for (auto i = std::size_t(0); i < exeInfo.streams.size(); i++) {
+            asyncTransfers.emplace_back(
+                std::async(std::launch::async,
+                           ExecuteGpuTransfer,
+                           iteration,
+                           exeInfo.streams[i],
+                           cfg.gfx.useHipEvents ? exeInfo.startEvents[i] : NULL,
+                           cfg.gfx.useHipEvents ? exeInfo.stopEvents[i] : NULL,
+                           xccDim,
+                           std::cref(cfg),
+                           std::ref(exeInfo.resources[i])));
+        }
+        for (auto& asyncTransfer : asyncTransfers) { ERR_CHECK(asyncTransfer.get()); }
+    } else {
+        // Combine all the Transfers into a single kernel launch
+        int numSubExecs = exeInfo.totalSubExecs;
+        dim3 const gridSize(xccDim, numSubExecs, 1);
+        dim3 const blockSize(cfg.gfx.blockSize, 1);
+        hipStream_t stream = exeInfo.streams[0];
+
+        int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 : cfg.gfx.wordSize == 2 ? 1 : 2;
+        auto gpuKernel  = GpuKernelTable[cfg.gfx.blockSize / 64 - 1][cfg.gfx.unrollFactor - 1]
+                                       [wordSizeIdx][cfg.gfx.temporalMode];
+
+#if defined(__NVCC__)
+        if (cfg.gfx.useHipEvents) { ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream)); }
+        gpuKernel<<<gridSize, blockSize, 0, stream>>>(
+            exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+        if (cfg.gfx.useHipEvents) { ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream)); }
+#else
+        hipExtLaunchKernelGGL(gpuKernel,
+                              gridSize,
+                              blockSize,
+                              0,
+                              stream,
+                              cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL,
+                              cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL,
+                              0,
+                              exeInfo.subExecParamGpu,
+                              cfg.gfx.waveOrder,
+                              cfg.general.numSubIterations);
+#endif
+        ERR_CHECK(hipStreamSynchronize(stream));
+    }
+    auto cpuDelta       = std::chrono::high_resolution_clock::now() - cpuStart;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
+                              .count() * 1000.0 / cfg.general.numSubIterations;
+
+    if (iteration >= 0) {
+        if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) {
+            float gpuDeltaMsec;
+            ERR_CHECK(
+                hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0]));
+            gpuDeltaMsec /= cfg.general.numSubIterations;
+            exeInfo.totalDurationMsec += gpuDeltaMsec;
+        } else {
+            exeInfo.totalDurationMsec += cpuDeltaMsec;
+        }
+
+        // Determine timing for each of the individual transfers that were part of this launch
+        if (!cfg.gfx.useMultiStream) {
+            for (auto i = std::size_t(0); i < exeInfo.resources.size(); i++) {
+                TransferResources& rss  = exeInfo.resources[i];
+                long long minStartCycle = std::numeric_limits<long long>::max();
+                long long maxStopCycle  = std::numeric_limits<long long>::min();
+                std::set<std::pair<int, int>> CUs;
+
+                for (auto subExecIdx : rss.subExecIdx) {
+                    minStartCycle = std::min(minStartCycle,
+                                             exeInfo.subExecParamGpu[subExecIdx].startCycle);
+                    maxStopCycle  = std::max(maxStopCycle,
+                                            exeInfo.subExecParamGpu[subExecIdx].stopCycle);
+                    if (cfg.general.recordPerIteration) {
+                        CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
+                                                  GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
+                    }
+                }
+                double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
+                deltaMsec /= cfg.general.numSubIterations;
+                rss.totalDurationMsec += deltaMsec;
+                if (cfg.general.recordPerIteration) {
+                    rss.perIterMsec.push_back(deltaMsec);
+                    rss.perIterCUs.push_back(CUs);
+                }
+            }
+        }
+    }
+    return ERR_NONE;
+}
+
+// DMA Executor-related functions
+//========================================================================================
+
+// Execute a single DMA Transfer
+static ErrResult ExecuteDmaTransfer(int const iteration,
+                                    bool const useSubIndices,
+                                    hipStream_t const stream,
+                                    hipEvent_t const startEvent,
+                                    hipEvent_t const stopEvent,
+                                    ConfigOptions const& cfg,
+                                    TransferResources& resources)
+{
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    int subIterations = 0;
+    if (!useSubIndices && !cfg.dma.useHsaCopy) {
+        if (cfg.dma.useHipEvents) { ERR_CHECK(hipEventRecord(startEvent, stream)); }
+
+        // Use hipMemcpy
+        do {
+            ERR_CHECK(hipMemcpyAsync(resources.dstMem[0],
+                                     resources.srcMem[0],
+                                     resources.numBytes,
+                                     hipMemcpyDefault,
+                                     stream));
+        } while (++subIterations != cfg.general.numSubIterations);
+
+        if (cfg.dma.useHipEvents) { ERR_CHECK(hipEventRecord(stopEvent, stream)); }
+        ERR_CHECK(hipStreamSynchronize(stream));
+    } else {
+#if defined(__NVCC__)
+        return {ERR_FATAL, "HSA copy not supported on NVIDIA hardware"};
+#else
+        // Use HSA async copy
+        do {
+            hsa_signal_store_screlease(resources.signal, 1);
+            if (!useSubIndices) {
+                ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[0],
+                                                    resources.dstAgent,
+                                                    resources.srcMem[0],
+                                                    resources.srcAgent,
+                                                    resources.numBytes,
+                                                    0,
+                                                    NULL,
+                                                    resources.signal));
+            } else {
+                HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[0],
+                                                             resources.dstAgent,
+                                                             resources.srcMem[0],
+                                                             resources.srcAgent,
+                                                             resources.numBytes,
+                                                             0,
+                                                             NULL,
+                                                             resources.signal,
+                                                             resources.sdmaEngineId,
+                                                             true));
+            }
+            // Wait for SDMA transfer to complete
+            while (hsa_signal_wait_scacquire(resources.signal,
+                                             HSA_SIGNAL_CONDITION_LT,
+                                             1,
+                                             UINT64_MAX,
+                                             HSA_WAIT_STATE_ACTIVE) >= 1);
+        } while (++subIterations != cfg.general.numSubIterations);
+#endif
+    }
+    auto cpuDelta       = std::chrono::high_resolution_clock::now() - cpuStart;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
+                              .count() * 1000.0 / cfg.general.numSubIterations;
+
+    if (iteration >= 0) {
+        double deltaMsec = cpuDeltaMsec;
+        if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
+            float gpuDeltaMsec;
+            ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+            deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
+        }
+        resources.totalDurationMsec += deltaMsec;
+        if (cfg.general.recordPerIteration) { resources.perIterMsec.push_back(deltaMsec); }
+    }
+    return ERR_NONE;
+}
+
+// Execute a single DMA executor
+static ErrResult RunDmaExecutor(int const iteration,
+                                ConfigOptions const& cfg,
+                                int const exeIndex,
+                                ExeInfo& exeInfo)
+{
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    ERR_CHECK(hipSetDevice(exeIndex));
+
+    vector<std::future<ErrResult>> asyncTransfers;
+    for (auto i = std::size_t(0); i < exeInfo.resources.size(); i++) {
+        asyncTransfers.emplace_back(std::async(std::launch::async,
+                                               ExecuteDmaTransfer,
+                                               iteration,
+                                               exeInfo.useSubIndices,
+                                               exeInfo.streams[i],
+                                               cfg.dma.useHipEvents ? exeInfo.startEvents[i] : NULL,
+                                               cfg.dma.useHipEvents ? exeInfo.stopEvents[i] : NULL,
+                                               std::cref(cfg),
+                                               std::ref(exeInfo.resources[i])));
+    }
+
+    for (auto& asyncTransfer : asyncTransfers) { ERR_CHECK(asyncTransfer.get()); }
+
+    auto cpuDelta    = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
+                            .count() * 1000.0 / cfg.general.numSubIterations;
+    if (iteration >= 0) { exeInfo.totalDurationMsec += deltaMsec; }
+    return ERR_NONE;
+}
+
+// Executor-related functions
+//========================================================================================
+static ErrResult RunExecutor(int const iteration,
+                             ConfigOptions const& cfg,
+                             ExeDevice const& exeDevice,
+                             ExeInfo& exeInfo)
+{
+    switch (exeDevice.exeType) {
+        case EXE_CPU: return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+        case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+        case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+#ifdef NIC_EXEC_ENABLED
+        case EXE_NIC: return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+#endif
+        default: return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType};
+    }
+}
+
+}    // End of anonymous namespace
+//========================================================================================
+/// @endcond
+
+ErrResult::ErrResult(ErrType err) : errType(err), errMsg(""){};
+
+ErrResult::ErrResult(hipError_t err)
+{
+    if (err == hipSuccess) {
+        this->errType = ERR_NONE;
+        this->errMsg  = "";
+    } else {
+        this->errType = ERR_FATAL;
+        this->errMsg  = std::string("HIP Error: ") + hipGetErrorString(err);
+    }
+}
+
+#if !defined(__NVCC__)
+ErrResult::ErrResult(hsa_status_t err)
+{
+    if (err == HSA_STATUS_SUCCESS) {
+        this->errType = ERR_NONE;
+        this->errMsg  = "";
+    } else {
+        const char* errString = NULL;
+        hsa_status_string(err, &errString);
+        this->errType = ERR_FATAL;
+        this->errMsg  = std::string("HSA Error: ") + errString;
+    }
+}
+#endif
+
+ErrResult::ErrResult(ErrType errType, const char* format, ...)
+{
+    this->errType = errType;
+    va_list args, args_temp;
+    va_start(args, format);
+    va_copy(args_temp, args);
+
+    int len = vsnprintf(nullptr, 0, format, args);
+    if (len < 0) {
+        va_end(args_temp);
+        va_end(args);
+    } else {
+        this->errMsg.resize(len);
+        vsnprintf(this->errMsg.data(), len + 1, format, args_temp);
+    }
+    va_end(args_temp);
+    va_end(args);
+}
+
+bool RunTransfers(ConfigOptions const& cfg,
+                  std::vector<Transfer> const& transfers,
+                  TestResults& results)
+{
+    // Clear all errors;
+    auto& errResults = results.errResults;
+    errResults.clear();
+
+    // Check for valid configuration
+    if (ConfigOptionsHaveErrors(cfg, errResults)) { return false; }
+
+    // Check for valid transfers
+    if (TransfersHaveErrors(cfg, transfers, errResults)) { return false; }
+
+    // Collect up transfers by executor
+    int minNumSrcs     = MAX_SRCS + 1;
+    int maxNumSrcs     = 0;
+    size_t maxNumBytes = 0;
+    std::map<ExeDevice, ExeInfo> executorMap;
+    for (auto i = std::size_t(0); i < transfers.size(); i++) {
+        Transfer const& t = transfers[i];
+        ExeDevice exeDevice;
+        ERR_APPEND(GetActualExecutor(cfg, t.exeDevice, exeDevice), errResults);
+
+        TransferResources resource = {};
+        resource.transferIdx       = i;
+
+        ExeInfo& exeInfo = executorMap[exeDevice];
+        exeInfo.totalBytes += t.numBytes;
+        exeInfo.totalSubExecs += t.numSubExecs;
+        exeInfo.useSubIndices |= (t.exeSubIndex != -1 || (t.exeDevice.exeType == EXE_GPU_GFX &&
+                                                          !cfg.gfx.prefXccTable.empty()));
+        exeInfo.resources.push_back(resource);
+        minNumSrcs  = std::min(minNumSrcs, (int)t.srcs.size());
+        maxNumSrcs  = std::max(maxNumSrcs, (int)t.srcs.size());
+        maxNumBytes = std::max(maxNumBytes, t.numBytes);
+    }
+
+    // Loop over each executor and prepare
+    // - Allocates memory for each Transfer
+    // - Set up work for subexecutors
+    vector<TransferResources*> transferResources;
+    for (auto& exeInfoPair : executorMap) {
+        ExeDevice const& exeDevice = exeInfoPair.first;
+        ExeInfo& exeInfo           = exeInfoPair.second;
+        ERR_APPEND(PrepareExecutor(cfg, transfers, exeDevice, exeInfo), errResults);
+
+        for (auto& resource : exeInfo.resources) { transferResources.push_back(&resource); }
+    }
+
+    // Prepare reference src/dst arrays - only once for largest size
+    size_t maxN = maxNumBytes / sizeof(float);
+    vector<float> outputBuffer(maxN);
+    vector<vector<float>> dstReference(maxNumSrcs + 1, vector<float>(maxN));
+    {
+        size_t initOffset = cfg.data.byteOffset / sizeof(float);
+        vector<vector<float>> srcReference(maxNumSrcs, vector<float>(maxN));
+        memset(dstReference[0].data(), MEMSET_CHAR, maxNumBytes);
+
+        for (int numSrcs = 0; numSrcs < maxNumSrcs; numSrcs++) {
+            PrepareReference(cfg, srcReference[numSrcs], numSrcs);
+            for (auto i = std::size_t(0); i < maxN; i++) {
+                dstReference[numSrcs + 1][i] = (numSrcs == 0 ? 0 : dstReference[numSrcs][i]) +
+                                               srcReference[numSrcs][i];
+            }
+        }
+        // Release un-used partial sums
+        for (int numSrcs = 0; numSrcs < minNumSrcs; numSrcs++) { dstReference[numSrcs].clear(); }
+
+        // Initialize all src memory buffers
+        for (auto resource : transferResources) {
+            for (auto srcIdx = std::size_t(0); srcIdx < resource->srcMem.size(); srcIdx++) {
+                ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx] + initOffset,
+                                     srcReference[srcIdx].data(),
+                                     resource->numBytes,
+                                     hipMemcpyDefault),
+                           errResults);
+            }
+        }
+    }
+
+    // Pause before starting when running in iteractive mode
+    if (cfg.general.useInteractive) {
+        printf("Memory prepared:\n");
+
+        for (auto i = std::size_t(0); i < transfers.size(); i++) {
+            ExeInfo const& exeInfo = executorMap[transfers[i].exeDevice];
+            printf("Transfer %03zu:\n", i);
+            for (auto iSrc = std::size_t(0); iSrc < transfers[i].srcs.size(); ++iSrc) {
+                printf("  SRC %0zu: %p\n",
+                       iSrc,
+                       static_cast<void*>(transferResources[i]->srcMem[iSrc]));
+            }
+            for (auto iDst = std::size_t(0); iDst < transfers[i].dsts.size(); ++iDst) {
+                printf("  DST %0zu: %p\n",
+                       iDst,
+                       static_cast<void*>(transferResources[i]->dstMem[iDst]));
+            }
+        }
+        printf("Hit <Enter> to continue: ");
+        if (scanf("%*c") != 0) {
+            printf("[ERROR] Unexpected input\n");
+            return EXIT_FAILURE;
+        }
+        printf("\n");
+    }
+
+    // Perform iterations
+    size_t numTimedIterations = 0;
+    double totalCpuTimeSec    = 0.0;
+    for (int iteration = -cfg.general.numWarmups;; iteration++) {
+        // Stop if number of iterations/seconds has reached limit
+        if (cfg.general.numIterations > 0 && iteration >= cfg.general.numIterations) { break; }
+        if (cfg.general.numIterations < 0 && totalCpuTimeSec > -cfg.general.numIterations) {
+            break;
+        }
+
+
+        // Start CPU timing for this iteration
+        auto cpuStart = std::chrono::high_resolution_clock::now();
+
+        // Execute all Transfers in parallel
+        std::vector<std::future<ErrResult>> asyncExecutors;
+        for (auto& exeInfoPair : executorMap) {
+            asyncExecutors.emplace_back(std::async(std::launch::async,
+                                                   RunExecutor,
+                                                   iteration,
+                                                   std::cref(cfg),
+                                                   std::cref(exeInfoPair.first),
+                                                   std::ref(exeInfoPair.second)));
+        }
+
+        // Wait for all threads to finish
+        for (auto& asyncExecutor : asyncExecutors) { ERR_APPEND(asyncExecutor.get(), errResults); }
+
+        // Stop CPU timing for this iteration
+        auto cpuDelta   = std::chrono::high_resolution_clock::now() - cpuStart;
+        double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
+                              .count() /
+                          cfg.general.numSubIterations;
+
+        if (cfg.data.alwaysValidate) {
+            ERR_APPEND(
+                ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
+                errResults);
+        }
+
+        if (iteration >= 0) {
+            ++numTimedIterations;
+            totalCpuTimeSec += deltaSec;
+        }
+    }
+
+    // Pause for interactive mode
+    if (cfg.general.useInteractive) {
+        printf("Transfers complete. Hit <Enter> to continue: ");
+        if (scanf("%*c") != 0) {
+            printf("[ERROR] Unexpected input\n");
+            return EXIT_FAILURE;
+        }
+        printf("\n");
+    }
+
+    // Validate results
+    if (!cfg.data.alwaysValidate) {
+        ERR_APPEND(
+            ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
+            errResults);
+    }
+
+    // Prepare results
+    results.exeResults.clear();
+    results.tfrResults.clear();
+    results.tfrResults.resize(transfers.size());
+    results.numTimedIterations    = numTimedIterations;
+    results.totalBytesTransferred = 0;
+    results.avgTotalDurationMsec  = (totalCpuTimeSec * 1000.0) / numTimedIterations;
+    results.overheadMsec          = results.avgTotalDurationMsec;
+    for (auto& exeInfoPair : executorMap) {
+        ExeDevice const& exeDevice = exeInfoPair.first;
+        ExeInfo& exeInfo           = exeInfoPair.second;
+
+        // Copy over executor results
+        ExeResult& exeResult           = results.exeResults[exeDevice];
+        exeResult.numBytes             = exeInfo.totalBytes;
+        exeResult.avgDurationMsec      = exeInfo.totalDurationMsec / numTimedIterations;
+        exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) / exeResult.avgDurationMsec;
+        exeResult.sumBandwidthGbPerSec = 0.0;
+        exeResult.transferIdx.clear();
+        results.totalBytesTransferred += exeInfo.totalBytes;
+        results.overheadMsec = std::min(results.overheadMsec,
+                                        (results.avgTotalDurationMsec - exeResult.avgDurationMsec));
+
+        // Copy over transfer results
+        for (auto const& rss : exeInfo.resources) {
+            int const transferIdx = rss.transferIdx;
+            exeResult.transferIdx.push_back(transferIdx);
+
+            TransferResult& tfrResult = results.tfrResults[transferIdx];
+            tfrResult.exeDevice       = exeDevice;
+#ifdef NIC_EXEC_ENABLED
+            tfrResult.exeDstDevice = {exeDevice.exeType, rss.dstNicIndex};
+#else
+            tfrResult.exeDstDevice = exeDevice;
+#endif
+            tfrResult.numBytes             = rss.numBytes;
+            tfrResult.avgDurationMsec      = rss.totalDurationMsec / numTimedIterations;
+            tfrResult.avgBandwidthGbPerSec = (rss.numBytes / 1.0e6) / tfrResult.avgDurationMsec;
+            if (cfg.general.recordPerIteration) {
+                tfrResult.perIterMsec = rss.perIterMsec;
+                tfrResult.perIterCUs  = rss.perIterCUs;
+            }
+            exeResult.sumBandwidthGbPerSec += tfrResult.avgBandwidthGbPerSec;
+        }
+    }
+    results.avgTotalBandwidthGbPerSec = (results.totalBytesTransferred / 1.0e6) /
+                                        results.avgTotalDurationMsec;
+
+    // Teardown executors
+    for (auto& exeInfoPair : executorMap) {
+        ExeDevice const& exeDevice = exeInfoPair.first;
+        ExeInfo& exeInfo           = exeInfoPair.second;
+        ERR_APPEND(TeardownExecutor(cfg, exeDevice, transfers, exeInfo), errResults);
+    }
+
+    return true;
+}
+
+int GetIntAttribute(IntAttribute attribute)
+{
+    switch (attribute) {
+        case ATR_GFX_MAX_BLOCKSIZE: return MAX_BLOCKSIZE;
+        case ATR_GFX_MAX_UNROLL: return MAX_UNROLL;
+        default: return -1;
+    }
+}
+
+std::string GetStrAttribute(StrAttribute attribute)
+{
+    switch (attribute) {
+        case ATR_SRC_PREP_DESCRIPTION:
+            return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)";
+        default: return "";
+    }
+}
+
+ErrResult ParseTransfers(std::string line, std::vector<Transfer>& transfers)
+{
+    // Replace any round brackets or '->' with spaces,
+    for (int i = 1; line[i]; i++) {
+        if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == ':' ||
+            line[i] == '>') {
+            line[i] = ' ';
+        }
+    }
+
+    transfers.clear();
+
+    // Read in number of transfers
+    int numTransfers = 0;
+    std::istringstream iss(line);
+    iss >> numTransfers;
+    if (iss.fail()) { return ERR_NONE; }
+
+    // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
+    // otherwise read triples (srcMem, exeMem, dstMem)
+    bool const advancedMode = (numTransfers < 0);
+    numTransfers            = abs(numTransfers);
+
+    int numSubExecs;
+    std::string srcStr, exeStr, dstStr, numBytesToken;
+
+    if (!advancedMode) {
+        iss >> numSubExecs;
+        if (numSubExecs < 0 || iss.fail()) {
+            return {ERR_FATAL,
+                    "Parsing error: Number of blocks to use (%d) must be non-negative",
+                    numSubExecs};
+        }
+    }
+
+    for (int i = 0; i < numTransfers; i++) {
+        Transfer transfer;
+
+        if (!advancedMode) {
+            iss >> srcStr >> exeStr >> dstStr;
+            transfer.numSubExecs = numSubExecs;
+            if (iss.fail()) {
+                return {ERR_FATAL,
+                        "Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet",
+                        i + 1};
+            }
+            transfer.numBytes = 0;
+        } else {
+            iss >> srcStr >> exeStr >> dstStr >> transfer.numSubExecs >> numBytesToken;
+            if (iss.fail()) {
+                return {ERR_FATAL,
+                        "Parsing error: Unable to read valid Transfer %d (SRC EXE DST $CU #Bytes) "
+                        "tuple",
+                        i + 1};
+            }
+            if (sscanf(numBytesToken.c_str(), "%lu", &transfer.numBytes) != 1) {
+                return {ERR_FATAL,
+                        "Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) "
+                        "tuple",
+                        i + 1};
+            }
+
+            char units = numBytesToken.back();
+            switch (toupper(units)) {
+                case 'G': transfer.numBytes *= 1024;
+                case 'M': transfer.numBytes *= 1024;
+                case 'K': transfer.numBytes *= 1024;
+            }
+        }
+
+        ERR_CHECK(ParseMemType(srcStr, transfer.srcs));
+        ERR_CHECK(ParseMemType(dstStr, transfer.dsts));
+        ERR_CHECK(ParseExeType(exeStr, transfer.exeDevice, transfer.exeSubIndex));
+        transfers.push_back(transfer);
+    }
+    return ERR_NONE;
+}
+
+
+auto GetTransferBenchVersion() -> const std::string
+{
+    auto tb_version = std::string(TB_HEADER_VERSION);
+    if (tb_version.empty()) { tb_version = std::string(TB_UNKNOWN_VERSION); }
+
+    return tb_version;
+}
+
+auto GetTransferBenchBranch() -> const std::string
+{
+    auto tb_branch = std::string(TB_GIT_BRANCH);
+    if (tb_branch.empty()) {
+        tb_branch = std::string(TB_UNKNOWN_VERSION);
+        return tb_branch;
+    }
+
+    return tb_branch;
+}
+
+auto GetTransferBenchCommitHash([[maybe_unused]] bool is_long_commit) -> const std::string
+{
+    constexpr auto TB_GIT_SHORT_COMMIT_SIZE = std::size_t(8);
+
+    auto tb_commit = std::string(TB_GIT_COMMIT);
+    if (tb_commit.empty()) {
+        tb_commit = std::string(TB_UNKNOWN_VERSION);
+        return tb_commit;
+    }
+
+    if (!is_long_commit && tb_commit.length() >= TB_GIT_SHORT_COMMIT_SIZE) {
+        tb_commit = tb_commit.substr(0, (TB_GIT_SHORT_COMMIT_SIZE - 1));
+    }
+
+    return tb_commit;
+}
+
+
+int GetNumExecutors(ExeType exeType)
+{
+    switch (exeType) {
+        case EXE_CPU: return numa_num_configured_nodes();
+        case EXE_GPU_GFX:
+        case EXE_GPU_DMA: {
+            int numDetectedGpus = 0;
+            hipError_t status   = hipGetDeviceCount(&numDetectedGpus);
+            if (status != hipSuccess) { numDetectedGpus = 0; }
+            return numDetectedGpus;
+        }
+#ifdef NIC_EXEC_ENABLED
+        case EXE_NIC:
+        case EXE_NIC_NEAREST: {
+            return GetIbvDeviceList().size();
+        }
+#endif
+        default: return 0;
+    }
+}
+
+int GetNumSubExecutors(ExeDevice exeDevice)
+{
+    int const& exeIndex = exeDevice.exeIndex;
+
+    switch (exeDevice.exeType) {
+        case EXE_CPU: {
+            int numCores = 0;
+            for (int i = 0; i < numa_num_configured_cpus(); i++) {
+                if (numa_node_of_cpu(i) == exeIndex) { numCores++; }
+            }
+            return numCores;
+        }
+        case EXE_GPU_GFX: {
+            int numGpus = GetNumExecutors(EXE_GPU_GFX);
+            if (exeIndex < 0 || numGpus <= exeIndex) { return 0; }
+            int numDeviceCUs  = 0;
+            hipError_t status = hipDeviceGetAttribute(
+                &numDeviceCUs, hipDeviceAttributeMultiprocessorCount, exeIndex);
+            if (status != hipSuccess) { numDeviceCUs = 0; }
+            return numDeviceCUs;
+        }
+        case EXE_GPU_DMA: {
+            return 1;
+        }
+        default: return 0;
+    }
+}
+
+int GetNumExecutorSubIndices(ExeDevice exeDevice)
+{
+    // Executor subindices are not supported on NVIDIA hardware
+#if defined(__NVCC__)
+    return 0;
+#else
+    int const& exeIndex = exeDevice.exeIndex;
+
+    switch (exeDevice.exeType) {
+        case EXE_CPU: return 0;
+        case EXE_GPU_GFX: {
+            hsa_agent_t agent;
+            ErrResult err = GetHsaAgent(exeDevice, agent);
+            if (err.errType != ERR_NONE) { return 0; }
+            int numXccs = 1;
+            if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_XCC, &numXccs) !=
+                HSA_STATUS_SUCCESS) {
+                return 1;
+            }
+            return numXccs;
+        }
+        case EXE_GPU_DMA: {
+            std::set<int> engineIds;
+            ErrResult err;
+
+            // Get HSA agent for this GPU
+            hsa_agent_t agent;
+            err = GetHsaAgent(exeDevice, agent);
+            if (err.errType != ERR_NONE) { return 0; }
+
+            int numTotalEngines = 0, numEnginesA = 0, numEnginesB = 0;
+            if (hsa_agent_get_info(agent,
+                                   (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_ENG,
+                                   &numEnginesA) == HSA_STATUS_SUCCESS) {
+                numTotalEngines += numEnginesA;
+            }
+            if (hsa_agent_get_info(agent,
+                                   (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG,
+                                   &numEnginesB) == HSA_STATUS_SUCCESS) {
+                numTotalEngines += numEnginesB;
+            }
+
+            return numTotalEngines;
+        }
+        default: return 0;
+    }
+#endif
+}
+
+int GetClosestCpuNumaToGpu(int gpuIndex)
+{
+    // Closest NUMA is not supported on NVIDIA hardware at this time
+#if defined(__NVCC__)
+    return -1;
+#else
+    hsa_agent_t gpuAgent;
+    ErrResult err = GetHsaAgent({EXE_GPU_GFX, gpuIndex}, gpuAgent);
+    if (err.errType != ERR_NONE) { return -1; }
+
+    hsa_agent_t closestCpuAgent;
+    if (hsa_agent_get_info(gpuAgent,
+                           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU,
+                           &closestCpuAgent) == HSA_STATUS_SUCCESS) {
+        int numCpus = GetNumExecutors(EXE_CPU);
+        for (int i = 0; i < numCpus; i++) {
+            hsa_agent_t cpuAgent;
+            err = GetHsaAgent({EXE_CPU, i}, cpuAgent);
+            if (err.errType != ERR_NONE) { return -1; }
+            if (cpuAgent.handle == closestCpuAgent.handle) { return i; }
+        }
+    }
+    return -1;
+#endif
+}
+
+int GetClosestCpuNumaToNic([[maybe_unused]] int nicIndex)
+{
+#ifdef NIC_EXEC_ENABLED
+    int numNics = GetNumExecutors(EXE_NIC);
+    if (nicIndex < 0 || nicIndex >= numNics) { return -1; }
+    return GetIbvDeviceList()[nicIndex].numaNode;
+#else
+    return -1;
+#endif
+}
+
+
+int GetClosestNicToGpu([[maybe_unused]] int gpuIndex)
+{
+#ifdef NIC_EXEC_ENABLED
+    static bool isInitialized = false;
+    static std::vector<int> closestNicId;
+
+    int numGpus = GetNumExecutors(EXE_GPU_GFX);
+    if (gpuIndex < 0 || gpuIndex >= numGpus) { return -1; }
+
+    // Build closest NICs per GPU on first use
+    if (!isInitialized) {
+        closestNicId.resize(numGpus, -1);
+
+        // Build up list of NIC bus addresses
+        std::vector<std::string> ibvAddressList;
+        auto const& ibvDeviceList = GetIbvDeviceList();
+        for (auto const& ibvDevice : ibvDeviceList) {
+            ibvAddressList.push_back(ibvDevice.hasActivePort ? ibvDevice.busId : "");
+        }
+
+        // Track how many times a device has been assigned as "closest"
+        // This allows distributed work across devices using multiple ports (sharing the same busID)
+        // NOTE: This isn't necessarily optimal, but likely to work in most cases involving
+        // multi-port Counter example:
+        //
+        //  G0 prefers (N0,N1), picks N0
+        //  G1 prefers (N1,N2), picks N1
+        //  G2 prefers N0,      picks N0
+        //
+        //  instead of G0->N1, G1->N2, G2->N0
+
+        std::vector<int> assignedCount(ibvDeviceList.size(), 0);
+
+        // Loop over each GPU to find the closest NIC(s) based on PCIe address
+        for (int i = 0; i < numGpus; i++) {
+            // Collect PCIe address for the GPU
+            char hipPciBusId[64];
+            hipError_t err = hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i);
+            if (err != hipSuccess) {
+#ifdef VERBS_DEBUG
+                printf(
+                    "Failed to get PCI Bus ID for HIP device %d: %s\n", i, hipGetErrorString(err));
+#endif
+                closestNicId[i] = -1;
+                continue;
+            }
+
+            // Find closest NICs
+            std::set<int> closestNicIdxs = GetNearestDevicesInTree(hipPciBusId, ibvAddressList);
+
+            // Pick the least-used NIC to assign as closest
+            int closestIdx = -1;
+            for (auto idx : closestNicIdxs) {
+                if (closestIdx == -1 || assignedCount[idx] < assignedCount[closestIdx]) {
+                    closestIdx = idx;
+                }
+            }
+
+            // The following will only use distance between bus IDs
+            // to determine the closest NIC to GPU if the PCIe tree approach fails
+            if (closestIdx < 0) {
+#ifdef VERBS_DEBUG
+                printf("[WARN] Falling back to PCIe bus ID distance to determine proximity\n");
+#endif
+
+                int minDistance = std::numeric_limits<int>::max();
+                for (int j = 0; j < ibvDeviceList.size(); j++) {
+                    if (ibvDeviceList[j].busId != "") {
+                        int distance = GetBusIdDistance(hipPciBusId, ibvDeviceList[j].busId);
+                        if (distance < minDistance && distance >= 0) {
+                            minDistance = distance;
+                            closestIdx  = j;
+                        }
+                    }
+                }
+            }
+            closestNicId[i] = closestIdx;
+            if (closestIdx != -1) { assignedCount[closestIdx]++; }
+        }
+        isInitialized = true;
+    }
+    return closestNicId[gpuIndex];
+#else
+    return -1;
+#endif
+}
+
+// Undefine CUDA compatibility macros
+#if defined(__NVCC__)
+
+// clang-format off
+// ROCm specific
+#undef wall_clock64
+#undef gcnArchName
+
+// Datatypes
+#undef hipDeviceProp_t
+#undef hipError_t
+#undef hipEvent_t
+#undef hipStream_t
+
+// Enumerations
+#undef hipDeviceAttributeClockRate
+#undef hipDeviceAttributeMaxSharedMemoryPerMultiprocessor
+#undef hipDeviceAttributeMultiprocessorCount
+#undef hipErrorPeerAccessAlreadyEnabled
+#undef hipFuncCachePreferShared
+#undef hipMemcpyDefault
+#undef hipMemcpyDeviceToHost
+#undef hipMemcpyHostToDevice
+#undef hipSuccess
+
+// Functions
+#undef hipDeviceCanAccessPeer
+#undef hipDeviceEnablePeerAccess
+#undef hipDeviceGetAttribute
+#undef hipDeviceGetPCIBusId
+#undef hipDeviceSetCacheConfig
+#undef hipDeviceSynchronize
+#undef hipEventCreate
+#undef hipEventDestroy
+#undef hipEventElapsedTime
+#undef hipEventRecord
+#undef hipFree
+#undef hipGetDeviceCount
+#undef hipGetDeviceProperties
+#undef hipGetErrorString
+#undef hipHostFree
+#undef hipHostMalloc
+#undef hipMalloc
+#undef hipMallocManaged
+#undef hipMemcpy
+#undef hipMemcpyAsync
+#undef hipMemset
+#undef hipMemsetAsync
+#undef hipSetDevice
+#undef hipStreamCreate
+#undef hipStreamDestroy
+#undef hipStreamSynchronize
+#endif
+
+// Kernel macros
+#undef GetHwId
+#undef GetXccId
+
+// Undefine helper macros
+#undef ERR_CHECK
+#undef ERR_APPEND
+
+// clang-format on
+
+}    // namespace TransferBench
--- a/deps/tbengine/src/TransferBench.cpp
+++ b/deps/tbengine/src/TransferBench.cpp
+/*
+ * SPDX-License-Identifier: MIT License
+ *
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+
+ /**
+  *  Note:  This file is used by shared/static library builds.
+  *         If parts of the implementation cannot be in the header (e.g., non-inline functions, 
+  *         non-template code, or code that must avoid being exposed to users), we can place that
+  *         code here. 
+  * 
+  */
+ 
+#define TRANSFERBENCH_HEADER_IMPLEMENTATION_DETAILS
+#include <TransferBench.hpp>
--- a/src/client/Client.cpp
+++ b/src/client/Client.cpp
-/*
-Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include "Client.hpp"
-#include "Presets.hpp"
-#include "Topology.hpp"
-#include <fstream>
-
-int main(int argc, char **argv) {
-
-  // Collect environment variables
-  EnvVars ev;
-
-  // Display usage instructions and detected topology
-  if (argc <= 1) {
-    if (!ev.outputToCsv) {
-      DisplayUsage(argv[0]);
-      DisplayPresets();
-    }
-    DisplayTopology(ev.outputToCsv);
-    exit(0);
-  }
-
-  // Determine number of bytes to run per Transfer
-  size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
-  if (argc > 2) {
-    // Adjust bytes if unit specified
-    char units = argv[2][strlen(argv[2])-1];
-    switch (units) {
-    case 'G': case 'g': numBytesPerTransfer *= 1024;
-    case 'M': case 'm': numBytesPerTransfer *= 1024;
-    case 'K': case 'k': numBytesPerTransfer *= 1024;
-    }
-  }
-  if (numBytesPerTransfer % 4) {
-    printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
-    exit(1);
-  }
-
-  // Run preset benchmark if requested
-  if (RunPreset(ev, numBytesPerTransfer, argc, argv)) exit(0);
-
-  // Read input from command line or configuration file
-  std::vector<std::string> lines;
-  {
-    std::string line;
-    if (!strcmp(argv[1], "cmdline")) {
-      for (int i = 3; i < argc; i++)
-        line += std::string(argv[i]) + " ";
-      lines.push_back(line);
-    } else {
-      std::ifstream cfgFile(argv[1]);
-      if (!cfgFile.is_open()) {
-        printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
-        exit(1);
-      }
-      while (std::getline(cfgFile, line))
-        lines.push_back(line);
-      cfgFile.close();
-    }
-  }
-
-  // Print environment variables and CSV header
-  ev.DisplayEnvVars();
-  if (ev.outputToCsv)
-    printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
-
-  TransferBench::ConfigOptions cfgOptions = ev.ToConfigOptions();
-  TransferBench::TestResults results;
-  std::vector<ErrResult> errors;
-
-  // Process each line as a Test
-  int testNum = 0;
-  for (std::string const &line : lines) {
-    // Check if line is a comment to be echoed to output (starts with ##)
-    if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s\n", line.c_str());
-
-    // Parse set of parallel Transfers to execute
-    std::vector<Transfer> transfers;
-    CheckForError(TransferBench::ParseTransfers(line, transfers));
-    if (transfers.empty()) continue;
-
-    // Check for variable sub-executors Transfers
-    int numVariableTransfers = 0;
-    int maxVarCount = 0;
-    {
-      std::map<ExeDevice, int> varTransferCount;
-      for (auto const& t : transfers) {
-        if (t.numSubExecs == 0) {
-          if (t.exeDevice.exeType != EXE_GPU_GFX) {
-            printf("[ERROR] Variable number of subexecutors is only supported on GFX executors\n");
-            exit(1);
-          }
-          numVariableTransfers++;
-          varTransferCount[t.exeDevice]++;
-          maxVarCount = max(maxVarCount, varTransferCount[t.exeDevice]);
-        }
-      }
-      if (numVariableTransfers > 0 && numVariableTransfers != transfers.size()) {
-        printf("[ERROR] All or none of the Transfers in the Test must use variable number of Subexecutors\n");
-        exit(1);
-      }
-    }
-
-    // Track which transfers have already numBytes specified
-    std::vector<bool> bytesSpecified(transfers.size());
-    int hasUnspecified = false;
-    for (int i = 0; i < transfers.size(); i++) {
-      bytesSpecified[i] = (transfers[i].numBytes != 0);
-      if (transfers[i].numBytes == 0) hasUnspecified = true;
-    }
-
-    // Run the specified numbers of bytes otherwise generate a range of values
-    for (size_t bytes = (1<<10); bytes <= (1<<29); bytes *= 2) {
-      size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor);
-      size_t currBytes = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer;
-      do {
-        for (int i = 0; i < transfers.size(); i++) {
-          if (!bytesSpecified[i])
-            transfers[i].numBytes = currBytes;
-        }
-
-        if (maxVarCount == 0) {
-          if (TransferBench::RunTransfers(cfgOptions, transfers, results)) {
-            PrintResults(ev, ++testNum, transfers, results);
-          }
-          PrintErrors(results.errResults);
-        } else {
-          // Variable subexecutors - Determine how many subexecutors to sweep up to
-          int maxNumVarSubExec = ev.maxNumVarSubExec;
-          if (maxNumVarSubExec == 0) {
-            maxNumVarSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}) / maxVarCount;
-          }
-
-          TransferBench::TestResults bestResults;
-          std::vector<Transfer> bestTransfers;
-          for (int numSubExecs = ev.minNumVarSubExec; numSubExecs <= maxNumVarSubExec; numSubExecs++) {
-            std::vector<Transfer> tempTransfers = transfers;
-            for (auto& t : tempTransfers) {
-              if (t.numSubExecs == 0) t.numSubExecs = numSubExecs;
-            }
-
-            TransferBench::TestResults tempResults;
-            if (!TransferBench::RunTransfers(cfgOptions, tempTransfers, tempResults)) {
-              PrintErrors(tempResults.errResults);
-            } else {
-              if (tempResults.avgTotalBandwidthGbPerSec > bestResults.avgTotalBandwidthGbPerSec) {
-                bestResults = tempResults;
-                bestTransfers = tempTransfers;
-              }
-            }
-          }
-          PrintResults(ev, ++testNum, bestTransfers, bestResults);
-          PrintErrors(bestResults.errResults);
-        }
-        if (numBytesPerTransfer != 0 || !hasUnspecified) break;
-        currBytes += deltaBytes;
-      } while (currBytes < bytes * 2);
-      if (numBytesPerTransfer != 0 || !hasUnspecified) break;
-    }
-  }
-}
-
-void DisplayUsage(char const* cmdName)
-{
-  std::string nicSupport = "";
-#if NIC_EXEC_ENABLED
-  nicSupport = " (with NIC support)";
-#endif
-  printf("TransferBench v%s.%s%s\n", TransferBench::VERSION, CLIENT_VERSION, nicSupport.c_str());
-  printf("========================================\n");
-
-  if (numa_available() == -1) {
-    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
-    exit(1);
-  }
-
-  printf("Usage: %s config <N>\n", cmdName);
-  printf("  config: Either:\n");
-  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
-  printf("          - Name of preset config:\n");
-  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
-  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
-         DEFAULT_BYTES_PER_TRANSFER);
-  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
-  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
-  printf("\n");
-
-  EnvVars::DisplayUsage();
-}
-
-std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices) {
-  if (memDevices.empty()) return "N";
-  std::stringstream ss;
-  for (auto const& m : memDevices)
-    ss << TransferBench::MemTypeStr[m.memType] << m.memIndex;
-  return ss.str();
-}
-
-void PrintResults(EnvVars const& ev, int const testNum,
-                  std::vector<Transfer> const& transfers,
-                  TransferBench::TestResults const& results)
-{
-  char sep = ev.outputToCsv ? ',' : '|';
-  size_t numTimedIterations = results.numTimedIterations;
-
-  if (!ev.outputToCsv) printf("Test %d:\n", testNum);
-
-  // Loop over each executor
-  for (auto exeInfoPair : results.exeResults) {
-    ExeDevice const& exeDevice = exeInfoPair.first;
-    ExeResult const& exeResult = exeInfoPair.second;
-    ExeType const    exeType   = exeDevice.exeType;
-    int32_t const    exeIndex  = exeDevice.exeIndex;
-
-    printf(" Executor: %3s %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
-           ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
-           exeResult.avgDurationMsec, sep, exeResult.numBytes, sep, exeResult.sumBandwidthGbPerSec);
-
-    // Loop over each executor
-    for (int idx : exeResult.transferIdx) {
-      Transfer const& t = transfers[idx];
-      TransferResult const& r = results.tfrResults[idx];
-
-      char exeSubIndexStr[32] = "";
-      if (t.exeSubIndex != -1)
-        sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
-      printf("     Transfer %02d  %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %c%03d%s:%03d -> %s\n",
-             idx,                    sep,
-             r.avgBandwidthGbPerSec, sep,
-             r.avgDurationMsec,      sep,
-             r.numBytes,             sep,
-             MemDevicesToStr(t.srcs).c_str(),
-             TransferBench::ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
-             exeSubIndexStr, t.numSubExecs,
-             MemDevicesToStr(t.dsts).c_str());
-
-      // Show per-iteration timing information
-      if (ev.showIterations) {
-
-        // Check that per-iteration information exists
-        if (r.perIterMsec.size() != numTimedIterations) {
-          printf("[ERROR] Per iteration timing data unavailable: Expected %lu data points, but have %lu\n",
-                 numTimedIterations, r.perIterMsec.size());
-          exit(1);
-        }
-
-        // Compute standard deviation and track iterations by speed
-        std::set<std::pair<double, int>> times;
-        double stdDevTime = 0;
-        double stdDevBw = 0;
-        for (int i = 0; i < numTimedIterations; i++) {
-          times.insert(std::make_pair(r.perIterMsec[i], i+1));
-          double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
-          stdDevTime += varTime * varTime;
-
-          double iterBandwidthGbs = (t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
-          double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
-          stdDevBw += varBw * varBw;
-        }
-        stdDevTime = sqrt(stdDevTime / numTimedIterations);
-        stdDevBw = sqrt(stdDevBw / numTimedIterations);
-
-        // Loop over iterations (fastest to slowest)
-        for (auto& time : times) {
-          double iterDurationMsec = time.first;
-          double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
-          printf("      Iter %03d    %c %8.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
-
-          std::set<int> usedXccs;
-          if (time.second - 1 < r.perIterCUs.size()) {
-            printf(" CUs:");
-            for (auto x : r.perIterCUs[time.second - 1]) {
-              printf(" %02d:%02d", x.first, x.second);
-              usedXccs.insert(x.first);
-            }
-          }
-
-          printf(" XCCs:");
-          for (auto x : usedXccs)
-            printf(" %02d", x);
-          printf("\n");
-        }
-        printf("      StandardDev %c %8.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
-      }
-    }
-  }
-  printf(" Aggregate (CPU)  %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
-         sep, results.avgTotalBandwidthGbPerSec,
-         sep, results.avgTotalDurationMsec,
-         sep, results.totalBytesTransferred,
-         sep, results.overheadMsec);
-}
-
-void CheckForError(ErrResult const& error)
-{
-  switch (error.errType) {
-  case ERR_NONE: return;
-  case ERR_WARN:
-    printf("[WARN] %s\n", error.errMsg.c_str());
-    return;
-  case ERR_FATAL:
-    printf("[ERROR] %s\n", error.errMsg.c_str());
-    exit(1);
-  default:
-    break;
-  }
-}
-
-void PrintErrors(std::vector<ErrResult> const& errors)
-{
-  bool isFatal = false;
-  for (auto const& err : errors) {
-    printf("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN", err.errMsg.c_str());
-    isFatal |= (err.errType == ERR_FATAL);
-  }
-  if (isFatal) exit(1);
-}
--- a/src/client/EnvVars.hpp
+++ b/src/client/EnvVars.hpp
-/*
-Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#ifndef ENVVARS_HPP
-#define ENVVARS_HPP
-
-// Helper macro for catching HIP errors
-#define HIP_CALL(cmd)                                                           \
-  do {                                                                          \
-    hipError_t error = (cmd);                                                   \
-    if (error != hipSuccess) {                                                  \
-      std::cerr << "Encountered HIP error (" << hipGetErrorString(error)        \
-                << ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \
-      exit(-1);                                                                 \
-    }                                                                           \
-  } while (0)
-
-#include <algorithm>
-#include <iostream>
-#include <numa.h>
-#include <random>
-#include <time.h>
-#include "Client.hpp"
-
-#include "TransferBench.hpp"
-using namespace TransferBench;
-
-// Redefinitions for CUDA compatibility
-//==========================================================================================
-#if defined(__NVCC__)
-  #define hipError_t                                         cudaError_t
-  #define hipGetErrorString                                  cudaGetErrorString
-  #define hipDeviceProp_t                                    cudaDeviceProp
-  #define hipDeviceGetPCIBusId                               cudaDeviceGetPCIBusId
-  #define hipGetDeviceProperties                             cudaGetDeviceProperties
-  #define hipSuccess                                         cudaSuccess
-  #define gcnArchName                                        name
-  #define hipGetDeviceCount                                  cudaGetDeviceCount
-#endif
-
-// This class manages environment variable that affect TransferBench
-class EnvVars
-{
-public:
-  // Default configuration values
-  int const DEFAULT_SAMPLING_FACTOR = 1;
-
-  // Environment variables
-  // General options
-  int numIterations;                 // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
-  int numSubIterations;              // Number of subiterations to perform
-  int numWarmups;                    // Number of un-timed warmup iterations to perform
-  int showIterations;                // Show per-iteration timing info
-  int useInteractive;                // Pause for user-input before starting transfer loop
-
-  // Data options
-  int alwaysValidate;                // Validate after each iteration instead of once after all iterations
-  int blockBytes;                    // Each subexecutor, except the last, gets a multiple of this many bytes to copy
-  int byteOffset;                    // Byte-offset for memory allocations
-  vector<float> fillPattern;         // Pattern of floats used to fill source data
-  int validateDirect;                // Validate GPU destination memory directly instead of staging GPU memory on host
-  int validateSource;                // Validate source GPU memory immediately after preparation
-
-  // DMA options
-  int useHsaDma;                     // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
-
-  // GFX options
-  int gfxBlockOrder;                 // How threadblocks for multiple Transfers are ordered 0=sequential 1=interleaved
-  int gfxBlockSize;                  // Size of each threadblock (must be multiple of 64)
-  vector<uint32_t> cuMask;           // Bit-vector representing the CU mask
-  vector<vector<int>> prefXccTable;  // Specifies XCC to use for given exe->dst pair
-  int gfxTemporal;                   // Non-temporal load/store mode (0=none, 1=load, 2=store, 3=both)
-  int gfxUnroll;                     // GFX-kernel unroll factor
-  int useHipEvents;                  // Use HIP events for timing GFX/DMA Executor
-  int useSingleStream;               // Use a single stream per GPU GFX executor instead of stream per Transfer
-  int gfxSingleTeam;                 // Team all subExecutors across the data array
-  int gfxWaveOrder;                  // GFX-kernel wavefront ordering
-  int gfxWordSize;                   // GFX-kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)
-
-  // Client options
-  int hideEnv;                       // Skip printing environment variable
-  int minNumVarSubExec;              // Minimum # of subexecutors to use for variable subExec Transfers
-  int maxNumVarSubExec;              // Maximum # of subexecutors to use for variable subExec Transfers (0 to use device limit)
-  int outputToCsv;                   // Output in CSV format
-  int samplingFactor;                // Affects how many different values of N are generated (when N set to 0)
-
-  // NIC options
-  int ibGidIndex;                    // GID Index for RoCE NICs
-  int roceVersion;                   // RoCE version number
-  int ipAddressFamily;               // IP Address Famliy
-  uint8_t ibPort;                    // NIC port number to be used
-  int nicRelaxedOrder;               // Use relaxed ordering for RDMA
-  std::string closestNicStr;         // Holds the user-specified list of closest NICs
-
-  // Developer features
-  int gpuMaxHwQueues;                // Tracks GPU_MAX_HW_QUEUES environment variable
-
-  // Constructor that collects values
-  EnvVars()
-  {
-    int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
-    int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-    int numDeviceCUs    = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0});
-
-    hipDeviceProp_t prop;
-    HIP_CALL(hipGetDeviceProperties(&prop, 0));
-    std::string fullName = prop.gcnArchName;
-    std::string archName = fullName.substr(0, fullName.find(':'));
-
-    // Different hardware pick different GPU kernels
-    // This performance difference is generally only noticable when executing fewer CUs
-    int defaultGfxUnroll = 4;
-    if      (archName == "gfx906") defaultGfxUnroll = 8;
-    else if (archName == "gfx90a") defaultGfxUnroll = 8;
-    else if (archName == "gfx940") defaultGfxUnroll = 6;
-    else if (archName == "gfx941") defaultGfxUnroll = 6;
-    else if (archName == "gfx942") defaultGfxUnroll = 4;
-
-    alwaysValidate    = GetEnvVar("ALWAYS_VALIDATE"     , 0);
-    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
-    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
-    gfxBlockOrder     = GetEnvVar("GFX_BLOCK_ORDER"     , 0);
-    gfxBlockSize      = GetEnvVar("GFX_BLOCK_SIZE"      , 256);
-    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 1);
-    gfxTemporal       = GetEnvVar("GFX_TEMPORAL"        , 0);
-    gfxUnroll         = GetEnvVar("GFX_UNROLL"          , defaultGfxUnroll);
-    gfxWaveOrder      = GetEnvVar("GFX_WAVE_ORDER"      , 0);
-    gfxWordSize       = GetEnvVar("GFX_WORD_SIZE"       , 4);
-    hideEnv           = GetEnvVar("HIDE_ENV"            , 0);
-    minNumVarSubExec  = GetEnvVar("MIN_VAR_SUBEXEC"     , 1);
-    maxNumVarSubExec  = GetEnvVar("MAX_VAR_SUBEXEC"     , 0);
-    numIterations     = GetEnvVar("NUM_ITERATIONS"      , 10);
-    numSubIterations  = GetEnvVar("NUM_SUBITERATIONS"   , 1);
-    numWarmups        = GetEnvVar("NUM_WARMUPS"         , 3);
-    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
-    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , 1);
-    showIterations    = GetEnvVar("SHOW_ITERATIONS"     , 0);
-    useHipEvents      = GetEnvVar("USE_HIP_EVENTS"      , 1);
-    useHsaDma         = GetEnvVar("USE_HSA_DMA"         , 0);
-    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
-    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 1);
-    validateDirect    = GetEnvVar("VALIDATE_DIRECT"     , 0);
-    validateSource    = GetEnvVar("VALIDATE_SOURCE"     , 0);
-
-    ibGidIndex        = GetEnvVar("IB_GID_INDEX"        ,-1);
-    ibPort            = GetEnvVar("IB_PORT_NUMBER"      , 1);
-    roceVersion       = GetEnvVar("ROCE_VERSION"        , 2);
-    ipAddressFamily   = GetEnvVar("IP_ADDRESS_FAMILY"   , 4);
-    nicRelaxedOrder   = GetEnvVar("NIC_RELAX_ORDER"     , 1);
-    closestNicStr     = GetEnvVar("CLOSEST_NIC"         , "");
-
-    gpuMaxHwQueues    = GetEnvVar("GPU_MAX_HW_QUEUES"   , 4);
-
-
-    // Check for fill pattern
-    char* pattern = getenv("FILL_PATTERN");
-    if (pattern != NULL) {
-      int patternLen = strlen(pattern);
-      if (patternLen % 2) {
-        printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n");
-        exit(1);
-      }
-
-      // Read in bytes
-      std::vector<unsigned char> bytes;
-      unsigned char val = 0;
-      for (int i = 0; i < patternLen; i++) {
-        if ('0' <= pattern[i] && pattern[i] <= '9')
-          val += (pattern[i] - '0');
-        else if ('A' <= pattern[i] && pattern[i] <= 'F')
-          val += (pattern[i] - 'A' + 10);
-        else if ('a' <= pattern[i] && pattern[i] <= 'f')
-          val += (pattern[i] - 'a' + 10);
-        else {
-          printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F).  (not %c)\n", pattern[i]);
-          exit(1);
-        }
-
-        if (i % 2 == 0)
-          val <<= 4;
-        else {
-          bytes.push_back(val);
-          val = 0;
-        }
-      }
-
-      // Reverse bytes (input is assumed to be given in big-endian)
-      std::reverse(bytes.begin(), bytes.end());
-
-      // Figure out how many copies of the pattern are necessary to fill a 4-byte float properly
-      int copies;
-      switch (patternLen % 8) {
-      case 0:  copies = 1; break;
-      case 4:  copies = 2; break;
-      default: copies = 4; break;
-      }
-
-      // Fill floats
-      int numFloats = copies * patternLen / 8;
-      fillPattern.resize(numFloats);
-      unsigned char* rawData = (unsigned char*) fillPattern.data();
-      for (int i = 0; i < numFloats * 4; i++)
-        rawData[i] = bytes[i % bytes.size()];
-    }
-    else fillPattern.clear();
-
-    // Check for CU mask
-    int numXccs = TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
-    cuMask.clear();
-    char* cuMaskStr = getenv("CU_MASK");
-    if (cuMaskStr != NULL) {
-#if defined(__NVCC__)
-      printf("[WARN] CU_MASK is not supported in CUDA\n");
-#else
-      std::vector<std::pair<int, int>> ranges;
-      int maxCU = 0;
-      char* token = strtok(cuMaskStr, ",");
-      while (token) {
-        int start, end;
-        if (sscanf(token, "%d-%d", &start, &end) == 2) {
-          ranges.push_back(std::make_pair(std::min(start, end), std::max(start, end)));
-          maxCU = std::max(maxCU, std::max(start, end));
-        } else if (sscanf(token, "%d", &start) == 1) {
-          ranges.push_back(std::make_pair(start, start));
-          maxCU = std::max(maxCU, start);
-        } else {
-          printf("[ERROR] Unrecognized token [%s]\n", token);
-          exit(1);
-        }
-        token = strtok(NULL, ",");
-      }
-      cuMask.resize(2 * numXccs, 0);
-
-      for (auto range : ranges) {
-        for (int i = range.first; i <= range.second; i++) {
-          for (int x = 0; x < numXccs; x++) {
-            int targetBit = i * numXccs + x;
-            cuMask[targetBit/32] |= (1<<(targetBit%32));
-          }
-        }
-      }
-#endif
-    }
-
-    // Parse preferred XCC table (if provided)
-    char* prefXccStr = getenv("XCC_PREF_TABLE");
-    if (prefXccStr) {
-      prefXccTable.resize(numDetectedGpus);
-      for (int i = 0; i < numDetectedGpus; i++){
-        prefXccTable[i].resize(numDetectedGpus, -1);
-      }
-      char* token = strtok(prefXccStr, ",");
-      int tokenCount = 0;
-      while (token) {
-        int xccId;
-        if (sscanf(token, "%d", &xccId) == 1) {
-          int src = tokenCount / numDetectedGpus;
-          int dst = tokenCount % numDetectedGpus;
-          if (xccId < 0 || xccId >= numXccs) {
-            printf("[ERROR] XCC index (%d) out of bounds. Expect value less than %d\n", xccId, numXccs);
-            exit(1);
-          }
-          prefXccTable[src][dst] = xccId;
-
-          tokenCount++;
-          if (tokenCount == (numDetectedGpus * numDetectedGpus)) break;
-        } else {
-          printf("[ERROR] Unrecognized token [%s]\n", token);
-          exit(1);
-        }
-        token = strtok(NULL, ",");
-      }
-    }
-  }
-
-  static std::string ToStr(std::vector<int> const& values) {
-    std::string result = "";
-    bool isFirst = true;
-    for (int v : values) {
-      if (isFirst) isFirst = false;
-      else result += ",";
-      result += std::to_string(v);
-    }
-    return result;
-  }
-
-  // Display info on the env vars that can be used
-  static void DisplayUsage()
-  {
-    printf("Environment variables:\n");
-    printf("======================\n");
-    printf(" ALWAYS_VALIDATE   - Validate after each iteration instead of once after all iterations\n");
-    printf(" BLOCK_BYTES       - Controls granularity of how work is divided across subExecutors\n");
-    printf(" BYTE_OFFSET       - Initial byte-offset for memory allocations.  Must be multiple of 4\n");
-#if NIC_EXEC_ENABLED
-    printf(" CLOSEST_NIC       - Comma-separated list of per-GPU closest NIC (default=auto)\n");
-#endif
-    printf(" CU_MASK           - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
-    printf(" FILL_PATTERN      - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
-    printf(" GFX_BLOCK_ORDER   - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
-    printf(" GFX_BLOCK_SIZE    - # of threads per threadblock (Must be multiple of 64)\n");
-    printf(" GFX_TEMPORAL      - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)\n");
-    printf(" GFX_UNROLL        - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
-    printf(" GFX_SINGLE_TEAM   - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
-    printf(" GFX_WAVE_ORDER    - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
-    printf(" GFX_WORD_SIZE     - GFX kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)\n");
-    printf(" HIDE_ENV          - Hide environment variable value listing\n");
-#if NIC_EXEC_ENABLED
-    printf(" IB_GID_INDEX      - Required for RoCE NICs (default=-1/auto)\n");
-    printf(" IB_PORT_NUMBER    - RDMA port count for RDMA NIC (default=1)\n");
-    printf(" IP_ADDRESS_FAMILY - IP address family (4=v4, 6=v6, default=v4)\n");
-#endif
-    printf(" MIN_VAR_SUBEXEC   - Minumum # of subexecutors to use for variable subExec Transfers\n");
-    printf(" MAX_VAR_SUBEXEC   - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
-#if NIC_EXEC_ENABLED
-    printf(" NIC_RELAX_ORDER   - Set to non-zero to use relaxed ordering");
-#endif
-    printf(" NUM_ITERATIONS    - # of timed iterations per test. If negative, run for this many seconds instead\n");
-    printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n");
-    printf(" NUM_WARMUPS       - # of untimed warmup iterations per test\n");
-    printf(" OUTPUT_TO_CSV     - Outputs to CSV format if set\n");
-#if NIC_EXEC_ENABLED
-    printf(" ROCE_VERSION      - RoCE version (default=2)\n");
-#endif
-    printf(" SAMPLING_FACTOR   - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n");
-    printf(" SHOW_ITERATIONS   - Show per-iteration timing info\n");
-    printf(" USE_HIP_EVENTS    - Use HIP events for GFX executor timing\n");
-    printf(" USE_HSA_DMA       - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
-    printf(" USE_INTERACTIVE   - Pause for user-input before starting transfer loop\n");
-    printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
-    printf(" VALIDATE_DIRECT   - Validate GPU destination memory directly instead of staging GPU memory on host\n");
-    printf(" VALIDATE_SOURCE   - Validate GPU src memory immediately after preparation\n");
-  }
-
-
-  void Print(std::string const& name, int32_t const value, const char* format, ...) const
-  {
-    printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value, outputToCsv ? "," : " : ");
-    va_list args;
-    va_start(args, format);
-    vprintf(format, args);
-    va_end(args);
-    printf("\n");
-  }
-
-  void Print(std::string const& name, std::string const& value, const char* format, ...) const
-  {
-    printf("%-20s%s%12s%s", name.c_str(), outputToCsv ? "," : " = ", value.c_str(), outputToCsv ? "," : " : ");
-    va_list args;
-    va_start(args, format);
-    vprintf(format, args);
-    va_end(args);
-    printf("\n");
-  }
-
-  // Display env var settings
-  void DisplayEnvVars() const
-  {
-    int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-    std::string nicSupport = "";
-#if NIC_EXEC_ENABLED
-    nicSupport = " (with NIC support)";
-#endif
-    if (!outputToCsv) {
-      printf("TransferBench v%s.%s%s\n", TransferBench::VERSION, CLIENT_VERSION, nicSupport.c_str());
-      printf("===============================================================\n");
-      if (!hideEnv) printf("[Common]                              (Suppress by setting HIDE_ENV=1)\n");
-    }
-    else if (!hideEnv)
-      printf("EnvVar,Value,Description,(TransferBench Client v%s Backend v%s)\n", CLIENT_VERSION, TransferBench::VERSION);
-    if (hideEnv) return;
-
-    Print("ALWAYS_VALIDATE", alwaysValidate,
-          "Validating after %s", (alwaysValidate ? "each iteration" : "all iterations"));
-    Print("BLOCK_BYTES", blockBytes,
-          "Each CU gets a mulitple of %d bytes to copy", blockBytes);
-    Print("BYTE_OFFSET", byteOffset,
-          "Using byte offset of %d", byteOffset);
-#if NIC_EXEC_ENABLED
-    Print("CLOSEST_NIC", (closestNicStr == "" ? "auto" : "user-input"),
-          "Per-GPU closest NIC is set as %s", (closestNicStr == "" ? "auto" : closestNicStr.c_str()));
-#endif
-    Print("CU_MASK", getenv("CU_MASK") ? 1 : 0,
-          "%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
-    Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
-          "%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
-    Print("GFX_BLOCK_ORDER", gfxBlockOrder,
-          "Thread block ordering: %s", gfxBlockOrder == 0 ? "Sequential" : "Interleaved");
-    Print("GFX_BLOCK_SIZE", gfxBlockSize,
-          "Threadblock size of %d", gfxBlockSize);
-    Print("GFX_SINGLE_TEAM", gfxSingleTeam,
-          "%s", (gfxSingleTeam ? "Combining CUs to work across entire data array" :
-                                 "Each CUs operates on its own disjoint subarray"));
-    Print("GFX_TEMPORAL", gfxTemporal,
-          "%s", (gfxTemporal == 0 ? "Not using non-temporal loads/stores" :
-                 gfxTemporal == 1 ? "Using non-temporal loads" :
-                 gfxTemporal == 2 ? "Using non-temporal stores" :
-                                    "Using non-temporal loads and stores"));
-
-    Print("GFX_UNROLL", gfxUnroll,
-          "Using GFX unroll factor of %d", gfxUnroll);
-    Print("GFX_WAVE_ORDER", gfxWaveOrder,
-          "Using GFX wave ordering of %s", (gfxWaveOrder == 0 ? "Unroll,Wavefront,CU" :
-                                            gfxWaveOrder == 1 ? "Unroll,CU,Wavefront" :
-                                            gfxWaveOrder == 2 ? "Wavefront,Unroll,CU" :
-                                            gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
-                                            gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
-                                                                "CU,Wavefront,Unroll"));
-    Print("GFX_WORD_SIZE", gfxWordSize,
-          "Using GFX word size of %d (DWORDx%d)", gfxWordSize, gfxWordSize);
-
-#if NIC_EXEC_ENABLED
-    Print("IP_ADDRESS_FAMILY", ipAddressFamily,
-          "IP address family is set to IPv%d", ipAddressFamily);
-
-    Print("IB_GID_INDEX", ibGidIndex,
-          "RoCE GID index is set to %s", (ibGidIndex < 0 ? "auto" : std::to_string(ibGidIndex).c_str()));
-    Print("IB_PORT_NUMBER", ibPort,
-          "IB port number is set to %d", ibPort);
-#endif
-    Print("MIN_VAR_SUBEXEC", minNumVarSubExec,
-          "Using at least %d subexecutor(s) for variable subExec tranfers", minNumVarSubExec);
-    Print("MAX_VAR_SUBEXEC", maxNumVarSubExec,
-          "Using up to %s subexecutors for variable subExec transfers",
-          maxNumVarSubExec ? std::to_string(maxNumVarSubExec).c_str() : "all available");
-#if NIC_EXEC_ENABLED
-    Print("NIC_RELAX_ORDER", nicRelaxedOrder,
-          "Using %s ordering for NIC RDMA", nicRelaxedOrder ? "relaxed" : "strict");
-#endif
-    Print("NUM_ITERATIONS", numIterations,
-          (numIterations == 0) ? "Running infinitely" :
-          "Running %d %s", abs(numIterations), (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
-    Print("NUM_SUBITERATIONS", numSubIterations,
-          "Running %s subiterations", (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)).c_str());
-    Print("NUM_WARMUPS", numWarmups,
-          "Running %d warmup iteration(s) per Test", numWarmups);
-#if NIC_EXEC_ENABLED
-    Print("ROCE_VERSION", roceVersion,
-          "RoCE version is set to %d", roceVersion);
-#endif
-    Print("SHOW_ITERATIONS", showIterations,
-          "%s per-iteration timing", showIterations ? "Showing" : "Hiding");
-    Print("USE_HIP_EVENTS", useHipEvents,
-          "Using %s for GFX/DMA Executor timing", useHipEvents ? "HIP events" : "CPU wall time");
-    Print("USE_HSA_DMA", useHsaDma,
-          "Using %s for DMA execution", useHsaDma ? "hsa_amd_async_copy" : "hipMemcpyAsync");
-    Print("USE_INTERACTIVE", useInteractive,
-          "Running in %s mode", useInteractive ? "interactive" : "non-interactive");
-    Print("USE_SINGLE_STREAM", useSingleStream,
-          "Using single stream per GFX %s", useSingleStream ? "device" : "Transfer");
-    if (getenv("XCC_PREF_TABLE")) {
-      printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", "");
-      printf("%36s:         ", "");
-      for (int i = 0; i < numGpuDevices; i++) printf(" %3d", i); printf(" (#XCCs)\n");
-      for (int i = 0; i < numGpuDevices; i++) {
-        printf("%36s: GPU %3d ", "", i);
-        for (int j = 0; j < numGpuDevices; j++)
-          printf(" %3d", prefXccTable[i][j]);
-        printf(" %3d\n", TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}));
-      }
-    }
-    Print("VALIDATE_DIRECT", validateDirect,
-          "Validate GPU destination memory %s", validateDirect ? "directly" : "via CPU staging buffer");
-    Print("VALIDATE_SOURCE", validateSource,
-          validateSource ? "Validate source after preparation" : "Do not perform source validation after prep");
-    printf("\n");
-  };
-
-  // Helper function that gets parses environment variable or sets to default value
-  static int GetEnvVar(std::string const& varname, int defaultValue)
-  {
-    if (getenv(varname.c_str()))
-      return atoi(getenv(varname.c_str()));
-    return defaultValue;
-  }
-
-  static std::vector<int> GetEnvVarArray(std::string const& varname, std::vector<int> const& defaultValue)
-  {
-    if (getenv(varname.c_str())) {
-      char* rangeStr = getenv(varname.c_str());
-      std::set<int> values;
-      char* token = strtok(rangeStr, ",");
-      while (token) {
-        int start, end;
-        if (sscanf(token, "%d-%d", &start, &end) == 2) {
-          for (int i = start; i <= end; i++) values.insert(i);
-        } else if (sscanf(token, "%d", &start) == 1) {
-          values.insert(start);
-        } else {
-          printf("[ERROR] Unrecognized token [%s]\n", token);
-          exit(1);
-        }
-        token = strtok(NULL, ",");
-      }
-      std::vector<int> result;
-      for (auto v : values) result.push_back(v);
-      return result;
-    }
-    return defaultValue;
-  }
-
-  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
-  {
-    if (getenv(varname.c_str()))
-      return getenv(varname.c_str());
-    return defaultValue;
-  }
-
-  std::string GetCuMaskDesc() const
-  {
-    std::vector<std::pair<int, int>> runs;
-    int numXccs = TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
-    bool inRun = false;
-    std::pair<int, int> curr;
-    int used = 0;
-    for (int targetBit = 0; targetBit < cuMask.size() * 32; targetBit += numXccs) {
-      if (cuMask[targetBit/32] & (1 << (targetBit%32))) {
-        used++;
-        if (!inRun) {
-          inRun = true;
-          curr.first = targetBit / numXccs;
-        }
-      } else {
-        if (inRun) {
-          inRun = false;
-          curr.second = targetBit / numXccs - 1;
-          runs.push_back(curr);
-        }
-      }
-    }
-    if (inRun)
-      curr.second = (cuMask.size() * 32) / numXccs - 1;
-
-    std::string result = "CUs used: (" + std::to_string(used) + ") ";
-    for (int i = 0; i < runs.size(); i++)
-    {
-      if (i) result += ",";
-      if (runs[i].first == runs[i].second) result += std::to_string(runs[i].first);
-      else result += std::to_string(runs[i].first) + "-" + std::to_string(runs[i].second);
-    }
-    return result;
-  }
-
-  TransferBench::ConfigOptions ToConfigOptions()
-  {
-    TransferBench::ConfigOptions cfg;
-
-    cfg.general.numIterations      = numIterations;
-    cfg.general.numSubIterations   = numSubIterations;
-    cfg.general.numWarmups         = numWarmups;
-    cfg.general.recordPerIteration = showIterations;
-    cfg.general.useInteractive     = useInteractive;
-
-    cfg.data.alwaysValidate        = alwaysValidate;
-    cfg.data.blockBytes            = blockBytes;
-    cfg.data.byteOffset            = byteOffset;
-    cfg.data.validateDirect        = validateDirect;
-    cfg.data.validateSource        = validateSource;
-    cfg.data.fillPattern           = fillPattern;
-
-    cfg.dma.useHipEvents           = useHipEvents;
-    cfg.dma.useHsaCopy             = useHsaDma;
-
-    cfg.gfx.blockOrder             = gfxBlockOrder;
-    cfg.gfx.blockSize              = gfxBlockSize;
-    cfg.gfx.cuMask                 = cuMask;
-    cfg.gfx.prefXccTable           = prefXccTable;
-    cfg.gfx.unrollFactor           = gfxUnroll;
-    cfg.gfx.temporalMode           = gfxTemporal;
-    cfg.gfx.useHipEvents           = useHipEvents;
-    cfg.gfx.useMultiStream         = !useSingleStream;
-    cfg.gfx.useSingleTeam          = gfxSingleTeam;
-    cfg.gfx.waveOrder              = gfxWaveOrder;
-    cfg.gfx.wordSize               = gfxWordSize;
-
-    cfg.nic.ibGidIndex             = ibGidIndex;
-    cfg.nic.ibPort                 = ibPort;
-    cfg.nic.ipAddressFamily        = ipAddressFamily;
-    cfg.nic.useRelaxedOrder        = nicRelaxedOrder;
-    cfg.nic.roceVersion            = roceVersion;
-
-    std::vector<int> closestNics;
-    if(closestNicStr != "") {
-      std::stringstream ss(closestNicStr);
-      std::string item;
-      while (std::getline(ss, item, ',')) {
-        try {
-          int nic = std::stoi(item);
-          closestNics.push_back(nic);
-        } catch (const std::invalid_argument& e) {
-          printf("[ERROR] Invalid NIC index (%s) by user in %s\n", item.c_str(), closestNicStr.c_str());
-          exit(1);
-        }
-      }
-      cfg.nic.closestNics = closestNics;
-    }
-    return cfg;
-  }
-};
-
-#endif
--- a/src/client/Presets/AllToAll.hpp
+++ b/src/client/Presets/AllToAll.hpp
-/*
-Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include "EnvVars.hpp"
-
-void AllToAllPreset(EnvVars&           ev,
-                    size_t      const  numBytesPerTransfer,
-                    std::string const  presetName)
-{
-  enum
-  {
-    A2A_COPY       = 0,
-    A2A_READ_ONLY  = 1,
-    A2A_WRITE_ONLY = 2,
-    A2A_CUSTOM     = 3,
-  };
-  char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
-
-  // Force single-stream mode for all-to-all benchmark
-  ev.useSingleStream = 1;
-
-  // Force to gfx unroll 2 unless explicitly set
-  ev.gfxUnroll      = EnvVars::GetEnvVar("GFX_UNROLL", 2);
-
-  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-
-  // Collect env vars for this preset
-  int a2aDirect     = EnvVars::GetEnvVar("A2A_DIRECT"     , 1);
-  int a2aLocal      = EnvVars::GetEnvVar("A2A_LOCAL"      , 0);
-  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
-  int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
-  int numSubExecs   = EnvVars::GetEnvVar("NUM_SUB_EXEC"   , 8);
-  int useDmaExec    = EnvVars::GetEnvVar("USE_DMA_EXEC"   , 0);
-  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
-  int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
-
-  // A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
-  int numSrcs, numDsts;
-  int a2aMode = 0;
-  if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
-    a2aMode = A2A_CUSTOM;
-  } else {
-    a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
-    if (a2aMode < 0 || a2aMode > 2) {
-      printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
-      exit(1);
-    }
-    numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
-    numDsts = (a2aMode == A2A_READ_ONLY  ? 0 : 1);
-  }
-
-  // Print off environment variables
-  ev.DisplayEnvVars();
-  if (!ev.hideEnv) {
-    if (!ev.outputToCsv) printf("[AllToAll Related]\n");
-    ev.Print("A2A_DIRECT"     , a2aDirect    , a2aDirect ? "Only using direct links" : "Full all-to-all");
-    ev.Print("A2A_LOCAL"      , a2aLocal     , "%s local transfers", a2aLocal ? "Include" : "Exclude");
-    ev.Print("A2A_MODE"       , (a2aMode == A2A_CUSTOM) ?  std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
-                                (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
-                                                           std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
-    ev.Print("NUM_GPU_DEVICES", numGpus      , "Using %d GPUs", numGpus);
-    ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
-    ev.Print("NUM_SUB_EXEC"   , numSubExecs  , "Using %d subexecutors/CUs per Transfer", numSubExecs);
-    ev.Print("USE_DMA_EXEC"   , useDmaExec   , "Using %s executor", useDmaExec ? "DMA" : "GFX");
-    ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
-    ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
-    printf("\n");
-  }
-
-  // Validate env vars
-  if (numGpus < 0 || numGpus > numDetectedGpus) {
-    printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
-    exit(1);
-  }
-  if (useDmaExec && (numSrcs != 1 || numDsts != 1)) {
-    printf("[ERROR] DMA execution can only be used for copies (A2A_MODE=0)\n");
-    exit(1);
-  }
-
-  // Collect the number of GPU devices to use
-  MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
-  ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
-
-  std::map<std::pair<int, int>, int> reIndex;
-  std::vector<Transfer> transfers;
-  for (int i = 0; i < numGpus; i++) {
-    for (int j = 0; j < numGpus; j++) {
-
-      // Check whether or not to execute this pair
-      if (i == j) {
-        if (!a2aLocal) continue;
-      } else if (a2aDirect) {
-#if !defined(__NVCC__)
-        uint32_t linkType, hopCount;
-        HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
-        if (hopCount != 1) continue;
-#endif
-      }
-
-      // Build Transfer and add it to list
-      TransferBench::Transfer transfer;
-      transfer.numBytes = numBytesPerTransfer;
-      for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, i});
-
-      // When using multiple destinations, the additional destinations are "local"
-      if (numDsts) transfer.dsts.push_back({memType, j});
-      for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
-      transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
-      transfer.exeSubIndex = -1;
-      transfer.numSubExecs = numSubExecs;
-
-      reIndex[std::make_pair(i,j)] = transfers.size();
-      transfers.push_back(transfer);
-    }
-  }
-
-  // Create a ring using NICs
-  std::vector<int> nicTransferIdx(numGpus);
-  if (numQueuePairs > 0) {
-    int numNics = TransferBench::GetNumExecutors(EXE_NIC);
-    for (int i = 0; i < numGpus; i++) {
-      TransferBench::Transfer transfer;
-      transfer.numBytes = numBytesPerTransfer;
-      transfer.srcs.push_back({memType, i});
-      transfer.dsts.push_back({memType, (i+1) % numGpus});
-      transfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, i};
-      transfer.exeSubIndex = (i+1) % numGpus;
-      transfer.numSubExecs = numQueuePairs;
-      nicTransferIdx[i] = transfers.size();
-      transfers.push_back(transfer);
-    }
-  }
-
-  printf("GPU-GFX All-To-All benchmark:\n");
-  printf("==========================\n");
-  printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
-         numBytesPerTransfer, a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
-  if (transfers.size() == 0) return;
-
-  // Execute Transfers
-  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
-  TransferBench::TestResults results;
-  if (!TransferBench::RunTransfers(cfg, transfers, results)) {
-    for (auto const& err : results.errResults)
-      printf("%s\n", err.errMsg.c_str());
-    exit(0);
-  } else {
-    PrintResults(ev, 1, transfers, results);
-  }
-
-  // Print results
-  char separator = (ev.outputToCsv ? ',' : ' ');
-  printf("\nSummary: [%lu bytes per Transfer] [%s:%d] [%d Read(s) %d Write(s)]\n",
-         numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, numSrcs, numDsts);
-  printf("===========================================================================\n");
-  printf("SRC\\DST ");
-  for (int dst = 0; dst < numGpus; dst++)
-    printf("%cGPU %02d    ", separator, dst);
-  if (numQueuePairs > 0)
-    printf("%cNIC(%02d QP)", separator, numQueuePairs);
-  printf("   %cSTotal     %cActual\n", separator, separator);
-
-  double totalBandwidthGpu = 0.0;
-  double minActualBandwidth = std::numeric_limits<double>::max();
-  double maxActualBandwidth = 0.0;
-  std::vector<double> colTotalBandwidth(numGpus+2, 0.0);
-  for (int src = 0; src < numGpus; src++) {
-    double rowTotalBandwidth = 0;
-    int    transferCount = 0;
-    double minBandwidth = std::numeric_limits<double>::max();
-    printf("GPU %02d", src);
-    for (int dst = 0; dst < numGpus; dst++) {
-      if (reIndex.count(std::make_pair(src, dst))) {
-        int const transferIdx = reIndex[std::make_pair(src,dst)];
-        TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
-        colTotalBandwidth[dst]  += r.avgBandwidthGbPerSec;
-        rowTotalBandwidth       += r.avgBandwidthGbPerSec;
-        totalBandwidthGpu       += r.avgBandwidthGbPerSec;
-        minBandwidth             = std::min(minBandwidth, r.avgBandwidthGbPerSec);
-        transferCount++;
-        printf("%c%8.3f  ", separator, r.avgBandwidthGbPerSec);
-      } else {
-        printf("%c%8s  ", separator, "N/A");
-      }
-    }
-
-    if (numQueuePairs > 0) {
-      TransferBench::TransferResult const& r = results.tfrResults[nicTransferIdx[src]];
-      colTotalBandwidth[numGpus]  += r.avgBandwidthGbPerSec;
-      rowTotalBandwidth           += r.avgBandwidthGbPerSec;
-      totalBandwidthGpu           += r.avgBandwidthGbPerSec;
-      minBandwidth                 = std::min(minBandwidth, r.avgBandwidthGbPerSec);
-      transferCount++;
-      printf("%c%8.3f  ", separator, r.avgBandwidthGbPerSec);
-    }
-    double actualBandwidth = minBandwidth * transferCount;
-    printf("   %c%8.3f   %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
-    minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
-    maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
-    colTotalBandwidth[numGpus+1] += rowTotalBandwidth;
-  }
-  printf("\nRTotal");
-  for (int dst = 0; dst < numGpus; dst++) {
-    printf("%c%8.3f  ", separator, colTotalBandwidth[dst]);
-  }
-  if (numQueuePairs > 0) {
-    printf("%c%8.3f  ", separator, colTotalBandwidth[numGpus]);
-  }
-  printf("   %c%8.3f   %c%8.3f   %c%8.3f\n", separator, colTotalBandwidth[numGpus+1],
-         separator, minActualBandwidth, separator, maxActualBandwidth);
-  printf("\n");
-
-  printf("Average   bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
-  printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
-  printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
-
-  PrintErrors(results.errResults);
-}
--- a/src/client/Presets/AllToAllN.hpp
+++ b/src/client/Presets/AllToAllN.hpp
-/*
-Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include "EnvVars.hpp"
-
-void AllToAllRdmaPreset(EnvVars&           ev,
-                        size_t      const  numBytesPerTransfer,
-                        std::string const  presetName)
-{
-
-
-  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-
-  // Collect env vars for this preset
-  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
-  int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1);
-  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
-
-  // Print off environment variables
-  ev.DisplayEnvVars();
-  if (!ev.hideEnv) {
-    if (!ev.outputToCsv) printf("[AllToAll Network Related]\n");
-    ev.Print("NUM_GPU_DEVICES", numGpus      , "Using %d GPUs", numGpus);
-    ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
-    ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
-    printf("\n");
-  }
-
-  // Validate env vars
-  if (numGpus < 0 || numGpus > numDetectedGpus) {
-    printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
-    exit(1);
-  }
-
-  MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
-
-  std::map<std::pair<int, int>, int> reIndex;
-  std::vector<Transfer> transfers;
-  for (int i = 0; i < numGpus; i++) {
-    for (int j = 0; j < numGpus; j++) {
-      // Build Transfer and add it to list
-      TransferBench::Transfer transfer;
-      transfer.numBytes = numBytesPerTransfer;
-      transfer.srcs.push_back({memType, i});
-      transfer.dsts.push_back({memType, j});
-      transfer.exeDevice = {EXE_NIC_NEAREST, i};
-      transfer.exeSubIndex = j;
-      transfer.numSubExecs = numQueuePairs;
-
-      reIndex[std::make_pair(i,j)] = transfers.size();
-      transfers.push_back(transfer);
-    }
-  }
-
-  printf("GPU-RDMA All-To-All benchmark:\n");
-  printf("==========================\n");
-  printf("- Copying %lu bytes between all pairs of GPUs using %d QPs per Transfer (%lu Transfers)\n",
-         numBytesPerTransfer, numQueuePairs, transfers.size());
-  if (transfers.size() == 0) return;
-
-  // Execute Transfers
-  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
-  TransferBench::TestResults results;
-  if (!TransferBench::RunTransfers(cfg, transfers, results)) {
-    for (auto const& err : results.errResults)
-      printf("%s\n", err.errMsg.c_str());
-    exit(0);
-  } else {
-    PrintResults(ev, 1, transfers, results);
-  }
-
-  // Print results
-  char separator = (ev.outputToCsv ? ',' : ' ');
-  printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
-  printf("==========================================================\n");
-  printf("SRC\\DST ");
-  for (int dst = 0; dst < numGpus; dst++)
-    printf("%cGPU %02d    ", separator, dst);
-  printf("   %cSTotal     %cActual\n", separator, separator);
-
-  double totalBandwidthGpu = 0.0;
-  double minActualBandwidth = std::numeric_limits<double>::max();
-  double maxActualBandwidth = 0.0;
-  std::vector<double> colTotalBandwidth(numGpus+2, 0.0);
-  for (int src = 0; src < numGpus; src++) {
-    double rowTotalBandwidth = 0;
-    int    transferCount = 0;
-    double minBandwidth = std::numeric_limits<double>::max();
-    printf("GPU %02d", src);
-    for (int dst = 0; dst < numGpus; dst++) {
-      if (reIndex.count(std::make_pair(src, dst))) {
-        int const transferIdx = reIndex[std::make_pair(src,dst)];
-        TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
-        colTotalBandwidth[dst]  += r.avgBandwidthGbPerSec;
-        rowTotalBandwidth       += r.avgBandwidthGbPerSec;
-        totalBandwidthGpu       += r.avgBandwidthGbPerSec;
-        minBandwidth             = std::min(minBandwidth, r.avgBandwidthGbPerSec);
-        transferCount++;
-        printf("%c%8.3f  ", separator, r.avgBandwidthGbPerSec);
-      } else {
-        printf("%c%8s  ", separator, "N/A");
-      }
-    }
-    double actualBandwidth = minBandwidth * transferCount;
-    printf("   %c%8.3f   %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
-    minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
-    maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
-    colTotalBandwidth[numGpus+1] += rowTotalBandwidth;
-  }
-  printf("\nRTotal");
-  for (int dst = 0; dst < numGpus; dst++) {
-    printf("%c%8.3f  ", separator, colTotalBandwidth[dst]);
-  }
-  printf("   %c%8.3f   %c%8.3f   %c%8.3f\n", separator, colTotalBandwidth[numGpus+1],
-         separator, minActualBandwidth, separator, maxActualBandwidth);
-  printf("\n");
-
-  printf("Average   bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
-  printf("Aggregate bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu);
-  printf("Aggregate bandwidth       (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
-
-  PrintErrors(results.errResults);
-}
--- a/src/client/Presets/AllToAllSweep.hpp
+++ b/src/client/Presets/AllToAllSweep.hpp
-/*
-Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include "EnvVars.hpp"
-
-void AllToAllSweepPreset(EnvVars&           ev,
-                         size_t      const  numBytesPerTransfer,
-                         std::string const  presetName)
-{
-  enum
-  {
-    A2A_COPY       = 0,
-    A2A_READ_ONLY  = 1,
-    A2A_WRITE_ONLY = 2,
-    A2A_CUSTOM     = 3,
-  };
-  char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
-
-  // Force single-stream mode for all-to-all benchmark
-  ev.useSingleStream = 1;
-
-  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-
-  // Collect env vars for this preset
-  int a2aDirect     = EnvVars::GetEnvVar("A2A_DIRECT"     , 1);
-  int a2aLocal      = EnvVars::GetEnvVar("A2A_LOCAL"      , 0);
-  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
-  int showMinOnly   = EnvVars::GetEnvVar("SHOW_MIN_ONLY",   1);
-  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
-  int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
-  int useSpray      = EnvVars::GetEnvVar("USE_SPRAY",       0);
-  int verbose       = EnvVars::GetEnvVar("VERBOSE",         0);
-
-  std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8});
-  std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32});
-
-  // A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
-  int numSrcs, numDsts;
-  int a2aMode = 0;
-  if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
-    a2aMode = A2A_CUSTOM;
-  } else {
-    a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
-    if (a2aMode < 0 || a2aMode > 2) {
-      printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
-      exit(1);
-    }
-    numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
-    numDsts = (a2aMode == A2A_READ_ONLY  ? 0 : 1);
-  }
-
-  // Print off environment variables
-  ev.DisplayEnvVars();
-  if (!ev.hideEnv) {
-    if (!ev.outputToCsv) printf("[AllToAll Related]\n");
-    ev.Print("A2A_DIRECT"     , a2aDirect        , a2aDirect ? "Only using direct links" : "Full all-to-all");
-    ev.Print("A2A_LOCAL"      , a2aLocal         , "%s local transfers", a2aLocal ? "Include" : "Exclude");
-    ev.Print("A2A_MODE"       , (a2aMode == A2A_CUSTOM) ?  std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
-                                (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
-                                                           std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
-    ev.Print("SHOW_MIN_ONLY"  , showMinOnly      , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results");
-    ev.Print("NUM_CUS"        , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
-    ev.Print("NUM_GPU_DEVICES", numGpus          , "Using %d GPUs", numGpus);
-    ev.Print("UNROLLS"        , unrollList.size(), EnvVars::ToStr(unrollList).c_str());
-    ev.Print("USE_FINE_GRAIN" , useFineGrain     , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
-    ev.Print("USE_REMOTE_READ", useRemoteRead    , "Using %s as executor", useRemoteRead ? "DST" : "SRC");
-    ev.Print("USE_SPRAY"      , useSpray         , "%s per CU", useSpray ? "All targets" : "One target");
-    ev.Print("VERBOSE"        , verbose          , verbose ? "Display test results" : "Display summary only");
-    printf("\n");
-  }
-
-  // Validate env vars
-  if (numGpus < 0 || numGpus > numDetectedGpus) {
-    printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
-    exit(1);
-  }
-
-  if (useSpray && numDsts > 1) {
-    printf("[ERROR] Cannot use USE_SPRAY with multiple destination buffers\n");
-    exit(1);
-  }
-
-  // Collect the number of GPU devices to use
-  MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
-  ExeType exeType = EXE_GPU_GFX;
-
-  std::vector<Transfer> transfers;
-
-  int targetCount = 0;
-  if (!useSpray) {
-    // Each CU will work on just one target
-    for (int i = 0; i < numGpus; i++) {
-      targetCount = 0;
-      for (int j = 0; j < numGpus; j++) {
-        // Check whether or not to execute this pair
-        if (i == j) {
-          if (!a2aLocal) continue;
-        } else if (a2aDirect) {
-#if !defined(__NVCC__)
-          uint32_t linkType, hopCount;
-          HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
-          if (hopCount != 1) continue;
-#endif
-        }
-
-        // Build Transfer and add it to list
-        TransferBench::Transfer transfer;
-        targetCount++;
-        transfer.numBytes = numBytesPerTransfer;
-        for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, i});
-
-        // When using multiple destinations, the additional destinations are "local"
-        if (numDsts) transfer.dsts.push_back({memType, j});
-        for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
-        transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
-        transfer.exeSubIndex = -1;
-        transfers.push_back(transfer);
-      }
-    }
-  } else {
-    // Each CU will work on all targets
-    for (int i = 0; i < numGpus; i++) {
-      TransferBench::Transfer transfer;
-      transfer.numBytes = numBytesPerTransfer;
-      transfer.exeDevice = {exeType, i};
-      transfer.exeSubIndex = -1;
-      targetCount = 0;
-      for (int j = 0; j < numGpus; j++) {
-        // Check whether or not to transfer to this GPU
-        if (i == j) {
-          if (!a2aLocal) continue;
-        } else if (a2aDirect) {
-#if !defined(__NVCC__)
-          uint32_t linkType, hopCount;
-          HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
-          if (hopCount != 1) continue;
-#endif
-        }
-        targetCount++;
-        for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, useRemoteRead ? j : i});
-
-        if (numDsts) transfer.dsts.push_back({memType, j});
-        for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
-      }
-      transfers.push_back(transfer);
-    }
-  }
-
-  printf("GPU-GFX All-To-All Sweep benchmark:\n");
-  printf("==========================\n");
-  printf("- Copying %lu bytes between %s pairs of GPUs\n", numBytesPerTransfer, a2aDirect ? "directly connected" : "all");
-  if (transfers.size() == 0) {
-    printf("[WARN} No transfers requested. Try adjusting A2A_DIRECT or A2A_LOCAL\n");
-    return;
-  }
-
-  // Execute Transfers
-  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
-
-  // Run tests
-  std::map<std::pair<int, int>, TransferBench::TestResults> results;
-
-  // Display summary
-  printf("#CUs\\Unroll");
-  for (int u : unrollList) {
-    printf("  %d(Min) ", u);
-    if (!showMinOnly) printf("  %d(Max) ", u);
-  }
-  printf("\n");
-  for (int c : numCusList) {
-    printf("   %5d   ", c);  fflush(stdout);
-    for (int u : unrollList) {
-      ev.gfxUnroll = cfg.gfx.unrollFactor = u;
-      for (auto& transfer : transfers)
-        transfer.numSubExecs = useSpray ? (c * targetCount) : c;
-
-      double minBandwidth = std::numeric_limits<double>::max();
-      double maxBandwidth = std::numeric_limits<double>::min();
-      TransferBench::TestResults result;
-      if (TransferBench::RunTransfers(cfg, transfers, result)) {
-        for (auto const& exeResult : result.exeResults) {
-          minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec);
-	  maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec);
-	}
-        if (useSpray) {
-	  minBandwidth *= targetCount;
-	  maxBandwidth *= targetCount;
-	}
-        results[std::make_pair(c,u)] = result;
-      } else {
-        minBandwidth = 0.0;
-      }
-      printf(" %7.2f ", minBandwidth);
-      if (!showMinOnly) printf(" %7.2f ", maxBandwidth);
-      fflush(stdout);
-    }
-    printf("\n"); fflush(stdout);
-  }
-
-  if (verbose) {
-    int testNum = 0;
-    for (int c : numCusList) {
-      for (int u : unrollList) {
-        printf("CUs: %d Unroll %d\n", c, u);
-        PrintResults(ev, ++testNum, transfers, results[std::make_pair(c,u)]);
-      }
-    }
-  }
-}
--- a/src/client/Presets/HealthCheck.hpp
+++ b/src/client/Presets/HealthCheck.hpp
-/*
-Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-
-void HealthCheckPreset(EnvVars&           ev,
-                       size_t      const  numBytesPerTransfer,
-                       std::string const  presetName)
-{
-  // Check for supported platforms
-#if defined(__NVCC__)
-  printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
-  return;
-#endif
-
-  bool hasFail = false;
-
-  // Force use of single stream
-  ev.useSingleStream = 1;
-
-  TransferBench::TestResults results;
-  int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
-
-  if (numGpuDevices != 8) {
-    printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
-    exit(1);
-  }
-
-  for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
-    hipDeviceProp_t prop;
-    HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
-    std::string fullName = prop.gcnArchName;
-    std::string archName = fullName.substr(0, fullName.find(':'));
-    if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
-    {
-      printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
-      exit(1);
-    }
-  }
-
-  // Pass limits
-  double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
-  double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
-  double a2aLimit  = getenv("LIMIT_A2A")  ? atof(getenv("LIMIT_A2A"))  : (int)(45 * 0.95);
-
-  // Run CPU to GPU
-
-  // Run unidirectional read from CPU to GPU
-  printf("Testing unidirectional reads from CPU ");
-  {
-    ev.gfxUnroll = 4;
-    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
-    std::vector<std::pair<int, double>> fails;
-    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
-      printf("."); fflush(stdout);
-
-      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
-      if (memIndex == -1) {
-        printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
-        exit(1);
-      }
-
-      std::vector<Transfer> transfers(1);
-      Transfer& t = transfers[0];
-      t.exeDevice = {EXE_GPU_GFX, gpuId};
-      t.numBytes  = 64*1024*1024;
-      t.srcs      = {{MEM_CPU, memIndex}};
-      t.dsts      = {};
-
-      // Loop over number of CUs to use
-      bool passed = false;
-      double bestResult = 0;
-      for (int cu = 7; cu <= 10; cu++) {
-        t.numSubExecs = cu;
-        if (TransferBench::RunTransfers(cfg, transfers, results)) {
-          bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
-        } else {
-          PrintErrors(results.errResults);
-        }
-        if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
-          passed = true;
-          break;
-        }
-      }
-      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
-    }
-    if (fails.size() == 0) {
-      printf("PASS\n");
-    } else {
-      hasFail = true;
-      printf("FAIL (%lu test(s))\n", fails.size());
-      for (auto p : fails) {
-        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
-      }
-    }
-  }
-
-  // Run unidirectional write from GPU to CPU
-  printf("Testing unidirectional writes to  CPU ");
-  {
-    ev.gfxUnroll = 4;
-    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
-
-    std::vector<std::pair<int, double>> fails;
-    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
-      printf("."); fflush(stdout);
-
-      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
-      if (memIndex == -1) {
-        printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
-        exit(1);
-      }
-
-      std::vector<Transfer> transfers(1);
-      Transfer& t = transfers[0];
-      t.exeDevice = {EXE_GPU_GFX, gpuId};
-      t.numBytes  = 64*1024*1024;
-      t.srcs      = {};
-      t.dsts      = {{MEM_CPU, memIndex}};
-
-      // Loop over number of CUs to use
-      bool passed = false;
-      double bestResult = 0;
-      for (int cu = 7; cu <= 10; cu++) {
-        t.numSubExecs = cu;
-        if (TransferBench::RunTransfers(cfg, transfers, results)) {
-          bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
-        } else {
-          PrintErrors(results.errResults);
-        }
-        if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
-          passed = true;
-          break;
-        }
-      }
-      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
-    }
-    if (fails.size() == 0) {
-      printf("PASS\n");
-    } else {
-      hasFail = true;
-      printf("FAIL (%lu test(s))\n", fails.size());
-      for (auto p : fails) {
-        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
-      }
-    }
-  }
-
-  // Run bidirectional tests
-  printf("Testing bidirectional  reads + writes ");
-  {
-    ev.gfxUnroll = 4;
-    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
-
-    std::vector<std::pair<int, double>> fails;
-    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
-      printf("."); fflush(stdout);
-
-      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
-      if (memIndex == -1) {
-        printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
-        exit(1);
-      }
-
-      std::vector<Transfer> transfers(2);
-      Transfer& t0 = transfers[0];
-      Transfer& t1 = transfers[1];
-
-      t0.exeDevice = {EXE_GPU_GFX, gpuId};
-      t0.numBytes  = 64*1024*1024;
-      t0.srcs      = {{MEM_CPU, memIndex}};
-      t0.dsts      = {};
-
-      t1.exeDevice = {EXE_GPU_GFX, gpuId};
-      t1.numBytes  = 64*1024*1024;
-      t1.srcs      = {};
-      t1.dsts      = {{MEM_CPU, memIndex}};
-
-      // Loop over number of CUs to use
-      bool passed = false;
-      double bestResult = 0;
-      for (int cu = 7; cu <= 10; cu++) {
-        t0.numSubExecs = cu;
-        t1.numSubExecs = cu;
-
-        if (TransferBench::RunTransfers(cfg, transfers, results)) {
-          double sum = (results.tfrResults[0].avgBandwidthGbPerSec +
-                        results.tfrResults[1].avgBandwidthGbPerSec);
-          bestResult = std::max(bestResult, sum);
-          if (sum >= bdirLimit) {
-            passed = true;
-            break;
-          }
-        } else {
-          PrintErrors(results.errResults);
-        }
-      }
-      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
-    }
-    if (fails.size() == 0) {
-      printf("PASS\n");
-    } else {
-      hasFail = true;
-      printf("FAIL (%lu test(s))\n", fails.size());
-      for (auto p : fails) {
-        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
-      }
-    }
-  }
-
-  // Run XGMI tests:
-  printf("Testing all-to-all XGMI copies        "); fflush(stdout);
-  {
-    // Force GFX unroll to 2 for MI300
-    ev.gfxUnroll = 2;
-    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
-
-    std::vector<Transfer> transfers;
-    for (int i = 0; i < numGpuDevices; i++) {
-      for (int j = 0; j < numGpuDevices; j++) {
-        if (i == j) continue;
-        Transfer t;
-        t.numBytes    = 64*1024*1024;
-        t.numSubExecs = 8;
-        t.exeDevice   = {EXE_GPU_GFX, i};
-        t.srcs        = {{MEM_GPU_FINE, i}};
-        t.dsts        = {{MEM_GPU_FINE, j}};
-        transfers.push_back(t);
-      }
-    }
-    std::vector<std::pair<std::pair<int,int>, double>> fails;
-
-    if (TransferBench::RunTransfers(cfg, transfers, results)) {
-      int transferIdx = 0;
-      for (int i = 0; i < numGpuDevices; i++) {
-        printf("."); fflush(stdout);
-        for (int j = 0; j < numGpuDevices; j++) {
-          if (i == j) continue;
-          double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
-          if (bw < a2aLimit) {
-            fails.push_back(std::make_pair(std::make_pair(i,j), bw));
-          }
-          transferIdx++;
-        }
-      }
-    }
-    if (fails.size() == 0) {
-      printf("PASS\n");
-    } else {
-      hasFail = true;
-      printf("FAIL (%lu test(s))\n", fails.size());
-      for (auto p : fails) {
-        printf(" GPU %02d to GPU %02d: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
-      }
-    }
-  }
-  exit(hasFail ? 1 : 0);
-}