TransferBench v1.62.00 (#181)

* Adding non-temporal loads and stores via GFX_TEMPORAL * Adding additional summary details to a2a preset * Add SHOW_MIN_ONLY for a2asweep preset * Adding new P CPU memory type which is indexed by closest GPU

TransferBench v1.62.00 (#181)
* Adding non-temporal loads and stores via GFX_TEMPORAL * Adding additional summary details to a2a preset * Add SHOW_MIN_ONLY for a2asweep preset * Adding new P CPU memory type which is indexed by closest GPU
569287a7 · gilbertlee-amd · GitHub · fa0e717d · 569287a7 · 569287a7
Unverified Commit 569287a7 authored Jun 09, 2025 by gilbertlee-amd Committed by GitHub Jun 09, 2025
7 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,15 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.62.00
+### Added
+- Adding GFX_TEMPORAL to allow for use for use of non-temporal loads/stores
+  - (0 = none [default], 1 = load, 2 = store, 3 = both)
+- Addding "P" memory type which maps to CPU memory but is indexed by closest GPU
+  - For example, P4 refers to CPU memory on NUMA node closest to GPU 4
+### Modified
+- Adding some additional summary details to a2a preset
+
 ## v1.61.00
 ### Added
 - Added a2a_n preset which conducts alltoall GPU-to-GPU tranfers over nearest NIC executors

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-# Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.

 if (DEFINED ENV{ROCM_PATH})
    set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
@@ -7,7 +7,7 @@ else()
 endif()
 cmake_minimum_required(VERSION 3.5)

-project(TransferBench VERSION 1.59.00 LANGUAGES CXX)
+project(TransferBench VERSION 1.62.00 LANGUAGES CXX)

 # Default GPU architectures to build
 #==================================================================================================

--- a/examples/example.cfg
+++ b/examples/example.cfg
@@ -53,6 +53,7 @@
 #                 - G:    Global device memory     (on GPU device indexed from 0 to [# GPUs - 1])
 #                 - F:    Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
 #                 - N:    Null memory              (index ignored)
+#                 - P:    Pinned host memory       (on NUMA node, but indexed by closest GPU [#GPUs -1])

 # Examples:
 # 1 4 (G0->G0->G1)                   Uses 4 CUs on GPU0 to copy from GPU0 to GPU1

--- a/src/client/EnvVars.hpp
+++ b/src/client/EnvVars.hpp
@@ -88,6 +88,7 @@ public:
  int gfxBlockSize;                  // Size of each threadblock (must be multiple of 64)
  vector<uint32_t> cuMask;           // Bit-vector representing the CU mask
  vector<vector<int>> prefXccTable;  // Specifies XCC to use for given exe->dst pair
+  int gfxTemporal;                   // Non-temporal load/store mode (0=none, 1=load, 2=store, 3=both)
  int gfxUnroll;                     // GFX-kernel unroll factor
  int useHipEvents;                  // Use HIP events for timing GFX/DMA Executor
  int useSingleStream;               // Use a single stream per GPU GFX executor instead of stream per Transfer
@@ -140,6 +141,7 @@ public:
    gfxBlockOrder     = GetEnvVar("GFX_BLOCK_ORDER"     , 0);
    gfxBlockSize      = GetEnvVar("GFX_BLOCK_SIZE"      , 256);
    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 1);
+    gfxTemporal       = GetEnvVar("GFX_TEMPORAL"        , 0);
    gfxUnroll         = GetEnvVar("GFX_UNROLL"          , defaultGfxUnroll);
    gfxWaveOrder      = GetEnvVar("GFX_WAVE_ORDER"      , 0);
    gfxWordSize       = GetEnvVar("GFX_WORD_SIZE"       , 4);
@@ -316,6 +318,7 @@ public:
    printf(" FILL_PATTERN      - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
    printf(" GFX_BLOCK_ORDER   - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
    printf(" GFX_BLOCK_SIZE    - # of threads per threadblock (Must be multiple of 64)\n");
+    printf(" GFX_TEMPORAL      - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)\n");
    printf(" GFX_UNROLL        - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
    printf(" GFX_SINGLE_TEAM   - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
    printf(" GFX_WAVE_ORDER    - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
@@ -407,6 +410,12 @@ public:
    Print("GFX_SINGLE_TEAM", gfxSingleTeam,
          "%s", (gfxSingleTeam ? "Combining CUs to work across entire data array" :
                                 "Each CUs operates on its own disjoint subarray"));
+    Print("GFX_TEMPORAL", gfxTemporal,
+          "%s", (gfxTemporal == 0 ? "Not using non-temporal loads/stores" :
+                 gfxTemporal == 1 ? "Using non-temporal loads" :
+                 gfxTemporal == 2 ? "Using non-temporal stores" :
+                                    "Using non-temporal loads and stores"));
+
    Print("GFX_UNROLL", gfxUnroll,
          "Using GFX unroll factor of %d", gfxUnroll);
    Print("GFX_WAVE_ORDER", gfxWaveOrder,
@@ -576,6 +585,7 @@ public:
    cfg.gfx.cuMask                 = cuMask;
    cfg.gfx.prefXccTable           = prefXccTable;
    cfg.gfx.unrollFactor           = gfxUnroll;
+    cfg.gfx.temporalMode           = gfxTemporal;
    cfg.gfx.useHipEvents           = useHipEvents;
    cfg.gfx.useMultiStream         = !useSingleStream;
    cfg.gfx.useSingleTeam          = gfxSingleTeam;

--- a/src/client/Presets/AllToAll.hpp
+++ b/src/client/Presets/AllToAll.hpp
@@ -169,8 +169,9 @@ void AllToAllPreset(EnvVars&           ev,

  // Print results
  char separator = (ev.outputToCsv ? ',' : ' ');
-  printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
-  printf("==========================================================\n");
+  printf("\nSummary: [%lu bytes per Transfer] [%s:%d] [%d Read(s) %d Write(s)]\n",
+         numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, numSrcs, numDsts);
+  printf("===========================================================================\n");
  printf("SRC\\DST ");
  for (int dst = 0; dst < numGpus; dst++)
    printf("%cGPU %02d    ", separator, dst);

--- a/src/client/Presets/AllToAllSweep.hpp
+++ b/src/client/Presets/AllToAllSweep.hpp
@@ -44,6 +44,7 @@ void AllToAllSweepPreset(EnvVars&           ev,
  int a2aDirect     = EnvVars::GetEnvVar("A2A_DIRECT"     , 1);
  int a2aLocal      = EnvVars::GetEnvVar("A2A_LOCAL"      , 0);
  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int showMinOnly   = EnvVars::GetEnvVar("SHOW_MIN_ONLY",   1);
  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
  int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
  int useSpray      = EnvVars::GetEnvVar("USE_SPRAY",       0);
@@ -76,6 +77,7 @@ void AllToAllSweepPreset(EnvVars&           ev,
    ev.Print("A2A_MODE"       , (a2aMode == A2A_CUSTOM) ?  std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
                                (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
                                                           std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
+    ev.Print("SHOW_MIN_ONLY"  , showMinOnly      , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results");
    ev.Print("NUM_CUS"        , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
    ev.Print("NUM_GPU_DEVICES", numGpus          , "Using %d GPUs", numGpus);
    ev.Print("UNROLLS"        , unrollList.size(), EnvVars::ToStr(unrollList).c_str());
@@ -181,7 +183,7 @@ void AllToAllSweepPreset(EnvVars&           ev,
  printf("#CUs\\Unroll");
  for (int u : unrollList) {
    printf("  %d(Min) ", u);
-    printf("  %d(Max) ", u);
+    if (!showMinOnly) printf("  %d(Max) ", u);
  }
  printf("\n");
  for (int c : numCusList) {
@@ -207,7 +209,9 @@ void AllToAllSweepPreset(EnvVars&           ev,
      } else {
        minBandwidth = 0.0;
      }
-      printf(" %7.2f  %7.2f ", minBandwidth, maxBandwidth); fflush(stdout);
+      printf(" %7.2f ", minBandwidth);
+      if (!showMinOnly) printf(" %7.2f ", maxBandwidth);
+      fflush(stdout);
    }
    printf("\n"); fflush(stdout);
  }

--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -66,13 +66,12 @@ namespace TransferBench
  using std::set;
  using std::vector;

-  constexpr char VERSION[] = "1.61";
+  constexpr char VERSION[] = "1.62";

  /**
   * Enumeration of supported Executor types
   *
   * @note The Executor is the device used to perform a Transfer
-   * @note IBVerbs executor is currently not implemented yet
   */
  enum ExeType
  {
@@ -113,10 +112,11 @@ namespace TransferBench
    MEM_GPU_FINE     = 3,                       ///< Fine-grained global GPU memory
    MEM_CPU_UNPINNED = 4,                       ///< Unpinned CPU memory
    MEM_NULL         = 5,                       ///< NULL memory - used for empty
-    MEM_MANAGED      = 6                        ///< Managed memory
+    MEM_MANAGED      = 6,                       ///< Managed memory
+    MEM_CPU_CLOSEST  = 7,                       ///< Coarse-grained pinned CPU memory indexed by closest GPU
  };
-  char const MemTypeStr[8] = "CGBFUNM";
-  inline bool IsCpuMemType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); }
+  char const MemTypeStr[9] = "CGBFUNMP";
+  inline bool IsCpuMemType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED || m == MEM_CPU_CLOSEST); }
  inline bool IsGpuMemType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); }

  /**
@@ -179,6 +179,7 @@ namespace TransferBench
    int                 blockSize      = 256;   ///< Size of each threadblock (must be multiple of 64)
    vector<uint32_t>    cuMask         = {};    ///< Bit-vector representing the CU mask
    vector<vector<int>> prefXccTable   = {};    ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
+    int                 temporalMode   = 0;     ///< Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both
    int                 unrollFactor   = 4;     ///< GFX-kernel unroll factor
    int                 useHipEvents   = 1;     ///< Use HIP events for timing GFX Executor
    int                 useMultiStream = 0;     ///< Use multiple streams for GFX
@@ -740,8 +741,14 @@ namespace {
    MemType const& memType = memDevice.memType;

    if (IsCpuMemType(memType)) {
-      // Set numa policy prior to call to hipHostMalloc
-      numa_set_preferred(memDevice.memIndex);
+      // Determine which NUMA device to use
+      int numaIdx = memDevice.memIndex;
+      if (memType == MEM_CPU_CLOSEST) {
+        numaIdx = GetClosestCpuNumaToGpu(memDevice.memIndex);
+      }
+
+      // Set NUMA policy prior to call to hipHostMalloc
+      numa_set_preferred(numaIdx);

      // Allocate host-pinned memory (should respect NUMA mem policy)
      if (memType == MEM_CPU_FINE) {
@@ -750,19 +757,19 @@ namespace {
 #else
        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
 #endif
-      } else if (memType == MEM_CPU) {
+      } else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) {
 #if defined (__NVCC__)
        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, 0));
 #else
        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent));
 #endif
      } else if (memType == MEM_CPU_UNPINNED) {
-        *memPtr = numa_alloc_onnode(numBytes, memDevice.memIndex);
+        *memPtr = numa_alloc_onnode(numBytes, numaIdx);
      }

      // Check that the allocated pages are actually on the correct NUMA node
      memset(*memPtr, 0, numBytes);
-      ERR_CHECK(CheckPages((char*)*memPtr, numBytes, memDevice.memIndex));
+      ERR_CHECK(CheckPages((char*)*memPtr, numBytes, numaIdx));

      // Reset to default numa mem policy
      numa_set_preferred(-1);
@@ -801,7 +808,7 @@ namespace {
      return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes};

    switch (memType) {
-    case MEM_CPU: case MEM_CPU_FINE:
+    case MEM_CPU: case MEM_CPU_FINE: case MEM_CPU_CLOSEST:
    {
      ERR_CHECK(hipHostFree(memPtr));
      break;
@@ -928,7 +935,7 @@ namespace {
    if (memDevice.memType == MEM_NULL)
      return ERR_NONE;

-    if (IsCpuMemType(memDevice.memType)) {
+    if (IsCpuMemType(memDevice.memType) && memDevice.memType != MEM_CPU_CLOSEST) {
      int numCpus = GetNumExecutors(EXE_CPU);
      if (memDevice.memIndex < 0 || memDevice.memIndex >= numCpus)
        return {ERR_FATAL,
@@ -936,11 +943,16 @@ namespace {
      return ERR_NONE;
    }

-    if (IsGpuMemType(memDevice.memType)) {
+    if (IsGpuMemType(memDevice.memType) || memDevice.memType == MEM_CPU_CLOSEST) {
    int numGpus = GetNumExecutors(EXE_GPU_GFX);
      if (memDevice.memIndex < 0 || memDevice.memIndex >= numGpus)
        return {ERR_FATAL,
                "GPU index must be between 0 and %d (instead of %d)", numGpus - 1, memDevice.memIndex};
+      if (memDevice.memType == MEM_CPU_CLOSEST) {
+        if (GetClosestCpuNumaToGpu(memDevice.memIndex) == -1) {
+          return {ERR_FATAL, "Unable to determine closest NUMA node for GPU %d", memDevice.memIndex};
+        }
+      }
      return ERR_NONE;
    }
    return {ERR_FATAL, "Unsupported memory type (%d)", memDevice.memType};
@@ -974,6 +986,16 @@ namespace {
                        "[gfx.blockSize] must be positive multiple of 64 less than or equal to %d",
                        gfxMaxBlockSize});

+    if (cfg.gfx.temporalMode < 0 || cfg.gfx.temporalMode > 3)
+      errors.push_back({ERR_FATAL,
+                        "[gfx.temporalMode] must be non-negative and less than or equal to 3"});
+
+#if defined(__NVCC__)
+    if (cfg.gfx.temporalMode > 0)
+      errors.push_back({ERR_FATAL,
+          "[gfx.temporalMode] is not supported on NVIDIA hardware"});
+#endif
+
    int gfxMaxUnroll = GetIntAttribute(ATR_GFX_MAX_UNROLL);
    if (cfg.gfx.unrollFactor < 0 || cfg.gfx.unrollFactor > gfxMaxUnroll)
      errors.push_back({ERR_FATAL,
@@ -2760,8 +2782,89 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
                                                                                          MEMSET_VAL); }


-// Kernel for GFX execution
-  template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL>
+  // Helper function for temporal/non-temporal reads / writes
+  #define TEMPORAL_NONE  0
+  #define TEMPORAL_LOAD  1
+  #define TEMPORAL_STORE 2
+  #define TEMPORAL_BOTH  3
+
+  template <int TEMPORAL_MODE>
+  __device__ __forceinline__ void Load(float const* src, float& dst) {
+    if (TEMPORAL_MODE & TEMPORAL_LOAD) {
+#if !defined(__NVCC__)
+      dst = __builtin_nontemporal_load(src);
+
+#endif
+    } else {
+      dst = *src;
+    }
+  }
+
+  template <int TEMPORAL_MODE>
+  __device__ __forceinline__ void Load(float2 const* src, float2& dst) {
+    if (TEMPORAL_MODE & TEMPORAL_LOAD) {
+#if !defined(__NVCC__)
+      dst.x = __builtin_nontemporal_load(&(src->x));
+      dst.y = __builtin_nontemporal_load(&(src->y));
+#endif
+    } else {
+      dst = *src;
+    }
+  }
+
+  template <int TEMPORAL_MODE>
+  __device__ __forceinline__ void Load(float4 const* src, float4& dst) {
+    if (TEMPORAL_MODE & TEMPORAL_LOAD) {
+#if !defined(__NVCC__)
+      dst.x = __builtin_nontemporal_load(&(src->x));
+      dst.y = __builtin_nontemporal_load(&(src->y));
+      dst.z = __builtin_nontemporal_load(&(src->z));
+      dst.w = __builtin_nontemporal_load(&(src->w));
+#endif
+    } else {
+      dst = *src;
+    }
+  }
+
+  template <int TEMPORAL_MODE>
+  __device__ __forceinline__ void Store(float const& src, float* dst) {
+    if (TEMPORAL_MODE & TEMPORAL_STORE) {
+#if !defined(__NVCC__)
+      __builtin_nontemporal_store(src, dst);
+#endif
+    } else {
+      *dst = src;
+    }
+  }
+
+  template <int TEMPORAL_MODE>
+  __device__ __forceinline__ void Store(float2 const& src, float2* dst) {
+    if (TEMPORAL_MODE & TEMPORAL_STORE) {
+#if !defined(__NVCC__)
+      __builtin_nontemporal_store(src.x, &(dst->x));
+      __builtin_nontemporal_store(src.y, &(dst->y));
+#endif
+    } else {
+      *dst = src;
+    }
+  }
+
+  template <int TEMPORAL_MODE>
+  __device__ __forceinline__ void Store(float4 const& src, float4* dst) {
+    if (TEMPORAL_MODE & TEMPORAL_STORE) {
+#if !defined(__NVCC__)
+      __builtin_nontemporal_store(src.x, &(dst->x));
+      __builtin_nontemporal_store(src.y, &(dst->y));
+      __builtin_nontemporal_store(src.z, &(dst->z));
+      __builtin_nontemporal_store(src.w, &(dst->w));
+#endif
+    } else {
+      *dst = src;
+    }
+  }
+
+  // Kernel for GFX execution
+  template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
  __global__ void __launch_bounds__(BLOCKSIZE)
    GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
  {
@@ -2811,6 +2914,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
      size_t const loop1Limit  = numPackedFloat / loop1Stride * loop1Stride;
      {
        PACKED_FLOAT val[UNROLL];
+        PACKED_FLOAT tmp[UNROLL];
        if (numSrcs == 0) {
          #pragma unroll
          for (int u = 0; u < UNROLL; u++)
@@ -2820,18 +2924,25 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
        for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) {
          // Read sources into memory and accumulate in registers
          if (numSrcs) {
+            #pragma unroll
+            for (int u = 0; u < UNROLL; u++)
+              Load<TEMPORAL_MODE>(&srcFloatPacked[0][idx + u * unrlStride * warpSize], val[u]);
+
+            for (int s = 1; s < numSrcs; s++) {
+              #pragma unroll
              for (int u = 0; u < UNROLL; u++)
-              val[u] = srcFloatPacked[0][idx + u * unrlStride * warpSize];
-            for (int s = 1; s < numSrcs; s++)
+                Load<TEMPORAL_MODE>(&srcFloatPacked[s][idx + u * unrlStride * warpSize], tmp[u]);
+              #pragma unroll
              for (int u = 0; u < UNROLL; u++)
-                val[u] += srcFloatPacked[s][idx + u * unrlStride * warpSize];
+                val[u] += tmp[u];
+            }
          }

          // Write accumulation to all outputs
          for (int d = 0; d < numDsts; d++) {
            #pragma unroll
            for (int u = 0; u < UNROLL; u++)
-              dstFloatPacked[d][idx + u * unrlStride * warpSize] = val[u];
+              Store<TEMPORAL_MODE>(val[u], &dstFloatPacked[d][idx + u * unrlStride * warpSize]);
          }
        }
      }
@@ -2839,19 +2950,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
      // Second loop: Deal with remaining PACKED_FLOAT
      {
        if (loop1Limit < numPackedFloat) {
-          PACKED_FLOAT val;
+          PACKED_FLOAT val, tmp;
          if (numSrcs == 0) val = MemsetVal<PACKED_FLOAT>();

          size_t const loop2Stride = nTeams * nWaves * warpSize;
          for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
               idx < numPackedFloat; idx += loop2Stride) {
            if (numSrcs) {
-              val = srcFloatPacked[0][idx];
-              for (int s = 1; s < numSrcs; s++)
-                val += srcFloatPacked[s][idx];
+              Load<TEMPORAL_MODE>(&srcFloatPacked[0][idx], val);
+              for (int s = 1; s < numSrcs; s++) {
+                Load<TEMPORAL_MODE>(&srcFloatPacked[s][idx], tmp);
+                val += tmp;
+              }
            }
            for (int d = 0; d < numDsts; d++)
-              dstFloatPacked[d][idx] = val;
+              Store<TEMPORAL_MODE>(val, &dstFloatPacked[d][idx]);
          }
        }
      }
@@ -2859,19 +2972,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
      // Third loop; Deal with remaining floats
      {
        if (numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) < p.N) {
-          float val;
+          float val, tmp;
          if (numSrcs == 0) val = MemsetVal<float>();

          size_t const loop3Stride = nTeams * nWaves * warpSize;
          for (size_t idx = numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
            if (numSrcs) {
-              val = p.src[0][idx];
-              for (int s = 1; s < numSrcs; s++)
-                val += p.src[s][idx];
+              Load<TEMPORAL_MODE>(&p.src[0][idx], val);
+              for (int s = 1; s < numSrcs; s++) {
+                Load<TEMPORAL_MODE>(&p.src[s][idx], tmp);
+                val += tmp;
+              }
            }

            for (int d = 0; d < numDsts; d++)
-              p.dst[d][idx] = val;
+              Store<TEMPORAL_MODE>(val, &p.dst[d][idx]);
          }
        }
      }
@@ -2890,10 +3005,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    }
  }

+#define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD)    \
+  {GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>,  \
+   GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>,  \
+   GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_STORE>, \
+   GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_BOTH>}
+
 #define GPU_KERNEL_DWORD_DECL(BLOCKSIZE, UNROLL)        \
-  {GpuReduceKernel<float,  BLOCKSIZE, UNROLL>,   \
-   GpuReduceKernel<float2, BLOCKSIZE, UNROLL>,   \
-   GpuReduceKernel<float4, BLOCKSIZE, UNROLL>}
+  {GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float),  \
+   GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float2), \
+   GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float4)}

 #define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)    \
  {GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 1),      \
@@ -2907,7 +3028,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)

  // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size)
  typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
-  GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3] =
+  GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3][4] =
  {
    GPU_KERNEL_UNROLL_DECL(64),
    GPU_KERNEL_UNROLL_DECL(128),
@@ -2919,6 +3040,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    GPU_KERNEL_UNROLL_DECL(512)
  };
  #undef GPU_KERNEL_UNROLL_DECL
+  #undef GPU_KERNEL_DWORD_DECL
+  #undef GPU_KERNEL_TEMPORAL_DECL

  // Execute a single GPU Transfer (when using 1 stream per Transfer)
  static ErrResult ExecuteGpuTransfer(int           const  iteration,
@@ -2938,7 +3061,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
                      cfg.gfx.wordSize == 2 ? 1 :
                                              2;
-    auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx];
+    auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode];

 #if defined(__NVCC__)
    if (startEvent != NULL)
@@ -3014,7 +3137,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
      int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
                        cfg.gfx.wordSize == 2 ? 1 :
                                                2;
-      auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx];
+      auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode];

 #if defined(__NVCC__)
      if (cfg.gfx.useHipEvents)