Unverified Commit 569287a7 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

TransferBench v1.62.00 (#181)

* Adding non-temporal loads and stores via GFX_TEMPORAL
* Adding additional summary details to a2a preset
* Add SHOW_MIN_ONLY for a2asweep preset
* Adding new P CPU memory type which is indexed by closest GPU
parent fa0e717d
......@@ -3,6 +3,15 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.62.00
### Added
- Adding GFX_TEMPORAL to allow for use for use of non-temporal loads/stores
- (0 = none [default], 1 = load, 2 = store, 3 = both)
- Addding "P" memory type which maps to CPU memory but is indexed by closest GPU
- For example, P4 refers to CPU memory on NUMA node closest to GPU 4
### Modified
- Adding some additional summary details to a2a preset
## v1.61.00
### Added
- Added a2a_n preset which conducts alltoall GPU-to-GPU tranfers over nearest NIC executors
......
# Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
if (DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
......@@ -7,7 +7,7 @@ else()
endif()
cmake_minimum_required(VERSION 3.5)
project(TransferBench VERSION 1.59.00 LANGUAGES CXX)
project(TransferBench VERSION 1.62.00 LANGUAGES CXX)
# Default GPU architectures to build
#==================================================================================================
......
......@@ -53,6 +53,7 @@
# - G: Global device memory (on GPU device indexed from 0 to [# GPUs - 1])
# - F: Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
# - N: Null memory (index ignored)
# - P: Pinned host memory (on NUMA node, but indexed by closest GPU [#GPUs -1])
# Examples:
# 1 4 (G0->G0->G1) Uses 4 CUs on GPU0 to copy from GPU0 to GPU1
......
......@@ -88,6 +88,7 @@ public:
int gfxBlockSize; // Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask; // Bit-vector representing the CU mask
vector<vector<int>> prefXccTable; // Specifies XCC to use for given exe->dst pair
int gfxTemporal; // Non-temporal load/store mode (0=none, 1=load, 2=store, 3=both)
int gfxUnroll; // GFX-kernel unroll factor
int useHipEvents; // Use HIP events for timing GFX/DMA Executor
int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer
......@@ -140,6 +141,7 @@ public:
gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1);
gfxTemporal = GetEnvVar("GFX_TEMPORAL" , 0);
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0);
gfxWordSize = GetEnvVar("GFX_WORD_SIZE" , 4);
......@@ -316,6 +318,7 @@ public:
printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
printf(" GFX_TEMPORAL - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
......@@ -407,6 +410,12 @@ public:
Print("GFX_SINGLE_TEAM", gfxSingleTeam,
"%s", (gfxSingleTeam ? "Combining CUs to work across entire data array" :
"Each CUs operates on its own disjoint subarray"));
Print("GFX_TEMPORAL", gfxTemporal,
"%s", (gfxTemporal == 0 ? "Not using non-temporal loads/stores" :
gfxTemporal == 1 ? "Using non-temporal loads" :
gfxTemporal == 2 ? "Using non-temporal stores" :
"Using non-temporal loads and stores"));
Print("GFX_UNROLL", gfxUnroll,
"Using GFX unroll factor of %d", gfxUnroll);
Print("GFX_WAVE_ORDER", gfxWaveOrder,
......@@ -576,6 +585,7 @@ public:
cfg.gfx.cuMask = cuMask;
cfg.gfx.prefXccTable = prefXccTable;
cfg.gfx.unrollFactor = gfxUnroll;
cfg.gfx.temporalMode = gfxTemporal;
cfg.gfx.useHipEvents = useHipEvents;
cfg.gfx.useMultiStream = !useSingleStream;
cfg.gfx.useSingleTeam = gfxSingleTeam;
......
......@@ -169,8 +169,9 @@ void AllToAllPreset(EnvVars& ev,
// Print results
char separator = (ev.outputToCsv ? ',' : ' ');
printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
printf("==========================================================\n");
printf("\nSummary: [%lu bytes per Transfer] [%s:%d] [%d Read(s) %d Write(s)]\n",
numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, numSrcs, numDsts);
printf("===========================================================================\n");
printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst);
......
......@@ -44,6 +44,7 @@ void AllToAllSweepPreset(EnvVars& ev,
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT" , 1);
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int showMinOnly = EnvVars::GetEnvVar("SHOW_MIN_ONLY", 1);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
int useSpray = EnvVars::GetEnvVar("USE_SPRAY", 0);
......@@ -76,6 +77,7 @@ void AllToAllSweepPreset(EnvVars& ev,
ev.Print("A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
ev.Print("SHOW_MIN_ONLY" , showMinOnly , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results");
ev.Print("NUM_CUS" , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("UNROLLS" , unrollList.size(), EnvVars::ToStr(unrollList).c_str());
......@@ -181,7 +183,7 @@ void AllToAllSweepPreset(EnvVars& ev,
printf("#CUs\\Unroll");
for (int u : unrollList) {
printf(" %d(Min) ", u);
printf(" %d(Max) ", u);
if (!showMinOnly) printf(" %d(Max) ", u);
}
printf("\n");
for (int c : numCusList) {
......@@ -207,7 +209,9 @@ void AllToAllSweepPreset(EnvVars& ev,
} else {
minBandwidth = 0.0;
}
printf(" %7.2f %7.2f ", minBandwidth, maxBandwidth); fflush(stdout);
printf(" %7.2f ", minBandwidth);
if (!showMinOnly) printf(" %7.2f ", maxBandwidth);
fflush(stdout);
}
printf("\n"); fflush(stdout);
}
......
......@@ -66,13 +66,12 @@ namespace TransferBench
using std::set;
using std::vector;
constexpr char VERSION[] = "1.61";
constexpr char VERSION[] = "1.62";
/**
* Enumeration of supported Executor types
*
* @note The Executor is the device used to perform a Transfer
* @note IBVerbs executor is currently not implemented yet
*/
enum ExeType
{
......@@ -113,10 +112,11 @@ namespace TransferBench
MEM_GPU_FINE = 3, ///< Fine-grained global GPU memory
MEM_CPU_UNPINNED = 4, ///< Unpinned CPU memory
MEM_NULL = 5, ///< NULL memory - used for empty
MEM_MANAGED = 6 ///< Managed memory
MEM_MANAGED = 6, ///< Managed memory
MEM_CPU_CLOSEST = 7, ///< Coarse-grained pinned CPU memory indexed by closest GPU
};
char const MemTypeStr[8] = "CGBFUNM";
inline bool IsCpuMemType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); }
char const MemTypeStr[9] = "CGBFUNMP";
inline bool IsCpuMemType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED || m == MEM_CPU_CLOSEST); }
inline bool IsGpuMemType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); }
/**
......@@ -179,6 +179,7 @@ namespace TransferBench
int blockSize = 256; ///< Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask = {}; ///< Bit-vector representing the CU mask
vector<vector<int>> prefXccTable = {}; ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
int temporalMode = 0; ///< Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both
int unrollFactor = 4; ///< GFX-kernel unroll factor
int useHipEvents = 1; ///< Use HIP events for timing GFX Executor
int useMultiStream = 0; ///< Use multiple streams for GFX
......@@ -740,8 +741,14 @@ namespace {
MemType const& memType = memDevice.memType;
if (IsCpuMemType(memType)) {
// Set numa policy prior to call to hipHostMalloc
numa_set_preferred(memDevice.memIndex);
// Determine which NUMA device to use
int numaIdx = memDevice.memIndex;
if (memType == MEM_CPU_CLOSEST) {
numaIdx = GetClosestCpuNumaToGpu(memDevice.memIndex);
}
// Set NUMA policy prior to call to hipHostMalloc
numa_set_preferred(numaIdx);
// Allocate host-pinned memory (should respect NUMA mem policy)
if (memType == MEM_CPU_FINE) {
......@@ -750,19 +757,19 @@ namespace {
#else
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
#endif
} else if (memType == MEM_CPU) {
} else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) {
#if defined (__NVCC__)
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, 0));
#else
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent));
#endif
} else if (memType == MEM_CPU_UNPINNED) {
*memPtr = numa_alloc_onnode(numBytes, memDevice.memIndex);
*memPtr = numa_alloc_onnode(numBytes, numaIdx);
}
// Check that the allocated pages are actually on the correct NUMA node
memset(*memPtr, 0, numBytes);
ERR_CHECK(CheckPages((char*)*memPtr, numBytes, memDevice.memIndex));
ERR_CHECK(CheckPages((char*)*memPtr, numBytes, numaIdx));
// Reset to default numa mem policy
numa_set_preferred(-1);
......@@ -801,7 +808,7 @@ namespace {
return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes};
switch (memType) {
case MEM_CPU: case MEM_CPU_FINE:
case MEM_CPU: case MEM_CPU_FINE: case MEM_CPU_CLOSEST:
{
ERR_CHECK(hipHostFree(memPtr));
break;
......@@ -928,7 +935,7 @@ namespace {
if (memDevice.memType == MEM_NULL)
return ERR_NONE;
if (IsCpuMemType(memDevice.memType)) {
if (IsCpuMemType(memDevice.memType) && memDevice.memType != MEM_CPU_CLOSEST) {
int numCpus = GetNumExecutors(EXE_CPU);
if (memDevice.memIndex < 0 || memDevice.memIndex >= numCpus)
return {ERR_FATAL,
......@@ -936,11 +943,16 @@ namespace {
return ERR_NONE;
}
if (IsGpuMemType(memDevice.memType)) {
if (IsGpuMemType(memDevice.memType) || memDevice.memType == MEM_CPU_CLOSEST) {
int numGpus = GetNumExecutors(EXE_GPU_GFX);
if (memDevice.memIndex < 0 || memDevice.memIndex >= numGpus)
return {ERR_FATAL,
"GPU index must be between 0 and %d (instead of %d)", numGpus - 1, memDevice.memIndex};
if (memDevice.memType == MEM_CPU_CLOSEST) {
if (GetClosestCpuNumaToGpu(memDevice.memIndex) == -1) {
return {ERR_FATAL, "Unable to determine closest NUMA node for GPU %d", memDevice.memIndex};
}
}
return ERR_NONE;
}
return {ERR_FATAL, "Unsupported memory type (%d)", memDevice.memType};
......@@ -974,6 +986,16 @@ namespace {
"[gfx.blockSize] must be positive multiple of 64 less than or equal to %d",
gfxMaxBlockSize});
if (cfg.gfx.temporalMode < 0 || cfg.gfx.temporalMode > 3)
errors.push_back({ERR_FATAL,
"[gfx.temporalMode] must be non-negative and less than or equal to 3"});
#if defined(__NVCC__)
if (cfg.gfx.temporalMode > 0)
errors.push_back({ERR_FATAL,
"[gfx.temporalMode] is not supported on NVIDIA hardware"});
#endif
int gfxMaxUnroll = GetIntAttribute(ATR_GFX_MAX_UNROLL);
if (cfg.gfx.unrollFactor < 0 || cfg.gfx.unrollFactor > gfxMaxUnroll)
errors.push_back({ERR_FATAL,
......@@ -2760,8 +2782,89 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
MEMSET_VAL); }
// Kernel for GFX execution
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL>
// Helper function for temporal/non-temporal reads / writes
#define TEMPORAL_NONE 0
#define TEMPORAL_LOAD 1
#define TEMPORAL_STORE 2
#define TEMPORAL_BOTH 3
template <int TEMPORAL_MODE>
__device__ __forceinline__ void Load(float const* src, float& dst) {
if (TEMPORAL_MODE & TEMPORAL_LOAD) {
#if !defined(__NVCC__)
dst = __builtin_nontemporal_load(src);
#endif
} else {
dst = *src;
}
}
template <int TEMPORAL_MODE>
__device__ __forceinline__ void Load(float2 const* src, float2& dst) {
if (TEMPORAL_MODE & TEMPORAL_LOAD) {
#if !defined(__NVCC__)
dst.x = __builtin_nontemporal_load(&(src->x));
dst.y = __builtin_nontemporal_load(&(src->y));
#endif
} else {
dst = *src;
}
}
template <int TEMPORAL_MODE>
__device__ __forceinline__ void Load(float4 const* src, float4& dst) {
if (TEMPORAL_MODE & TEMPORAL_LOAD) {
#if !defined(__NVCC__)
dst.x = __builtin_nontemporal_load(&(src->x));
dst.y = __builtin_nontemporal_load(&(src->y));
dst.z = __builtin_nontemporal_load(&(src->z));
dst.w = __builtin_nontemporal_load(&(src->w));
#endif
} else {
dst = *src;
}
}
template <int TEMPORAL_MODE>
__device__ __forceinline__ void Store(float const& src, float* dst) {
if (TEMPORAL_MODE & TEMPORAL_STORE) {
#if !defined(__NVCC__)
__builtin_nontemporal_store(src, dst);
#endif
} else {
*dst = src;
}
}
template <int TEMPORAL_MODE>
__device__ __forceinline__ void Store(float2 const& src, float2* dst) {
if (TEMPORAL_MODE & TEMPORAL_STORE) {
#if !defined(__NVCC__)
__builtin_nontemporal_store(src.x, &(dst->x));
__builtin_nontemporal_store(src.y, &(dst->y));
#endif
} else {
*dst = src;
}
}
template <int TEMPORAL_MODE>
__device__ __forceinline__ void Store(float4 const& src, float4* dst) {
if (TEMPORAL_MODE & TEMPORAL_STORE) {
#if !defined(__NVCC__)
__builtin_nontemporal_store(src.x, &(dst->x));
__builtin_nontemporal_store(src.y, &(dst->y));
__builtin_nontemporal_store(src.z, &(dst->z));
__builtin_nontemporal_store(src.w, &(dst->w));
#endif
} else {
*dst = src;
}
}
// Kernel for GFX execution
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
__global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
{
......@@ -2811,6 +2914,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
size_t const loop1Limit = numPackedFloat / loop1Stride * loop1Stride;
{
PACKED_FLOAT val[UNROLL];
PACKED_FLOAT tmp[UNROLL];
if (numSrcs == 0) {
#pragma unroll
for (int u = 0; u < UNROLL; u++)
......@@ -2820,18 +2924,25 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) {
// Read sources into memory and accumulate in registers
if (numSrcs) {
#pragma unroll
for (int u = 0; u < UNROLL; u++)
Load<TEMPORAL_MODE>(&srcFloatPacked[0][idx + u * unrlStride * warpSize], val[u]);
for (int s = 1; s < numSrcs; s++) {
#pragma unroll
for (int u = 0; u < UNROLL; u++)
val[u] = srcFloatPacked[0][idx + u * unrlStride * warpSize];
for (int s = 1; s < numSrcs; s++)
Load<TEMPORAL_MODE>(&srcFloatPacked[s][idx + u * unrlStride * warpSize], tmp[u]);
#pragma unroll
for (int u = 0; u < UNROLL; u++)
val[u] += srcFloatPacked[s][idx + u * unrlStride * warpSize];
val[u] += tmp[u];
}
}
// Write accumulation to all outputs
for (int d = 0; d < numDsts; d++) {
#pragma unroll
for (int u = 0; u < UNROLL; u++)
dstFloatPacked[d][idx + u * unrlStride * warpSize] = val[u];
Store<TEMPORAL_MODE>(val[u], &dstFloatPacked[d][idx + u * unrlStride * warpSize]);
}
}
}
......@@ -2839,19 +2950,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Second loop: Deal with remaining PACKED_FLOAT
{
if (loop1Limit < numPackedFloat) {
PACKED_FLOAT val;
PACKED_FLOAT val, tmp;
if (numSrcs == 0) val = MemsetVal<PACKED_FLOAT>();
size_t const loop2Stride = nTeams * nWaves * warpSize;
for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
idx < numPackedFloat; idx += loop2Stride) {
if (numSrcs) {
val = srcFloatPacked[0][idx];
for (int s = 1; s < numSrcs; s++)
val += srcFloatPacked[s][idx];
Load<TEMPORAL_MODE>(&srcFloatPacked[0][idx], val);
for (int s = 1; s < numSrcs; s++) {
Load<TEMPORAL_MODE>(&srcFloatPacked[s][idx], tmp);
val += tmp;
}
}
for (int d = 0; d < numDsts; d++)
dstFloatPacked[d][idx] = val;
Store<TEMPORAL_MODE>(val, &dstFloatPacked[d][idx]);
}
}
}
......@@ -2859,19 +2972,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Third loop; Deal with remaining floats
{
if (numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) < p.N) {
float val;
float val, tmp;
if (numSrcs == 0) val = MemsetVal<float>();
size_t const loop3Stride = nTeams * nWaves * warpSize;
for (size_t idx = numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
if (numSrcs) {
val = p.src[0][idx];
for (int s = 1; s < numSrcs; s++)
val += p.src[s][idx];
Load<TEMPORAL_MODE>(&p.src[0][idx], val);
for (int s = 1; s < numSrcs; s++) {
Load<TEMPORAL_MODE>(&p.src[s][idx], tmp);
val += tmp;
}
}
for (int d = 0; d < numDsts; d++)
p.dst[d][idx] = val;
Store<TEMPORAL_MODE>(val, &p.dst[d][idx]);
}
}
}
......@@ -2890,10 +3005,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
}
}
#define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD) \
{GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_STORE>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_BOTH>}
#define GPU_KERNEL_DWORD_DECL(BLOCKSIZE, UNROLL) \
{GpuReduceKernel<float, BLOCKSIZE, UNROLL>, \
GpuReduceKernel<float2, BLOCKSIZE, UNROLL>, \
GpuReduceKernel<float4, BLOCKSIZE, UNROLL>}
{GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float), \
GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float2), \
GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float4)}
#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
{GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 1), \
......@@ -2907,7 +3028,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size)
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3] =
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3][4] =
{
GPU_KERNEL_UNROLL_DECL(64),
GPU_KERNEL_UNROLL_DECL(128),
......@@ -2919,6 +3040,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
GPU_KERNEL_UNROLL_DECL(512)
};
#undef GPU_KERNEL_UNROLL_DECL
#undef GPU_KERNEL_DWORD_DECL
#undef GPU_KERNEL_TEMPORAL_DECL
// Execute a single GPU Transfer (when using 1 stream per Transfer)
static ErrResult ExecuteGpuTransfer(int const iteration,
......@@ -2938,7 +3061,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
cfg.gfx.wordSize == 2 ? 1 :
2;
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx];
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode];
#if defined(__NVCC__)
if (startEvent != NULL)
......@@ -3014,7 +3137,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
cfg.gfx.wordSize == 2 ? 1 :
2;
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx];
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode];
#if defined(__NVCC__)
if (cfg.gfx.useHipEvents)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment