Unverified Commit 569287a7 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

TransferBench v1.62.00 (#181)

* Adding non-temporal loads and stores via GFX_TEMPORAL
* Adding additional summary details to a2a preset
* Add SHOW_MIN_ONLY for a2asweep preset
* Adding new P CPU memory type which is indexed by closest GPU
parent fa0e717d
...@@ -3,6 +3,15 @@ ...@@ -3,6 +3,15 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.62.00
### Added
- Adding GFX_TEMPORAL to allow for use for use of non-temporal loads/stores
- (0 = none [default], 1 = load, 2 = store, 3 = both)
- Addding "P" memory type which maps to CPU memory but is indexed by closest GPU
- For example, P4 refers to CPU memory on NUMA node closest to GPU 4
### Modified
- Adding some additional summary details to a2a preset
## v1.61.00 ## v1.61.00
### Added ### Added
- Added a2a_n preset which conducts alltoall GPU-to-GPU tranfers over nearest NIC executors - Added a2a_n preset which conducts alltoall GPU-to-GPU tranfers over nearest NIC executors
......
# Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. # Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
if (DEFINED ENV{ROCM_PATH}) if (DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory") set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
...@@ -7,7 +7,7 @@ else() ...@@ -7,7 +7,7 @@ else()
endif() endif()
cmake_minimum_required(VERSION 3.5) cmake_minimum_required(VERSION 3.5)
project(TransferBench VERSION 1.59.00 LANGUAGES CXX) project(TransferBench VERSION 1.62.00 LANGUAGES CXX)
# Default GPU architectures to build # Default GPU architectures to build
#================================================================================================== #==================================================================================================
......
...@@ -53,6 +53,7 @@ ...@@ -53,6 +53,7 @@
# - G: Global device memory (on GPU device indexed from 0 to [# GPUs - 1]) # - G: Global device memory (on GPU device indexed from 0 to [# GPUs - 1])
# - F: Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1]) # - F: Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
# - N: Null memory (index ignored) # - N: Null memory (index ignored)
# - P: Pinned host memory (on NUMA node, but indexed by closest GPU [#GPUs -1])
# Examples: # Examples:
# 1 4 (G0->G0->G1) Uses 4 CUs on GPU0 to copy from GPU0 to GPU1 # 1 4 (G0->G0->G1) Uses 4 CUs on GPU0 to copy from GPU0 to GPU1
......
...@@ -88,6 +88,7 @@ public: ...@@ -88,6 +88,7 @@ public:
int gfxBlockSize; // Size of each threadblock (must be multiple of 64) int gfxBlockSize; // Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask; // Bit-vector representing the CU mask vector<uint32_t> cuMask; // Bit-vector representing the CU mask
vector<vector<int>> prefXccTable; // Specifies XCC to use for given exe->dst pair vector<vector<int>> prefXccTable; // Specifies XCC to use for given exe->dst pair
int gfxTemporal; // Non-temporal load/store mode (0=none, 1=load, 2=store, 3=both)
int gfxUnroll; // GFX-kernel unroll factor int gfxUnroll; // GFX-kernel unroll factor
int useHipEvents; // Use HIP events for timing GFX/DMA Executor int useHipEvents; // Use HIP events for timing GFX/DMA Executor
int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer
...@@ -140,6 +141,7 @@ public: ...@@ -140,6 +141,7 @@ public:
gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0); gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256); gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1); gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1);
gfxTemporal = GetEnvVar("GFX_TEMPORAL" , 0);
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll); gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0); gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0);
gfxWordSize = GetEnvVar("GFX_WORD_SIZE" , 4); gfxWordSize = GetEnvVar("GFX_WORD_SIZE" , 4);
...@@ -316,6 +318,7 @@ public: ...@@ -316,6 +318,7 @@ public:
printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n"); printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n"); printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n"); printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
printf(" GFX_TEMPORAL - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL)); printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n"); printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n"); printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
...@@ -407,6 +410,12 @@ public: ...@@ -407,6 +410,12 @@ public:
Print("GFX_SINGLE_TEAM", gfxSingleTeam, Print("GFX_SINGLE_TEAM", gfxSingleTeam,
"%s", (gfxSingleTeam ? "Combining CUs to work across entire data array" : "%s", (gfxSingleTeam ? "Combining CUs to work across entire data array" :
"Each CUs operates on its own disjoint subarray")); "Each CUs operates on its own disjoint subarray"));
Print("GFX_TEMPORAL", gfxTemporal,
"%s", (gfxTemporal == 0 ? "Not using non-temporal loads/stores" :
gfxTemporal == 1 ? "Using non-temporal loads" :
gfxTemporal == 2 ? "Using non-temporal stores" :
"Using non-temporal loads and stores"));
Print("GFX_UNROLL", gfxUnroll, Print("GFX_UNROLL", gfxUnroll,
"Using GFX unroll factor of %d", gfxUnroll); "Using GFX unroll factor of %d", gfxUnroll);
Print("GFX_WAVE_ORDER", gfxWaveOrder, Print("GFX_WAVE_ORDER", gfxWaveOrder,
...@@ -576,6 +585,7 @@ public: ...@@ -576,6 +585,7 @@ public:
cfg.gfx.cuMask = cuMask; cfg.gfx.cuMask = cuMask;
cfg.gfx.prefXccTable = prefXccTable; cfg.gfx.prefXccTable = prefXccTable;
cfg.gfx.unrollFactor = gfxUnroll; cfg.gfx.unrollFactor = gfxUnroll;
cfg.gfx.temporalMode = gfxTemporal;
cfg.gfx.useHipEvents = useHipEvents; cfg.gfx.useHipEvents = useHipEvents;
cfg.gfx.useMultiStream = !useSingleStream; cfg.gfx.useMultiStream = !useSingleStream;
cfg.gfx.useSingleTeam = gfxSingleTeam; cfg.gfx.useSingleTeam = gfxSingleTeam;
......
...@@ -169,8 +169,9 @@ void AllToAllPreset(EnvVars& ev, ...@@ -169,8 +169,9 @@ void AllToAllPreset(EnvVars& ev,
// Print results // Print results
char separator = (ev.outputToCsv ? ',' : ' '); char separator = (ev.outputToCsv ? ',' : ' ');
printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer); printf("\nSummary: [%lu bytes per Transfer] [%s:%d] [%d Read(s) %d Write(s)]\n",
printf("==========================================================\n"); numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, numSrcs, numDsts);
printf("===========================================================================\n");
printf("SRC\\DST "); printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++) for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst); printf("%cGPU %02d ", separator, dst);
......
...@@ -44,6 +44,7 @@ void AllToAllSweepPreset(EnvVars& ev, ...@@ -44,6 +44,7 @@ void AllToAllSweepPreset(EnvVars& ev,
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT" , 1); int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT" , 1);
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0); int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int showMinOnly = EnvVars::GetEnvVar("SHOW_MIN_ONLY", 1);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1); int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0); int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
int useSpray = EnvVars::GetEnvVar("USE_SPRAY", 0); int useSpray = EnvVars::GetEnvVar("USE_SPRAY", 0);
...@@ -76,6 +77,7 @@ void AllToAllSweepPreset(EnvVars& ev, ...@@ -76,6 +77,7 @@ void AllToAllSweepPreset(EnvVars& ev,
ev.Print("A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode), ev.Print("A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " + (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]); std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
ev.Print("SHOW_MIN_ONLY" , showMinOnly , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results");
ev.Print("NUM_CUS" , numCusList.size(), EnvVars::ToStr(numCusList).c_str()); ev.Print("NUM_CUS" , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus); ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("UNROLLS" , unrollList.size(), EnvVars::ToStr(unrollList).c_str()); ev.Print("UNROLLS" , unrollList.size(), EnvVars::ToStr(unrollList).c_str());
...@@ -181,7 +183,7 @@ void AllToAllSweepPreset(EnvVars& ev, ...@@ -181,7 +183,7 @@ void AllToAllSweepPreset(EnvVars& ev,
printf("#CUs\\Unroll"); printf("#CUs\\Unroll");
for (int u : unrollList) { for (int u : unrollList) {
printf(" %d(Min) ", u); printf(" %d(Min) ", u);
printf(" %d(Max) ", u); if (!showMinOnly) printf(" %d(Max) ", u);
} }
printf("\n"); printf("\n");
for (int c : numCusList) { for (int c : numCusList) {
...@@ -207,7 +209,9 @@ void AllToAllSweepPreset(EnvVars& ev, ...@@ -207,7 +209,9 @@ void AllToAllSweepPreset(EnvVars& ev,
} else { } else {
minBandwidth = 0.0; minBandwidth = 0.0;
} }
printf(" %7.2f %7.2f ", minBandwidth, maxBandwidth); fflush(stdout); printf(" %7.2f ", minBandwidth);
if (!showMinOnly) printf(" %7.2f ", maxBandwidth);
fflush(stdout);
} }
printf("\n"); fflush(stdout); printf("\n"); fflush(stdout);
} }
......
...@@ -66,13 +66,12 @@ namespace TransferBench ...@@ -66,13 +66,12 @@ namespace TransferBench
using std::set; using std::set;
using std::vector; using std::vector;
constexpr char VERSION[] = "1.61"; constexpr char VERSION[] = "1.62";
/** /**
* Enumeration of supported Executor types * Enumeration of supported Executor types
* *
* @note The Executor is the device used to perform a Transfer * @note The Executor is the device used to perform a Transfer
* @note IBVerbs executor is currently not implemented yet
*/ */
enum ExeType enum ExeType
{ {
...@@ -113,10 +112,11 @@ namespace TransferBench ...@@ -113,10 +112,11 @@ namespace TransferBench
MEM_GPU_FINE = 3, ///< Fine-grained global GPU memory MEM_GPU_FINE = 3, ///< Fine-grained global GPU memory
MEM_CPU_UNPINNED = 4, ///< Unpinned CPU memory MEM_CPU_UNPINNED = 4, ///< Unpinned CPU memory
MEM_NULL = 5, ///< NULL memory - used for empty MEM_NULL = 5, ///< NULL memory - used for empty
MEM_MANAGED = 6 ///< Managed memory MEM_MANAGED = 6, ///< Managed memory
MEM_CPU_CLOSEST = 7, ///< Coarse-grained pinned CPU memory indexed by closest GPU
}; };
char const MemTypeStr[8] = "CGBFUNM"; char const MemTypeStr[9] = "CGBFUNMP";
inline bool IsCpuMemType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); } inline bool IsCpuMemType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED || m == MEM_CPU_CLOSEST); }
inline bool IsGpuMemType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); } inline bool IsGpuMemType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); }
/** /**
...@@ -179,6 +179,7 @@ namespace TransferBench ...@@ -179,6 +179,7 @@ namespace TransferBench
int blockSize = 256; ///< Size of each threadblock (must be multiple of 64) int blockSize = 256; ///< Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask = {}; ///< Bit-vector representing the CU mask vector<uint32_t> cuMask = {}; ///< Bit-vector representing the CU mask
vector<vector<int>> prefXccTable = {}; ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device vector<vector<int>> prefXccTable = {}; ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
int temporalMode = 0; ///< Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both
int unrollFactor = 4; ///< GFX-kernel unroll factor int unrollFactor = 4; ///< GFX-kernel unroll factor
int useHipEvents = 1; ///< Use HIP events for timing GFX Executor int useHipEvents = 1; ///< Use HIP events for timing GFX Executor
int useMultiStream = 0; ///< Use multiple streams for GFX int useMultiStream = 0; ///< Use multiple streams for GFX
...@@ -740,8 +741,14 @@ namespace { ...@@ -740,8 +741,14 @@ namespace {
MemType const& memType = memDevice.memType; MemType const& memType = memDevice.memType;
if (IsCpuMemType(memType)) { if (IsCpuMemType(memType)) {
// Set numa policy prior to call to hipHostMalloc // Determine which NUMA device to use
numa_set_preferred(memDevice.memIndex); int numaIdx = memDevice.memIndex;
if (memType == MEM_CPU_CLOSEST) {
numaIdx = GetClosestCpuNumaToGpu(memDevice.memIndex);
}
// Set NUMA policy prior to call to hipHostMalloc
numa_set_preferred(numaIdx);
// Allocate host-pinned memory (should respect NUMA mem policy) // Allocate host-pinned memory (should respect NUMA mem policy)
if (memType == MEM_CPU_FINE) { if (memType == MEM_CPU_FINE) {
...@@ -750,19 +757,19 @@ namespace { ...@@ -750,19 +757,19 @@ namespace {
#else #else
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser)); ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
#endif #endif
} else if (memType == MEM_CPU) { } else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) {
#if defined (__NVCC__) #if defined (__NVCC__)
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, 0)); ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, 0));
#else #else
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent)); ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent));
#endif #endif
} else if (memType == MEM_CPU_UNPINNED) { } else if (memType == MEM_CPU_UNPINNED) {
*memPtr = numa_alloc_onnode(numBytes, memDevice.memIndex); *memPtr = numa_alloc_onnode(numBytes, numaIdx);
} }
// Check that the allocated pages are actually on the correct NUMA node // Check that the allocated pages are actually on the correct NUMA node
memset(*memPtr, 0, numBytes); memset(*memPtr, 0, numBytes);
ERR_CHECK(CheckPages((char*)*memPtr, numBytes, memDevice.memIndex)); ERR_CHECK(CheckPages((char*)*memPtr, numBytes, numaIdx));
// Reset to default numa mem policy // Reset to default numa mem policy
numa_set_preferred(-1); numa_set_preferred(-1);
...@@ -801,7 +808,7 @@ namespace { ...@@ -801,7 +808,7 @@ namespace {
return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes}; return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes};
switch (memType) { switch (memType) {
case MEM_CPU: case MEM_CPU_FINE: case MEM_CPU: case MEM_CPU_FINE: case MEM_CPU_CLOSEST:
{ {
ERR_CHECK(hipHostFree(memPtr)); ERR_CHECK(hipHostFree(memPtr));
break; break;
...@@ -928,7 +935,7 @@ namespace { ...@@ -928,7 +935,7 @@ namespace {
if (memDevice.memType == MEM_NULL) if (memDevice.memType == MEM_NULL)
return ERR_NONE; return ERR_NONE;
if (IsCpuMemType(memDevice.memType)) { if (IsCpuMemType(memDevice.memType) && memDevice.memType != MEM_CPU_CLOSEST) {
int numCpus = GetNumExecutors(EXE_CPU); int numCpus = GetNumExecutors(EXE_CPU);
if (memDevice.memIndex < 0 || memDevice.memIndex >= numCpus) if (memDevice.memIndex < 0 || memDevice.memIndex >= numCpus)
return {ERR_FATAL, return {ERR_FATAL,
...@@ -936,11 +943,16 @@ namespace { ...@@ -936,11 +943,16 @@ namespace {
return ERR_NONE; return ERR_NONE;
} }
if (IsGpuMemType(memDevice.memType)) { if (IsGpuMemType(memDevice.memType) || memDevice.memType == MEM_CPU_CLOSEST) {
int numGpus = GetNumExecutors(EXE_GPU_GFX); int numGpus = GetNumExecutors(EXE_GPU_GFX);
if (memDevice.memIndex < 0 || memDevice.memIndex >= numGpus) if (memDevice.memIndex < 0 || memDevice.memIndex >= numGpus)
return {ERR_FATAL, return {ERR_FATAL,
"GPU index must be between 0 and %d (instead of %d)", numGpus - 1, memDevice.memIndex}; "GPU index must be between 0 and %d (instead of %d)", numGpus - 1, memDevice.memIndex};
if (memDevice.memType == MEM_CPU_CLOSEST) {
if (GetClosestCpuNumaToGpu(memDevice.memIndex) == -1) {
return {ERR_FATAL, "Unable to determine closest NUMA node for GPU %d", memDevice.memIndex};
}
}
return ERR_NONE; return ERR_NONE;
} }
return {ERR_FATAL, "Unsupported memory type (%d)", memDevice.memType}; return {ERR_FATAL, "Unsupported memory type (%d)", memDevice.memType};
...@@ -974,6 +986,16 @@ namespace { ...@@ -974,6 +986,16 @@ namespace {
"[gfx.blockSize] must be positive multiple of 64 less than or equal to %d", "[gfx.blockSize] must be positive multiple of 64 less than or equal to %d",
gfxMaxBlockSize}); gfxMaxBlockSize});
if (cfg.gfx.temporalMode < 0 || cfg.gfx.temporalMode > 3)
errors.push_back({ERR_FATAL,
"[gfx.temporalMode] must be non-negative and less than or equal to 3"});
#if defined(__NVCC__)
if (cfg.gfx.temporalMode > 0)
errors.push_back({ERR_FATAL,
"[gfx.temporalMode] is not supported on NVIDIA hardware"});
#endif
int gfxMaxUnroll = GetIntAttribute(ATR_GFX_MAX_UNROLL); int gfxMaxUnroll = GetIntAttribute(ATR_GFX_MAX_UNROLL);
if (cfg.gfx.unrollFactor < 0 || cfg.gfx.unrollFactor > gfxMaxUnroll) if (cfg.gfx.unrollFactor < 0 || cfg.gfx.unrollFactor > gfxMaxUnroll)
errors.push_back({ERR_FATAL, errors.push_back({ERR_FATAL,
...@@ -2760,8 +2782,89 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2760,8 +2782,89 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
MEMSET_VAL); } MEMSET_VAL); }
// Kernel for GFX execution // Helper function for temporal/non-temporal reads / writes
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL> #define TEMPORAL_NONE 0
#define TEMPORAL_LOAD 1
#define TEMPORAL_STORE 2
#define TEMPORAL_BOTH 3
template <int TEMPORAL_MODE>
__device__ __forceinline__ void Load(float const* src, float& dst) {
if (TEMPORAL_MODE & TEMPORAL_LOAD) {
#if !defined(__NVCC__)
dst = __builtin_nontemporal_load(src);
#endif
} else {
dst = *src;
}
}
template <int TEMPORAL_MODE>
__device__ __forceinline__ void Load(float2 const* src, float2& dst) {
if (TEMPORAL_MODE & TEMPORAL_LOAD) {
#if !defined(__NVCC__)
dst.x = __builtin_nontemporal_load(&(src->x));
dst.y = __builtin_nontemporal_load(&(src->y));
#endif
} else {
dst = *src;
}
}
template <int TEMPORAL_MODE>
__device__ __forceinline__ void Load(float4 const* src, float4& dst) {
if (TEMPORAL_MODE & TEMPORAL_LOAD) {
#if !defined(__NVCC__)
dst.x = __builtin_nontemporal_load(&(src->x));
dst.y = __builtin_nontemporal_load(&(src->y));
dst.z = __builtin_nontemporal_load(&(src->z));
dst.w = __builtin_nontemporal_load(&(src->w));
#endif
} else {
dst = *src;
}
}
template <int TEMPORAL_MODE>
__device__ __forceinline__ void Store(float const& src, float* dst) {
if (TEMPORAL_MODE & TEMPORAL_STORE) {
#if !defined(__NVCC__)
__builtin_nontemporal_store(src, dst);
#endif
} else {
*dst = src;
}
}
template <int TEMPORAL_MODE>
__device__ __forceinline__ void Store(float2 const& src, float2* dst) {
if (TEMPORAL_MODE & TEMPORAL_STORE) {
#if !defined(__NVCC__)
__builtin_nontemporal_store(src.x, &(dst->x));
__builtin_nontemporal_store(src.y, &(dst->y));
#endif
} else {
*dst = src;
}
}
template <int TEMPORAL_MODE>
__device__ __forceinline__ void Store(float4 const& src, float4* dst) {
if (TEMPORAL_MODE & TEMPORAL_STORE) {
#if !defined(__NVCC__)
__builtin_nontemporal_store(src.x, &(dst->x));
__builtin_nontemporal_store(src.y, &(dst->y));
__builtin_nontemporal_store(src.z, &(dst->z));
__builtin_nontemporal_store(src.w, &(dst->w));
#endif
} else {
*dst = src;
}
}
// Kernel for GFX execution
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
__global__ void __launch_bounds__(BLOCKSIZE) __global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations) GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
{ {
...@@ -2811,6 +2914,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2811,6 +2914,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
size_t const loop1Limit = numPackedFloat / loop1Stride * loop1Stride; size_t const loop1Limit = numPackedFloat / loop1Stride * loop1Stride;
{ {
PACKED_FLOAT val[UNROLL]; PACKED_FLOAT val[UNROLL];
PACKED_FLOAT tmp[UNROLL];
if (numSrcs == 0) { if (numSrcs == 0) {
#pragma unroll #pragma unroll
for (int u = 0; u < UNROLL; u++) for (int u = 0; u < UNROLL; u++)
...@@ -2820,18 +2924,25 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2820,18 +2924,25 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) { for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) {
// Read sources into memory and accumulate in registers // Read sources into memory and accumulate in registers
if (numSrcs) { if (numSrcs) {
#pragma unroll
for (int u = 0; u < UNROLL; u++)
Load<TEMPORAL_MODE>(&srcFloatPacked[0][idx + u * unrlStride * warpSize], val[u]);
for (int s = 1; s < numSrcs; s++) {
#pragma unroll
for (int u = 0; u < UNROLL; u++) for (int u = 0; u < UNROLL; u++)
val[u] = srcFloatPacked[0][idx + u * unrlStride * warpSize]; Load<TEMPORAL_MODE>(&srcFloatPacked[s][idx + u * unrlStride * warpSize], tmp[u]);
for (int s = 1; s < numSrcs; s++) #pragma unroll
for (int u = 0; u < UNROLL; u++) for (int u = 0; u < UNROLL; u++)
val[u] += srcFloatPacked[s][idx + u * unrlStride * warpSize]; val[u] += tmp[u];
}
} }
// Write accumulation to all outputs // Write accumulation to all outputs
for (int d = 0; d < numDsts; d++) { for (int d = 0; d < numDsts; d++) {
#pragma unroll #pragma unroll
for (int u = 0; u < UNROLL; u++) for (int u = 0; u < UNROLL; u++)
dstFloatPacked[d][idx + u * unrlStride * warpSize] = val[u]; Store<TEMPORAL_MODE>(val[u], &dstFloatPacked[d][idx + u * unrlStride * warpSize]);
} }
} }
} }
...@@ -2839,19 +2950,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2839,19 +2950,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Second loop: Deal with remaining PACKED_FLOAT // Second loop: Deal with remaining PACKED_FLOAT
{ {
if (loop1Limit < numPackedFloat) { if (loop1Limit < numPackedFloat) {
PACKED_FLOAT val; PACKED_FLOAT val, tmp;
if (numSrcs == 0) val = MemsetVal<PACKED_FLOAT>(); if (numSrcs == 0) val = MemsetVal<PACKED_FLOAT>();
size_t const loop2Stride = nTeams * nWaves * warpSize; size_t const loop2Stride = nTeams * nWaves * warpSize;
for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
idx < numPackedFloat; idx += loop2Stride) { idx < numPackedFloat; idx += loop2Stride) {
if (numSrcs) { if (numSrcs) {
val = srcFloatPacked[0][idx]; Load<TEMPORAL_MODE>(&srcFloatPacked[0][idx], val);
for (int s = 1; s < numSrcs; s++) for (int s = 1; s < numSrcs; s++) {
val += srcFloatPacked[s][idx]; Load<TEMPORAL_MODE>(&srcFloatPacked[s][idx], tmp);
val += tmp;
}
} }
for (int d = 0; d < numDsts; d++) for (int d = 0; d < numDsts; d++)
dstFloatPacked[d][idx] = val; Store<TEMPORAL_MODE>(val, &dstFloatPacked[d][idx]);
} }
} }
} }
...@@ -2859,19 +2972,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2859,19 +2972,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Third loop; Deal with remaining floats // Third loop; Deal with remaining floats
{ {
if (numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) < p.N) { if (numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) < p.N) {
float val; float val, tmp;
if (numSrcs == 0) val = MemsetVal<float>(); if (numSrcs == 0) val = MemsetVal<float>();
size_t const loop3Stride = nTeams * nWaves * warpSize; size_t const loop3Stride = nTeams * nWaves * warpSize;
for (size_t idx = numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) { for (size_t idx = numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
if (numSrcs) { if (numSrcs) {
val = p.src[0][idx]; Load<TEMPORAL_MODE>(&p.src[0][idx], val);
for (int s = 1; s < numSrcs; s++) for (int s = 1; s < numSrcs; s++) {
val += p.src[s][idx]; Load<TEMPORAL_MODE>(&p.src[s][idx], tmp);
val += tmp;
}
} }
for (int d = 0; d < numDsts; d++) for (int d = 0; d < numDsts; d++)
p.dst[d][idx] = val; Store<TEMPORAL_MODE>(val, &p.dst[d][idx]);
} }
} }
} }
...@@ -2890,10 +3005,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2890,10 +3005,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
} }
#define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD) \
{GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_STORE>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_BOTH>}
#define GPU_KERNEL_DWORD_DECL(BLOCKSIZE, UNROLL) \ #define GPU_KERNEL_DWORD_DECL(BLOCKSIZE, UNROLL) \
{GpuReduceKernel<float, BLOCKSIZE, UNROLL>, \ {GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float), \
GpuReduceKernel<float2, BLOCKSIZE, UNROLL>, \ GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float2), \
GpuReduceKernel<float4, BLOCKSIZE, UNROLL>} GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float4)}
#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \ #define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
{GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 1), \ {GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 1), \
...@@ -2907,7 +3028,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2907,7 +3028,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size) // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size)
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int); typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3] = GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3][4] =
{ {
GPU_KERNEL_UNROLL_DECL(64), GPU_KERNEL_UNROLL_DECL(64),
GPU_KERNEL_UNROLL_DECL(128), GPU_KERNEL_UNROLL_DECL(128),
...@@ -2919,6 +3040,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2919,6 +3040,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
GPU_KERNEL_UNROLL_DECL(512) GPU_KERNEL_UNROLL_DECL(512)
}; };
#undef GPU_KERNEL_UNROLL_DECL #undef GPU_KERNEL_UNROLL_DECL
#undef GPU_KERNEL_DWORD_DECL
#undef GPU_KERNEL_TEMPORAL_DECL
// Execute a single GPU Transfer (when using 1 stream per Transfer) // Execute a single GPU Transfer (when using 1 stream per Transfer)
static ErrResult ExecuteGpuTransfer(int const iteration, static ErrResult ExecuteGpuTransfer(int const iteration,
...@@ -2938,7 +3061,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2938,7 +3061,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 : int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
cfg.gfx.wordSize == 2 ? 1 : cfg.gfx.wordSize == 2 ? 1 :
2; 2;
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx]; auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode];
#if defined(__NVCC__) #if defined(__NVCC__)
if (startEvent != NULL) if (startEvent != NULL)
...@@ -3014,7 +3137,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3014,7 +3137,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 : int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
cfg.gfx.wordSize == 2 ? 1 : cfg.gfx.wordSize == 2 ? 1 :
2; 2;
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx]; auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode];
#if defined(__NVCC__) #if defined(__NVCC__)
if (cfg.gfx.useHipEvents) if (cfg.gfx.useHipEvents)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment