Unverified Commit f33c7fd9 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

V1.45 candidate (#78)

* v1.45 New GFX kernel
parent 33a5435c
......@@ -3,6 +3,16 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.45
### Additions
* Adding A2A_MODE to a2a preset (0 = copy, 1 = read-only, 2 = write-only)
* Adding GFX_UNROLL to modify GFX kernel's unroll factor
* Adding GFX_WAVE_ORDER to modify order in which wavefronts process data
### Modifications
* Rewrote the GFX reduction kernel to support new wave ordering
## v1.44
### Additions
......
......@@ -158,7 +158,7 @@ int main(int argc, char **argv)
}
ev.DisplayRemoteWriteEnvVars();
int numSubExecs = (argc > 3 ? atoi(argv[3]) : 8);
int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
int srcIdx = (argc > 4 ? atoi(argv[4]) : 0);
int minGpus = (argc > 5 ? atoi(argv[5]) : 1);
int maxGpus = (argc > 6 ? atoi(argv[6]) : std::min(ev.numGpuDevices - 1, 3));
......@@ -611,16 +611,26 @@ void ExecuteTransfers(EnvVars const& ev,
transfer->executorBandwidth = exeBandwidthGbs;
totalCUs += transfer->numSubExecs;
char exeSubIndexStr[32] = "";
if (ev.useXccFilter)
{
if (transfer->exeSubIndex == -1)
sprintf(exeSubIndexStr, ".*");
else
sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex);
}
if (!verbose) continue;
if (!ev.outputToCsv)
{
printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d%s:%03d -> %s\n",
transfer->transferIndex,
transfer->transferBandwidth,
transfer->transferTime,
transfer->numBytesActual,
transfer->SrcToStr().c_str(),
ExeTypeName[transfer->exeType], transfer->exeIndex,
exeSubIndexStr,
transfer->numSubExecs,
transfer->DstToStr().c_str());
......@@ -668,10 +678,10 @@ void ExecuteTransfers(EnvVars const& ev,
}
else
{
printf("%d,%d,%lu,%s,%c%02d,%s,%d,%.3f,%.3f,%s,%s\n",
printf("%d,%d,%lu,%s,%c%02d%s,%s,%d,%.3f,%.3f,%s,%s\n",
testNum, transfer->transferIndex, transfer->numBytesActual,
transfer->SrcToStr().c_str(),
MemTypeStr[transfer->exeType], transfer->exeIndex,
MemTypeStr[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
transfer->DstToStr().c_str(),
transfer->numSubExecs,
transfer->transferBandwidth, transfer->transferTime,
......@@ -699,14 +709,24 @@ void ExecuteTransfers(EnvVars const& ev,
transfer->executorBandwidth = transfer->transferBandwidth;
maxGpuTime = std::max(maxGpuTime, transfer->transferTime);
if (!verbose) continue;
char exeSubIndexStr[32] = "";
if (ev.useXccFilter)
{
if (transfer->exeSubIndex == -1)
sprintf(exeSubIndexStr, ".*");
else
sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex);
}
if (!ev.outputToCsv)
{
printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d%s:%03d -> %s\n",
transfer->transferIndex,
transfer->transferBandwidth, transfer->transferTime,
transfer->numBytesActual,
transfer->SrcToStr().c_str(),
ExeTypeName[transfer->exeType], transfer->exeIndex,
ExeTypeName[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
transfer->numSubExecs,
transfer->DstToStr().c_str());
......@@ -753,10 +773,10 @@ void ExecuteTransfers(EnvVars const& ev,
}
else
{
printf("%d,%d,%lu,%s,%s%02d,%s,%d,%.3f,%.3f,%s,%s\n",
printf("%d,%d,%lu,%s,%s%02d%s,%s,%d,%.3f,%.3f,%s,%s\n",
testNum, transfer->transferIndex, transfer->numBytesActual,
transfer->SrcToStr().c_str(),
ExeTypeName[transfer->exeType], transfer->exeIndex,
ExeTypeName[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
transfer->DstToStr().c_str(),
transfer->numSubExecs,
transfer->transferBandwidth, transfer->transferTime,
......@@ -1436,15 +1456,16 @@ void RunTransfer(EnvVars const& ev, int const iteration,
#if defined(__NVCC__)
HIP_CALL(hipEventRecord(startEvent, stream));
GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, ev.blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
GpuKernelTable[ev.gfxBlockSize/warpSize - 1][ev.gfxUnroll - 1]
<<<numBlocksToRun, ev.gfxBlockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.waveOrder);
HIP_CALL(hipEventRecord(stopEvent, stream));
#else
hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/warpSize - 1][ev.gfxUnroll - 1],
dim3(numXCCs, numBlocksToRun, 1),
dim3(ev.blockSize, 1, 1),
dim3(ev.gfxBlockSize, 1, 1),
ev.sharedMemBytes, stream,
startEvent, stopEvent,
0, transfer->subExecParamGpuPtr);
0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
#endif
// Synchronize per iteration, unless in single sync mode, in which case
// synchronize during last warmup / last actual iteration
......@@ -1947,8 +1968,8 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.numSubExecs = numSubExecs;
transfer.numSrcs = 1;
transfer.numDsts = 1;
transfer.numSrcs = ev.a2aMode == 2 ? 0 : 1;
transfer.numDsts = ev.a2aMode == 1 ? 0 : 1;
transfer.exeType = EXE_GPU_GFX;
transfer.exeSubIndex = -1;
transfer.srcType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
......@@ -2065,22 +2086,35 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)
size_t assigned = 0;
for (int i = 0; i < this->numSubExecs; ++i)
{
SubExecParam& p = this->subExecParam[i];
p.numSrcs = this->numSrcs;
p.numDsts = this->numDsts;
if (ev.gfxSingleTeam && this->exeType == EXE_GPU_GFX)
{
p.N = N;
p.teamSize = this->numSubExecs;
p.teamIdx = i;
for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.src[iSrc] = this->srcMem[iSrc] + initOffset;
for (int iDst = 0; iDst < this->numDsts; ++iDst) p.dst[iDst] = this->dstMem[iDst] + initOffset;
}
else
{
int const subExecLeft = std::max(0, maxSubExecToUse - i);
size_t const leftover = N - assigned;
size_t const roundedN = (leftover + targetMultiple - 1) / targetMultiple;
SubExecParam& p = this->subExecParam[i];
p.N = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
p.numSrcs = this->numSrcs;
p.numDsts = this->numDsts;
for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc)
p.src[iSrc] = this->srcMem[iSrc] + assigned + initOffset;
for (int iDst = 0; iDst < this->numDsts; ++iDst)
p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset;
p.teamSize = 1;
p.teamIdx = 0;
for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.src[iSrc] = this->srcMem[iSrc] + initOffset + assigned;
for (int iDst = 0; iDst < this->numDsts; ++iDst) p.dst[iDst] = this->dstMem[iDst] + initOffset + assigned;
p.preferredXccId = -1;
assigned += p.N;
}
p.preferredXccId = -1;
if (ev.useXccFilter && this->exeType == EXE_GPU_GFX)
{
std::uniform_int_distribution<int> distribution(0, ev.xccIdsPerDevice[this->exeIndex].size() - 1);
......@@ -2109,7 +2143,6 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)
p.startCycle = 0;
p.stopCycle = 0;
assigned += p.N;
}
this->transferTime = 0.0;
......@@ -2178,7 +2211,7 @@ bool Transfer::PrepareSrc(EnvVars const& ev)
int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
HIP_CALL(hipSetDevice(deviceIdx));
if (ev.usePrepSrcKernel)
PrepSrcDataKernel<<<32, ev.blockSize>>>(srcPtr, N, srcIdx);
PrepSrcDataKernel<<<32, ev.gfxBlockSize>>>(srcPtr, N, srcIdx);
else
HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
HIP_CALL(hipDeviceSynchronize());
......@@ -2424,17 +2457,22 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
char memType = ev.useFineGrain ? 'F' : 'G';
printf("Bytes to write: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
char sep = (ev.outputToCsv ? ',' : ' ');
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (i == srcIdx) continue;
printf(" GPU %3d ", i);
printf(" GPU %-3d %c", i, sep);
}
printf("\n");
if (!ev.outputToCsv)
{
for (int i = 0; i < ev.numGpuDevices-1; i++)
{
printf("-------------");
}
printf("\n");
}
for (int p = minGpus; p <= maxGpus; p++)
{
......@@ -2469,11 +2507,12 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (bitmask & (1<<i))
printf(" %8.3f ", transfers[counter++].transferBandwidth);
printf(" %8.3f %c", transfers[counter++].transferBandwidth, sep);
else if (i != srcIdx)
printf(" ");
printf(" %c", sep);
}
printf(" %d %d", p, numSubExecs);
for (auto i = 0; i < transfers.size(); i++)
{
printf(" (N0 G%d %c%d)", srcIdx, MemTypeStr[transfers[i].dstType[0]], transfers[i].dstIndex[0]);
......@@ -2481,9 +2520,7 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
printf("\n");
}
}
printf("\n");
}
}
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
......
......@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.44"
#define TB_VERSION "1.45"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......@@ -75,11 +75,14 @@ public:
// Environment variables
int alwaysValidate; // Validate after each iteration instead of once after all iterations
int blockSize; // Size of each threadblock (must be multiple of 64)
int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy
int blockBytes; // Each subexecutor, except the last, gets a multiple of this many bytes to copy
int blockOrder; // How blocks are ordered in single-stream mode (0=Sequential, 1=Interleaved, 2=Random)
int byteOffset; // Byte-offset for memory allocations
int continueOnError; // Continue tests even after mismatch detected
int gfxBlockSize; // Size of each threadblock (must be multiple of 64)
int gfxSingleTeam; // Team all subExecutors across the data array
int gfxUnroll; // GFX-kernel unroll factor
int gfxWaveOrder; // GFX-kernel wavefront ordering
int hideEnv; // Skip printing environment variable
int numCpuDevices; // Number of CPU devices to use (defaults to # NUMA nodes detected)
int numGpuDevices; // Number of GPU devices to use (defaults to # HIP devices detected)
......@@ -123,10 +126,10 @@ public:
// Enviroment variables only for A2A preset
int a2aDirect; // Only execute on links that are directly connected
int a2aMode; // Perform 0=copy, 1=read only, 2 = write only
// Developer features
int enableDebug; // Enable debug output
int gpuKernel; // Which GPU kernel to use
// Used to track current configuration mode
ConfigModeEnum configMode;
......@@ -167,19 +170,22 @@ public:
// Different hardware pick different GPU kernels
// This performance difference is generally only noticable when executing fewer CUs
int defaultGpuKernel = 0;
if (archName == "gfx906") defaultGpuKernel = 13;
else if (archName == "gfx90a") defaultGpuKernel = 9;
else if (archName == "gfx940") defaultGpuKernel = 6;
else if (archName == "gfx941") defaultGpuKernel = 6;
else if (archName == "gfx942") defaultGpuKernel = 3;
int defaultGfxUnroll = 4;
if (archName == "gfx906") defaultGfxUnroll = 13;
else if (archName == "gfx90a") defaultGfxUnroll = 9;
else if (archName == "gfx940") defaultGfxUnroll = 6;
else if (archName == "gfx941") defaultGfxUnroll = 6;
else if (archName == "gfx942") defaultGfxUnroll = 4;
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0);
blockSize = GetEnvVar("BLOCK_SIZE" , 256);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
blockOrder = GetEnvVar("BLOCK_ORDER" , 0);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 0);
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0);
hideEnv = GetEnvVar("HIDE_ENV" , 0);
numCpuDevices = GetEnvVar("NUM_CPU_DEVICES" , numDetectedCpus);
numGpuDevices = GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus);
......@@ -196,7 +202,6 @@ public:
useXccFilter = GetEnvVar("USE_XCC_FILTER" , 0);
validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0);
enableDebug = GetEnvVar("DEBUG" , 0);
gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel);
// P2P Benchmark related
useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0); // Needed for numGpuSubExec
......@@ -221,6 +226,7 @@ public:
// A2A Benchmark related
a2aDirect = GetEnvVar("A2A_DIRECT" , 1);
a2aMode = GetEnvVar("A2A_MODE" , 0);
// Determine random seed
char *sweepSeedStr = getenv("SWEEP_SEED");
......@@ -401,14 +407,14 @@ public:
printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
exit(1);
}
if (blockSize % 64)
if (gfxBlockSize % 64)
{
printf("[ERROR] BLOCK_SIZE (%d) must be a multiple of 64\n", blockSize);
printf("[ERROR] GFX_BLOCK_SIZE (%d) must be a multiple of 64\n", gfxBlockSize);
exit(1);
}
if (blockSize > MAX_BLOCKSIZE)
if (gfxBlockSize > MAX_BLOCKSIZE)
{
printf("[ERROR] BLOCK_SIZE (%d) must be less than %d\n", blockSize, MAX_BLOCKSIZE);
printf("[ERROR] BLOCK_SIZE (%d) must be less than %d\n", gfxBlockSize, MAX_BLOCKSIZE);
exit(1);
}
if (byteOffset % sizeof(float))
......@@ -494,9 +500,22 @@ public:
exit(1);
}
}
if (gpuKernel < 0 || gpuKernel > NUM_GPU_KERNELS)
if (a2aMode < 0 || a2aMode > 2)
{
printf("[ERROR] a2aMode must be between 0 and 2\n");
exit(1);
}
if (gfxUnroll < 1 || gfxUnroll > MAX_UNROLL)
{
printf("[ERROR] GFX kernel unroll factor must be between 1 and %d\n", MAX_UNROLL);
exit(1);
}
if (gfxWaveOrder < 0 || gfxWaveOrder >= 6)
{
printf("[ERROR] GPU kernel must be between 0 and %d\n", NUM_GPU_KERNELS);
printf("[ERROR] GFX wave order must be between 0 and 5\n");
exit(1);
}
......@@ -533,6 +552,12 @@ public:
exit(1);
}
if (getenv("GPU_KERNEL"))
{
printf("[WARN] GPU_KERNEL has been deprecated and replaced by GFX_KERNEL and GFX_UNROLL\n");
exit(1);
}
char* enableSdma = getenv("HSA_ENABLE_SDMA");
if (enableSdma && !strcmp(enableSdma, "0"))
{
......@@ -553,6 +578,9 @@ public:
printf(" CONTINUE_ON_ERROR - Continue tests even after mismatch detected\n");
printf(" CU_MASK - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n");
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", MAX_UNROLL);
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on individual disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n");
printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n");
printf(" NUM_GPU_DEVICES=X - Restrict number of GPUs to X. May not be greater than # detected HIP devices\n");
......@@ -589,10 +617,9 @@ public:
else if (!hideEnv)
printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
if (hideEnv) return;
PRINT_EV("ALWAYS_VALIDATE", alwaysValidate,
std::string("Validating after ") + (alwaysValidate ? "each iteration" : "all iterations"));
PRINT_EV("BLOCK_SIZE", blockSize,
std::string("Threadblock size of " + std::to_string(blockSize)));
PRINT_EV("BLOCK_BYTES", blockBytes,
std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
PRINT_EV("BLOCK_ORDER", blockOrder,
......@@ -607,8 +634,20 @@ public:
(cuMask.size() ? GetCuMaskDesc() : "All"));
PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
(fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
PRINT_EV("GPU_KERNEL", gpuKernel,
std::string("Using GPU kernel ") + std::to_string(gpuKernel) + " [" + std::string(GpuKernelNames[gpuKernel]) + "]");
PRINT_EV("GFX_BLOCK_SIZE", gfxBlockSize,
std::string("Threadblock size of " + std::to_string(gfxBlockSize)));
PRINT_EV("GFX_SINGLE_TEAM", gfxSingleTeam,
(gfxSingleTeam ? std::string("Combining CUs to work across entire data array") :
std::string("Each CUs operates on its own disjoint subarray")));
PRINT_EV("GFX_UNROLL", gfxUnroll,
std::string("Using GFX unroll factor of ") + std::to_string(gfxUnroll));
PRINT_EV("GFX_WAVE_ORDER", gfxWaveOrder,
std::string("Using GFX wave ordering of ") + std::string((gfxWaveOrder == 0 ? "Unroll,Wavefront,CU" :
gfxWaveOrder == 1 ? "Unroll,CU,Wavefront" :
gfxWaveOrder == 2 ? "Wavefront,Unroll,CU" :
gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
"CU,Wavefront,Unroll")));
PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
......@@ -722,6 +761,10 @@ public:
printf("[AllToAll Related]\n");
PRINT_EV("A2A_DIRECT", a2aDirect,
std::string(a2aDirect ? "Only using direct links" : "Full all-to-all"));
PRINT_EV("A2A_MODE", a2aMode,
std::string(a2aMode == 0 ? "Perform copy" :
a2aMode == 1 ? "Perform read-only" :
"Perform write-only"));
PRINT_EV("USE_FINE_GRAIN", useFineGrain,
std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
PRINT_EV("USE_REMOTE_READ", useRemoteRead,
......
......@@ -23,12 +23,16 @@ THE SOFTWARE.
#pragma once
#define PackedFloat_t float4
#define WARP_SIZE 64
#define MAX_BLOCKSIZE 512
#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
#define MEMSET_CHAR 75
#define MEMSET_VAL 13323083.0f
#define MAX_WAVEGROUPS MAX_BLOCKSIZE / warpSize
#define MAX_UNROLL 8
#define NUM_WAVEORDERS 6
// Each subExecutor is provided with subarrays to work on
#define MAX_SRCS 16
#define MAX_DSTS 16
......@@ -42,6 +46,10 @@ struct SubExecParam
float* dst[MAX_DSTS]; // Destination array pointers
uint32_t preferredXccId; // XCC ID to execute on
// Prepared
int teamSize; // Index of this sub executor amongst team
int teamIdx; // Size of team this sub executor is part of
// Outputs
long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor)
long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor)
......@@ -130,321 +138,161 @@ template <typename T> __device__ __forceinline__ T MemsetVal();
template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; };
template <> __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); }
// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats
template <int LOOP1_UNROLL>
__global__ void __launch_bounds__(MAX_BLOCKSIZE)
GpuReduceKernel(SubExecParam* params)
template <int BLOCKSIZE, int UNROLL>
__global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int waveOrder)
{
int64_t startCycle;
if (threadIdx.x == 0) startCycle = wall_clock64();
SubExecParam& p = params[blockIdx.y];
// Filter by XCC if desired
int xccId;
// (Experimental) Filter by XCC if desired
int32_t xccId;
GetXccId(xccId);
if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
// Collect data information
int32_t const numSrcs = p.numSrcs;
int32_t const numDsts = p.numDsts;
float4 const* __restrict__ srcFloat4[MAX_SRCS];
float4* __restrict__ dstFloat4[MAX_DSTS];
for (int i = 0; i < numSrcs; i++) srcFloat4[i] = (float4*)p.src[i];
for (int i = 0; i < numDsts; i++) dstFloat4[i] = (float4*)p.dst[i];
// Operate on wavefront granularity
int const numSrcs = p.numSrcs;
int const numDsts = p.numDsts;
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number
int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
// 1st loop - each wavefront operates on LOOP1_UNROLL x FLOATS_PER_PACK per thread per iteration
// Determine the number of packed floats processed by the first loop
size_t Nrem = p.N;
size_t const loop1Npack = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
size_t const loop1Nelem = loop1Npack * FLOATS_PER_PACK;
size_t const loop1Inc = blockDim.x * LOOP1_UNROLL;
size_t loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;
while (loop1Offset < loop1Npack)
{
PackedFloat_t vals[LOOP1_UNROLL] = {};
int32_t const nTeams = p.teamSize; // Number of threadblocks working together on this subarray
int32_t const teamIdx = p.teamIdx; // Index of this threadblock within the team
int32_t const nWaves = BLOCKSIZE / warpSize; // Number of wavefronts within this threadblock
int32_t const waveIdx = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
int32_t const tIdx = threadIdx.x % warpSize; // Thread index within wavefront
if (numSrcs == 0)
{
#pragma unroll
for (int u = 0; u < LOOP1_UNROLL; ++u) vals[u] = MemsetVal<float4>();
}
else
{
for (int i = 0; i < numSrcs; ++i)
{
PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i]) + loop1Offset;
#pragma unroll
for (int u = 0; u < LOOP1_UNROLL; ++u)
vals[u] += *(packedSrc + u * WARP_SIZE);
}
}
size_t const numFloat4 = p.N / 4;
int32_t const nFlt4PerWave = warpSize * 4;
for (int i = 0; i < numDsts; ++i)
int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
switch (waveOrder)
{
PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i]) + loop1Offset;
#pragma unroll
for (int u = 0; u < LOOP1_UNROLL; ++u) *(packedDst + u * WARP_SIZE) = vals[u];
case 0: /* U,W,C */ unrlStride = 1; waveStride = UNROLL; teamStride = UNROLL * nWaves; teamStride2 = nWaves; waveStride2 = 1 ; break;
case 1: /* U,C,W */ unrlStride = 1; teamStride = UNROLL; waveStride = UNROLL * nTeams; teamStride2 = 1; waveStride2 = nTeams; break;
case 2: /* W,U,C */ waveStride = 1; unrlStride = nWaves; teamStride = nWaves * UNROLL; teamStride2 = nWaves; waveStride2 = 1 ; break;
case 3: /* W,C,U */ waveStride = 1; teamStride = nWaves; unrlStride = nWaves * nTeams; teamStride2 = nWaves; waveStride2 = 1 ; break;
case 4: /* C,U,W */ teamStride = 1; unrlStride = nTeams; waveStride = nTeams * UNROLL; teamStride2 = 1; waveStride2 = nTeams; break;
case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves; teamStride2 = 1; waveStride2 = nTeams; break;
}
loop1Offset += loop1Inc;
}
Nrem -= loop1Nelem;
if (Nrem > 0)
{
// 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration
// NOTE: Using int32_t due to smaller size requirements
int32_t const loop2Npack = Nrem / FLOATS_PER_PACK;
int32_t const loop2Nelem = loop2Npack * FLOATS_PER_PACK;
int32_t const loop2Inc = blockDim.x;
int32_t loop2Offset = threadIdx.x;
while (loop2Offset < loop2Npack)
// First loop: Each wavefront in the team works on UNROLL float4s per thread
size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
size_t const loop1Limit = numFloat4 / loop1Stride * loop1Stride;
{
PackedFloat_t val;
float4 val[UNROLL];
if (numSrcs == 0)
{
val = MemsetVal<float4>();
}
else
{
val = {};
for (int i = 0; i < numSrcs; ++i)
{
PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i] + loop1Nelem) + loop2Offset;
val += *packedSrc;
}
#pragma unroll
for (int u = 0; u < UNROLL; u++)
val[u] = MemsetVal<float4>();
}
for (int i = 0; i < numDsts; ++i)
for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride)
{
PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i] + loop1Nelem) + loop2Offset;
*packedDst = val;
}
loop2Offset += loop2Inc;
// Read sources into memory and accumulate in registers
if (numSrcs)
{
for (int u = 0; u < UNROLL; u++)
val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
for (int s = 1; s < numSrcs; s++)
for (int u = 0; u < UNROLL; u++)
val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
}
Nrem -= loop2Nelem;
// Deal with leftovers less than FLOATS_PER_PACK)
if (threadIdx.x < Nrem)
// Write accumulation to all outputs
for (int d = 0; d < numDsts; d++)
{
int offset = loop1Nelem + loop2Nelem + threadIdx.x;
float val = 0;
if (numSrcs == 0)
{
val = MEMSET_VAL;
#pragma unroll
for (int u = 0; u < UNROLL; u++)
dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
}
else
{
for (int i = 0; i < numSrcs; ++i)
val += p.src[i][offset];
}
for (int i = 0; i < numDsts; ++i)
p.dst[i][offset] = val;
}
}
__syncthreads();
if (threadIdx.x == 0)
// Second loop: Deal with remaining float4s
{
__threadfence_system();
p.stopCycle = wall_clock64();
p.startCycle = startCycle;
p.xccId = xccId;
__trace_hwreg();
}
}
template <typename FLOAT_TYPE, int UNROLL_FACTOR>
__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
{
int constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
size_t constexpr loopPackInc = blockDim.x * UNROLL_FACTOR;
size_t constexpr numPacksPerWave = WARP_SIZE * UNROLL_FACTOR;
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number
int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
int const numSrcs = p.numSrcs;
int const numDsts = p.numDsts;
size_t const numPacksDone = (numFloatsPerPack == 1 && UNROLL_FACTOR == 1) ? N : (N / (FLOATS_PER_PACK * numPacksPerWave)) * numPacksPerWave;
size_t const numFloatsLeft = N - numPacksDone * numFloatsPerPack;
size_t loopPackOffset = waveId * numPacksPerWave + threadId;
while (loopPackOffset < numPacksDone)
if (loop1Limit < numFloat4)
{
FLOAT_TYPE vals[UNROLL_FACTOR];
float4 val;
if (numSrcs == 0) val = MemsetVal<float4>();
if (numSrcs == 0)
{
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u) vals[u] = MemsetVal<FLOAT_TYPE>();
}
else
size_t const loop2Stride = nTeams * nWaves * warpSize;
for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < numFloat4; idx += loop2Stride)
{
FLOAT_TYPE const* __restrict__ src0Ptr = ((FLOAT_TYPE const*)(p.src[0] + offset)) + loopPackOffset;
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
vals[u] = *(src0Ptr + u * WARP_SIZE);
for (int i = 1; i < numSrcs; ++i)
if (numSrcs)
{
FLOAT_TYPE const* __restrict__ srcPtr = ((FLOAT_TYPE const*)(p.src[i] + offset)) + loopPackOffset;
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
vals[u] += *(srcPtr + u * WARP_SIZE);
}
val = srcFloat4[0][idx];
for (int s = 1; s < numSrcs; s++)
val += srcFloat4[s][idx];
}
for (int i = 0; i < numDsts; ++i)
{
FLOAT_TYPE* __restrict__ dstPtr = (FLOAT_TYPE*)(p.dst[i + offset]) + loopPackOffset;
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
*(dstPtr + u * WARP_SIZE) = vals[u];
for (int d = 0; d < numDsts; d++)
dstFloat4[d][idx] = val;
}
loopPackOffset += loopPackInc;
}
return numFloatsLeft;
}
template <typename FLOAT_TYPE, int UNROLL_FACTOR>
__device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset, size_t const N)
{
// Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
int constexpr numFloatsPerRead = sizeof(FLOAT_TYPE) / sizeof(float);
size_t const numFloatsPerInnerLoop = blockDim.x * numFloatsPerRead;
size_t const numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR;
size_t const numFloatsLeft = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop;
size_t const numFloatsDone = N - numFloatsLeft;
int const numSrcs = p.numSrcs;
int const numDsts = p.numDsts;
for (size_t idx = threadIdx.x * numFloatsPerRead; idx < numFloatsDone; idx += numFloatsPerOuterLoop)
{
FLOAT_TYPE tmp[UNROLL_FACTOR];
if (numSrcs == 0)
{
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
tmp[u] = MemsetVal<FLOAT_TYPE>();
}
else
{
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
tmp[u] = *((FLOAT_TYPE*)(&p.src[0][offset + idx + u * numFloatsPerInnerLoop]));
for (int i = 1; i < numSrcs; ++i)
// Third loop; Deal with remaining floats
{
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
tmp[u] += *((FLOAT_TYPE*)(&p.src[i][offset + idx + u * numFloatsPerInnerLoop]));
}
}
if (numFloat4 * 4 < p.N)
{
float val;
if (numSrcs == 0) val = MemsetVal<float>();
for (int i = 0; i < numDsts; ++i)
size_t const loop3Stride = nTeams * nWaves * warpSize;
for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride)
{
for (int u = 0; u < UNROLL_FACTOR; ++u)
if (numSrcs)
{
*((FLOAT_TYPE*)(&p.dst[i][offset + idx + u * numFloatsPerInnerLoop])) = tmp[u];
val = p.src[0][idx];
for (int s = 1; s < numSrcs; s++)
val += p.src[s][idx];
}
for (int d = 0; d < numDsts; d++)
p.dst[d][idx] = val;
}
}
return numFloatsLeft;
}
template <typename FLOAT_TYPE>
__device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size_t const N, int const unroll)
{
switch (unroll)
{
case 1: return GpuReduceFuncImpl<FLOAT_TYPE, 1>(p, offset, N);
case 2: return GpuReduceFuncImpl<FLOAT_TYPE, 2>(p, offset, N);
case 3: return GpuReduceFuncImpl<FLOAT_TYPE, 3>(p, offset, N);
case 4: return GpuReduceFuncImpl<FLOAT_TYPE, 4>(p, offset, N);
case 5: return GpuReduceFuncImpl<FLOAT_TYPE, 5>(p, offset, N);
case 6: return GpuReduceFuncImpl<FLOAT_TYPE, 6>(p, offset, N);
case 7: return GpuReduceFuncImpl<FLOAT_TYPE, 7>(p, offset, N);
case 8: return GpuReduceFuncImpl<FLOAT_TYPE, 8>(p, offset, N);
case 9: return GpuReduceFuncImpl<FLOAT_TYPE, 9>(p, offset, N);
case 10: return GpuReduceFuncImpl<FLOAT_TYPE, 10>(p, offset, N);
case 11: return GpuReduceFuncImpl<FLOAT_TYPE, 11>(p, offset, N);
case 12: return GpuReduceFuncImpl<FLOAT_TYPE, 12>(p, offset, N);
case 13: return GpuReduceFuncImpl<FLOAT_TYPE, 13>(p, offset, N);
case 14: return GpuReduceFuncImpl<FLOAT_TYPE, 14>(p, offset, N);
case 15: return GpuReduceFuncImpl<FLOAT_TYPE, 15>(p, offset, N);
case 16: return GpuReduceFuncImpl<FLOAT_TYPE, 16>(p, offset, N);
default: return GpuReduceFuncImpl<FLOAT_TYPE, 1>(p, offset, N);
}
}
// GPU copy kernel
__global__ void __launch_bounds__(MAX_BLOCKSIZE)
GpuReduceKernel2(SubExecParam* params)
{
int64_t startCycle = wall_clock64();
SubExecParam& p = params[blockIdx.y];
size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8);
if (numFloatsLeft)
numFloatsLeft = GpuReduceFunc<float4>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
if (numFloatsLeft)
GpuReduceFunc<float>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
__threadfence_system();
// Wait for all threads to finish
__syncthreads();
if (threadIdx.x == 0)
{
p.startCycle = startCycle;
__threadfence_system();
p.stopCycle = wall_clock64();
p.startCycle = startCycle;
p.xccId = xccId;
__trace_hwreg();
}
}
#define NUM_GPU_KERNELS 18
typedef void (*GpuKernelFuncPtr)(SubExecParam*);
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int);
GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] =
{
GpuReduceKernel<8>,
GpuReduceKernel<1>,
GpuReduceKernel<2>,
GpuReduceKernel<3>,
GpuReduceKernel<4>,
GpuReduceKernel<5>,
GpuReduceKernel<6>,
GpuReduceKernel<7>,
GpuReduceKernel<8>,
GpuReduceKernel<9>,
GpuReduceKernel<10>,
GpuReduceKernel<11>,
GpuReduceKernel<12>,
GpuReduceKernel<13>,
GpuReduceKernel<14>,
GpuReduceKernel<15>,
GpuReduceKernel<16>,
GpuReduceKernel2
};
#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
{GpuReduceKernel<BLOCKSIZE, 1>, \
GpuReduceKernel<BLOCKSIZE, 2>, \
GpuReduceKernel<BLOCKSIZE, 3>, \
GpuReduceKernel<BLOCKSIZE, 4>, \
GpuReduceKernel<BLOCKSIZE, 5>, \
GpuReduceKernel<BLOCKSIZE, 6>, \
GpuReduceKernel<BLOCKSIZE, 7>, \
GpuReduceKernel<BLOCKSIZE, 8>}
std::string GpuKernelNames[NUM_GPU_KERNELS] =
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] =
{
"Default - 8xUnroll",
"Unroll x1",
"Unroll x2",
"Unroll x3",
"Unroll x4",
"Unroll x5",
"Unroll x6",
"Unroll x7",
"Unroll x8",
"Unroll x9",
"Unroll x10",
"Unroll x11",
"Unroll x12",
"Unroll x13",
"Unroll x14",
"Unroll x15",
"Unroll x16",
"8xUnrollB",
GPU_KERNEL_UNROLL_DECL(64),
GPU_KERNEL_UNROLL_DECL(128),
GPU_KERNEL_UNROLL_DECL(192),
GPU_KERNEL_UNROLL_DECL(256),
GPU_KERNEL_UNROLL_DECL(320),
GPU_KERNEL_UNROLL_DECL(384),
GPU_KERNEL_UNROLL_DECL(448),
GPU_KERNEL_UNROLL_DECL(512)
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment