Unverified Commit f33c7fd9 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

V1.45 candidate (#78)

* v1.45 New GFX kernel
parent 33a5435c
...@@ -3,6 +3,16 @@ ...@@ -3,6 +3,16 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.45
### Additions
* Adding A2A_MODE to a2a preset (0 = copy, 1 = read-only, 2 = write-only)
* Adding GFX_UNROLL to modify GFX kernel's unroll factor
* Adding GFX_WAVE_ORDER to modify order in which wavefronts process data
### Modifications
* Rewrote the GFX reduction kernel to support new wave ordering
## v1.44 ## v1.44
### Additions ### Additions
......
...@@ -158,7 +158,7 @@ int main(int argc, char **argv) ...@@ -158,7 +158,7 @@ int main(int argc, char **argv)
} }
ev.DisplayRemoteWriteEnvVars(); ev.DisplayRemoteWriteEnvVars();
int numSubExecs = (argc > 3 ? atoi(argv[3]) : 8); int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
int srcIdx = (argc > 4 ? atoi(argv[4]) : 0); int srcIdx = (argc > 4 ? atoi(argv[4]) : 0);
int minGpus = (argc > 5 ? atoi(argv[5]) : 1); int minGpus = (argc > 5 ? atoi(argv[5]) : 1);
int maxGpus = (argc > 6 ? atoi(argv[6]) : std::min(ev.numGpuDevices - 1, 3)); int maxGpus = (argc > 6 ? atoi(argv[6]) : std::min(ev.numGpuDevices - 1, 3));
...@@ -611,16 +611,26 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -611,16 +611,26 @@ void ExecuteTransfers(EnvVars const& ev,
transfer->executorBandwidth = exeBandwidthGbs; transfer->executorBandwidth = exeBandwidthGbs;
totalCUs += transfer->numSubExecs; totalCUs += transfer->numSubExecs;
char exeSubIndexStr[32] = "";
if (ev.useXccFilter)
{
if (transfer->exeSubIndex == -1)
sprintf(exeSubIndexStr, ".*");
else
sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex);
}
if (!verbose) continue; if (!verbose) continue;
if (!ev.outputToCsv) if (!ev.outputToCsv)
{ {
printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n", printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d%s:%03d -> %s\n",
transfer->transferIndex, transfer->transferIndex,
transfer->transferBandwidth, transfer->transferBandwidth,
transfer->transferTime, transfer->transferTime,
transfer->numBytesActual, transfer->numBytesActual,
transfer->SrcToStr().c_str(), transfer->SrcToStr().c_str(),
ExeTypeName[transfer->exeType], transfer->exeIndex, ExeTypeName[transfer->exeType], transfer->exeIndex,
exeSubIndexStr,
transfer->numSubExecs, transfer->numSubExecs,
transfer->DstToStr().c_str()); transfer->DstToStr().c_str());
...@@ -668,10 +678,10 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -668,10 +678,10 @@ void ExecuteTransfers(EnvVars const& ev,
} }
else else
{ {
printf("%d,%d,%lu,%s,%c%02d,%s,%d,%.3f,%.3f,%s,%s\n", printf("%d,%d,%lu,%s,%c%02d%s,%s,%d,%.3f,%.3f,%s,%s\n",
testNum, transfer->transferIndex, transfer->numBytesActual, testNum, transfer->transferIndex, transfer->numBytesActual,
transfer->SrcToStr().c_str(), transfer->SrcToStr().c_str(),
MemTypeStr[transfer->exeType], transfer->exeIndex, MemTypeStr[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
transfer->DstToStr().c_str(), transfer->DstToStr().c_str(),
transfer->numSubExecs, transfer->numSubExecs,
transfer->transferBandwidth, transfer->transferTime, transfer->transferBandwidth, transfer->transferTime,
...@@ -699,14 +709,24 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -699,14 +709,24 @@ void ExecuteTransfers(EnvVars const& ev,
transfer->executorBandwidth = transfer->transferBandwidth; transfer->executorBandwidth = transfer->transferBandwidth;
maxGpuTime = std::max(maxGpuTime, transfer->transferTime); maxGpuTime = std::max(maxGpuTime, transfer->transferTime);
if (!verbose) continue; if (!verbose) continue;
char exeSubIndexStr[32] = "";
if (ev.useXccFilter)
{
if (transfer->exeSubIndex == -1)
sprintf(exeSubIndexStr, ".*");
else
sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex);
}
if (!ev.outputToCsv) if (!ev.outputToCsv)
{ {
printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n", printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d%s:%03d -> %s\n",
transfer->transferIndex, transfer->transferIndex,
transfer->transferBandwidth, transfer->transferTime, transfer->transferBandwidth, transfer->transferTime,
transfer->numBytesActual, transfer->numBytesActual,
transfer->SrcToStr().c_str(), transfer->SrcToStr().c_str(),
ExeTypeName[transfer->exeType], transfer->exeIndex, ExeTypeName[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
transfer->numSubExecs, transfer->numSubExecs,
transfer->DstToStr().c_str()); transfer->DstToStr().c_str());
...@@ -753,10 +773,10 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -753,10 +773,10 @@ void ExecuteTransfers(EnvVars const& ev,
} }
else else
{ {
printf("%d,%d,%lu,%s,%s%02d,%s,%d,%.3f,%.3f,%s,%s\n", printf("%d,%d,%lu,%s,%s%02d%s,%s,%d,%.3f,%.3f,%s,%s\n",
testNum, transfer->transferIndex, transfer->numBytesActual, testNum, transfer->transferIndex, transfer->numBytesActual,
transfer->SrcToStr().c_str(), transfer->SrcToStr().c_str(),
ExeTypeName[transfer->exeType], transfer->exeIndex, ExeTypeName[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
transfer->DstToStr().c_str(), transfer->DstToStr().c_str(),
transfer->numSubExecs, transfer->numSubExecs,
transfer->transferBandwidth, transfer->transferTime, transfer->transferBandwidth, transfer->transferTime,
...@@ -1436,15 +1456,16 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1436,15 +1456,16 @@ void RunTransfer(EnvVars const& ev, int const iteration,
#if defined(__NVCC__) #if defined(__NVCC__)
HIP_CALL(hipEventRecord(startEvent, stream)); HIP_CALL(hipEventRecord(startEvent, stream));
GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, ev.blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr); GpuKernelTable[ev.gfxBlockSize/warpSize - 1][ev.gfxUnroll - 1]
<<<numBlocksToRun, ev.gfxBlockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.waveOrder);
HIP_CALL(hipEventRecord(stopEvent, stream)); HIP_CALL(hipEventRecord(stopEvent, stream));
#else #else
hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel], hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/warpSize - 1][ev.gfxUnroll - 1],
dim3(numXCCs, numBlocksToRun, 1), dim3(numXCCs, numBlocksToRun, 1),
dim3(ev.blockSize, 1, 1), dim3(ev.gfxBlockSize, 1, 1),
ev.sharedMemBytes, stream, ev.sharedMemBytes, stream,
startEvent, stopEvent, startEvent, stopEvent,
0, transfer->subExecParamGpuPtr); 0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
#endif #endif
// Synchronize per iteration, unless in single sync mode, in which case // Synchronize per iteration, unless in single sync mode, in which case
// synchronize during last warmup / last actual iteration // synchronize during last warmup / last actual iteration
...@@ -1947,8 +1968,8 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i ...@@ -1947,8 +1968,8 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
Transfer transfer; Transfer transfer;
transfer.numBytes = numBytesPerTransfer; transfer.numBytes = numBytesPerTransfer;
transfer.numSubExecs = numSubExecs; transfer.numSubExecs = numSubExecs;
transfer.numSrcs = 1; transfer.numSrcs = ev.a2aMode == 2 ? 0 : 1;
transfer.numDsts = 1; transfer.numDsts = ev.a2aMode == 1 ? 0 : 1;
transfer.exeType = EXE_GPU_GFX; transfer.exeType = EXE_GPU_GFX;
transfer.exeSubIndex = -1; transfer.exeSubIndex = -1;
transfer.srcType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU); transfer.srcType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
...@@ -2066,21 +2087,34 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev) ...@@ -2066,21 +2087,34 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)
size_t assigned = 0; size_t assigned = 0;
for (int i = 0; i < this->numSubExecs; ++i) for (int i = 0; i < this->numSubExecs; ++i)
{ {
int const subExecLeft = std::max(0, maxSubExecToUse - i); SubExecParam& p = this->subExecParam[i];
size_t const leftover = N - assigned; p.numSrcs = this->numSrcs;
size_t const roundedN = (leftover + targetMultiple - 1) / targetMultiple; p.numDsts = this->numDsts;
SubExecParam& p = this->subExecParam[i]; if (ev.gfxSingleTeam && this->exeType == EXE_GPU_GFX)
p.N = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0; {
p.numSrcs = this->numSrcs; p.N = N;
p.numDsts = this->numDsts; p.teamSize = this->numSubExecs;
for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.teamIdx = i;
p.src[iSrc] = this->srcMem[iSrc] + assigned + initOffset; for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.src[iSrc] = this->srcMem[iSrc] + initOffset;
for (int iDst = 0; iDst < this->numDsts; ++iDst) for (int iDst = 0; iDst < this->numDsts; ++iDst) p.dst[iDst] = this->dstMem[iDst] + initOffset;
p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset; }
else
{
int const subExecLeft = std::max(0, maxSubExecToUse - i);
size_t const leftover = N - assigned;
size_t const roundedN = (leftover + targetMultiple - 1) / targetMultiple;
p.preferredXccId = -1; p.N = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
p.teamSize = 1;
p.teamIdx = 0;
for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.src[iSrc] = this->srcMem[iSrc] + initOffset + assigned;
for (int iDst = 0; iDst < this->numDsts; ++iDst) p.dst[iDst] = this->dstMem[iDst] + initOffset + assigned;
assigned += p.N;
}
p.preferredXccId = -1;
if (ev.useXccFilter && this->exeType == EXE_GPU_GFX) if (ev.useXccFilter && this->exeType == EXE_GPU_GFX)
{ {
std::uniform_int_distribution<int> distribution(0, ev.xccIdsPerDevice[this->exeIndex].size() - 1); std::uniform_int_distribution<int> distribution(0, ev.xccIdsPerDevice[this->exeIndex].size() - 1);
...@@ -2109,7 +2143,6 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev) ...@@ -2109,7 +2143,6 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)
p.startCycle = 0; p.startCycle = 0;
p.stopCycle = 0; p.stopCycle = 0;
assigned += p.N;
} }
this->transferTime = 0.0; this->transferTime = 0.0;
...@@ -2178,7 +2211,7 @@ bool Transfer::PrepareSrc(EnvVars const& ev) ...@@ -2178,7 +2211,7 @@ bool Transfer::PrepareSrc(EnvVars const& ev)
int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false); int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
HIP_CALL(hipSetDevice(deviceIdx)); HIP_CALL(hipSetDevice(deviceIdx));
if (ev.usePrepSrcKernel) if (ev.usePrepSrcKernel)
PrepSrcDataKernel<<<32, ev.blockSize>>>(srcPtr, N, srcIdx); PrepSrcDataKernel<<<32, ev.gfxBlockSize>>>(srcPtr, N, srcIdx);
else else
HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault)); HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
HIP_CALL(hipDeviceSynchronize()); HIP_CALL(hipDeviceSynchronize());
...@@ -2424,17 +2457,22 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer ...@@ -2424,17 +2457,22 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
char memType = ev.useFineGrain ? 'F' : 'G'; char memType = ev.useFineGrain ? 'F' : 'G';
printf("Bytes to write: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus); printf("Bytes to write: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
char sep = (ev.outputToCsv ? ',' : ' ');
for (int i = 0; i < ev.numGpuDevices; i++) for (int i = 0; i < ev.numGpuDevices; i++)
{ {
if (i == srcIdx) continue; if (i == srcIdx) continue;
printf(" GPU %3d ", i); printf(" GPU %-3d %c", i, sep);
} }
printf("\n"); printf("\n");
for (int i = 0; i < ev.numGpuDevices-1; i++) if (!ev.outputToCsv)
{ {
printf("-------------"); for (int i = 0; i < ev.numGpuDevices-1; i++)
{
printf("-------------");
}
printf("\n");
} }
printf("\n");
for (int p = minGpus; p <= maxGpus; p++) for (int p = minGpus; p <= maxGpus; p++)
{ {
...@@ -2469,11 +2507,12 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer ...@@ -2469,11 +2507,12 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
for (int i = 0; i < ev.numGpuDevices; i++) for (int i = 0; i < ev.numGpuDevices; i++)
{ {
if (bitmask & (1<<i)) if (bitmask & (1<<i))
printf(" %8.3f ", transfers[counter++].transferBandwidth); printf(" %8.3f %c", transfers[counter++].transferBandwidth, sep);
else if (i != srcIdx) else if (i != srcIdx)
printf(" "); printf(" %c", sep);
} }
printf(" %d %d", p, numSubExecs);
for (auto i = 0; i < transfers.size(); i++) for (auto i = 0; i < transfers.size(); i++)
{ {
printf(" (N0 G%d %c%d)", srcIdx, MemTypeStr[transfers[i].dstType[0]], transfers[i].dstIndex[0]); printf(" (N0 G%d %c%d)", srcIdx, MemTypeStr[transfers[i].dstType[0]], transfers[i].dstIndex[0]);
...@@ -2481,9 +2520,7 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer ...@@ -2481,9 +2520,7 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
printf("\n"); printf("\n");
} }
} }
printf("\n");
} }
} }
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom) void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.44" #define TB_VERSION "1.45"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -57,9 +57,9 @@ class EnvVars ...@@ -57,9 +57,9 @@ class EnvVars
{ {
public: public:
// Default configuration values // Default configuration values
int const DEFAULT_NUM_WARMUPS = 3; int const DEFAULT_NUM_WARMUPS = 3;
int const DEFAULT_NUM_ITERATIONS = 10; int const DEFAULT_NUM_ITERATIONS = 10;
int const DEFAULT_SAMPLING_FACTOR = 1; int const DEFAULT_SAMPLING_FACTOR = 1;
// Peer-to-peer Benchmark preset defaults // Peer-to-peer Benchmark preset defaults
int const DEFAULT_P2P_NUM_CPU_SE = 4; int const DEFAULT_P2P_NUM_CPU_SE = 4;
...@@ -75,11 +75,14 @@ public: ...@@ -75,11 +75,14 @@ public:
// Environment variables // Environment variables
int alwaysValidate; // Validate after each iteration instead of once after all iterations int alwaysValidate; // Validate after each iteration instead of once after all iterations
int blockSize; // Size of each threadblock (must be multiple of 64) int blockBytes; // Each subexecutor, except the last, gets a multiple of this many bytes to copy
int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy
int blockOrder; // How blocks are ordered in single-stream mode (0=Sequential, 1=Interleaved, 2=Random) int blockOrder; // How blocks are ordered in single-stream mode (0=Sequential, 1=Interleaved, 2=Random)
int byteOffset; // Byte-offset for memory allocations int byteOffset; // Byte-offset for memory allocations
int continueOnError; // Continue tests even after mismatch detected int continueOnError; // Continue tests even after mismatch detected
int gfxBlockSize; // Size of each threadblock (must be multiple of 64)
int gfxSingleTeam; // Team all subExecutors across the data array
int gfxUnroll; // GFX-kernel unroll factor
int gfxWaveOrder; // GFX-kernel wavefront ordering
int hideEnv; // Skip printing environment variable int hideEnv; // Skip printing environment variable
int numCpuDevices; // Number of CPU devices to use (defaults to # NUMA nodes detected) int numCpuDevices; // Number of CPU devices to use (defaults to # NUMA nodes detected)
int numGpuDevices; // Number of GPU devices to use (defaults to # HIP devices detected) int numGpuDevices; // Number of GPU devices to use (defaults to # HIP devices detected)
...@@ -123,10 +126,10 @@ public: ...@@ -123,10 +126,10 @@ public:
// Enviroment variables only for A2A preset // Enviroment variables only for A2A preset
int a2aDirect; // Only execute on links that are directly connected int a2aDirect; // Only execute on links that are directly connected
int a2aMode; // Perform 0=copy, 1=read only, 2 = write only
// Developer features // Developer features
int enableDebug; // Enable debug output int enableDebug; // Enable debug output
int gpuKernel; // Which GPU kernel to use
// Used to track current configuration mode // Used to track current configuration mode
ConfigModeEnum configMode; ConfigModeEnum configMode;
...@@ -167,19 +170,22 @@ public: ...@@ -167,19 +170,22 @@ public:
// Different hardware pick different GPU kernels // Different hardware pick different GPU kernels
// This performance difference is generally only noticable when executing fewer CUs // This performance difference is generally only noticable when executing fewer CUs
int defaultGpuKernel = 0; int defaultGfxUnroll = 4;
if (archName == "gfx906") defaultGpuKernel = 13; if (archName == "gfx906") defaultGfxUnroll = 13;
else if (archName == "gfx90a") defaultGpuKernel = 9; else if (archName == "gfx90a") defaultGfxUnroll = 9;
else if (archName == "gfx940") defaultGpuKernel = 6; else if (archName == "gfx940") defaultGfxUnroll = 6;
else if (archName == "gfx941") defaultGpuKernel = 6; else if (archName == "gfx941") defaultGfxUnroll = 6;
else if (archName == "gfx942") defaultGpuKernel = 3; else if (archName == "gfx942") defaultGfxUnroll = 4;
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0); alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0);
blockSize = GetEnvVar("BLOCK_SIZE" , 256);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256); blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
blockOrder = GetEnvVar("BLOCK_ORDER" , 0); blockOrder = GetEnvVar("BLOCK_ORDER" , 0);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0); byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0); continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 0);
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0);
hideEnv = GetEnvVar("HIDE_ENV" , 0); hideEnv = GetEnvVar("HIDE_ENV" , 0);
numCpuDevices = GetEnvVar("NUM_CPU_DEVICES" , numDetectedCpus); numCpuDevices = GetEnvVar("NUM_CPU_DEVICES" , numDetectedCpus);
numGpuDevices = GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus); numGpuDevices = GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus);
...@@ -196,7 +202,6 @@ public: ...@@ -196,7 +202,6 @@ public:
useXccFilter = GetEnvVar("USE_XCC_FILTER" , 0); useXccFilter = GetEnvVar("USE_XCC_FILTER" , 0);
validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0); validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0);
enableDebug = GetEnvVar("DEBUG" , 0); enableDebug = GetEnvVar("DEBUG" , 0);
gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel);
// P2P Benchmark related // P2P Benchmark related
useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0); // Needed for numGpuSubExec useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0); // Needed for numGpuSubExec
...@@ -221,6 +226,7 @@ public: ...@@ -221,6 +226,7 @@ public:
// A2A Benchmark related // A2A Benchmark related
a2aDirect = GetEnvVar("A2A_DIRECT" , 1); a2aDirect = GetEnvVar("A2A_DIRECT" , 1);
a2aMode = GetEnvVar("A2A_MODE" , 0);
// Determine random seed // Determine random seed
char *sweepSeedStr = getenv("SWEEP_SEED"); char *sweepSeedStr = getenv("SWEEP_SEED");
...@@ -401,14 +407,14 @@ public: ...@@ -401,14 +407,14 @@ public:
printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus); printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
exit(1); exit(1);
} }
if (blockSize % 64) if (gfxBlockSize % 64)
{ {
printf("[ERROR] BLOCK_SIZE (%d) must be a multiple of 64\n", blockSize); printf("[ERROR] GFX_BLOCK_SIZE (%d) must be a multiple of 64\n", gfxBlockSize);
exit(1); exit(1);
} }
if (blockSize > MAX_BLOCKSIZE) if (gfxBlockSize > MAX_BLOCKSIZE)
{ {
printf("[ERROR] BLOCK_SIZE (%d) must be less than %d\n", blockSize, MAX_BLOCKSIZE); printf("[ERROR] BLOCK_SIZE (%d) must be less than %d\n", gfxBlockSize, MAX_BLOCKSIZE);
exit(1); exit(1);
} }
if (byteOffset % sizeof(float)) if (byteOffset % sizeof(float))
...@@ -494,9 +500,22 @@ public: ...@@ -494,9 +500,22 @@ public:
exit(1); exit(1);
} }
} }
if (gpuKernel < 0 || gpuKernel > NUM_GPU_KERNELS)
if (a2aMode < 0 || a2aMode > 2)
{
printf("[ERROR] a2aMode must be between 0 and 2\n");
exit(1);
}
if (gfxUnroll < 1 || gfxUnroll > MAX_UNROLL)
{
printf("[ERROR] GFX kernel unroll factor must be between 1 and %d\n", MAX_UNROLL);
exit(1);
}
if (gfxWaveOrder < 0 || gfxWaveOrder >= 6)
{ {
printf("[ERROR] GPU kernel must be between 0 and %d\n", NUM_GPU_KERNELS); printf("[ERROR] GFX wave order must be between 0 and 5\n");
exit(1); exit(1);
} }
...@@ -533,6 +552,12 @@ public: ...@@ -533,6 +552,12 @@ public:
exit(1); exit(1);
} }
if (getenv("GPU_KERNEL"))
{
printf("[WARN] GPU_KERNEL has been deprecated and replaced by GFX_KERNEL and GFX_UNROLL\n");
exit(1);
}
char* enableSdma = getenv("HSA_ENABLE_SDMA"); char* enableSdma = getenv("HSA_ENABLE_SDMA");
if (enableSdma && !strcmp(enableSdma, "0")) if (enableSdma && !strcmp(enableSdma, "0"))
{ {
...@@ -553,6 +578,9 @@ public: ...@@ -553,6 +578,9 @@ public:
printf(" CONTINUE_ON_ERROR - Continue tests even after mismatch detected\n"); printf(" CONTINUE_ON_ERROR - Continue tests even after mismatch detected\n");
printf(" CU_MASK - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n"); printf(" CU_MASK - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n");
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n"); printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", MAX_UNROLL);
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on individual disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n"); printf(" HIDE_ENV - Hide environment variable value listing\n");
printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n"); printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n");
printf(" NUM_GPU_DEVICES=X - Restrict number of GPUs to X. May not be greater than # detected HIP devices\n"); printf(" NUM_GPU_DEVICES=X - Restrict number of GPUs to X. May not be greater than # detected HIP devices\n");
...@@ -589,10 +617,9 @@ public: ...@@ -589,10 +617,9 @@ public:
else if (!hideEnv) else if (!hideEnv)
printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION); printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
if (hideEnv) return; if (hideEnv) return;
PRINT_EV("ALWAYS_VALIDATE", alwaysValidate, PRINT_EV("ALWAYS_VALIDATE", alwaysValidate,
std::string("Validating after ") + (alwaysValidate ? "each iteration" : "all iterations")); std::string("Validating after ") + (alwaysValidate ? "each iteration" : "all iterations"));
PRINT_EV("BLOCK_SIZE", blockSize,
std::string("Threadblock size of " + std::to_string(blockSize)));
PRINT_EV("BLOCK_BYTES", blockBytes, PRINT_EV("BLOCK_BYTES", blockBytes,
std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy")); std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
PRINT_EV("BLOCK_ORDER", blockOrder, PRINT_EV("BLOCK_ORDER", blockOrder,
...@@ -607,8 +634,20 @@ public: ...@@ -607,8 +634,20 @@ public:
(cuMask.size() ? GetCuMaskDesc() : "All")); (cuMask.size() ? GetCuMaskDesc() : "All"));
PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0, PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
(fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString())); (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
PRINT_EV("GPU_KERNEL", gpuKernel, PRINT_EV("GFX_BLOCK_SIZE", gfxBlockSize,
std::string("Using GPU kernel ") + std::to_string(gpuKernel) + " [" + std::string(GpuKernelNames[gpuKernel]) + "]"); std::string("Threadblock size of " + std::to_string(gfxBlockSize)));
PRINT_EV("GFX_SINGLE_TEAM", gfxSingleTeam,
(gfxSingleTeam ? std::string("Combining CUs to work across entire data array") :
std::string("Each CUs operates on its own disjoint subarray")));
PRINT_EV("GFX_UNROLL", gfxUnroll,
std::string("Using GFX unroll factor of ") + std::to_string(gfxUnroll));
PRINT_EV("GFX_WAVE_ORDER", gfxWaveOrder,
std::string("Using GFX wave ordering of ") + std::string((gfxWaveOrder == 0 ? "Unroll,Wavefront,CU" :
gfxWaveOrder == 1 ? "Unroll,CU,Wavefront" :
gfxWaveOrder == 2 ? "Wavefront,Unroll,CU" :
gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
"CU,Wavefront,Unroll")));
PRINT_EV("NUM_CPU_DEVICES", numCpuDevices, PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices"); std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
PRINT_EV("NUM_GPU_DEVICES", numGpuDevices, PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
...@@ -722,6 +761,10 @@ public: ...@@ -722,6 +761,10 @@ public:
printf("[AllToAll Related]\n"); printf("[AllToAll Related]\n");
PRINT_EV("A2A_DIRECT", a2aDirect, PRINT_EV("A2A_DIRECT", a2aDirect,
std::string(a2aDirect ? "Only using direct links" : "Full all-to-all")); std::string(a2aDirect ? "Only using direct links" : "Full all-to-all"));
PRINT_EV("A2A_MODE", a2aMode,
std::string(a2aMode == 0 ? "Perform copy" :
a2aMode == 1 ? "Perform read-only" :
"Perform write-only"));
PRINT_EV("USE_FINE_GRAIN", useFineGrain, PRINT_EV("USE_FINE_GRAIN", useFineGrain,
std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory"); std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
PRINT_EV("USE_REMOTE_READ", useRemoteRead, PRINT_EV("USE_REMOTE_READ", useRemoteRead,
......
...@@ -23,12 +23,16 @@ THE SOFTWARE. ...@@ -23,12 +23,16 @@ THE SOFTWARE.
#pragma once #pragma once
#define PackedFloat_t float4 #define PackedFloat_t float4
#define WARP_SIZE 64
#define MAX_BLOCKSIZE 512 #define MAX_BLOCKSIZE 512
#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float)) #define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
#define MEMSET_CHAR 75 #define MEMSET_CHAR 75
#define MEMSET_VAL 13323083.0f #define MEMSET_VAL 13323083.0f
#define MAX_WAVEGROUPS MAX_BLOCKSIZE / warpSize
#define MAX_UNROLL 8
#define NUM_WAVEORDERS 6
// Each subExecutor is provided with subarrays to work on // Each subExecutor is provided with subarrays to work on
#define MAX_SRCS 16 #define MAX_SRCS 16
#define MAX_DSTS 16 #define MAX_DSTS 16
...@@ -42,6 +46,10 @@ struct SubExecParam ...@@ -42,6 +46,10 @@ struct SubExecParam
float* dst[MAX_DSTS]; // Destination array pointers float* dst[MAX_DSTS]; // Destination array pointers
uint32_t preferredXccId; // XCC ID to execute on uint32_t preferredXccId; // XCC ID to execute on
// Prepared
int teamSize; // Index of this sub executor amongst team
int teamIdx; // Size of team this sub executor is part of
// Outputs // Outputs
long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor) long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor)
long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor) long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor)
...@@ -130,321 +138,161 @@ template <typename T> __device__ __forceinline__ T MemsetVal(); ...@@ -130,321 +138,161 @@ template <typename T> __device__ __forceinline__ T MemsetVal();
template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; }; template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; };
template <> __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); } template <> __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); }
// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats template <int BLOCKSIZE, int UNROLL>
template <int LOOP1_UNROLL> __global__ void __launch_bounds__(BLOCKSIZE)
__global__ void __launch_bounds__(MAX_BLOCKSIZE) GpuReduceKernel(SubExecParam* params, int waveOrder)
GpuReduceKernel(SubExecParam* params)
{ {
int64_t startCycle; int64_t startCycle;
if (threadIdx.x == 0) startCycle = wall_clock64(); if (threadIdx.x == 0) startCycle = wall_clock64();
SubExecParam& p = params[blockIdx.y]; SubExecParam& p = params[blockIdx.y];
// Filter by XCC if desired // (Experimental) Filter by XCC if desired
int xccId; int32_t xccId;
GetXccId(xccId); GetXccId(xccId);
if (p.preferredXccId != -1 && xccId != p.preferredXccId) return; if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
// Collect data information
int32_t const numSrcs = p.numSrcs;
int32_t const numDsts = p.numDsts;
float4 const* __restrict__ srcFloat4[MAX_SRCS];
float4* __restrict__ dstFloat4[MAX_DSTS];
for (int i = 0; i < numSrcs; i++) srcFloat4[i] = (float4*)p.src[i];
for (int i = 0; i < numDsts; i++) dstFloat4[i] = (float4*)p.dst[i];
// Operate on wavefront granularity // Operate on wavefront granularity
int const numSrcs = p.numSrcs; int32_t const nTeams = p.teamSize; // Number of threadblocks working together on this subarray
int const numDsts = p.numDsts; int32_t const teamIdx = p.teamIdx; // Index of this threadblock within the team
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number int32_t const nWaves = BLOCKSIZE / warpSize; // Number of wavefronts within this threadblock
int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront int32_t const waveIdx = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
int32_t const tIdx = threadIdx.x % warpSize; // Thread index within wavefront
// 1st loop - each wavefront operates on LOOP1_UNROLL x FLOATS_PER_PACK per thread per iteration
// Determine the number of packed floats processed by the first loop
size_t Nrem = p.N;
size_t const loop1Npack = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
size_t const loop1Nelem = loop1Npack * FLOATS_PER_PACK;
size_t const loop1Inc = blockDim.x * LOOP1_UNROLL;
size_t loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;
while (loop1Offset < loop1Npack)
{
PackedFloat_t vals[LOOP1_UNROLL] = {};
if (numSrcs == 0) size_t const numFloat4 = p.N / 4;
{ int32_t const nFlt4PerWave = warpSize * 4;
#pragma unroll
for (int u = 0; u < LOOP1_UNROLL; ++u) vals[u] = MemsetVal<float4>();
}
else
{
for (int i = 0; i < numSrcs; ++i)
{
PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i]) + loop1Offset;
#pragma unroll
for (int u = 0; u < LOOP1_UNROLL; ++u)
vals[u] += *(packedSrc + u * WARP_SIZE);
}
}
for (int i = 0; i < numDsts; ++i) int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
{ switch (waveOrder)
PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i]) + loop1Offset; {
#pragma unroll case 0: /* U,W,C */ unrlStride = 1; waveStride = UNROLL; teamStride = UNROLL * nWaves; teamStride2 = nWaves; waveStride2 = 1 ; break;
for (int u = 0; u < LOOP1_UNROLL; ++u) *(packedDst + u * WARP_SIZE) = vals[u]; case 1: /* U,C,W */ unrlStride = 1; teamStride = UNROLL; waveStride = UNROLL * nTeams; teamStride2 = 1; waveStride2 = nTeams; break;
} case 2: /* W,U,C */ waveStride = 1; unrlStride = nWaves; teamStride = nWaves * UNROLL; teamStride2 = nWaves; waveStride2 = 1 ; break;
loop1Offset += loop1Inc; case 3: /* W,C,U */ waveStride = 1; teamStride = nWaves; unrlStride = nWaves * nTeams; teamStride2 = nWaves; waveStride2 = 1 ; break;
case 4: /* C,U,W */ teamStride = 1; unrlStride = nTeams; waveStride = nTeams * UNROLL; teamStride2 = 1; waveStride2 = nTeams; break;
case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves; teamStride2 = 1; waveStride2 = nTeams; break;
} }
Nrem -= loop1Nelem;
if (Nrem > 0) // First loop: Each wavefront in the team works on UNROLL float4s per thread
size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
size_t const loop1Limit = numFloat4 / loop1Stride * loop1Stride;
{ {
// 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration float4 val[UNROLL];
// NOTE: Using int32_t due to smaller size requirements if (numSrcs == 0)
int32_t const loop2Npack = Nrem / FLOATS_PER_PACK;
int32_t const loop2Nelem = loop2Npack * FLOATS_PER_PACK;
int32_t const loop2Inc = blockDim.x;
int32_t loop2Offset = threadIdx.x;
while (loop2Offset < loop2Npack)
{ {
PackedFloat_t val; #pragma unroll
if (numSrcs == 0) for (int u = 0; u < UNROLL; u++)
{ val[u] = MemsetVal<float4>();
val = MemsetVal<float4>();
}
else
{
val = {};
for (int i = 0; i < numSrcs; ++i)
{
PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i] + loop1Nelem) + loop2Offset;
val += *packedSrc;
}
}
for (int i = 0; i < numDsts; ++i)
{
PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i] + loop1Nelem) + loop2Offset;
*packedDst = val;
}
loop2Offset += loop2Inc;
} }
Nrem -= loop2Nelem;
// Deal with leftovers less than FLOATS_PER_PACK) for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride)
if (threadIdx.x < Nrem)
{ {
int offset = loop1Nelem + loop2Nelem + threadIdx.x; // Read sources into memory and accumulate in registers
float val = 0; if (numSrcs)
if (numSrcs == 0)
{ {
val = MEMSET_VAL; for (int u = 0; u < UNROLL; u++)
val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
for (int s = 1; s < numSrcs; s++)
for (int u = 0; u < UNROLL; u++)
val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
} }
else
// Write accumulation to all outputs
for (int d = 0; d < numDsts; d++)
{ {
for (int i = 0; i < numSrcs; ++i) #pragma unroll
val += p.src[i][offset]; for (int u = 0; u < UNROLL; u++)
dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
} }
for (int i = 0; i < numDsts; ++i)
p.dst[i][offset] = val;
} }
} }
__syncthreads(); // Second loop: Deal with remaining float4s
if (threadIdx.x == 0)
{ {
__threadfence_system(); if (loop1Limit < numFloat4)
p.stopCycle = wall_clock64();
p.startCycle = startCycle;
p.xccId = xccId;
__trace_hwreg();
}
}
template <typename FLOAT_TYPE, int UNROLL_FACTOR>
__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
{
int constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
size_t constexpr loopPackInc = blockDim.x * UNROLL_FACTOR;
size_t constexpr numPacksPerWave = WARP_SIZE * UNROLL_FACTOR;
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number
int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
int const numSrcs = p.numSrcs;
int const numDsts = p.numDsts;
size_t const numPacksDone = (numFloatsPerPack == 1 && UNROLL_FACTOR == 1) ? N : (N / (FLOATS_PER_PACK * numPacksPerWave)) * numPacksPerWave;
size_t const numFloatsLeft = N - numPacksDone * numFloatsPerPack;
size_t loopPackOffset = waveId * numPacksPerWave + threadId;
while (loopPackOffset < numPacksDone)
{
FLOAT_TYPE vals[UNROLL_FACTOR];
if (numSrcs == 0)
{ {
#pragma unroll UNROLL_FACTOR float4 val;
for (int u = 0; u < UNROLL_FACTOR; ++u) vals[u] = MemsetVal<FLOAT_TYPE>(); if (numSrcs == 0) val = MemsetVal<float4>();
}
else
{
FLOAT_TYPE const* __restrict__ src0Ptr = ((FLOAT_TYPE const*)(p.src[0] + offset)) + loopPackOffset;
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
vals[u] = *(src0Ptr + u * WARP_SIZE);
for (int i = 1; i < numSrcs; ++i) size_t const loop2Stride = nTeams * nWaves * warpSize;
for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < numFloat4; idx += loop2Stride)
{ {
FLOAT_TYPE const* __restrict__ srcPtr = ((FLOAT_TYPE const*)(p.src[i] + offset)) + loopPackOffset; if (numSrcs)
{
val = srcFloat4[0][idx];
for (int s = 1; s < numSrcs; s++)
val += srcFloat4[s][idx];
}
#pragma unroll UNROLL_FACTOR for (int d = 0; d < numDsts; d++)
for (int u = 0; u < UNROLL_FACTOR; ++u) dstFloat4[d][idx] = val;
vals[u] += *(srcPtr + u * WARP_SIZE);
} }
} }
for (int i = 0; i < numDsts; ++i)
{
FLOAT_TYPE* __restrict__ dstPtr = (FLOAT_TYPE*)(p.dst[i + offset]) + loopPackOffset;
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
*(dstPtr + u * WARP_SIZE) = vals[u];
}
loopPackOffset += loopPackInc;
} }
return numFloatsLeft; // Third loop; Deal with remaining floats
}
template <typename FLOAT_TYPE, int UNROLL_FACTOR>
__device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset, size_t const N)
{
// Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
int constexpr numFloatsPerRead = sizeof(FLOAT_TYPE) / sizeof(float);
size_t const numFloatsPerInnerLoop = blockDim.x * numFloatsPerRead;
size_t const numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR;
size_t const numFloatsLeft = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop;
size_t const numFloatsDone = N - numFloatsLeft;
int const numSrcs = p.numSrcs;
int const numDsts = p.numDsts;
for (size_t idx = threadIdx.x * numFloatsPerRead; idx < numFloatsDone; idx += numFloatsPerOuterLoop)
{ {
FLOAT_TYPE tmp[UNROLL_FACTOR]; if (numFloat4 * 4 < p.N)
if (numSrcs == 0)
{ {
#pragma unroll UNROLL_FACTOR float val;
for (int u = 0; u < UNROLL_FACTOR; ++u) if (numSrcs == 0) val = MemsetVal<float>();
tmp[u] = MemsetVal<FLOAT_TYPE>();
}
else
{
#pragma unroll UNROLL_FACTOR
for (int u = 0; u < UNROLL_FACTOR; ++u)
tmp[u] = *((FLOAT_TYPE*)(&p.src[0][offset + idx + u * numFloatsPerInnerLoop]));
for (int i = 1; i < numSrcs; ++i) size_t const loop3Stride = nTeams * nWaves * warpSize;
for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride)
{ {
#pragma unroll UNROLL_FACTOR if (numSrcs)
for (int u = 0; u < UNROLL_FACTOR; ++u) {
tmp[u] += *((FLOAT_TYPE*)(&p.src[i][offset + idx + u * numFloatsPerInnerLoop])); val = p.src[0][idx];
} for (int s = 1; s < numSrcs; s++)
} val += p.src[s][idx];
}
for (int i = 0; i < numDsts; ++i) for (int d = 0; d < numDsts; d++)
{ p.dst[d][idx] = val;
for (int u = 0; u < UNROLL_FACTOR; ++u)
{
*((FLOAT_TYPE*)(&p.dst[i][offset + idx + u * numFloatsPerInnerLoop])) = tmp[u];
} }
} }
} }
return numFloatsLeft;
}
template <typename FLOAT_TYPE>
__device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size_t const N, int const unroll)
{
switch (unroll)
{
case 1: return GpuReduceFuncImpl<FLOAT_TYPE, 1>(p, offset, N);
case 2: return GpuReduceFuncImpl<FLOAT_TYPE, 2>(p, offset, N);
case 3: return GpuReduceFuncImpl<FLOAT_TYPE, 3>(p, offset, N);
case 4: return GpuReduceFuncImpl<FLOAT_TYPE, 4>(p, offset, N);
case 5: return GpuReduceFuncImpl<FLOAT_TYPE, 5>(p, offset, N);
case 6: return GpuReduceFuncImpl<FLOAT_TYPE, 6>(p, offset, N);
case 7: return GpuReduceFuncImpl<FLOAT_TYPE, 7>(p, offset, N);
case 8: return GpuReduceFuncImpl<FLOAT_TYPE, 8>(p, offset, N);
case 9: return GpuReduceFuncImpl<FLOAT_TYPE, 9>(p, offset, N);
case 10: return GpuReduceFuncImpl<FLOAT_TYPE, 10>(p, offset, N);
case 11: return GpuReduceFuncImpl<FLOAT_TYPE, 11>(p, offset, N);
case 12: return GpuReduceFuncImpl<FLOAT_TYPE, 12>(p, offset, N);
case 13: return GpuReduceFuncImpl<FLOAT_TYPE, 13>(p, offset, N);
case 14: return GpuReduceFuncImpl<FLOAT_TYPE, 14>(p, offset, N);
case 15: return GpuReduceFuncImpl<FLOAT_TYPE, 15>(p, offset, N);
case 16: return GpuReduceFuncImpl<FLOAT_TYPE, 16>(p, offset, N);
default: return GpuReduceFuncImpl<FLOAT_TYPE, 1>(p, offset, N);
}
}
// GPU copy kernel
__global__ void __launch_bounds__(MAX_BLOCKSIZE)
GpuReduceKernel2(SubExecParam* params)
{
int64_t startCycle = wall_clock64();
SubExecParam& p = params[blockIdx.y];
size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8);
if (numFloatsLeft)
numFloatsLeft = GpuReduceFunc<float4>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
if (numFloatsLeft)
GpuReduceFunc<float>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
__threadfence_system(); // Wait for all threads to finish
__syncthreads();
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
p.startCycle = startCycle; __threadfence_system();
p.stopCycle = wall_clock64(); p.stopCycle = wall_clock64();
p.startCycle = startCycle;
p.xccId = xccId;
__trace_hwreg();
} }
} }
#define NUM_GPU_KERNELS 18 typedef void (*GpuKernelFuncPtr)(SubExecParam*, int);
typedef void (*GpuKernelFuncPtr)(SubExecParam*);
GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] = #define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
{ {GpuReduceKernel<BLOCKSIZE, 1>, \
GpuReduceKernel<8>, GpuReduceKernel<BLOCKSIZE, 2>, \
GpuReduceKernel<1>, GpuReduceKernel<BLOCKSIZE, 3>, \
GpuReduceKernel<2>, GpuReduceKernel<BLOCKSIZE, 4>, \
GpuReduceKernel<3>, GpuReduceKernel<BLOCKSIZE, 5>, \
GpuReduceKernel<4>, GpuReduceKernel<BLOCKSIZE, 6>, \
GpuReduceKernel<5>, GpuReduceKernel<BLOCKSIZE, 7>, \
GpuReduceKernel<6>, GpuReduceKernel<BLOCKSIZE, 8>}
GpuReduceKernel<7>,
GpuReduceKernel<8>,
GpuReduceKernel<9>,
GpuReduceKernel<10>,
GpuReduceKernel<11>,
GpuReduceKernel<12>,
GpuReduceKernel<13>,
GpuReduceKernel<14>,
GpuReduceKernel<15>,
GpuReduceKernel<16>,
GpuReduceKernel2
};
std::string GpuKernelNames[NUM_GPU_KERNELS] = GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] =
{ {
"Default - 8xUnroll", GPU_KERNEL_UNROLL_DECL(64),
"Unroll x1", GPU_KERNEL_UNROLL_DECL(128),
"Unroll x2", GPU_KERNEL_UNROLL_DECL(192),
"Unroll x3", GPU_KERNEL_UNROLL_DECL(256),
"Unroll x4", GPU_KERNEL_UNROLL_DECL(320),
"Unroll x5", GPU_KERNEL_UNROLL_DECL(384),
"Unroll x6", GPU_KERNEL_UNROLL_DECL(448),
"Unroll x7", GPU_KERNEL_UNROLL_DECL(512)
"Unroll x8",
"Unroll x9",
"Unroll x10",
"Unroll x11",
"Unroll x12",
"Unroll x13",
"Unroll x14",
"Unroll x15",
"Unroll x16",
"8xUnrollB",
}; };
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment