Unverified Commit e7cfab75 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Adding BLOCK_SIZE and BLOCK_ORDER (#57)

parent 0b29707e
# Changelog for TransferBench # Changelog for TransferBench
## v1.30
### Added
- BLOCK_SIZE added to control threadblock size (Must be multiple of 64, up to 512)
- BLOCK_ORDER added to control how work is ordered for GFX-executors running USE_SINGLE_STREAM=1
- 0 - Threadblocks for Transfers are ordered sequentially (Default)
- 1 - Threadblocks for Transfers are interleaved
- 2 - Threadblocks for Transfers are ordered randomly
## v1.29 ## v1.29
### Added ### Added
- a2a preset config now responds to USE_REMOTE_READ - a2a preset config now responds to USE_REMOTE_READ
......
...@@ -336,8 +336,6 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -336,8 +336,6 @@ void ExecuteTransfers(EnvVars const& ev,
int const exeIndex = RemappedIndex(executor.second, IsCpuType(exeType)); int const exeIndex = RemappedIndex(executor.second, IsCpuType(exeType));
exeInfo.totalBytes = 0; exeInfo.totalBytes = 0;
int transferOffset = 0;
for (int i = 0; i < exeInfo.transfers.size(); ++i) for (int i = 0; i < exeInfo.transfers.size(); ++i)
{ {
// Prepare subarrays each threadblock works on and fill src memory with patterned data // Prepare subarrays each threadblock works on and fill src memory with patterned data
...@@ -345,20 +343,75 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -345,20 +343,75 @@ void ExecuteTransfers(EnvVars const& ev,
transfer->PrepareSubExecParams(ev); transfer->PrepareSubExecParams(ev);
isSrcCorrect &= transfer->PrepareSrc(ev); isSrcCorrect &= transfer->PrepareSrc(ev);
exeInfo.totalBytes += transfer->numBytesActual; exeInfo.totalBytes += transfer->numBytesActual;
}
// Copy block parameters to GPU for GPU executors
if (exeType == EXE_GPU_GFX)
{
std::vector<SubExecParam> tempSubExecParam;
// Copy block parameters to GPU for GPU executors if (!ev.useSingleStream || (ev.blockOrder == ORDER_SEQUENTIAL))
if (transfer->exeType == EXE_GPU_GFX)
{ {
exeInfo.transfers[i]->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset; // Assign Transfers to sequentual threadblocks
HIP_CALL(hipSetDevice(exeIndex)); int transferOffset = 0;
HIP_CALL(hipMemcpy(&exeInfo.subExecParamGpu[transferOffset], for (Transfer* transfer : exeInfo.transfers)
transfer->subExecParam.data(), {
transfer->subExecParam.size() * sizeof(SubExecParam), transfer->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
hipMemcpyHostToDevice));
HIP_CALL(hipDeviceSynchronize());
transferOffset += transfer->subExecParam.size(); transfer->subExecIdx.clear();
for (int subExecIdx = 0; subExecIdx < transfer->subExecParam.size(); subExecIdx++)
{
transfer->subExecIdx.push_back(transferOffset + subExecIdx);
tempSubExecParam.push_back(transfer->subExecParam[subExecIdx]);
}
transferOffset += transfer->numSubExecs;
}
} }
else if (ev.blockOrder == ORDER_INTERLEAVED)
{
// Interleave threadblocks of different Transfers
exeInfo.transfers[0]->subExecParamGpuPtr = exeInfo.subExecParamGpu;
for (int subExecIdx = 0; tempSubExecParam.size() < exeInfo.totalSubExecs; ++subExecIdx)
{
for (Transfer* transfer : exeInfo.transfers)
{
if (subExecIdx < transfer->numSubExecs)
{
transfer->subExecIdx.push_back(tempSubExecParam.size());
tempSubExecParam.push_back(transfer->subExecParam[subExecIdx]);
}
}
}
}
else if (ev.blockOrder == ORDER_RANDOM)
{
std::vector<std::pair<int,int>> indices;
exeInfo.transfers[0]->subExecParamGpuPtr = exeInfo.subExecParamGpu;
// Build up a list of (transfer,subExecParam) indices, then randomly sort them
for (int i = 0; i < exeInfo.transfers.size(); i++)
{
Transfer* transfer = exeInfo.transfers[i];
for (int subExecIdx = 0; subExecIdx < transfer->numSubExecs; subExecIdx++)
indices.push_back(std::make_pair(i, subExecIdx));
}
std::shuffle(indices.begin(), indices.end(), *ev.generator);
// Build randomized threadblock list
for (auto p : indices)
{
Transfer* transfer = exeInfo.transfers[p.first];
transfer->subExecIdx.push_back(tempSubExecParam.size());
tempSubExecParam.push_back(transfer->subExecParam[p.second]);
}
}
HIP_CALL(hipSetDevice(exeIndex));
HIP_CALL(hipMemcpy(exeInfo.subExecParamGpu,
tempSubExecParam.data(),
tempSubExecParam.size() * sizeof(SubExecParam),
hipMemcpyDefault));
HIP_CALL(hipDeviceSynchronize());
} }
} }
...@@ -602,7 +655,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -602,7 +655,7 @@ void ExecuteTransfers(EnvVars const& ev,
for (auto x : transfer->perIterationCUs[t.second - 1]) for (auto x : transfer->perIterationCUs[t.second - 1])
printf(" %2d", x); printf(" %2d", x);
} }
printf("\n"); printf("\n");
} }
printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime); printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
} }
...@@ -1281,12 +1334,12 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1281,12 +1334,12 @@ void RunTransfer(EnvVars const& ev, int const iteration,
int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs; int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
#if defined(__NVCC__) #if defined(__NVCC__)
HIP_CALL(hipEventRecord(startEvent, stream)); HIP_CALL(hipEventRecord(startEvent, stream));
GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, BLOCKSIZE, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr); GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, ev.blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
HIP_CALL(hipEventRecord(stopEvent, stream)); HIP_CALL(hipEventRecord(stopEvent, stream));
#else #else
hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel], hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
dim3(numBlocksToRun, 1, 1), dim3(numBlocksToRun, 1, 1),
dim3(BLOCKSIZE, 1, 1), dim3(ev.blockSize, 1, 1),
ev.sharedMemBytes, stream, ev.sharedMemBytes, stream,
startEvent, stopEvent, startEvent, stopEvent,
0, transfer->subExecParamGpuPtr); 0, transfer->subExecParamGpuPtr);
...@@ -1306,12 +1359,16 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1306,12 +1359,16 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// Figure out individual timings for Transfers that were all launched together // Figure out individual timings for Transfers that were all launched together
for (Transfer* currTransfer : exeInfo.transfers) for (Transfer* currTransfer : exeInfo.transfers)
{ {
long long minStartCycle = currTransfer->subExecParamGpuPtr[0].startCycle; long long minStartCycle = std::numeric_limits<long long>::max();
long long maxStopCycle = currTransfer->subExecParamGpuPtr[0].stopCycle; long long maxStopCycle = std::numeric_limits<long long>::min();
for (int i = 1; i < currTransfer->numSubExecs; i++)
std::set<int> CUs;
for (auto subExecIdx : currTransfer->subExecIdx)
{ {
minStartCycle = std::min(minStartCycle, currTransfer->subExecParamGpuPtr[i].startCycle); minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
maxStopCycle = std::max(maxStopCycle, currTransfer->subExecParamGpuPtr[i].stopCycle); maxStopCycle = std::max(maxStopCycle, exeInfo.subExecParamGpu[subExecIdx].stopCycle);
if (ev.showIterations)
CUs.insert(GetId(exeInfo.subExecParamGpu[subExecIdx].hwId));
} }
int const wallClockRate = ev.wallClockPerDeviceMhz[exeIndex]; int const wallClockRate = ev.wallClockPerDeviceMhz[exeIndex];
double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate); double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
...@@ -1319,9 +1376,6 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1319,9 +1376,6 @@ void RunTransfer(EnvVars const& ev, int const iteration,
if (ev.showIterations) if (ev.showIterations)
{ {
currTransfer->perIterationTime.push_back(iterationTimeMs); currTransfer->perIterationTime.push_back(iterationTimeMs);
std::set<int> CUs;
for (int i = 0; i < currTransfer->numSubExecs; i++)
CUs.insert(GetId(currTransfer->subExecParamGpuPtr[i].hwId));
currTransfer->perIterationCUs.push_back(CUs); currTransfer->perIterationCUs.push_back(CUs);
} }
} }
...@@ -1990,7 +2044,7 @@ bool Transfer::PrepareSrc(EnvVars const& ev) ...@@ -1990,7 +2044,7 @@ bool Transfer::PrepareSrc(EnvVars const& ev)
int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false); int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
HIP_CALL(hipSetDevice(deviceIdx)); HIP_CALL(hipSetDevice(deviceIdx));
if (ev.usePrepSrcKernel) if (ev.usePrepSrcKernel)
PrepSrcDataKernel<<<32, BLOCKSIZE>>>(srcPtr, N, srcIdx); PrepSrcDataKernel<<<32, ev.blockSize>>>(srcPtr, N, srcIdx);
else else
HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault)); HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
HIP_CALL(hipDeviceSynchronize()); HIP_CALL(hipDeviceSynchronize());
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.29" #define TB_VERSION "1.30"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -43,6 +43,13 @@ enum ConfigModeEnum ...@@ -43,6 +43,13 @@ enum ConfigModeEnum
CFG_A2A = 4 CFG_A2A = 4
}; };
enum BlockOrderEnum
{
ORDER_SEQUENTIAL = 0,
ORDER_INTERLEAVED = 1,
ORDER_RANDOM = 2
};
// This class manages environment variable that affect TransferBench // This class manages environment variable that affect TransferBench
class EnvVars class EnvVars
{ {
...@@ -65,7 +72,9 @@ public: ...@@ -65,7 +72,9 @@ public:
int const DEFAULT_SWEEP_TIME_LIMIT = 0; int const DEFAULT_SWEEP_TIME_LIMIT = 0;
// Environment variables // Environment variables
int blockSize; // Size of each threadblock (must be multiple of 64)
int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy
int blockOrder; // How blocks are ordered in single-stream mode (0=Sequential, 1=Interleaved, 2=Random)
int byteOffset; // Byte-offset for memory allocations int byteOffset; // Byte-offset for memory allocations
int continueOnError; // Continue tests even after mismatch detected int continueOnError; // Continue tests even after mismatch detected
int hideEnv; // Skip printing environment variable int hideEnv; // Skip printing environment variable
...@@ -157,7 +166,9 @@ public: ...@@ -157,7 +166,9 @@ public:
else if (archName == "gfx940") defaultGpuKernel = 6; else if (archName == "gfx940") defaultGpuKernel = 6;
else if (archName == "gfx941") defaultGpuKernel = 6; else if (archName == "gfx941") defaultGpuKernel = 6;
blockSize = GetEnvVar("BLOCK_SIZE" , 256);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256); blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
blockOrder = GetEnvVar("BLOCK_ORDER" , 0);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0); byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0); continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0);
hideEnv = GetEnvVar("HIDE_ENV" , 0); hideEnv = GetEnvVar("HIDE_ENV" , 0);
...@@ -324,11 +335,26 @@ public: ...@@ -324,11 +335,26 @@ public:
printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus); printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
exit(1); exit(1);
} }
if (blockSize % 64)
{
printf("[ERROR] BLOCK_SIZE (%d) must be a multiple of 64\n", blockSize);
exit(1);
}
if (blockSize > MAX_BLOCKSIZE)
{
printf("[ERROR] BLOCK_SIZE (%d) must be less than %d\n", blockSize, MAX_BLOCKSIZE);
exit(1);
}
if (byteOffset % sizeof(float)) if (byteOffset % sizeof(float))
{ {
printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float)); printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
exit(1); exit(1);
} }
if (blockOrder < 0 || blockOrder > 2)
{
printf("[ERROR] BLOCK_ORDER must be 0 (Sequential), 1 (Interleaved), or 2 (Random)\n");
exit(1);
}
if (numWarmups < 0) if (numWarmups < 0)
{ {
printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n"); printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
...@@ -349,7 +375,6 @@ public: ...@@ -349,7 +375,6 @@ public:
printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n"); printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n");
exit(1); exit(1);
} }
if (numGpuSubExecs <= 0) if (numGpuSubExecs <= 0)
{ {
printf("[ERROR] NUM_GPU_SE must be greater than 0\n"); printf("[ERROR] NUM_GPU_SE must be greater than 0\n");
...@@ -454,7 +479,9 @@ public: ...@@ -454,7 +479,9 @@ public:
{ {
printf("Environment variables:\n"); printf("Environment variables:\n");
printf("======================\n"); printf("======================\n");
printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n"); printf(" BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64). Defaults to 256\n");
printf(" BLOCK_BYTES - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
printf(" BLOCK_ORDER - Threadblock ordering in single-stream mode (0=Serial, 1=Interleaved, 2=Random)\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n"); printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
printf(" CONTINUE_ON_ERROR - Continue tests even after mismatch detected\n"); printf(" CONTINUE_ON_ERROR - Continue tests even after mismatch detected\n");
printf(" CU_MASK - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n"); printf(" CU_MASK - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n");
...@@ -495,8 +522,14 @@ public: ...@@ -495,8 +522,14 @@ public:
printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION); printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
if (hideEnv) return; if (hideEnv) return;
PRINT_EV("BLOCK_SIZE", blockSize,
std::string("Threadblock size of " + std::to_string(blockSize)));
PRINT_EV("BLOCK_BYTES", blockBytes, PRINT_EV("BLOCK_BYTES", blockBytes,
std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy")); std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
PRINT_EV("BLOCK_ORDER", blockOrder,
std::string("Transfer blocks order: " + std::string((blockOrder == 0 ? "Sequential" :
blockOrder == 1 ? "Interleaved" :
"Random"))));
PRINT_EV("BYTE_OFFSET", byteOffset, PRINT_EV("BYTE_OFFSET", byteOffset,
std::string("Using byte offset of " + std::to_string(byteOffset))); std::string("Using byte offset of " + std::to_string(byteOffset)));
PRINT_EV("CONTINUE_ON_ERROR", continueOnError, PRINT_EV("CONTINUE_ON_ERROR", continueOnError,
...@@ -531,12 +564,16 @@ public: ...@@ -531,12 +564,16 @@ public:
PRINT_EV("VALIDATE_DIRECT", validateDirect, PRINT_EV("VALIDATE_DIRECT", validateDirect,
std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer")); std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer"));
printf("\n"); printf("\n");
if (blockOrder != ORDER_SEQUENTIAL && !useSingleStream)
printf("[WARN] BLOCK_ORDER is ignored if USE_SINGLE_STREAM is not enabled\n");
}; };
// Display env var for P2P Benchmark preset // Display env var for P2P Benchmark preset
void DisplayP2PBenchmarkEnvVars() const void DisplayP2PBenchmarkEnvVars() const
{ {
DisplayEnvVars(); DisplayEnvVars();
if (hideEnv) return; if (hideEnv) return;
if (!outputToCsv) if (!outputToCsv)
......
...@@ -24,7 +24,7 @@ THE SOFTWARE. ...@@ -24,7 +24,7 @@ THE SOFTWARE.
#define PackedFloat_t float4 #define PackedFloat_t float4
#define WARP_SIZE 64 #define WARP_SIZE 64
#define BLOCKSIZE 256 #define MAX_BLOCKSIZE 512
#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float)) #define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
#define MEMSET_CHAR 75 #define MEMSET_CHAR 75
#define MEMSET_VAL 13323083.0f #define MEMSET_VAL 13323083.0f
...@@ -34,11 +34,14 @@ THE SOFTWARE. ...@@ -34,11 +34,14 @@ THE SOFTWARE.
#define MAX_DSTS 16 #define MAX_DSTS 16
struct SubExecParam struct SubExecParam
{ {
// Inputs
size_t N; // Number of floats this subExecutor works on size_t N; // Number of floats this subExecutor works on
int numSrcs; // Number of source arrays int numSrcs; // Number of source arrays
int numDsts; // Number of destination arrays int numDsts; // Number of destination arrays
float* src[MAX_SRCS]; // Source array pointers float* src[MAX_SRCS]; // Source array pointers
float* dst[MAX_DSTS]; // Destination array pointers float* dst[MAX_DSTS]; // Destination array pointers
// Outputs
long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor) long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor)
long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor) long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor)
uint32_t hwId; // Hardware ID uint32_t hwId; // Hardware ID
...@@ -111,7 +114,7 @@ template <> __device__ __forceinline__ float4 MemsetVal(){ return make ...@@ -111,7 +114,7 @@ template <> __device__ __forceinline__ float4 MemsetVal(){ return make
// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats // GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats
template <int LOOP1_UNROLL> template <int LOOP1_UNROLL>
__global__ void __launch_bounds__(BLOCKSIZE) __global__ void __launch_bounds__(MAX_BLOCKSIZE)
GpuReduceKernel(SubExecParam* params) GpuReduceKernel(SubExecParam* params)
{ {
int64_t startCycle = wall_clock64(); int64_t startCycle = wall_clock64();
...@@ -128,7 +131,7 @@ GpuReduceKernel(SubExecParam* params) ...@@ -128,7 +131,7 @@ GpuReduceKernel(SubExecParam* params)
size_t Nrem = p.N; size_t Nrem = p.N;
size_t const loop1Npack = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE); size_t const loop1Npack = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
size_t const loop1Nelem = loop1Npack * FLOATS_PER_PACK; size_t const loop1Nelem = loop1Npack * FLOATS_PER_PACK;
size_t const loop1Inc = BLOCKSIZE * LOOP1_UNROLL; size_t const loop1Inc = blockDim.x * LOOP1_UNROLL;
size_t loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId; size_t loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;
while (loop1Offset < loop1Npack) while (loop1Offset < loop1Npack)
...@@ -167,7 +170,7 @@ GpuReduceKernel(SubExecParam* params) ...@@ -167,7 +170,7 @@ GpuReduceKernel(SubExecParam* params)
// NOTE: Using int32_t due to smaller size requirements // NOTE: Using int32_t due to smaller size requirements
int32_t const loop2Npack = Nrem / FLOATS_PER_PACK; int32_t const loop2Npack = Nrem / FLOATS_PER_PACK;
int32_t const loop2Nelem = loop2Npack * FLOATS_PER_PACK; int32_t const loop2Nelem = loop2Npack * FLOATS_PER_PACK;
int32_t const loop2Inc = BLOCKSIZE; int32_t const loop2Inc = blockDim.x;
int32_t loop2Offset = threadIdx.x; int32_t loop2Offset = threadIdx.x;
while (loop2Offset < loop2Npack) while (loop2Offset < loop2Npack)
...@@ -229,7 +232,7 @@ template <typename FLOAT_TYPE, int UNROLL_FACTOR> ...@@ -229,7 +232,7 @@ template <typename FLOAT_TYPE, int UNROLL_FACTOR>
__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N) __device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
{ {
int constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread int constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
size_t constexpr loopPackInc = BLOCKSIZE * UNROLL_FACTOR; size_t constexpr loopPackInc = blockDim.x * UNROLL_FACTOR;
size_t constexpr numPacksPerWave = WARP_SIZE * UNROLL_FACTOR; size_t constexpr numPacksPerWave = WARP_SIZE * UNROLL_FACTOR;
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number
int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
...@@ -283,8 +286,8 @@ __device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset, ...@@ -283,8 +286,8 @@ __device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset,
{ {
// Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop // Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
int constexpr numFloatsPerRead = sizeof(FLOAT_TYPE) / sizeof(float); int constexpr numFloatsPerRead = sizeof(FLOAT_TYPE) / sizeof(float);
size_t constexpr numFloatsPerInnerLoop = BLOCKSIZE * numFloatsPerRead; size_t const numFloatsPerInnerLoop = blockDim.x * numFloatsPerRead;
size_t constexpr numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR; size_t const numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR;
size_t const numFloatsLeft = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop; size_t const numFloatsLeft = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop;
size_t const numFloatsDone = N - numFloatsLeft; size_t const numFloatsDone = N - numFloatsLeft;
int const numSrcs = p.numSrcs; int const numSrcs = p.numSrcs;
...@@ -351,7 +354,7 @@ __device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size ...@@ -351,7 +354,7 @@ __device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size
} }
// GPU copy kernel // GPU copy kernel
__global__ void __launch_bounds__(BLOCKSIZE) __global__ void __launch_bounds__(MAX_BLOCKSIZE)
GpuReduceKernel2(SubExecParam* params) GpuReduceKernel2(SubExecParam* params)
{ {
int64_t startCycle = wall_clock64(); int64_t startCycle = wall_clock64();
......
...@@ -118,6 +118,7 @@ struct Transfer ...@@ -118,6 +118,7 @@ struct Transfer
std::vector<SubExecParam> subExecParam; // Defines subarrays assigned to each threadblock std::vector<SubExecParam> subExecParam; // Defines subarrays assigned to each threadblock
SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
std::vector<int> subExecIdx; // Indicies into subExecParamGpu
std::vector<double> perIterationTime; // Per-iteration timing std::vector<double> perIterationTime; // Per-iteration timing
std::vector<std::set<int>> perIterationCUs; // Per-iteration CU usage std::vector<std::set<int>> perIterationCUs; // Per-iteration CU usage
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment