Unverified Commit 7279c539 authored by Anton Gorenko's avatar Anton Gorenko
Browse files

Optimize sorting kernels and tune block sizes

* Compile kernels with max block size of 256 threads:
  The default hipcc behavior since ROCm 4.2 is to compile kernels
  with 1024 threads unless __launch_bounds__ is specified. This
  significantly increases register pressure especially in heavy kernels
  (double precision, for example), requiring register spilling;
* Optimize computeRange by using multiple blocks for reduction;
* Use blocks of 1024 threads for computeBucketPositions - it is executed
  as a single work group so larger block size is faster;
* Sort up-to lenghtNextPow2 instead of blockDim.x (faster for short
  buckets);
* Optimize sortShortList2;
* Optimize sortBuckets with bit instructions;
* Decrease bucket size for non-uniform sorting: too many buckets may
  have sizes too large to sort in shared memory;
* Add more sizes in tests.
parent aca24d5f
...@@ -342,7 +342,7 @@ public: ...@@ -342,7 +342,7 @@ public:
* Get the maximum number of threads in a thread block supported by this device. * Get the maximum number of threads in a thread block supported by this device.
*/ */
int getMaxThreadBlockSize() const { int getMaxThreadBlockSize() const {
return 1024; return 256;
} }
/** /**
* Get whether the device being used is a CPU. In some cases, different algorithms * Get whether the device being used is a CPU. In some cases, different algorithms
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. * * Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. * * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis * * Authors: Peter Eastman, Nicholas Curtis *
* Contributors: * * Contributors: *
* * * *
...@@ -77,9 +77,11 @@ public: ...@@ -77,9 +77,11 @@ public:
* @param trait a SortTrait defining the type of data to sort. It should have been allocated * @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it, * on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the HipSort is deleted. * and deletes it when the HipSort is deleted.
* @param length the length of the arrays this object will be used to sort * @param length the length of the arrays this object will be used to sort.
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint.
*/ */
HipSort(HipContext& context, SortTrait* trait, unsigned int length); HipSort(HipContext& context, SortTrait* trait, unsigned int length, bool uniform=true);
~HipSort(); ~HipSort();
/** /**
* Sort an array. * Sort an array.
...@@ -88,14 +90,15 @@ public: ...@@ -88,14 +90,15 @@ public:
private: private:
HipContext& context; HipContext& context;
SortTrait* trait; SortTrait* trait;
HipArray counters;
HipArray dataRange; HipArray dataRange;
HipArray bucketOfElement; HipArray bucketOfElement;
HipArray offsetInBucket; HipArray offsetInBucket;
HipArray bucketOffset; HipArray bucketOffset;
HipArray buckets; HipArray buckets;
hipFunction_t shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel; hipFunction_t shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize; unsigned int dataLength, rangeKernelBlocks, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList; bool isShortList, uniform;
}; };
/** /**
......
...@@ -431,6 +431,9 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri ...@@ -431,6 +431,9 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
static_assert(8*sizeof(void*) == HipContext::TileSize); static_assert(8*sizeof(void*) == HipContext::TileSize);
string bits = intToString(8*sizeof(void*)); string bits = intToString(8*sizeof(void*));
string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags)); string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
if (getMaxThreadBlockSize() < 1024) {
options += " --gpu-max-threads-per-block=" + std::to_string(getMaxThreadBlockSize());
}
stringstream src; stringstream src;
if (!options.empty()) if (!options.empty())
src << "// Compilation Options: " << options << endl << endl; src << "// Compilation Options: " << options << endl << endl;
...@@ -659,6 +662,18 @@ void HipContext::executeKernel(hipFunction_t kernel, void** arguments, int threa ...@@ -659,6 +662,18 @@ void HipContext::executeKernel(hipFunction_t kernel, void** arguments, int threa
} }
} }
void HipContext::executeKernelFlat(hipFunction_t kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
if (blockSize == -1)
blockSize = ThreadBlockSize;
int gridSize = (threads+blockSize-1)/blockSize;
hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
if (result != hipSuccess) {
stringstream str;
str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
int HipContext::computeThreadBlockSize(double memory) const { int HipContext::computeThreadBlockSize(double memory) const {
int maxShared = this->sharedMemPerBlock; int maxShared = this->sharedMemPerBlock;
int max = (int) (maxShared/memory); int max = (int) (maxShared/memory);
...@@ -738,7 +753,7 @@ void HipContext::clearAutoclearBuffers() { ...@@ -738,7 +753,7 @@ void HipContext::clearAutoclearBuffers() {
double HipContext::reduceEnergy() { double HipContext::reduceEnergy() {
int bufferSize = energyBuffer.getSize(); int bufferSize = energyBuffer.getSize();
int workGroupSize = 512; int workGroupSize = getMaxThreadBlockSize();
void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize}; void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer.getElementSize()); executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer.getElementSize());
energySum.download(pinnedBuffer); energySum.download(pinnedBuffer);
......
...@@ -280,7 +280,7 @@ void HipNonbondedUtilities::initialize(const System& system) { ...@@ -280,7 +280,7 @@ void HipNonbondedUtilities::initialize(const System& system) {
sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox"); sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions"); oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList"); rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks); blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false);
vector<unsigned int> count(2, 0); vector<unsigned int> count(2, 0);
interactionCount.upload(count); interactionCount.upload(count);
rebuildNeighborList.upload(&count[0]); rebuildNeighborList.upload(&count[0]);
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. * * Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. * * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis * * Authors: Peter Eastman, Nicholas Curtis *
* Contributors: * * Contributors: *
* * * *
...@@ -33,7 +33,8 @@ ...@@ -33,7 +33,8 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataLength(length) { HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, bool uniform) :
context(context), trait(trait), dataLength(length), uniform(uniform) {
// Create kernels. // Create kernels.
map<string, string> replacements; map<string, string> replacements;
...@@ -54,32 +55,34 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length) : c ...@@ -54,32 +55,34 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length) : c
// Work out the work group sizes for various kernels. // Work out the work group sizes for various kernels.
int maxBlockSize;
hipDeviceGetAttribute(&maxBlockSize, hipDeviceAttributeMaxBlockDimX, context.getDevice());
int maxSharedMem; int maxSharedMem;
hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice()); hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice());
int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2; int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
int maxShortList = min(3000, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks())); int maxShortList = min(3000, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
isShortList = (length <= maxShortList); isShortList = (length <= maxShortList);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2) sortKernelSize = 256;
; rangeKernelSize = 256;
positionsKernelSize = rangeKernelSize;
sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4);
if (rangeKernelSize > length) if (rangeKernelSize > length)
rangeKernelSize = length; rangeKernelSize = length;
rangeKernelBlocks = (length + rangeKernelSize - 1) / rangeKernelSize;
if (sortKernelSize > maxLocalBuffer) if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer; sortKernelSize = maxLocalBuffer;
unsigned int targetBucketSize = sortKernelSize/2; unsigned int targetBucketSize = uniform ? sortKernelSize/2 : sortKernelSize/8;
unsigned int numBuckets = length/targetBucketSize; unsigned int numBuckets = length/targetBucketSize;
if (numBuckets < 1) if (numBuckets < 1)
numBuckets = 1; numBuckets = 1;
// computeBucketPositions is executed as a single work group so larger block size is faster.
positionsKernelSize = 1024;
if (positionsKernelSize > numBuckets) if (positionsKernelSize > numBuckets)
positionsKernelSize = numBuckets; positionsKernelSize = numBuckets;
// Create workspace arrays. // Create workspace arrays.
if (!isShortList) { if (!isShortList) {
dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange"); counters.initialize<unsigned int>(context, 1, "counters");
unsigned int zero = 0;
counters.upload(&zero);
dataRange.initialize(context, 2*rangeKernelBlocks, trait->getKeySize(), "sortDataRange");
bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset"); bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset");
bucketOfElement.initialize<uint1>(context, length, "bucketOfElement"); bucketOfElement.initialize<uint1>(context, length, "bucketOfElement");
offsetInBucket.initialize<uint1>(context, length, "offsetInBucket"); offsetInBucket.initialize<uint1>(context, length, "offsetInBucket");
...@@ -101,7 +104,7 @@ void HipSort::sort(HipArray& data) { ...@@ -101,7 +104,7 @@ void HipSort::sort(HipArray& data) {
if (dataLength <= HipContext::ThreadBlockSize*context.getNumThreadBlocks()) { if (dataLength <= HipContext::ThreadBlockSize*context.getNumThreadBlocks()) {
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength}; void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
context.executeKernel(shortList2Kernel, sortArgs, dataLength); context.executeKernel(shortList2Kernel, sortArgs, dataLength, HipContext::ThreadBlockSize, HipContext::ThreadBlockSize*trait->getKeySize());
buckets.copyTo(data); buckets.copyTo(data);
} }
else { else {
...@@ -113,8 +116,8 @@ void HipSort::sort(HipArray& data) { ...@@ -113,8 +116,8 @@ void HipSort::sort(HipArray& data) {
// Compute the range of data values. // Compute the range of data values.
unsigned int numBuckets = bucketOffset.getSize(); unsigned int numBuckets = bucketOffset.getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()}; void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer(), &counters.getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize()); context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelBlocks*rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets. // Assign array elements to buckets.
...@@ -124,7 +127,7 @@ void HipSort::sort(HipArray& data) { ...@@ -124,7 +127,7 @@ void HipSort::sort(HipArray& data) {
// Compute the position of each bucket. // Compute the position of each bucket.
void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer()}; void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer(), &counters.getDevicePointer()};
context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int)); context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
// Copy the data into the buckets. // Copy the data into the buckets.
...@@ -135,7 +138,7 @@ void HipSort::sort(HipArray& data) { ...@@ -135,7 +138,7 @@ void HipSort::sort(HipArray& data) {
// Sort each bucket. // Sort each bucket.
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()}; void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &bucketOffset.getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize()); context.executeKernelFlat(sortBucketsKernel, sortArgs, numBuckets*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
} }
} }
...@@ -11,23 +11,23 @@ extern "C" { ...@@ -11,23 +11,23 @@ extern "C" {
__global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) { __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) {
// Load the data into local memory. // Load the data into local memory.
HIP_DYNAMIC_SHARED( DATA_TYPE, dataBuffer) extern __shared__ DATA_TYPE dataBuffer[];
for (int index = threadIdx.x; index < length; index += blockDim.x) for (int index = threadIdx.x; index < length; index += blockDim.x)
dataBuffer[index] = data[index]; dataBuffer[index] = data[index];
__syncthreads(); __syncthreads();
// Perform a bitonic sort in local memory. // Perform a bitonic sort in local memory.
for (unsigned int k = 2; k < 2*length; k *= 2) { unsigned int lenghtNextPow2 = length <= 2 ? length : (1 << (32 - __clz(length - 1)));
for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) { for (unsigned int j = k/2; j > 0; j /= 2) {
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) { for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
int ixj = i^j; int ixj = i^j;
if (ixj > i && ixj < length) { if (ixj > i && ixj < length) {
DATA_TYPE value1 = dataBuffer[i]; DATA_TYPE value1 = dataBuffer[i];
DATA_TYPE value2 = dataBuffer[ixj]; DATA_TYPE value2 = dataBuffer[ixj];
bool ascending = ((i&k) == 0); bool ascending = (__popc(~i & (lenghtNextPow2 - k)) & 1) == 0;
for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending);
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2)); KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1)); KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) { if (lowKey > highKey) {
...@@ -52,7 +52,7 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) ...@@ -52,7 +52,7 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
* work, but also parallelizes much better. * work, but also parallelizes much better.
*/ */
__global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE* __restrict__ dataOut, unsigned int length) { __global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE* __restrict__ dataOut, unsigned int length) {
__shared__ DATA_TYPE dataBuffer[64]; extern __shared__ KEY_TYPE keyBuffer[];
int globalId = blockDim.x*blockIdx.x+threadIdx.x; int globalId = blockDim.x*blockIdx.x+threadIdx.x;
DATA_TYPE value = dataIn[globalId < length ? globalId : 0]; DATA_TYPE value = dataIn[globalId < length ? globalId : 0];
KEY_TYPE key = getValue(value); KEY_TYPE key = getValue(value);
...@@ -61,59 +61,80 @@ __global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE* ...@@ -61,59 +61,80 @@ __global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE*
int numInBlock = min(static_cast<int>(blockDim.x), static_cast<int>(length-blockStart)); int numInBlock = min(static_cast<int>(blockDim.x), static_cast<int>(length-blockStart));
__syncthreads(); __syncthreads();
if (threadIdx.x < numInBlock) if (threadIdx.x < numInBlock)
dataBuffer[threadIdx.x] = dataIn[blockStart+threadIdx.x]; keyBuffer[threadIdx.x] = getValue(dataIn[blockStart+threadIdx.x]);
__syncthreads(); __syncthreads();
for (int i = 0; i < numInBlock; i++) { for (int i = 0; i < numInBlock; i++) {
KEY_TYPE otherKey = getValue(dataBuffer[i]); KEY_TYPE otherKey = keyBuffer[i];
if (otherKey < key || (otherKey == key && blockStart+i < globalId)) count += (otherKey < key) | ((otherKey == key) & (blockStart+i < globalId));
count++;
} }
} }
if (globalId < length) if (globalId < length)
dataOut[count] = value; dataOut[count] = value;
} }
inline __device__ void reduceMinMax(KEY_TYPE minimum, KEY_TYPE maximum, KEY_TYPE* minBuffer, KEY_TYPE* maxBuffer,
KEY_TYPE* minResult, KEY_TYPE* maxResult) {
minBuffer[threadIdx.x] = minimum;
maxBuffer[threadIdx.x] = maximum;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if ((threadIdx.x+step < blockDim.x) & ((threadIdx.x&(2*step-1)) == 0)) {
minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
}
__syncthreads();
}
if (threadIdx.x == 0) {
*minResult = minBuffer[0];
*maxResult = maxBuffer[0];
}
}
/** /**
* Calculate the minimum and maximum value in the array to be sorted. This kernel * Calculate the minimum and maximum value in the array to be sorted.
* is executed as a single work group.
*/ */
__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range, __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range,
unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) { unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
HIP_DYNAMIC_SHARED( KEY_TYPE, minBuffer) extern __shared__ KEY_TYPE minBuffer[];
KEY_TYPE* maxBuffer = minBuffer+blockDim.x; KEY_TYPE* maxBuffer = minBuffer+blockDim.x;
KEY_TYPE minimum = MAX_KEY; KEY_TYPE minimum = MAX_KEY;
KEY_TYPE maximum = MIN_KEY; KEY_TYPE maximum = MIN_KEY;
__shared__ bool isLastFinishedBlock;
if (threadIdx.x == 0) {
isLastFinishedBlock = false;
}
// Each thread calculates the range of a subset of values. // Each thread calculates the range of a subset of values.
for (unsigned int index = threadIdx.x; index < length; index += blockDim.x) { for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
KEY_TYPE value = getValue(data[index]); KEY_TYPE value = getValue(data[index]);
minimum = min(minimum, value); minimum = min(minimum, value);
maximum = max(maximum, value); maximum = max(maximum, value);
} }
// Now reduce them. // Now reduce them and save partial results
minBuffer[threadIdx.x] = minimum; reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[blockIdx.x * 2 + 0], &range[blockIdx.x * 2 + 1]);
maxBuffer[threadIdx.x] = maximum; __threadfence();
__syncthreads(); if (threadIdx.x == 0) {
for (unsigned int step = 1; step < blockDim.x; step *= 2) { isLastFinishedBlock = atomicAdd(&counters[0], 1) + 1 == gridDim.x;
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0) {
minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
} }
__syncthreads(); __syncthreads();
// The last block reduce partial results
if (isLastFinishedBlock) {
for (unsigned int index = threadIdx.x; index < gridDim.x; index += blockDim.x) {
minimum = min(minimum, range[index * 2 + 0]);
maximum = max(maximum, range[index * 2 + 1]);
} }
minimum = minBuffer[0]; reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[0], &range[1]);
maximum = maxBuffer[0];
if (threadIdx.x == 0) {
range[0] = minimum;
range[1] = maximum;
} }
// Clear the bucket counters in preparation for the next kernel. // Clear the bucket counters in preparation for the next kernel.
for (unsigned int index = threadIdx.x; index < numBuckets; index += blockDim.x) for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < numBuckets; index += blockDim.x*gridDim.x)
bucketOffset[index] = 0; bucketOffset[index] = 0;
} }
...@@ -137,8 +158,8 @@ __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsi ...@@ -137,8 +158,8 @@ __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsi
* Sum the bucket sizes to compute the start position of each bucket. This kernel * Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group. * is executed as a single work group.
*/ */
__global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) { __global__ __launch_bounds__(1024) void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
HIP_DYNAMIC_SHARED( unsigned int, posBuffer) extern __shared__ unsigned int posBuffer[];
unsigned int globalOffset = 0; unsigned int globalOffset = 0;
for (unsigned int startBucket = 0; startBucket < numBuckets; startBucket += blockDim.x) { for (unsigned int startBucket = 0; startBucket < numBuckets; startBucket += blockDim.x) {
// Load the bucket sizes into local memory. // Load the bucket sizes into local memory.
...@@ -163,6 +184,9 @@ __global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __ ...@@ -163,6 +184,9 @@ __global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __
bucketOffset[globalIndex] = posBuffer[threadIdx.x]+globalOffset; bucketOffset[globalIndex] = posBuffer[threadIdx.x]+globalOffset;
globalOffset += posBuffer[blockDim.x-1]; globalOffset += posBuffer[blockDim.x-1];
} }
if (threadIdx.x == 0) {
counters[0] = 0;
}
} }
/** /**
...@@ -180,27 +204,28 @@ __global__ void copyDataToBuckets(const DATA_TYPE* __restrict__ data, DATA_TYPE* ...@@ -180,27 +204,28 @@ __global__ void copyDataToBuckets(const DATA_TYPE* __restrict__ data, DATA_TYPE*
/** /**
* Sort the data in each bucket. * Sort the data in each bucket.
*/ */
__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, unsigned int numBuckets, const unsigned int* __restrict__ bucketOffset) { __global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, const unsigned int* __restrict__ bucketOffset) {
HIP_DYNAMIC_SHARED( DATA_TYPE, dataBuffer) extern __shared__ DATA_TYPE dataBuffer[];
for (unsigned int index = blockIdx.x; index < numBuckets; index += gridDim.x) { unsigned int index = blockIdx.x;
unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]); unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
unsigned int endIndex = bucketOffset[index]; unsigned int endIndex = bucketOffset[index];
unsigned int length = endIndex-startIndex; unsigned int length = endIndex-startIndex;
unsigned int lenghtNextPow2 = length <= 2 ? length : (1 << (32 - __clz(length - 1)));
if (length <= blockDim.x) { if (length <= blockDim.x) {
// Load the data into local memory. // Load the data into local memory.
if (threadIdx.x < length) if (threadIdx.x < length)
dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x]; dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
else else if (threadIdx.x < lenghtNextPow2)
dataBuffer[threadIdx.x] = MAX_VALUE; dataBuffer[threadIdx.x] = MAX_VALUE;
__syncthreads(); __syncthreads();
// Perform a bitonic sort in local memory. // Perform a bitonic sort in local memory.
for (unsigned int k = 2; k <= blockDim.x; k *= 2) { for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) { for (unsigned int j = k/2; j > 0; j /= 2) {
int ixj = threadIdx.x^j; int ixj = threadIdx.x^j;
if (ixj > threadIdx.x) { if (threadIdx.x < lenghtNextPow2 && ixj > threadIdx.x) {
DATA_TYPE value1 = dataBuffer[threadIdx.x]; DATA_TYPE value1 = dataBuffer[threadIdx.x];
DATA_TYPE value2 = dataBuffer[ixj]; DATA_TYPE value2 = dataBuffer[ixj];
bool ascending = (threadIdx.x&k) == 0; bool ascending = (threadIdx.x&k) == 0;
...@@ -225,21 +250,18 @@ __global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __res ...@@ -225,21 +250,18 @@ __global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __res
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
data[startIndex+i] = buckets[startIndex+i]; data[startIndex+i] = buckets[startIndex+i];
__threadfence_block();
__syncthreads(); __syncthreads();
// Perform a bitonic sort in global memory. // Perform a bitonic sort in global memory.
for (unsigned int k = 2; k < 2*length; k *= 2) { for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) { for (unsigned int j = k/2; j > 0; j /= 2) {
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) { for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
int ixj = i^j; int ixj = i^j;
if (ixj > i && ixj < length) { if (ixj > i && ixj < length) {
DATA_TYPE value1 = data[startIndex+i]; DATA_TYPE value1 = data[startIndex+i];
DATA_TYPE value2 = data[startIndex+ixj]; DATA_TYPE value2 = data[startIndex+ixj];
bool ascending = ((i&k) == 0); bool ascending = (__popc(~i & (lenghtNextPow2 - k)) & 1) == 0;
for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending);
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2)); KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1)); KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) { if (lowKey > highKey) {
...@@ -248,12 +270,10 @@ __global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __res ...@@ -248,12 +270,10 @@ __global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __res
} }
} }
} }
__threadfence_block();
__syncthreads(); __syncthreads();
} }
} }
} }
}
} }
} }
...@@ -60,7 +60,7 @@ class SortTrait : public HipSort::SortTrait { ...@@ -60,7 +60,7 @@ class SortTrait : public HipSort::SortTrait {
const char* getSortKey() const {return "value";} const char* getSortKey() const {return "value";}
}; };
void verifySorting(vector<float> array) { void verifySorting(vector<float> array, bool uniform) {
// Sort the array. // Sort the array.
System system; System system;
...@@ -72,7 +72,7 @@ void verifySorting(vector<float> array) { ...@@ -72,7 +72,7 @@ void verifySorting(vector<float> array) {
context.initialize(); context.initialize();
HipArray data(context, array.size(), 4, "sortData"); HipArray data(context, array.size(), 4, "sortData");
data.upload(array); data.upload(array);
HipSort sort(context, new SortTrait(), array.size()); HipSort sort(context, new SortTrait(), array.size(), uniform);
sort.sort(data); sort.sort(data);
vector<float> sorted; vector<float> sorted;
data.download(sorted); data.download(sorted);
...@@ -93,30 +93,26 @@ void testUniformValues() { ...@@ -93,30 +93,26 @@ void testUniformValues() {
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt); init_gen_rand(0, sfmt);
vector<float> array(10000); for (auto size : { 2, 63, 100, 1234, 10000, 60123, 876543}) {
vector<float> array(size);
for (int i = 0; i < (int) array.size(); i++) for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) genrand_real2(sfmt); array[i] = (float) genrand_real2(sfmt);
verifySorting(array); verifySorting(array, true);
verifySorting(array, false);
}
} }
void testLogValues() { void testLogValues() {
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt); init_gen_rand(0, sfmt);
vector<float> array(10000); for (auto size : { 2, 63, 100, 1234, 10000, 60123, 876543}) {
vector<float> array(size);
for (int i = 0; i < (int) array.size(); i++) for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt)); array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array); verifySorting(array, true);
} verifySorting(array, false);
}
void testShortList() {
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> array(500);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array);
} }
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
...@@ -125,7 +121,6 @@ int main(int argc, char* argv[]) { ...@@ -125,7 +121,6 @@ int main(int argc, char* argv[]) {
platform.setPropertyDefaultValue("HipPrecision", string(argv[1])); platform.setPropertyDefaultValue("HipPrecision", string(argv[1]));
testUniformValues(); testUniformValues();
testLogValues(); testLogValues();
testShortList();
} }
catch(const exception& e) { catch(const exception& e) {
cout << "exception: " << e.what() << endl; cout << "exception: " << e.what() << endl;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment