Unverified Commit 7279c539 authored by Anton Gorenko's avatar Anton Gorenko
Browse files

Optimize sorting kernels and tune block sizes

* Compile kernels with max block size of 256 threads:
  The default hipcc behavior since ROCm 4.2 is to compile kernels
  with 1024 threads unless __launch_bounds__ is specified. This
  significantly increases register pressure especially in heavy kernels
  (double precision, for example), requiring register spilling;
* Optimize computeRange by using multiple blocks for reduction;
* Use blocks of 1024 threads for computeBucketPositions - it is executed
  as a single work group so larger block size is faster;
* Sort up-to lenghtNextPow2 instead of blockDim.x (faster for short
  buckets);
* Optimize sortShortList2;
* Optimize sortBuckets with bit instructions;
* Decrease bucket size for non-uniform sorting: too many buckets may
  have sizes too large to sort in shared memory;
* Add more sizes in tests.
parent aca24d5f
......@@ -342,7 +342,7 @@ public:
* Get the maximum number of threads in a thread block supported by this device.
*/
int getMaxThreadBlockSize() const {
return 1024;
return 256;
}
/**
* Get whether the device being used is a CPU. In some cases, different algorithms
......
......@@ -10,7 +10,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
......@@ -77,9 +77,11 @@ public:
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the HipSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param length the length of the arrays this object will be used to sort.
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint.
*/
HipSort(HipContext& context, SortTrait* trait, unsigned int length);
HipSort(HipContext& context, SortTrait* trait, unsigned int length, bool uniform=true);
~HipSort();
/**
* Sort an array.
......@@ -88,14 +90,15 @@ public:
private:
HipContext& context;
SortTrait* trait;
HipArray counters;
HipArray dataRange;
HipArray bucketOfElement;
HipArray offsetInBucket;
HipArray bucketOffset;
HipArray buckets;
hipFunction_t shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList;
unsigned int dataLength, rangeKernelBlocks, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList, uniform;
};
/**
......
......@@ -431,6 +431,9 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
static_assert(8*sizeof(void*) == HipContext::TileSize);
string bits = intToString(8*sizeof(void*));
string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
if (getMaxThreadBlockSize() < 1024) {
options += " --gpu-max-threads-per-block=" + std::to_string(getMaxThreadBlockSize());
}
stringstream src;
if (!options.empty())
src << "// Compilation Options: " << options << endl << endl;
......@@ -659,6 +662,18 @@ void HipContext::executeKernel(hipFunction_t kernel, void** arguments, int threa
}
}
void HipContext::executeKernelFlat(hipFunction_t kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
if (blockSize == -1)
blockSize = ThreadBlockSize;
int gridSize = (threads+blockSize-1)/blockSize;
hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
if (result != hipSuccess) {
stringstream str;
str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
int HipContext::computeThreadBlockSize(double memory) const {
int maxShared = this->sharedMemPerBlock;
int max = (int) (maxShared/memory);
......@@ -738,7 +753,7 @@ void HipContext::clearAutoclearBuffers() {
double HipContext::reduceEnergy() {
int bufferSize = energyBuffer.getSize();
int workGroupSize = 512;
int workGroupSize = getMaxThreadBlockSize();
void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer.getElementSize());
energySum.download(pinnedBuffer);
......
......@@ -280,7 +280,7 @@ void HipNonbondedUtilities::initialize(const System& system) {
sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false);
vector<unsigned int> count(2, 0);
interactionCount.upload(count);
rebuildNeighborList.upload(&count[0]);
......
......@@ -7,7 +7,7 @@
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
......@@ -33,7 +33,8 @@
using namespace OpenMM;
using namespace std;
HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataLength(length) {
HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, bool uniform) :
context(context), trait(trait), dataLength(length), uniform(uniform) {
// Create kernels.
map<string, string> replacements;
......@@ -54,32 +55,34 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length) : c
// Work out the work group sizes for various kernels.
int maxBlockSize;
hipDeviceGetAttribute(&maxBlockSize, hipDeviceAttributeMaxBlockDimX, context.getDevice());
int maxSharedMem;
hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice());
int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
int maxShortList = min(3000, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
isShortList = (length <= maxShortList);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
;
positionsKernelSize = rangeKernelSize;
sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4);
sortKernelSize = 256;
rangeKernelSize = 256;
if (rangeKernelSize > length)
rangeKernelSize = length;
rangeKernelBlocks = (length + rangeKernelSize - 1) / rangeKernelSize;
if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer;
unsigned int targetBucketSize = sortKernelSize/2;
unsigned int targetBucketSize = uniform ? sortKernelSize/2 : sortKernelSize/8;
unsigned int numBuckets = length/targetBucketSize;
if (numBuckets < 1)
numBuckets = 1;
// computeBucketPositions is executed as a single work group so larger block size is faster.
positionsKernelSize = 1024;
if (positionsKernelSize > numBuckets)
positionsKernelSize = numBuckets;
// Create workspace arrays.
if (!isShortList) {
dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
counters.initialize<unsigned int>(context, 1, "counters");
unsigned int zero = 0;
counters.upload(&zero);
dataRange.initialize(context, 2*rangeKernelBlocks, trait->getKeySize(), "sortDataRange");
bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset");
bucketOfElement.initialize<uint1>(context, length, "bucketOfElement");
offsetInBucket.initialize<uint1>(context, length, "offsetInBucket");
......@@ -101,7 +104,7 @@ void HipSort::sort(HipArray& data) {
if (dataLength <= HipContext::ThreadBlockSize*context.getNumThreadBlocks()) {
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
context.executeKernel(shortList2Kernel, sortArgs, dataLength);
context.executeKernel(shortList2Kernel, sortArgs, dataLength, HipContext::ThreadBlockSize, HipContext::ThreadBlockSize*trait->getKeySize());
buckets.copyTo(data);
}
else {
......@@ -113,8 +116,8 @@ void HipSort::sort(HipArray& data) {
// Compute the range of data values.
unsigned int numBuckets = bucketOffset.getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer(), &counters.getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelBlocks*rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets.
......@@ -124,7 +127,7 @@ void HipSort::sort(HipArray& data) {
// Compute the position of each bucket.
void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer()};
void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer(), &counters.getDevicePointer()};
context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
// Copy the data into the buckets.
......@@ -135,7 +138,7 @@ void HipSort::sort(HipArray& data) {
// Sort each bucket.
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &bucketOffset.getDevicePointer()};
context.executeKernelFlat(sortBucketsKernel, sortArgs, numBuckets*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
}
}
......@@ -11,23 +11,23 @@ extern "C" {
__global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) {
// Load the data into local memory.
HIP_DYNAMIC_SHARED( DATA_TYPE, dataBuffer)
extern __shared__ DATA_TYPE dataBuffer[];
for (int index = threadIdx.x; index < length; index += blockDim.x)
dataBuffer[index] = data[index];
__syncthreads();
// Perform a bitonic sort in local memory.
for (unsigned int k = 2; k < 2*length; k *= 2) {
unsigned int lenghtNextPow2 = length <= 2 ? length : (1 << (32 - __clz(length - 1)));
for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
int ixj = i^j;
if (ixj > i && ixj < length) {
DATA_TYPE value1 = dataBuffer[i];
DATA_TYPE value2 = dataBuffer[ixj];
bool ascending = ((i&k) == 0);
for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending);
bool ascending = (__popc(~i & (lenghtNextPow2 - k)) & 1) == 0;
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
......@@ -52,7 +52,7 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
* work, but also parallelizes much better.
*/
__global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE* __restrict__ dataOut, unsigned int length) {
__shared__ DATA_TYPE dataBuffer[64];
extern __shared__ KEY_TYPE keyBuffer[];
int globalId = blockDim.x*blockIdx.x+threadIdx.x;
DATA_TYPE value = dataIn[globalId < length ? globalId : 0];
KEY_TYPE key = getValue(value);
......@@ -61,59 +61,80 @@ __global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE*
int numInBlock = min(static_cast<int>(blockDim.x), static_cast<int>(length-blockStart));
__syncthreads();
if (threadIdx.x < numInBlock)
dataBuffer[threadIdx.x] = dataIn[blockStart+threadIdx.x];
keyBuffer[threadIdx.x] = getValue(dataIn[blockStart+threadIdx.x]);
__syncthreads();
for (int i = 0; i < numInBlock; i++) {
KEY_TYPE otherKey = getValue(dataBuffer[i]);
if (otherKey < key || (otherKey == key && blockStart+i < globalId))
count++;
KEY_TYPE otherKey = keyBuffer[i];
count += (otherKey < key) | ((otherKey == key) & (blockStart+i < globalId));
}
}
if (globalId < length)
dataOut[count] = value;
}
inline __device__ void reduceMinMax(KEY_TYPE minimum, KEY_TYPE maximum, KEY_TYPE* minBuffer, KEY_TYPE* maxBuffer,
KEY_TYPE* minResult, KEY_TYPE* maxResult) {
minBuffer[threadIdx.x] = minimum;
maxBuffer[threadIdx.x] = maximum;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if ((threadIdx.x+step < blockDim.x) & ((threadIdx.x&(2*step-1)) == 0)) {
minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
}
__syncthreads();
}
if (threadIdx.x == 0) {
*minResult = minBuffer[0];
*maxResult = maxBuffer[0];
}
}
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
* Calculate the minimum and maximum value in the array to be sorted.
*/
__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range,
unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
HIP_DYNAMIC_SHARED( KEY_TYPE, minBuffer)
unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
extern __shared__ KEY_TYPE minBuffer[];
KEY_TYPE* maxBuffer = minBuffer+blockDim.x;
KEY_TYPE minimum = MAX_KEY;
KEY_TYPE maximum = MIN_KEY;
__shared__ bool isLastFinishedBlock;
if (threadIdx.x == 0) {
isLastFinishedBlock = false;
}
// Each thread calculates the range of a subset of values.
for (unsigned int index = threadIdx.x; index < length; index += blockDim.x) {
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
KEY_TYPE value = getValue(data[index]);
minimum = min(minimum, value);
maximum = max(maximum, value);
}
// Now reduce them.
// Now reduce them and save partial results
minBuffer[threadIdx.x] = minimum;
maxBuffer[threadIdx.x] = maximum;
reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[blockIdx.x * 2 + 0], &range[blockIdx.x * 2 + 1]);
__threadfence();
if (threadIdx.x == 0) {
isLastFinishedBlock = atomicAdd(&counters[0], 1) + 1 == gridDim.x;
}
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0) {
minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
// The last block reduce partial results
if (isLastFinishedBlock) {
for (unsigned int index = threadIdx.x; index < gridDim.x; index += blockDim.x) {
minimum = min(minimum, range[index * 2 + 0]);
maximum = max(maximum, range[index * 2 + 1]);
}
__syncthreads();
}
minimum = minBuffer[0];
maximum = maxBuffer[0];
if (threadIdx.x == 0) {
range[0] = minimum;
range[1] = maximum;
reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[0], &range[1]);
}
// Clear the bucket counters in preparation for the next kernel.
for (unsigned int index = threadIdx.x; index < numBuckets; index += blockDim.x)
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < numBuckets; index += blockDim.x*gridDim.x)
bucketOffset[index] = 0;
}
......@@ -137,8 +158,8 @@ __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsi
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
HIP_DYNAMIC_SHARED( unsigned int, posBuffer)
__global__ __launch_bounds__(1024) void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
extern __shared__ unsigned int posBuffer[];
unsigned int globalOffset = 0;
for (unsigned int startBucket = 0; startBucket < numBuckets; startBucket += blockDim.x) {
// Load the bucket sizes into local memory.
......@@ -163,6 +184,9 @@ __global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __
bucketOffset[globalIndex] = posBuffer[threadIdx.x]+globalOffset;
globalOffset += posBuffer[blockDim.x-1];
}
if (threadIdx.x == 0) {
counters[0] = 0;
}
}
/**
......@@ -180,77 +204,73 @@ __global__ void copyDataToBuckets(const DATA_TYPE* __restrict__ data, DATA_TYPE*
/**
* Sort the data in each bucket.
*/
__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, unsigned int numBuckets, const unsigned int* __restrict__ bucketOffset) {
HIP_DYNAMIC_SHARED( DATA_TYPE, dataBuffer)
for (unsigned int index = blockIdx.x; index < numBuckets; index += gridDim.x) {
unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
unsigned int endIndex = bucketOffset[index];
unsigned int length = endIndex-startIndex;
if (length <= blockDim.x) {
// Load the data into local memory.
if (threadIdx.x < length)
dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
else
dataBuffer[threadIdx.x] = MAX_VALUE;
__syncthreads();
__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, const unsigned int* __restrict__ bucketOffset) {
extern __shared__ DATA_TYPE dataBuffer[];
unsigned int index = blockIdx.x;
unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
unsigned int endIndex = bucketOffset[index];
unsigned int length = endIndex-startIndex;
unsigned int lenghtNextPow2 = length <= 2 ? length : (1 << (32 - __clz(length - 1)));
if (length <= blockDim.x) {
// Load the data into local memory.
if (threadIdx.x < length)
dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
else if (threadIdx.x < lenghtNextPow2)
dataBuffer[threadIdx.x] = MAX_VALUE;
__syncthreads();
// Perform a bitonic sort in local memory.
// Perform a bitonic sort in local memory.
for (unsigned int k = 2; k <= blockDim.x; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
int ixj = threadIdx.x^j;
if (ixj > threadIdx.x) {
DATA_TYPE value1 = dataBuffer[threadIdx.x];
DATA_TYPE value2 = dataBuffer[ixj];
bool ascending = (threadIdx.x&k) == 0;
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
dataBuffer[threadIdx.x] = value2;
dataBuffer[ixj] = value1;
}
for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
int ixj = threadIdx.x^j;
if (threadIdx.x < lenghtNextPow2 && ixj > threadIdx.x) {
DATA_TYPE value1 = dataBuffer[threadIdx.x];
DATA_TYPE value2 = dataBuffer[ixj];
bool ascending = (threadIdx.x&k) == 0;
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
dataBuffer[threadIdx.x] = value2;
dataBuffer[ixj] = value1;
}
__syncthreads();
}
__syncthreads();
}
}
// Write the data to the sorted array.
// Write the data to the sorted array.
if (threadIdx.x < length)
data[startIndex+threadIdx.x] = dataBuffer[threadIdx.x];
}
else {
// Copy the bucket data over to the output array.
if (threadIdx.x < length)
data[startIndex+threadIdx.x] = dataBuffer[threadIdx.x];
}
else {
// Copy the bucket data over to the output array.
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
data[startIndex+i] = buckets[startIndex+i];
__threadfence_block();
__syncthreads();
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
data[startIndex+i] = buckets[startIndex+i];
__syncthreads();
// Perform a bitonic sort in global memory.
for (unsigned int k = 2; k < 2*length; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
int ixj = i^j;
if (ixj > i && ixj < length) {
DATA_TYPE value1 = data[startIndex+i];
DATA_TYPE value2 = data[startIndex+ixj];
bool ascending = ((i&k) == 0);
for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending);
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
data[startIndex+i] = value2;
data[startIndex+ixj] = value1;
}
// Perform a bitonic sort in global memory.
for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
int ixj = i^j;
if (ixj > i && ixj < length) {
DATA_TYPE value1 = data[startIndex+i];
DATA_TYPE value2 = data[startIndex+ixj];
bool ascending = (__popc(~i & (lenghtNextPow2 - k)) & 1) == 0;
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
data[startIndex+i] = value2;
data[startIndex+ixj] = value1;
}
}
__threadfence_block();
__syncthreads();
}
__syncthreads();
}
}
}
......
......@@ -60,7 +60,7 @@ class SortTrait : public HipSort::SortTrait {
const char* getSortKey() const {return "value";}
};
void verifySorting(vector<float> array) {
void verifySorting(vector<float> array, bool uniform) {
// Sort the array.
System system;
......@@ -72,7 +72,7 @@ void verifySorting(vector<float> array) {
context.initialize();
HipArray data(context, array.size(), 4, "sortData");
data.upload(array);
HipSort sort(context, new SortTrait(), array.size());
HipSort sort(context, new SortTrait(), array.size(), uniform);
sort.sort(data);
vector<float> sorted;
data.download(sorted);
......@@ -93,30 +93,26 @@ void testUniformValues() {
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) genrand_real2(sfmt);
verifySorting(array);
for (auto size : { 2, 63, 100, 1234, 10000, 60123, 876543}) {
vector<float> array(size);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) genrand_real2(sfmt);
verifySorting(array, true);
verifySorting(array, false);
}
}
void testLogValues() {
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array);
}
void testShortList() {
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> array(500);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array);
for (auto size : { 2, 63, 100, 1234, 10000, 60123, 876543}) {
vector<float> array(size);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array, true);
verifySorting(array, false);
}
}
int main(int argc, char* argv[]) {
......@@ -125,7 +121,6 @@ int main(int argc, char* argv[]) {
platform.setPropertyDefaultValue("HipPrecision", string(argv[1]));
testUniformValues();
testLogValues();
testShortList();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment