Commit c35407a8 authored by Peter Eastman's avatar Peter Eastman
Browse files

Tony Tye's optimizations to sorting. Also a couple of other very minor fixes.

parent 052aea3e
...@@ -90,13 +90,13 @@ public: ...@@ -90,13 +90,13 @@ public:
/** /**
* Get the size of the array. * Get the size of the array.
*/ */
int getSize() { int getSize() const {
return size; return size;
} }
/** /**
* Get the name of the array. * Get the name of the array.
*/ */
const std::string& getName() { const std::string& getName() const {
return name; return name;
} }
/** /**
......
...@@ -131,7 +131,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde ...@@ -131,7 +131,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
else else
simdWidth = 1; simdWidth = 1;
if (platforms[0].getInfo<CL_PLATFORM_VENDOR>() == "Apple" && vendor == "AMD") if (platforms[0].getInfo<CL_PLATFORM_VENDOR>() == "Apple" && vendor == "AMD")
compilationDefines["MAC_AMD_WORKAROUND"] == ""; compilationDefines["MAC_AMD_WORKAROUND"] = "";
if (supports64BitGlobalAtomics) if (supports64BitGlobalAtomics)
compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = ""; compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
if (supportsDoublePrecision) if (supportsDoublePrecision)
......
...@@ -1077,10 +1077,10 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1077,10 +1077,10 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeBsplineTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta"); pmeBsplineTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta");
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
if (deviceIsCpu) if (deviceIsCpu)
pmeBsplineDTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta"); pmeBsplineDTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineDTheta");
pmeAtomRange = new OpenCLArray<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = new OpenCLArray<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = new OpenCLArray<mm_int2>(cl, numParticles, "pmeAtomGridIndex"); pmeAtomGridIndex = new OpenCLArray<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
sort = new OpenCLSort<mm_int2>(cl, cl.getNumAtoms(), "int2", "value.y"); sort = new OpenCLSort<SortTrait>(cl, cl.getNumAtoms());
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ); fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
// Initialize the b-spline moduli. // Initialize the b-spline moduli.
......
...@@ -481,6 +481,16 @@ public: ...@@ -481,6 +481,16 @@ public:
*/ */
double execute(ContextImpl& context, bool includeForces, bool includeEnergy); double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
private: private:
struct SortTrait {
typedef mm_int2 DataType;
typedef cl_int KeyType;
static const char* clDataType() {return "int2";}
static const char* clKeyType() {return "int";}
static const char* clMinKey() {return "INT_MIN";}
static const char* clMaxKey() {return "INT_MAX";}
static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";}
static const char* clSortKey() {return "value.y";}
};
OpenCLContext& cl; OpenCLContext& cl;
bool hasInitializedKernel; bool hasInitializedKernel;
OpenCLArray<mm_float2>* sigmaEpsilon; OpenCLArray<mm_float2>* sigmaEpsilon;
...@@ -495,7 +505,7 @@ private: ...@@ -495,7 +505,7 @@ private:
OpenCLArray<mm_float4>* pmeBsplineDTheta; OpenCLArray<mm_float4>* pmeBsplineDTheta;
OpenCLArray<cl_int>* pmeAtomRange; OpenCLArray<cl_int>* pmeAtomRange;
OpenCLArray<mm_int2>* pmeAtomGridIndex; OpenCLArray<mm_int2>* pmeAtomGridIndex;
OpenCLSort<mm_int2>* sort; OpenCLSort<SortTrait>* sort;
OpenCLFFT3D* fft; OpenCLFFT3D* fft;
cl::Kernel ewaldSumsKernel; cl::Kernel ewaldSumsKernel;
cl::Kernel ewaldForcesKernel; cl::Kernel ewaldForcesKernel;
......
#include "OpenCLSort.h" #include "OpenCLSort.h"
template class OpenMM::OpenCLSort<float>;
...@@ -37,6 +37,28 @@ namespace OpenMM { ...@@ -37,6 +37,28 @@ namespace OpenMM {
/** /**
* This class sorts arrays of values. It supports any type of values, not just scalars, * This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them. * so long as an appropriate sorting key can be defined by which to sort them.
*
* The class is templatized by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* struct FloatTrait {
* // The name of the data and key types being sorted.
* // Both the host type and OpenCL type is required.
* // For primitive types they will be the same.
* typedef cl_float DataType;
* typedef cl_float KeyType;
* static const char* clDataType() {return "float";}
* static const char* clKeyType() {return "float";}
* // The minimum value a key can take.
* static const char* clMinKey() {return "-MAXFLOAT";}
* // The maximum value a key can take.
* static const char* clMaxKey() {return "MAXFLOAT";}
* // A value whose key is guaranteed to equal clMaxKey().
* static const char* clMaxValue() {return "MAXFLOAT";}
* // The OpenCL code to select the key from the data value.
* static const char* clSortKey() {return "value";}
* };
* *
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket * The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to * (in local memory when possible, in global memory otherwise). This is similar to
...@@ -51,8 +73,8 @@ namespace OpenMM { ...@@ -51,8 +73,8 @@ namespace OpenMM {
* good performance with the array sizes we typically work with (10,000 to 100,000 * good performance with the array sizes we typically work with (10,000 to 100,000
* elements). * elements).
*/ */
template <class TYPE> template <class TRAIT>
class OPENMM_EXPORT OpenCLSort { class OPENMM_EXPORT OpenCLSort {
public: public:
/** /**
...@@ -60,17 +82,18 @@ public: ...@@ -60,17 +82,18 @@ public:
* *
* @param context the context in which to perform calculations * @param context the context in which to perform calculations
* @param length the length of the arrays this object will be used to sort * @param length the length of the arrays this object will be used to sort
* @param typeName the name of the data type being sorting (e.g. "float")
* @param sortKey an expression that returns the value by which the variable "value" should be sorted.
* For primitive types, this will simply be "value".
*/ */
OpenCLSort(OpenCLContext& context, int length, const std::string& typeName, const std::string& sortKey) : context(context), OpenCLSort(OpenCLContext& context, unsigned int length) : context(context),
dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) { dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
// Create kernels. // Create kernels.
std::map<std::string, std::string> replacements; std::map<std::string, std::string> replacements;
replacements["TYPE"] = typeName; replacements["DATA_TYPE"] = TRAIT::clDataType();
replacements["SORT_KEY"] = sortKey; replacements["KEY_TYPE"] = TRAIT::clKeyType();
replacements["SORT_KEY"] = TRAIT::clSortKey();
replacements["MIN_KEY"] = TRAIT::clMinKey();
replacements["MAX_KEY"] = TRAIT::clMaxKey();
replacements["MAX_VALUE"] = TRAIT::clMaxValue();
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements)); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
computeRangeKernel = cl::Kernel(program, "computeRange"); computeRangeKernel = cl::Kernel(program, "computeRange");
assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets"); assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
...@@ -80,18 +103,18 @@ public: ...@@ -80,18 +103,18 @@ public:
// Work out the work group sizes for various kernels. // Work out the work group sizes for various kernels.
int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2) for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2)
; ;
positionsKernelSize = rangeKernelSize; positionsKernelSize = rangeKernelSize;
sortKernelSize = rangeKernelSize/2; sortKernelSize = rangeKernelSize/2;
if (rangeKernelSize > length) if (rangeKernelSize > length)
rangeKernelSize = length; rangeKernelSize = length;
int maxLocalBuffer = (int) ((context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/sizeof(TYPE))/2); unsigned int maxLocalBuffer = (unsigned int) ((context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/sizeof(typename TRAIT::DataType))/2);
if (sortKernelSize > maxLocalBuffer) if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer; sortKernelSize = maxLocalBuffer;
int targetBucketSize = sortKernelSize/2; unsigned int targetBucketSize = sortKernelSize/2;
int numBuckets = length/targetBucketSize; unsigned int numBuckets = length/targetBucketSize;
if (numBuckets < 1) if (numBuckets < 1)
numBuckets = 1; numBuckets = 1;
if (positionsKernelSize > numBuckets) if (positionsKernelSize > numBuckets)
...@@ -99,11 +122,11 @@ public: ...@@ -99,11 +122,11 @@ public:
// Create workspace arrays. // Create workspace arrays.
dataRange = new OpenCLArray<mm_float2>(context, 1, "sortDataRange"); dataRange = new OpenCLArray<typename TRAIT::KeyType>(context, 2, "sortDataRange");
bucketOffset = new OpenCLArray<cl_int>(context, numBuckets, "bucketOffset"); bucketOffset = new OpenCLArray<cl_uint>(context, numBuckets, "bucketOffset");
bucketOfElement = new OpenCLArray<cl_int>(context, length, "bucketOfElement"); bucketOfElement = new OpenCLArray<cl_uint>(context, length, "bucketOfElement");
offsetInBucket = new OpenCLArray<cl_int>(context, length, "offsetInBucket"); offsetInBucket = new OpenCLArray<cl_uint>(context, length, "offsetInBucket");
buckets = new OpenCLArray<TYPE>(context, length, "buckets"); buckets = new OpenCLArray<typename TRAIT::DataType>(context, length, "buckets");
} }
~OpenCLSort() { ~OpenCLSort() {
if (dataRange != NULL) if (dataRange != NULL)
...@@ -120,18 +143,24 @@ public: ...@@ -120,18 +143,24 @@ public:
/** /**
* Sort an array. * Sort an array.
*/ */
void sort(OpenCLArray<TYPE>& data) { void sort(OpenCLArray<typename TRAIT::DataType>& data) {
if (data.getSize() != bucketOfElement->getSize())
throw OpenMMException("OpenCLSort called with different data size");
if (data.getSize() == 0)
return;
// Compute the range of data values. // Compute the range of data values.
computeRangeKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer()); computeRangeKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
computeRangeKernel.setArg<cl_int>(1, data.getSize()); computeRangeKernel.setArg<cl_uint>(1, data.getSize());
computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer()); computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer());
computeRangeKernel.setArg(3, rangeKernelSize*sizeof(cl_float), NULL); computeRangeKernel.setArg(3, rangeKernelSize*sizeof(typename TRAIT::KeyType), NULL);
context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize); context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);
// Assign array elements to buckets. // Assign array elements to buckets.
int numBuckets = bucketOffset->getSize(); unsigned int numBuckets = bucketOffset->getSize();
context.clearBuffer(bucketOffset->getDeviceBuffer(), numBuckets); context.clearBuffer(bucketOffset->getDeviceBuffer(), numBuckets);
assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer()); assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
assignElementsKernel.setArg<cl_int>(1, data.getSize()); assignElementsKernel.setArg<cl_int>(1, data.getSize());
...@@ -165,18 +194,18 @@ public: ...@@ -165,18 +194,18 @@ public:
sortBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer()); sortBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
sortBucketsKernel.setArg<cl_int>(2, numBuckets); sortBucketsKernel.setArg<cl_int>(2, numBuckets);
sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer()); sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
sortBucketsKernel.setArg(4, sortKernelSize*sizeof(TYPE), NULL); sortBucketsKernel.setArg(4, sortKernelSize*sizeof(typename TRAIT::DataType), NULL);
context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize); context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
} }
private: private:
OpenCLContext& context; OpenCLContext& context;
OpenCLArray<mm_float2>* dataRange; OpenCLArray<typename TRAIT::KeyType>* dataRange;
OpenCLArray<cl_int>* bucketOfElement; OpenCLArray<cl_uint>* bucketOfElement;
OpenCLArray<cl_int>* offsetInBucket; OpenCLArray<cl_uint>* offsetInBucket;
OpenCLArray<cl_int>* bucketOffset; OpenCLArray<cl_uint>* bucketOffset;
OpenCLArray<TYPE>* buckets; OpenCLArray<typename TRAIT::DataType>* buckets;
cl::Kernel computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel; cl::Kernel computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
int rangeKernelSize, positionsKernelSize, sortKernelSize; unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
}; };
} // namespace OpenMM } // namespace OpenMM
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
float getValue(TYPE value) { KEY_TYPE getValue(DATA_TYPE value) {
return SORT_KEY; return SORT_KEY;
} }
...@@ -8,14 +8,14 @@ float getValue(TYPE value) { ...@@ -8,14 +8,14 @@ float getValue(TYPE value) {
* Calculate the minimum and maximum value in the array to be sorted. This kernel * Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group. * is executed as a single work group.
*/ */
__kernel void computeRange(__global const TYPE* restrict data, int length, __global float2* restrict range, __local float* restrict buffer) { __kernel void computeRange(__global const DATA_TYPE* restrict data, uint length, __global KEY_TYPE* restrict range, __local KEY_TYPE* restrict buffer) {
float minimum = MAXFLOAT; KEY_TYPE minimum = MAX_KEY;
float maximum = -MAXFLOAT; KEY_TYPE maximum = MIN_KEY;
// Each thread calculates the range of a subset of values. // Each thread calculates the range of a subset of values.
for (int index = get_local_id(0); index < length; index += get_local_size(0)) { for (uint index = get_local_id(0); index < length; index += get_local_size(0)) {
float value = getValue(data[index]); KEY_TYPE value = getValue(data[index]);
minimum = min(minimum, value); minimum = min(minimum, value);
maximum = max(maximum, value); maximum = max(maximum, value);
} }
...@@ -24,7 +24,7 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo ...@@ -24,7 +24,7 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
buffer[get_local_id(0)] = minimum; buffer[get_local_id(0)] = minimum;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int step = 1; step < get_local_size(0); step *= 2) { for (uint step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0) if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
buffer[get_local_id(0)] = min(buffer[get_local_id(0)], buffer[get_local_id(0)+step]); buffer[get_local_id(0)] = min(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -32,21 +32,23 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo ...@@ -32,21 +32,23 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
minimum = buffer[0]; minimum = buffer[0];
buffer[get_local_id(0)] = maximum; buffer[get_local_id(0)] = maximum;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int step = 1; step < get_local_size(0); step *= 2) { for (uint step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0) if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
buffer[get_local_id(0)] = max(buffer[get_local_id(0)], buffer[get_local_id(0)+step]); buffer[get_local_id(0)] = max(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
maximum = buffer[0]; maximum = buffer[0];
if (get_local_id(0) == 0) if (get_local_id(0) == 0) {
range[0] = (float2) (minimum, maximum); range[0] = minimum;
range[1] = maximum;
}
} }
/** /**
* Assign elements to buckets. * Assign elements to buckets.
*/ */
__kernel void assignElementsToBuckets(__global const TYPE* restrict data, int length, int numBuckets, __global const float2* restrict range, __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, uint length, uint numBuckets, __global const KEY_TYPE* restrict range,
__global int* bucketOffset, __global int* restrict bucketOfElement, __global int* restrict offsetInBucket) { __global uint* bucketOffset, __global uint* restrict bucketOfElement, __global uint* restrict offsetInBucket) {
#ifdef AMD_ATOMIC_WORK_AROUND #ifdef AMD_ATOMIC_WORK_AROUND
// Do a byte write to force all memory accesses to interactionCount to use the complete path. // Do a byte write to force all memory accesses to interactionCount to use the complete path.
// This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path. // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
...@@ -55,19 +57,18 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le ...@@ -55,19 +57,18 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
if (get_global_id(0) == get_local_id(0)+1) if (get_global_id(0) == get_local_id(0)+1)
((__global char*)bucketOffset)[sizeof(int)*numBuckets+1] = 0; ((__global char*)bucketOffset)[sizeof(int)*numBuckets+1] = 0;
#endif #endif
float2 dataRange = range[0]; float minValue = (float) (range[0]);
float minValue = dataRange.x; float maxValue = (float) (range[1]);
float maxValue = dataRange.y;
float bucketWidth = (maxValue-minValue)/numBuckets; float bucketWidth = (maxValue-minValue)/numBuckets;
for (int index = get_global_id(0); index < length; index += get_global_size(0)) { for (uint index = get_global_id(0); index < length; index += get_global_size(0)) {
#ifdef MAC_AMD_WORKAROUND #ifdef MAC_AMD_WORKAROUND
__global int* d = (__global int*) data; __global int* d = (__global int*) data;
int2 element = (int2) (d[2*index], d[2*index+1]); int2 element = (int2) (d[2*index], d[2*index+1]);
float key = (float) getValue(element);
#else #else
TYPE element = data[index]; float key = (float) getValue(data[index]);
#endif #endif
float value = getValue(element); uint bucketIndex = min((uint) ((key-minValue)/bucketWidth), numBuckets-1);
int bucketIndex = min((int) ((value-minValue)/bucketWidth), numBuckets-1);
offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]); offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]);
bucketOfElement[index] = bucketIndex; bucketOfElement[index] = bucketIndex;
} }
...@@ -77,19 +78,19 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le ...@@ -77,19 +78,19 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
* Sum the bucket sizes to compute the start position of each bucket. This kernel * Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group. * is executed as a single work group.
*/ */
__kernel void computeBucketPositions(int numBuckets, __global int* restrict bucketOffset, __local int* restrict buffer) { __kernel void computeBucketPositions(uint numBuckets, __global uint* restrict bucketOffset, __local uint* restrict buffer) {
int globalOffset = 0; uint globalOffset = 0;
for (int startBucket = 0; startBucket < numBuckets; startBucket += get_local_size(0)) { for (uint startBucket = 0; startBucket < numBuckets; startBucket += get_local_size(0)) {
// Load the bucket sizes into local memory. // Load the bucket sizes into local memory.
int globalIndex = startBucket+get_local_id(0); uint globalIndex = startBucket+get_local_id(0);
buffer[get_local_id(0)] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0); buffer[get_local_id(0)] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Perform a parallel prefix sum. // Perform a parallel prefix sum.
for (int step = 1; step < get_local_size(0); step *= 2) { for (uint step = 1; step < get_local_size(0); step *= 2) {
int add = (get_local_id(0) >= step ? buffer[get_local_id(0)-step] : 0); uint add = (get_local_id(0) >= step ? buffer[get_local_id(0)-step] : 0);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
buffer[get_local_id(0)] += add; buffer[get_local_id(0)] += add;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -106,11 +107,11 @@ __kernel void computeBucketPositions(int numBuckets, __global int* restrict buck ...@@ -106,11 +107,11 @@ __kernel void computeBucketPositions(int numBuckets, __global int* restrict buck
/** /**
* Copy the input data into the buckets for sorting. * Copy the input data into the buckets for sorting.
*/ */
__kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYPE* restrict buckets, int length, __global const int* restrict bucketOffset, __global const int* restrict bucketOfElement, __global const int* restrict offsetInBucket) { __kernel void copyDataToBuckets(__global const DATA_TYPE* restrict data, __global DATA_TYPE* restrict buckets, uint length, __global const uint* restrict bucketOffset, __global const uint* restrict bucketOfElement, __global const uint* restrict offsetInBucket) {
for (int index = get_global_id(0); index < length; index += get_global_size(0)) { for (uint index = get_global_id(0); index < length; index += get_global_size(0)) {
TYPE element = data[index]; DATA_TYPE element = data[index];
int bucketIndex = bucketOfElement[index]; uint bucketIndex = bucketOfElement[index];
int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]); uint offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
buckets[offset+offsetInBucket[index]] = element; buckets[offset+offsetInBucket[index]] = element;
} }
} }
...@@ -118,28 +119,34 @@ __kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYP ...@@ -118,28 +119,34 @@ __kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYP
/** /**
* Sort the data in each bucket. * Sort the data in each bucket.
*/ */
__kernel void sortBuckets(__global TYPE* data, __global const TYPE* buckets, int numBuckets, __global const int* restrict bucketOffset, __local TYPE* buffer) { __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA_TYPE* restrict buckets, uint numBuckets, __global const uint* restrict bucketOffset, __local DATA_TYPE* restrict buffer) {
for (int index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) { for (uint index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) {
int startIndex = (index == 0 ? 0 : bucketOffset[index-1]); uint startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
int endIndex = bucketOffset[index]; uint endIndex = bucketOffset[index];
int length = endIndex-startIndex; uint length = endIndex-startIndex;
if (length <= get_local_size(0)) { if (length <= get_local_size(0)) {
// Load the data into local memory. // Load the data into local memory.
buffer[get_local_id(0)] = (get_local_id(0) < length ? buckets[startIndex+get_local_id(0)] : (TYPE) MAXFLOAT); if (get_local_id(0) < length)
buffer[get_local_id(0)] = buckets[startIndex+get_local_id(0)];
else
buffer[get_local_id(0)] = MAX_VALUE;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Perform a bitonic sort in local memory. // Perform a bitonic sort in local memory.
for (int k = 2; k <= get_local_size(0); k *= 2) { for (uint k = 2; k <= get_local_size(0); k *= 2) {
for (int j = k/2; j > 0; j /= 2) { for (uint j = k/2; j > 0; j /= 2) {
int ixj = get_local_id(0)^j; int ixj = get_local_id(0)^j;
if (ixj > get_local_id(0)) { if (ixj > get_local_id(0)) {
if (((get_local_id(0)&k) == 0 && getValue(buffer[get_local_id(0)]) > getValue(buffer[ixj])) || DATA_TYPE value1 = buffer[get_local_id(0)];
((get_local_id(0)&k) != 0 && getValue(buffer[get_local_id(0)]) < getValue(buffer[ixj]))) { DATA_TYPE value2 = buffer[ixj];
TYPE temp = buffer[get_local_id(0)]; bool ascending = (get_local_id(0)&k) == 0;
buffer[get_local_id(0)] = buffer[ixj]; KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
buffer[ixj] = temp; KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
buffer[get_local_id(0)] = value2;
buffer[ixj] = value1;
} }
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -154,24 +161,25 @@ __kernel void sortBuckets(__global TYPE* data, __global const TYPE* buckets, int ...@@ -154,24 +161,25 @@ __kernel void sortBuckets(__global TYPE* data, __global const TYPE* buckets, int
else { else {
// Copy the bucket data over to the output array. // Copy the bucket data over to the output array.
for (int i = get_local_id(0); i < length; i += get_local_size(0)) for (uint i = get_local_id(0); i < length; i += get_local_size(0))
data[startIndex+i] = buckets[startIndex+i]; data[startIndex+i] = buckets[startIndex+i];
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
// Perform a bitonic sort in global memory. // Perform a bitonic sort in global memory.
for (int k = 2; k < 2*length; k *= 2) { for (uint k = 2; k < 2*length; k *= 2) {
for (int j = k/2; j > 0; j /= 2) { for (uint j = k/2; j > 0; j /= 2) {
for (int i = get_local_id(0); i < length; i += get_local_size(0)) { for (uint i = get_local_id(0); i < length; i += get_local_size(0)) {
int ixj = i^j; int ixj = i^j;
if (ixj > i && ixj < length) { if (ixj > i && ixj < length) {
TYPE value1 = data[startIndex+i]; DATA_TYPE value1 = data[startIndex+i];
TYPE value2 = data[startIndex+ixj]; DATA_TYPE value2 = data[startIndex+ixj];
bool ascending = ((i&k) == 0); bool ascending = ((i&k) == 0);
for (int mask = k*2; mask < 2*length; mask *= 2) for (uint mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending); ascending = ((i&mask) == 0 ? !ascending : ascending);
if ((ascending && getValue(value1) > getValue(value2)) || KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
(!ascending && getValue(value1) < getValue(value2))) { KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
data[startIndex+i] = value2; data[startIndex+i] = value2;
data[startIndex+ixj] = value1; data[startIndex+ixj] = value1;
} }
......
...@@ -46,6 +46,17 @@ ...@@ -46,6 +46,17 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
struct SortTrait {
typedef cl_float DataType;
typedef cl_float KeyType;
static const char* clDataType() {return "float";}
static const char* clKeyType() {return "float";}
static const char* clMinKey() {return "-MAXFLOAT";}
static const char* clMaxKey() {return "MAXFLOAT";}
static const char* clMaxValue() {return "MAXFLOAT";}
static const char* clSortKey() {return "value";}
};
void verifySorting(vector<float> array) { void verifySorting(vector<float> array) {
// Sort the array. // Sort the array.
...@@ -56,7 +67,7 @@ void verifySorting(vector<float> array) { ...@@ -56,7 +67,7 @@ void verifySorting(vector<float> array) {
context.initialize(system); context.initialize(system);
OpenCLArray<float> data(context, array.size(), "sortData"); OpenCLArray<float> data(context, array.size(), "sortData");
data.upload(array); data.upload(array);
OpenCLSort<float> sort(context, array.size(), "float", "value"); OpenCLSort<SortTrait> sort(context, array.size());
sort.sort(data); sort.sort(data);
vector<float> sorted; vector<float> sorted;
data.download(sorted); data.download(sorted);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment