Commit c35407a8 authored by Peter Eastman's avatar Peter Eastman
Browse files

Tony Tye's optimizations to sorting. Also a couple of other very minor fixes.

parent 052aea3e
......@@ -90,13 +90,13 @@ public:
/**
* Get the size of the array.
*/
int getSize() {
int getSize() const {
return size;
}
/**
* Get the name of the array.
*/
const std::string& getName() {
const std::string& getName() const {
return name;
}
/**
......
......@@ -131,7 +131,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
else
simdWidth = 1;
if (platforms[0].getInfo<CL_PLATFORM_VENDOR>() == "Apple" && vendor == "AMD")
compilationDefines["MAC_AMD_WORKAROUND"] == "";
compilationDefines["MAC_AMD_WORKAROUND"] = "";
if (supports64BitGlobalAtomics)
compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
if (supportsDoublePrecision)
......
......@@ -1077,10 +1077,10 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeBsplineTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta");
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
if (deviceIsCpu)
pmeBsplineDTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta");
pmeBsplineDTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineDTheta");
pmeAtomRange = new OpenCLArray<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = new OpenCLArray<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
sort = new OpenCLSort<mm_int2>(cl, cl.getNumAtoms(), "int2", "value.y");
sort = new OpenCLSort<SortTrait>(cl, cl.getNumAtoms());
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
// Initialize the b-spline moduli.
......
......@@ -481,6 +481,16 @@ public:
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
private:
struct SortTrait {
typedef mm_int2 DataType;
typedef cl_int KeyType;
static const char* clDataType() {return "int2";}
static const char* clKeyType() {return "int";}
static const char* clMinKey() {return "INT_MIN";}
static const char* clMaxKey() {return "INT_MAX";}
static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";}
static const char* clSortKey() {return "value.y";}
};
OpenCLContext& cl;
bool hasInitializedKernel;
OpenCLArray<mm_float2>* sigmaEpsilon;
......@@ -495,7 +505,7 @@ private:
OpenCLArray<mm_float4>* pmeBsplineDTheta;
OpenCLArray<cl_int>* pmeAtomRange;
OpenCLArray<mm_int2>* pmeAtomGridIndex;
OpenCLSort<mm_int2>* sort;
OpenCLSort<SortTrait>* sort;
OpenCLFFT3D* fft;
cl::Kernel ewaldSumsKernel;
cl::Kernel ewaldForcesKernel;
......
#include "OpenCLSort.h"
template class OpenMM::OpenCLSort<float>;
......@@ -38,6 +38,28 @@ namespace OpenMM {
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The class is templatized by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* struct FloatTrait {
* // The name of the data and key types being sorted.
* // Both the host type and OpenCL type is required.
* // For primitive types they will be the same.
* typedef cl_float DataType;
* typedef cl_float KeyType;
* static const char* clDataType() {return "float";}
* static const char* clKeyType() {return "float";}
* // The minimum value a key can take.
* static const char* clMinKey() {return "-MAXFLOAT";}
* // The maximum value a key can take.
* static const char* clMaxKey() {return "MAXFLOAT";}
* // A value whose key is guaranteed to equal clMaxKey().
* static const char* clMaxValue() {return "MAXFLOAT";}
* // The OpenCL code to select the key from the data value.
* static const char* clSortKey() {return "value";}
* };
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
......@@ -52,7 +74,7 @@ namespace OpenMM {
* elements).
*/
template <class TYPE>
template <class TRAIT>
class OPENMM_EXPORT OpenCLSort {
public:
/**
......@@ -60,17 +82,18 @@ public:
*
* @param context the context in which to perform calculations
* @param length the length of the arrays this object will be used to sort
* @param typeName the name of the data type being sorting (e.g. "float")
* @param sortKey an expression that returns the value by which the variable "value" should be sorted.
* For primitive types, this will simply be "value".
*/
OpenCLSort(OpenCLContext& context, int length, const std::string& typeName, const std::string& sortKey) : context(context),
OpenCLSort(OpenCLContext& context, unsigned int length) : context(context),
dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
// Create kernels.
std::map<std::string, std::string> replacements;
replacements["TYPE"] = typeName;
replacements["SORT_KEY"] = sortKey;
replacements["DATA_TYPE"] = TRAIT::clDataType();
replacements["KEY_TYPE"] = TRAIT::clKeyType();
replacements["SORT_KEY"] = TRAIT::clSortKey();
replacements["MIN_KEY"] = TRAIT::clMinKey();
replacements["MAX_KEY"] = TRAIT::clMaxKey();
replacements["MAX_VALUE"] = TRAIT::clMaxValue();
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
computeRangeKernel = cl::Kernel(program, "computeRange");
assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
......@@ -80,18 +103,18 @@ public:
// Work out the work group sizes for various kernels.
int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2)
;
positionsKernelSize = rangeKernelSize;
sortKernelSize = rangeKernelSize/2;
if (rangeKernelSize > length)
rangeKernelSize = length;
int maxLocalBuffer = (int) ((context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/sizeof(TYPE))/2);
unsigned int maxLocalBuffer = (unsigned int) ((context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/sizeof(typename TRAIT::DataType))/2);
if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer;
int targetBucketSize = sortKernelSize/2;
int numBuckets = length/targetBucketSize;
unsigned int targetBucketSize = sortKernelSize/2;
unsigned int numBuckets = length/targetBucketSize;
if (numBuckets < 1)
numBuckets = 1;
if (positionsKernelSize > numBuckets)
......@@ -99,11 +122,11 @@ public:
// Create workspace arrays.
dataRange = new OpenCLArray<mm_float2>(context, 1, "sortDataRange");
bucketOffset = new OpenCLArray<cl_int>(context, numBuckets, "bucketOffset");
bucketOfElement = new OpenCLArray<cl_int>(context, length, "bucketOfElement");
offsetInBucket = new OpenCLArray<cl_int>(context, length, "offsetInBucket");
buckets = new OpenCLArray<TYPE>(context, length, "buckets");
dataRange = new OpenCLArray<typename TRAIT::KeyType>(context, 2, "sortDataRange");
bucketOffset = new OpenCLArray<cl_uint>(context, numBuckets, "bucketOffset");
bucketOfElement = new OpenCLArray<cl_uint>(context, length, "bucketOfElement");
offsetInBucket = new OpenCLArray<cl_uint>(context, length, "offsetInBucket");
buckets = new OpenCLArray<typename TRAIT::DataType>(context, length, "buckets");
}
~OpenCLSort() {
if (dataRange != NULL)
......@@ -120,18 +143,24 @@ public:
/**
* Sort an array.
*/
void sort(OpenCLArray<TYPE>& data) {
void sort(OpenCLArray<typename TRAIT::DataType>& data) {
if (data.getSize() != bucketOfElement->getSize())
throw OpenMMException("OpenCLSort called with different data size");
if (data.getSize() == 0)
return;
// Compute the range of data values.
computeRangeKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
computeRangeKernel.setArg<cl_int>(1, data.getSize());
computeRangeKernel.setArg<cl_uint>(1, data.getSize());
computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer());
computeRangeKernel.setArg(3, rangeKernelSize*sizeof(cl_float), NULL);
computeRangeKernel.setArg(3, rangeKernelSize*sizeof(typename TRAIT::KeyType), NULL);
context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);
// Assign array elements to buckets.
int numBuckets = bucketOffset->getSize();
unsigned int numBuckets = bucketOffset->getSize();
context.clearBuffer(bucketOffset->getDeviceBuffer(), numBuckets);
assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
assignElementsKernel.setArg<cl_int>(1, data.getSize());
......@@ -165,18 +194,18 @@ public:
sortBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
sortBucketsKernel.setArg<cl_int>(2, numBuckets);
sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
sortBucketsKernel.setArg(4, sortKernelSize*sizeof(TYPE), NULL);
sortBucketsKernel.setArg(4, sortKernelSize*sizeof(typename TRAIT::DataType), NULL);
context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
}
private:
OpenCLContext& context;
OpenCLArray<mm_float2>* dataRange;
OpenCLArray<cl_int>* bucketOfElement;
OpenCLArray<cl_int>* offsetInBucket;
OpenCLArray<cl_int>* bucketOffset;
OpenCLArray<TYPE>* buckets;
OpenCLArray<typename TRAIT::KeyType>* dataRange;
OpenCLArray<cl_uint>* bucketOfElement;
OpenCLArray<cl_uint>* offsetInBucket;
OpenCLArray<cl_uint>* bucketOffset;
OpenCLArray<typename TRAIT::DataType>* buckets;
cl::Kernel computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
int rangeKernelSize, positionsKernelSize, sortKernelSize;
unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
};
} // namespace OpenMM
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
float getValue(TYPE value) {
KEY_TYPE getValue(DATA_TYPE value) {
return SORT_KEY;
}
......@@ -8,14 +8,14 @@ float getValue(TYPE value) {
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
*/
__kernel void computeRange(__global const TYPE* restrict data, int length, __global float2* restrict range, __local float* restrict buffer) {
float minimum = MAXFLOAT;
float maximum = -MAXFLOAT;
__kernel void computeRange(__global const DATA_TYPE* restrict data, uint length, __global KEY_TYPE* restrict range, __local KEY_TYPE* restrict buffer) {
KEY_TYPE minimum = MAX_KEY;
KEY_TYPE maximum = MIN_KEY;
// Each thread calculates the range of a subset of values.
for (int index = get_local_id(0); index < length; index += get_local_size(0)) {
float value = getValue(data[index]);
for (uint index = get_local_id(0); index < length; index += get_local_size(0)) {
KEY_TYPE value = getValue(data[index]);
minimum = min(minimum, value);
maximum = max(maximum, value);
}
......@@ -24,7 +24,7 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
buffer[get_local_id(0)] = minimum;
barrier(CLK_LOCAL_MEM_FENCE);
for (int step = 1; step < get_local_size(0); step *= 2) {
for (uint step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
buffer[get_local_id(0)] = min(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -32,21 +32,23 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
minimum = buffer[0];
buffer[get_local_id(0)] = maximum;
barrier(CLK_LOCAL_MEM_FENCE);
for (int step = 1; step < get_local_size(0); step *= 2) {
for (uint step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
buffer[get_local_id(0)] = max(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
barrier(CLK_LOCAL_MEM_FENCE);
}
maximum = buffer[0];
if (get_local_id(0) == 0)
range[0] = (float2) (minimum, maximum);
if (get_local_id(0) == 0) {
range[0] = minimum;
range[1] = maximum;
}
}
/**
* Assign elements to buckets.
*/
__kernel void assignElementsToBuckets(__global const TYPE* restrict data, int length, int numBuckets, __global const float2* restrict range,
__global int* bucketOffset, __global int* restrict bucketOfElement, __global int* restrict offsetInBucket) {
__kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, uint length, uint numBuckets, __global const KEY_TYPE* restrict range,
__global uint* bucketOffset, __global uint* restrict bucketOfElement, __global uint* restrict offsetInBucket) {
#ifdef AMD_ATOMIC_WORK_AROUND
// Do a byte write to force all memory accesses to interactionCount to use the complete path.
// This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
......@@ -55,19 +57,18 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
if (get_global_id(0) == get_local_id(0)+1)
((__global char*)bucketOffset)[sizeof(int)*numBuckets+1] = 0;
#endif
float2 dataRange = range[0];
float minValue = dataRange.x;
float maxValue = dataRange.y;
float minValue = (float) (range[0]);
float maxValue = (float) (range[1]);
float bucketWidth = (maxValue-minValue)/numBuckets;
for (int index = get_global_id(0); index < length; index += get_global_size(0)) {
for (uint index = get_global_id(0); index < length; index += get_global_size(0)) {
#ifdef MAC_AMD_WORKAROUND
__global int* d = (__global int*) data;
int2 element = (int2) (d[2*index], d[2*index+1]);
float key = (float) getValue(element);
#else
TYPE element = data[index];
float key = (float) getValue(data[index]);
#endif
float value = getValue(element);
int bucketIndex = min((int) ((value-minValue)/bucketWidth), numBuckets-1);
uint bucketIndex = min((uint) ((key-minValue)/bucketWidth), numBuckets-1);
offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]);
bucketOfElement[index] = bucketIndex;
}
......@@ -77,19 +78,19 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__kernel void computeBucketPositions(int numBuckets, __global int* restrict bucketOffset, __local int* restrict buffer) {
int globalOffset = 0;
for (int startBucket = 0; startBucket < numBuckets; startBucket += get_local_size(0)) {
__kernel void computeBucketPositions(uint numBuckets, __global uint* restrict bucketOffset, __local uint* restrict buffer) {
uint globalOffset = 0;
for (uint startBucket = 0; startBucket < numBuckets; startBucket += get_local_size(0)) {
// Load the bucket sizes into local memory.
int globalIndex = startBucket+get_local_id(0);
uint globalIndex = startBucket+get_local_id(0);
buffer[get_local_id(0)] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
barrier(CLK_LOCAL_MEM_FENCE);
// Perform a parallel prefix sum.
for (int step = 1; step < get_local_size(0); step *= 2) {
int add = (get_local_id(0) >= step ? buffer[get_local_id(0)-step] : 0);
for (uint step = 1; step < get_local_size(0); step *= 2) {
uint add = (get_local_id(0) >= step ? buffer[get_local_id(0)-step] : 0);
barrier(CLK_LOCAL_MEM_FENCE);
buffer[get_local_id(0)] += add;
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -106,11 +107,11 @@ __kernel void computeBucketPositions(int numBuckets, __global int* restrict buck
/**
* Copy the input data into the buckets for sorting.
*/
__kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYPE* restrict buckets, int length, __global const int* restrict bucketOffset, __global const int* restrict bucketOfElement, __global const int* restrict offsetInBucket) {
for (int index = get_global_id(0); index < length; index += get_global_size(0)) {
TYPE element = data[index];
int bucketIndex = bucketOfElement[index];
int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
__kernel void copyDataToBuckets(__global const DATA_TYPE* restrict data, __global DATA_TYPE* restrict buckets, uint length, __global const uint* restrict bucketOffset, __global const uint* restrict bucketOfElement, __global const uint* restrict offsetInBucket) {
for (uint index = get_global_id(0); index < length; index += get_global_size(0)) {
DATA_TYPE element = data[index];
uint bucketIndex = bucketOfElement[index];
uint offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
buckets[offset+offsetInBucket[index]] = element;
}
}
......@@ -118,28 +119,34 @@ __kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYP
/**
* Sort the data in each bucket.
*/
__kernel void sortBuckets(__global TYPE* data, __global const TYPE* buckets, int numBuckets, __global const int* restrict bucketOffset, __local TYPE* buffer) {
for (int index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) {
int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
int endIndex = bucketOffset[index];
int length = endIndex-startIndex;
__kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA_TYPE* restrict buckets, uint numBuckets, __global const uint* restrict bucketOffset, __local DATA_TYPE* restrict buffer) {
for (uint index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) {
uint startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
uint endIndex = bucketOffset[index];
uint length = endIndex-startIndex;
if (length <= get_local_size(0)) {
// Load the data into local memory.
buffer[get_local_id(0)] = (get_local_id(0) < length ? buckets[startIndex+get_local_id(0)] : (TYPE) MAXFLOAT);
if (get_local_id(0) < length)
buffer[get_local_id(0)] = buckets[startIndex+get_local_id(0)];
else
buffer[get_local_id(0)] = MAX_VALUE;
barrier(CLK_LOCAL_MEM_FENCE);
// Perform a bitonic sort in local memory.
for (int k = 2; k <= get_local_size(0); k *= 2) {
for (int j = k/2; j > 0; j /= 2) {
for (uint k = 2; k <= get_local_size(0); k *= 2) {
for (uint j = k/2; j > 0; j /= 2) {
int ixj = get_local_id(0)^j;
if (ixj > get_local_id(0)) {
if (((get_local_id(0)&k) == 0 && getValue(buffer[get_local_id(0)]) > getValue(buffer[ixj])) ||
((get_local_id(0)&k) != 0 && getValue(buffer[get_local_id(0)]) < getValue(buffer[ixj]))) {
TYPE temp = buffer[get_local_id(0)];
buffer[get_local_id(0)] = buffer[ixj];
buffer[ixj] = temp;
DATA_TYPE value1 = buffer[get_local_id(0)];
DATA_TYPE value2 = buffer[ixj];
bool ascending = (get_local_id(0)&k) == 0;
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
buffer[get_local_id(0)] = value2;
buffer[ixj] = value1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -154,24 +161,25 @@ __kernel void sortBuckets(__global TYPE* data, __global const TYPE* buckets, int
else {
// Copy the bucket data over to the output array.
for (int i = get_local_id(0); i < length; i += get_local_size(0))
for (uint i = get_local_id(0); i < length; i += get_local_size(0))
data[startIndex+i] = buckets[startIndex+i];
barrier(CLK_GLOBAL_MEM_FENCE);
// Perform a bitonic sort in global memory.
for (int k = 2; k < 2*length; k *= 2) {
for (int j = k/2; j > 0; j /= 2) {
for (int i = get_local_id(0); i < length; i += get_local_size(0)) {
for (uint k = 2; k < 2*length; k *= 2) {
for (uint j = k/2; j > 0; j /= 2) {
for (uint i = get_local_id(0); i < length; i += get_local_size(0)) {
int ixj = i^j;
if (ixj > i && ixj < length) {
TYPE value1 = data[startIndex+i];
TYPE value2 = data[startIndex+ixj];
DATA_TYPE value1 = data[startIndex+i];
DATA_TYPE value2 = data[startIndex+ixj];
bool ascending = ((i&k) == 0);
for (int mask = k*2; mask < 2*length; mask *= 2)
for (uint mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending);
if ((ascending && getValue(value1) > getValue(value2)) ||
(!ascending && getValue(value1) < getValue(value2))) {
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
data[startIndex+i] = value2;
data[startIndex+ixj] = value1;
}
......
......@@ -46,6 +46,17 @@
using namespace OpenMM;
using namespace std;
struct SortTrait {
typedef cl_float DataType;
typedef cl_float KeyType;
static const char* clDataType() {return "float";}
static const char* clKeyType() {return "float";}
static const char* clMinKey() {return "-MAXFLOAT";}
static const char* clMaxKey() {return "MAXFLOAT";}
static const char* clMaxValue() {return "MAXFLOAT";}
static const char* clSortKey() {return "value";}
};
void verifySorting(vector<float> array) {
// Sort the array.
......@@ -56,7 +67,7 @@ void verifySorting(vector<float> array) {
context.initialize(system);
OpenCLArray<float> data(context, array.size(), "sortData");
data.upload(array);
OpenCLSort<float> sort(context, array.size(), "float", "value");
OpenCLSort<SortTrait> sort(context, array.size());
sort.sort(data);
vector<float> sorted;
data.download(sorted);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment