Unverified Commit 71d9bb13 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Optimize sorting of non-uniformly distributed data (#3383)

* Optimized CudaSort for non-uniformly distributed data

* Optimized OpenCLSort for non-uniformly distributed data

* Further tuned distributing elements between buckets

* Copied optimizations over to OpenCL
parent 130124a3
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Portions copyright (c) 2010-2021 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -77,8 +77,12 @@ public:
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the CudaSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint. It allows parts
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
CudaSort(CudaContext& context, SortTrait* trait, unsigned int length);
CudaSort(CudaContext& context, SortTrait* trait, unsigned int length, bool uniform=true);
~CudaSort();
/**
* Sort an array.
......@@ -94,7 +98,7 @@ private:
CudaArray buckets;
CUfunction shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList;
bool isShortList, uniform;
};
/**
......
......@@ -279,7 +279,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false);
vector<unsigned int> count(2, 0);
interactionCount.upload(count);
rebuildNeighborList.upload(&count[0]);
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Portions copyright (c) 2010-2021 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -32,7 +32,8 @@
using namespace OpenMM;
using namespace std;
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataLength(length) {
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length, bool uniform) :
context(context), trait(trait), dataLength(length), uniform(uniform) {
// Create kernels.
map<string, string> replacements;
......@@ -42,11 +43,12 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
replacements["MIN_KEY"] = trait->getMinKey();
replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue();
replacements["UNIFORM"] = (uniform ? "1" : "0");
CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
shortListKernel = context.getKernel(module, "sortShortList");
shortList2Kernel = context.getKernel(module, "sortShortList2");
computeRangeKernel = context.getKernel(module, "computeRange");
assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
assignElementsKernel = context.getKernel(module, uniform ? "assignElementsToBuckets" : "assignElementsToBuckets2");
computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
sortBucketsKernel = context.getKernel(module, "sortBuckets");
......
......@@ -79,6 +79,7 @@ __global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE*
*/
__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range,
unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
#if UNIFORM
extern __shared__ KEY_TYPE minBuffer[];
KEY_TYPE* maxBuffer = minBuffer+blockDim.x;
KEY_TYPE minimum = MAX_KEY;
......@@ -110,6 +111,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
range[0] = minimum;
range[1] = maximum;
}
#endif
// Clear the bucket counters in preparation for the next kernel.
......@@ -118,7 +120,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
}
/**
* Assign elements to buckets.
* Assign elements to buckets. This version is optimized for uniformly distributed data.
*/
__global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
......@@ -133,6 +135,85 @@ __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsi
}
}
/**
* Assign elements to buckets. This version is optimized for non-uniformly distributed data.
*/
__global__ void assignElementsToBuckets2(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
// Load 64 datapoints and sort them to get an estimate of the data distribution.
__shared__ KEY_TYPE elements[64];
if (threadIdx.x < 64) {
int index = (int) (threadIdx.x*length/64.0);
elements[threadIdx.x] = getValue(data[index]);
}
__syncthreads();
for (unsigned int k = 2; k <= 64; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
if (threadIdx.x < 64) {
int ixj = threadIdx.x^j;
if (ixj > threadIdx.x) {
KEY_TYPE value1 = elements[threadIdx.x];
KEY_TYPE value2 = elements[ixj];
bool ascending = (threadIdx.x&k) == 0;
KEY_TYPE lowKey = (ascending ? value1 : value2);
KEY_TYPE highKey = (ascending ? value2 : value1);
if (lowKey > highKey) {
elements[threadIdx.x] = value2;
elements[ixj] = value1;
}
}
}
__syncthreads();
}
}
// Create a function composed of linear segments mapping data values to bucket indices.
__shared__ float segmentLowerBound[9];
__shared__ float segmentBaseIndex[9];
__shared__ float segmentIndexScale[9];
if (threadIdx.x == 0) {
segmentLowerBound[0] = elements[0]-0.2f*(elements[5]-elements[0]);
segmentLowerBound[1] = elements[5];
segmentLowerBound[2] = elements[10];
segmentLowerBound[3] = elements[20];
segmentLowerBound[4] = elements[30];
segmentLowerBound[5] = elements[40];
segmentLowerBound[6] = elements[50];
segmentLowerBound[7] = elements[60];
segmentLowerBound[8] = elements[63]+0.2f*(elements[63]-elements[58]);
segmentBaseIndex[0] = numBuckets/16;
segmentBaseIndex[1] = 3*numBuckets/16;
segmentBaseIndex[2] = 5*numBuckets/16;
segmentBaseIndex[3] = 7*numBuckets/16;
segmentBaseIndex[4] = 9*numBuckets/16;
segmentBaseIndex[5] = 11*numBuckets/16;
segmentBaseIndex[6] = 13*numBuckets/16;
segmentBaseIndex[7] = 15*numBuckets/16;
segmentBaseIndex[8] = numBuckets;
for (int i = 0; i < 8; i++)
if (segmentLowerBound[i+1] == segmentLowerBound[i])
segmentIndexScale[i] = 0;
else
segmentIndexScale[i] = (segmentBaseIndex[i+1]-segmentBaseIndex[i])/(segmentLowerBound[i+1]-segmentLowerBound[i]);
}
__syncthreads();
// Assign elements to buckets.
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
float key = (float) getValue(data[index]);
int segment;
for (segment = 0; segment < 7 && key > segmentLowerBound[segment+1]; segment++)
;
unsigned int bucketIndex = segmentBaseIndex[segment]+(key-segmentLowerBound[segment])*segmentIndexScale[segment];
bucketIndex = min(max(0, bucketIndex), numBuckets-1);
offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
bucketOfElement[index] = bucketIndex;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2016 Stanford University and the Authors. *
* Portions copyright (c) 2008-2021 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -59,7 +59,7 @@ class SortTrait : public CudaSort::SortTrait {
const char* getSortKey() const {return "value";}
};
void verifySorting(vector<float> array) {
void verifySorting(vector<float> array, bool uniform) {
// Sort the array.
System system;
......@@ -72,7 +72,7 @@ void verifySorting(vector<float> array) {
context.setAsCurrent();
CudaArray data(context, array.size(), 4, "sortData");
data.upload(array);
CudaSort sort(context, new SortTrait(), array.size());
CudaSort sort(context, new SortTrait(), array.size(), uniform);
sort.sort(data);
vector<float> sorted;
data.download(sorted);
......@@ -96,7 +96,8 @@ void testUniformValues() {
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) genrand_real2(sfmt);
verifySorting(array);
verifySorting(array, true);
verifySorting(array, false);
}
void testLogValues() {
......@@ -106,7 +107,8 @@ void testLogValues() {
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array);
verifySorting(array, true);
verifySorting(array, false);
}
void testShortList() {
......@@ -116,7 +118,8 @@ void testShortList() {
vector<float> array(500);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array);
verifySorting(array, true);
verifySorting(array, false);
}
int main(int argc, char* argv[]) {
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Portions copyright (c) 2010-2021 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -77,8 +77,12 @@ public:
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the OpenCLSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param uniform whether the input data is expected to follow a uniform or nonuniform
* distribution. This argument is used only as a hint. It allows parts
* of the algorithm to be tuned for faster performance on the expected
* distribution.
*/
OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length);
OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length, bool uniform=true);
~OpenCLSort();
/**
* Sort an array.
......@@ -94,7 +98,7 @@ private:
OpenCLArray buckets;
cl::Kernel shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList, useShortList2;
bool isShortList, useShortList2, uniform;
};
/**
......
......@@ -305,7 +305,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new OpenCLSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
blockSorter = new OpenCLSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false);
vector<cl_uint> count(1, 0);
interactionCount.upload(count);
rebuildNeighborList.upload(count);
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2020 Stanford University and the Authors. *
* Portions copyright (c) 2010-2021 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -37,7 +37,8 @@
using namespace OpenMM;
using namespace std;
OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataLength(length) {
OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length, bool uniform) :
context(context), trait(trait), dataLength(length), uniform(uniform) {
// Create kernels.
std::map<std::string, std::string> replacements;
......@@ -47,11 +48,13 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
replacements["MIN_KEY"] = trait->getMinKey();
replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue();
replacements["UNIFORM"] = (uniform ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
shortListKernel = cl::Kernel(program, "sortShortList");
shortList2Kernel = cl::Kernel(program, "sortShortList2");
computeRangeKernel = cl::Kernel(program, "computeRange");
assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
assignElementsKernel = cl::Kernel(program, uniform ? "assignElementsToBuckets" : "assignElementsToBuckets2");
computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions");
copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets");
sortBucketsKernel = cl::Kernel(program, "sortBuckets");
......
......@@ -77,6 +77,7 @@ __kernel void sortShortList2(__global const DATA_TYPE* restrict dataIn, __global
*/
__kernel void computeRange(__global const DATA_TYPE* restrict data, uint length, __global KEY_TYPE* restrict range, __local KEY_TYPE* restrict minBuffer,
__local KEY_TYPE* restrict maxBuffer, uint numBuckets, __global uint* restrict bucketOffset) {
#if UNIFORM
KEY_TYPE minimum = MAX_KEY;
KEY_TYPE maximum = MIN_KEY;
......@@ -106,6 +107,7 @@ __kernel void computeRange(__global const DATA_TYPE* restrict data, uint length,
range[0] = minimum;
range[1] = maximum;
}
#endif
// Clear the bucket counters in preparation for the next kernel.
......@@ -114,7 +116,7 @@ __kernel void computeRange(__global const DATA_TYPE* restrict data, uint length,
}
/**
* Assign elements to buckets.
* Assign elements to buckets. This version is optimized for uniformly distributed data.
*/
__kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, uint length, uint numBuckets, __global const KEY_TYPE* restrict range,
__global uint* restrict bucketOffset, __global uint* restrict bucketOfElement, __global uint* restrict offsetInBucket) {
......@@ -137,6 +139,85 @@ __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, u
}
}
/**
* Assign elements to buckets. This version is optimized for non-uniformly distributed data.
*/
__kernel void assignElementsToBuckets2(__global const DATA_TYPE* restrict data, uint length, uint numBuckets, __global const KEY_TYPE* restrict range,
__global uint* restrict bucketOffset, __global uint* restrict bucketOfElement, __global uint* restrict offsetInBucket) {
// Load 64 datapoints and sort them to get an estimate of the data distribution.
__local KEY_TYPE elements[64];
if (get_local_id(0) < 64) {
int index = (int) (get_local_id(0)*length/64.0);
elements[get_local_id(0)] = getValue(data[index]);
}
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int k = 2; k <= 64; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
if (get_local_id(0) < 64) {
int ixj = get_local_id(0)^j;
if (ixj > get_local_id(0)) {
KEY_TYPE value1 = elements[get_local_id(0)];
KEY_TYPE value2 = elements[ixj];
bool ascending = (get_local_id(0)&k) == 0;
KEY_TYPE lowKey = (ascending ? value1 : value2);
KEY_TYPE highKey = (ascending ? value2 : value1);
if (lowKey > highKey) {
elements[get_local_id(0)] = value2;
elements[ixj] = value1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// Create a function composed of linear segments mapping data values to bucket indices.
__local float segmentLowerBound[9];
__local float segmentBaseIndex[9];
__local float segmentIndexScale[9];
if (get_local_id(0) == 0) {
segmentLowerBound[0] = elements[0]-0.2f*(elements[5]-elements[0]);
segmentLowerBound[1] = elements[5];
segmentLowerBound[2] = elements[10];
segmentLowerBound[3] = elements[20];
segmentLowerBound[4] = elements[30];
segmentLowerBound[5] = elements[40];
segmentLowerBound[6] = elements[50];
segmentLowerBound[7] = elements[60];
segmentLowerBound[8] = elements[63]+0.2f*(elements[63]-elements[58]);
segmentBaseIndex[0] = numBuckets/16;
segmentBaseIndex[1] = 3*numBuckets/16;
segmentBaseIndex[2] = 5*numBuckets/16;
segmentBaseIndex[3] = 7*numBuckets/16;
segmentBaseIndex[4] = 9*numBuckets/16;
segmentBaseIndex[5] = 11*numBuckets/16;
segmentBaseIndex[6] = 13*numBuckets/16;
segmentBaseIndex[7] = 15*numBuckets/16;
segmentBaseIndex[8] = numBuckets;
for (int i = 0; i < 8; i++)
if (segmentLowerBound[i+1] == segmentLowerBound[i])
segmentIndexScale[i] = 0;
else
segmentIndexScale[i] = (segmentBaseIndex[i+1]-segmentBaseIndex[i])/(segmentLowerBound[i+1]-segmentLowerBound[i]);
}
barrier(CLK_LOCAL_MEM_FENCE);
// Assign elements to buckets.
for (unsigned int index = get_global_id(0); index < length; index += get_global_size(0)) {
float key = (float) getValue(data[index]);
int segment;
for (segment = 0; segment < 7 && key > segmentLowerBound[segment+1]; segment++)
;
unsigned int bucketIndex = segmentBaseIndex[segment]+(key-segmentLowerBound[segment])*segmentIndexScale[segment];
bucketIndex = min(max((uint) 0, bucketIndex), numBuckets-1);
offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]);
bucketOfElement[index] = bucketIndex;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2016 Stanford University and the Authors. *
* Portions copyright (c) 2008-2021 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -59,7 +59,7 @@ class SortTrait : public OpenCLSort::SortTrait {
const char* getSortKey() const {return "value";}
};
void verifySorting(vector<float> array) {
void verifySorting(vector<float> array, bool uniform) {
// Sort the array.
System system;
......@@ -69,7 +69,7 @@ void verifySorting(vector<float> array) {
context.initialize();
OpenCLArray data(context, array.size(), sizeof(float), "sortData");
data.upload(array);
OpenCLSort sort(context, new SortTrait(), array.size());
OpenCLSort sort(context, new SortTrait(), array.size(), uniform);
sort.sort(data);
vector<float> sorted;
data.download(sorted);
......@@ -93,7 +93,8 @@ void testUniformValues() {
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) genrand_real2(sfmt);
verifySorting(array);
verifySorting(array, true);
verifySorting(array, false);
}
void testLogValues() {
......@@ -103,7 +104,8 @@ void testLogValues() {
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array);
verifySorting(array, true);
verifySorting(array, false);
}
void testShortList() {
......@@ -113,7 +115,8 @@ void testShortList() {
vector<float> array(500);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array);
verifySorting(array, true);
verifySorting(array, false);
}
int main(int argc, char* argv[]) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment