Commit 6c542539 authored by Peter Eastman's avatar Peter Eastman
Browse files

Created OpenCL sorting algorithm

parent 07073906
#ifndef __OPENMM_OPENCLSORT_H__
#define __OPENMM_OPENCLSORT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "OpenCLArray.h"
#include "OpenCLKernelSources.h"
#include "openmm/internal/windowsExport.h"
#include <map>
namespace OpenMM {
/**
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
*
* Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng. "An Efficient
* Sorting Algorithm with CUDA" Journal of the Chinese Institute of Engineers, 32(7),
* pp. 915-921 (2009)
*
* but with many modifications and simplifications. In particular, this algorithm
* involves much less communication between host and device, which is critical to get
* good performance with the array sizes we typically work with (10,000 to 100,000
* elements).
*/
template <class TYPE>
class OPENMM_EXPORT OpenCLSort {
public:
/**
* Create an OpenCLSort object for sorting data of a particular type.
*
* @param context the context in which to perform calculations
* @param length the length of the arrays this object will be used to sort
* @param typeName the name of the data type being sorting (e.g. "float")
* @param sortKey an expression that returns the value by which the variable "value" should be sorted.
* For primitive types, this will simply be "value".
*/
OpenCLSort(OpenCLContext& context, int length, const std::string& typeName, const std::string& sortKey) : context(context),
dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
// Create kernels.
std::map<std::string, std::string> replacements;
replacements["TYPE"] = typeName;
replacements["SORT_KEY"] = sortKey;
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
computeRangeKernel = cl::Kernel(program, "computeRange");
assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions");
copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets");
sortBucketsKernel = cl::Kernel(program, "sortBuckets");
// Work out the work group sizes for various kernels.
int maxGroupSize = context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2)
;
positionsKernelSize = rangeKernelSize;
sortKernelSize = rangeKernelSize/2;
if (rangeKernelSize > length)
rangeKernelSize = length;
int maxLocalBuffer = (context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/sizeof(TYPE))/2;
if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer;
int targetBucketSize = sortKernelSize/2;
int numBuckets = length/targetBucketSize;
if (positionsKernelSize > numBuckets)
positionsKernelSize = numBuckets;
// Create workspace arrays.
dataRange = new OpenCLArray<mm_float2>(context, 1, "sortDataRange");
bucketOffset = new OpenCLArray<cl_int>(context, numBuckets, "bucketOffset");
bucketOfElement = new OpenCLArray<cl_int>(context, length, "bucketOfElement");
offsetInBucket = new OpenCLArray<cl_int>(context, length, "offsetInBucket");
buckets = new OpenCLArray<TYPE>(context, length, "buckets");
}
~OpenCLSort() {
if (dataRange != NULL)
delete dataRange;
if (bucketOfElement != NULL)
delete bucketOfElement;
if (offsetInBucket != NULL)
delete offsetInBucket;
if (bucketOffset != NULL)
delete bucketOffset;
if (buckets != NULL)
delete buckets;
}
/**
* Sort an array.
*/
void sort(OpenCLArray<TYPE>& data) {
// Compute the range of data values.
computeRangeKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
computeRangeKernel.setArg<cl_int>(1, data.getSize());
computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer());
computeRangeKernel.setArg(3, rangeKernelSize*sizeof(cl_float), NULL);
context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);
// Assign array elements to buckets.
int numBuckets = bucketOffset->getSize();
context.clearBuffer(bucketOffset->getDeviceBuffer(), numBuckets);
assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
assignElementsKernel.setArg<cl_int>(1, data.getSize());
assignElementsKernel.setArg<cl_int>(2, numBuckets);
assignElementsKernel.setArg<cl::Buffer>(3, dataRange->getDeviceBuffer());
assignElementsKernel.setArg<cl::Buffer>(4, bucketOffset->getDeviceBuffer());
assignElementsKernel.setArg<cl::Buffer>(5, bucketOfElement->getDeviceBuffer());
assignElementsKernel.setArg<cl::Buffer>(6, offsetInBucket->getDeviceBuffer());
context.executeKernel(assignElementsKernel, data.getSize());
// Compute the position of each bucket.
computeBucketPositionsKernel.setArg<cl_int>(0, numBuckets);
computeBucketPositionsKernel.setArg<cl::Buffer>(1, bucketOffset->getDeviceBuffer());
computeBucketPositionsKernel.setArg(2, positionsKernelSize*sizeof(cl_int), NULL);
context.executeKernel(computeBucketPositionsKernel, positionsKernelSize, positionsKernelSize);
// Copy the data into the buckets.
copyToBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
copyToBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
copyToBucketsKernel.setArg<cl_int>(2, data.getSize());
copyToBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
copyToBucketsKernel.setArg<cl::Buffer>(4, bucketOfElement->getDeviceBuffer());
copyToBucketsKernel.setArg<cl::Buffer>(5, offsetInBucket->getDeviceBuffer());
context.executeKernel(copyToBucketsKernel, data.getSize());
// Sort each bucket.
sortBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
sortBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
sortBucketsKernel.setArg<cl_int>(2, numBuckets);
sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
sortBucketsKernel.setArg(4, sortKernelSize*sizeof(mm_float2), NULL);
context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
}
private:
OpenCLContext& context;
OpenCLArray<mm_float2>* dataRange;
OpenCLArray<cl_int>* bucketOfElement;
OpenCLArray<cl_int>* offsetInBucket;
OpenCLArray<cl_int>* bucketOffset;
OpenCLArray<TYPE>* buckets;
cl::Kernel computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
int rangeKernelSize, positionsKernelSize, sortKernelSize;
};
} // namespace OpenMM
#endif // __OPENMM_OPENCLSORT_H__
\ No newline at end of file
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
float getValue(TYPE value) {
return SORT_KEY;
}
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
*/
__kernel void computeRange(__global TYPE* data, int length, __global float2* range, __local float* buffer) {
float minimum = MAXFLOAT;
float maximum = -MAXFLOAT;
// Each thread calculates the range of a subset of values.
for (int index = get_local_id(0); index < length; index += get_local_size(0)) {
float value = getValue(data[index]);
minimum = min(minimum, value);
maximum = max(maximum, value);
}
// Now reduce them.
buffer[get_local_id(0)] = minimum;
for (int step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
buffer[get_local_id(0)] = min(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
barrier(CLK_LOCAL_MEM_FENCE);
}
minimum = buffer[0];
buffer[get_local_id(0)] = maximum;
for (int step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
buffer[get_local_id(0)] = max(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
barrier(CLK_LOCAL_MEM_FENCE);
}
maximum = buffer[0];
if (get_local_id(0) == 0)
range[0] = (float2) (minimum, maximum);
}
/**
* Assign elements to buckets.
*/
__kernel void assignElementsToBuckets(__global TYPE* data, int length, int numBuckets, __global float2* range,
__global int* bucketOffset, __global int* bucketOfElement, __global int* offsetInBucket) {
float2 dataRange = range[0];
float minValue = dataRange.x;
float maxValue = dataRange.y;
float bucketWidth = (maxValue-minValue)/numBuckets;
for (int index = get_global_id(0); index < length; index += get_global_size(0)) {
TYPE element = data[index];
float value = getValue(element);
int bucketIndex = min((int) ((value-minValue)/bucketWidth), numBuckets-1);
offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]);
bucketOfElement[index] = bucketIndex;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__kernel void computeBucketPositions(int numBuckets, __global int* bucketOffset, __local int* buffer) {
int globalOffset = 0;
for (int startBucket = 0; startBucket < numBuckets; startBucket += get_local_size(0)) {
// Load the bucket sizes into local memory.
int globalIndex = startBucket+get_local_id(0);
buffer[get_local_id(0)] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
barrier(CLK_LOCAL_MEM_FENCE);
// Perform a parallel prefix sum.
for (int step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0) >= step)
buffer[get_local_id(0)] += buffer[get_local_id(0)-step];
barrier(CLK_LOCAL_MEM_FENCE);
}
// Write the results back to global memory.
if (globalIndex < numBuckets)
bucketOffset[globalIndex] = buffer[get_local_id(0)]+globalOffset;
globalOffset += buffer[get_local_size(0)-1];
}
}
/**
* Copy the input data into the buckets for sorting.
*/
__kernel void copyDataToBuckets(__global TYPE* data, __global TYPE* buckets, int length, __global int* bucketOffset, __global int* bucketOfElement, __global int* offsetInBucket) {
for (int index = get_global_id(0); index < length; index += get_global_size(0)) {
TYPE element = data[index];
int bucketIndex = bucketOfElement[index];
int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
buckets[offset+offsetInBucket[index]] = element;
}
}
/**
* Sort the data in each bucket.
*/
__kernel void sortBuckets(__global TYPE* data, __global TYPE* buckets, int numBuckets, __global int* bucketOffset, __local TYPE* buffer) {
for (int index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) {
int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
int endIndex = bucketOffset[index];
int length = endIndex-startIndex;
if (length <= get_local_size(0)) {
// Load the data into local memory.
buffer[get_local_id(0)] = (get_local_id(0) < length ? buckets[startIndex+get_local_id(0)] : MAXFLOAT);
barrier(CLK_LOCAL_MEM_FENCE);
// Perform a bitonic sort in local memory.
for (int k = 2; k <= get_local_size(0); k *= 2) {
for (int j = k/2; j > 0; j /= 2) {
int ixj = get_local_id(0)^j;
if (ixj > get_local_id(0)) {
if (((get_local_id(0)&k) == 0 && getValue(buffer[get_local_id(0)]) > getValue(buffer[ixj])) ||
((get_local_id(0)&k) != 0 && getValue(buffer[get_local_id(0)]) < getValue(buffer[ixj]))) {
TYPE temp = buffer[get_local_id(0)];
buffer[get_local_id(0)] = buffer[ixj];
buffer[ixj] = temp;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// Write the data to the sorted array.
if (get_local_id(0) < length)
data[startIndex+get_local_id(0)] = buffer[get_local_id(0)];
}
else {
// Copy the bucket data over to the output array.
for (int i = get_local_id(0); i < length; i += get_local_size(0))
data[startIndex+i] = buckets[startIndex+i];
barrier(CLK_GLOBAL_MEM_FENCE);
// Perform a bitonic sort in global memory.
for (int k = 2; k < 2*length; k *= 2) {
for (int j = k/2; j > 0; j /= 2) {
for (int i = get_local_id(0); i < length; i += get_local_size(0)) {
int ixj = i^j;
if (ixj > i && ixj < length) {
TYPE value1 = data[startIndex+i];
TYPE value2 = data[startIndex+ixj];
bool ascending = ((i&k) == 0);
for (int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending);
if ((ascending && getValue(value1) > getValue(value2)) ||
(!ascending && getValue(value1) < getValue(value2))) {
data[startIndex+i] = value2;
data[startIndex+ixj] = value1;
}
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
}
}
}
\ No newline at end of file
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the OpenCL implementation of sorting.
*/
#include "../../../tests/AssertionUtilities.h"
#include "../src/OpenCLArray.h"
#include "../src/OpenCLContext.h"
#include "../src/OpenCLSort.h"
#include "../src/sfmt/SFMT.h"
#include "openmm/System.h"
#include <iostream>
#include <cmath>
#include <set>
using namespace OpenMM;
using namespace std;
void verifySorting(vector<float> array) {
// Sort the array.
System system;
system.addParticle(0.0);
OpenCLContext context(1, -1);
context.initialize(system);
OpenCLArray<float> data(context, array.size(), "sortData");
data.upload(array);
OpenCLSort<float> sort(context, array.size(), "float", "value");
sort.sort(data);
vector<float> sorted;
data.download(sorted);
// Verify that it is in sorted order.
for (int i = 1; i < (int) sorted.size(); i++)
ASSERT(sorted[i-1] <= sorted[i]);
// Make sure the sorted array contains the same values as the original one.
multiset<float> elements1(array.begin(), array.end());
multiset<float> elements2(sorted.begin(), sorted.end());
ASSERT(elements1 == elements2);
}
void testUniformValues()
{
init_gen_rand(0);
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) genrand_real2();
verifySorting(array);
}
void testLogValues()
{
init_gen_rand(0);
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2());
verifySorting(array);
}
int main() {
try {
testUniformValues();
testLogValues();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
cout << "Done" << endl;
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment