Created OpenCL sorting algorithm

6c542539 · Peter Eastman · 07073906 · 6c542539 · 6c542539 · 6c542539
Commit 6c542539 authored Jan 20, 2010 by Peter Eastman
3 changed files
--- a/platforms/opencl/src/OpenCLSort.h
+++ b/platforms/opencl/src/OpenCLSort.h
+#ifndef __OPENMM_OPENCLSORT_H__
+#define __OPENMM_OPENCLSORT_H__
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "OpenCLArray.h"
+#include "OpenCLKernelSources.h"
+#include "openmm/internal/windowsExport.h"
+#include <map>
+namespace OpenMM {
+/**
+ * This class sorts arrays of values.  It supports any type of values, not just scalars,
+ * so long as an appropriate sorting key can be defined by which to sort them.
+ *
+ * The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
+ * (in local memory when possible, in global memory otherwise).  This is similar to
+ * the algorithm described in
+ *
+ * Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng.  "An Efficient
+ * Sorting Algorithm with CUDA"  Journal of the Chinese Institute of Engineers, 32(7),
+ * pp. 915-921 (2009)
+ *
+ * but with many modifications and simplifications.  In particular, this algorithm
+ * involves much less communication between host and device, which is critical to get
+ * good performance with the array sizes we typically work with (10,000 to 100,000
+ * elements).
+ */
+template <class TYPE>
+class OPENMM_EXPORT OpenCLSort {
+public:
+    /**
+     * Create an OpenCLSort object for sorting data of a particular type.
+     *
+     * @param context    the context in which to perform calculations
+     * @param length     the length of the arrays this object will be used to sort
+     * @param typeName   the name of the data type being sorting (e.g. "float")
+     * @param sortKey    an expression that returns the value by which the variable "value" should be sorted.
+     *                   For primitive types, this will simply be "value".
+     */
+    OpenCLSort(OpenCLContext& context, int length, const std::string& typeName, const std::string& sortKey) : context(context),
+            dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
+        // Create kernels.
+        std::map<std::string, std::string> replacements;
+        replacements["TYPE"] = typeName;
+        replacements["SORT_KEY"] = sortKey;
+        cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
+        computeRangeKernel = cl::Kernel(program, "computeRange");
+        assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
+        computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions");
+        copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets");
+        sortBucketsKernel = cl::Kernel(program, "sortBuckets");
+        // Work out the work group sizes for various kernels.
+        int maxGroupSize = context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+        for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2)
+            ;
+        positionsKernelSize = rangeKernelSize;
+        sortKernelSize = rangeKernelSize/2;
+        if (rangeKernelSize > length)
+            rangeKernelSize = length;
+        int maxLocalBuffer = (context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/sizeof(TYPE))/2;
+        if (sortKernelSize > maxLocalBuffer)
+            sortKernelSize = maxLocalBuffer;
+        int targetBucketSize = sortKernelSize/2;
+        int numBuckets = length/targetBucketSize;
+        if (positionsKernelSize > numBuckets)
+            positionsKernelSize = numBuckets;
+        // Create workspace arrays.
+        dataRange = new OpenCLArray<mm_float2>(context, 1, "sortDataRange");
+        bucketOffset = new OpenCLArray<cl_int>(context, numBuckets, "bucketOffset");
+        bucketOfElement = new OpenCLArray<cl_int>(context, length, "bucketOfElement");
+        offsetInBucket = new OpenCLArray<cl_int>(context, length, "offsetInBucket");
+        buckets = new OpenCLArray<TYPE>(context, length, "buckets");
+    }
+    ~OpenCLSort() {
+        if (dataRange != NULL)
+            delete dataRange;
+        if (bucketOfElement != NULL)
+            delete bucketOfElement;
+        if (offsetInBucket != NULL)
+            delete offsetInBucket;
+        if (bucketOffset != NULL)
+            delete bucketOffset;
+        if (buckets != NULL)
+            delete buckets;
+    }
+    /**
+     * Sort an array.
+     */
+    void sort(OpenCLArray<TYPE>& data) {
+        // Compute the range of data values.
+        computeRangeKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+        computeRangeKernel.setArg<cl_int>(1, data.getSize());
+        computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer());
+        computeRangeKernel.setArg(3, rangeKernelSize*sizeof(cl_float), NULL);
+        context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);
+        // Assign array elements to buckets.
+        int numBuckets = bucketOffset->getSize();
+        context.clearBuffer(bucketOffset->getDeviceBuffer(), numBuckets);
+        assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+        assignElementsKernel.setArg<cl_int>(1, data.getSize());
+        assignElementsKernel.setArg<cl_int>(2, numBuckets);
+        assignElementsKernel.setArg<cl::Buffer>(3, dataRange->getDeviceBuffer());
+        assignElementsKernel.setArg<cl::Buffer>(4, bucketOffset->getDeviceBuffer());
+        assignElementsKernel.setArg<cl::Buffer>(5, bucketOfElement->getDeviceBuffer());
+        assignElementsKernel.setArg<cl::Buffer>(6, offsetInBucket->getDeviceBuffer());
+        context.executeKernel(assignElementsKernel, data.getSize());
+        // Compute the position of each bucket.
+        computeBucketPositionsKernel.setArg<cl_int>(0, numBuckets);
+        computeBucketPositionsKernel.setArg<cl::Buffer>(1, bucketOffset->getDeviceBuffer());
+        computeBucketPositionsKernel.setArg(2, positionsKernelSize*sizeof(cl_int), NULL);
+        context.executeKernel(computeBucketPositionsKernel, positionsKernelSize, positionsKernelSize);
+        // Copy the data into the buckets.
+        copyToBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+        copyToBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
+        copyToBucketsKernel.setArg<cl_int>(2, data.getSize());
+        copyToBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
+        copyToBucketsKernel.setArg<cl::Buffer>(4, bucketOfElement->getDeviceBuffer());
+        copyToBucketsKernel.setArg<cl::Buffer>(5, offsetInBucket->getDeviceBuffer());
+        context.executeKernel(copyToBucketsKernel, data.getSize());
+        // Sort each bucket.
+        sortBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
+        sortBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
+        sortBucketsKernel.setArg<cl_int>(2, numBuckets);
+        sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
+        sortBucketsKernel.setArg(4, sortKernelSize*sizeof(mm_float2), NULL);
+        context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
+    }
+private:
+    OpenCLContext& context;
+    OpenCLArray<mm_float2>* dataRange;
+    OpenCLArray<cl_int>* bucketOfElement;
+    OpenCLArray<cl_int>* offsetInBucket;
+    OpenCLArray<cl_int>* bucketOffset;
+    OpenCLArray<TYPE>* buckets;
+    cl::Kernel computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
+    int rangeKernelSize, positionsKernelSize, sortKernelSize;
+};
+} // namespace OpenMM
+#endif // __OPENMM_OPENCLSORT_H__
\ No newline at end of file
--- a/platforms/opencl/src/kernels/sort.cl
+++ b/platforms/opencl/src/kernels/sort.cl
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+float getValue(TYPE value) {
+    return SORT_KEY;
+}
+/**
+ * Calculate the minimum and maximum value in the array to be sorted.  This kernel
+ * is executed as a single work group.
+ */
+__kernel void computeRange(__global TYPE* data, int length, __global float2* range, __local float* buffer) {
+    float minimum = MAXFLOAT;
+    float maximum = -MAXFLOAT;
+    // Each thread calculates the range of a subset of values.
+    for (int index = get_local_id(0); index < length; index += get_local_size(0)) {
+        float value = getValue(data[index]);
+        minimum = min(minimum, value);
+        maximum = max(maximum, value);
+    }
+    // Now reduce them.
+    buffer[get_local_id(0)] = minimum;
+    for (int step = 1; step < get_local_size(0); step *= 2) {
+        if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
+            buffer[get_local_id(0)] = min(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
+            barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    minimum = buffer[0];
+    buffer[get_local_id(0)] = maximum;
+    for (int step = 1; step < get_local_size(0); step *= 2) {
+        if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
+            buffer[get_local_id(0)] = max(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
+            barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    maximum = buffer[0];
+    if (get_local_id(0) == 0)
+        range[0] = (float2) (minimum, maximum);
+}
+/**
+ * Assign elements to buckets.
+ */
+__kernel void assignElementsToBuckets(__global TYPE* data, int length, int numBuckets, __global float2* range,
+        __global int* bucketOffset, __global int* bucketOfElement, __global int* offsetInBucket) {
+    float2 dataRange = range[0];
+    float minValue = dataRange.x;
+    float maxValue = dataRange.y;
+    float bucketWidth = (maxValue-minValue)/numBuckets;
+    for (int index = get_global_id(0); index < length; index += get_global_size(0)) {
+        TYPE element = data[index];
+        float value = getValue(element);
+        int bucketIndex = min((int) ((value-minValue)/bucketWidth), numBuckets-1);
+        offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]);
+        bucketOfElement[index] = bucketIndex;
+    }
+}
+/**
+ * Sum the bucket sizes to compute the start position of each bucket.  This kernel
+ * is executed as a single work group.
+ */
+__kernel void computeBucketPositions(int numBuckets, __global int* bucketOffset, __local int* buffer) {
+    int globalOffset = 0;
+    for (int startBucket = 0; startBucket < numBuckets; startBucket += get_local_size(0)) {
+        // Load the bucket sizes into local memory.
+        int globalIndex = startBucket+get_local_id(0);
+        buffer[get_local_id(0)] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // Perform a parallel prefix sum.
+        for (int step = 1; step < get_local_size(0); step *= 2) {
+            if (get_local_id(0) >= step)
+                buffer[get_local_id(0)] += buffer[get_local_id(0)-step];
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+        // Write the results back to global memory.
+        if (globalIndex < numBuckets)
+            bucketOffset[globalIndex] = buffer[get_local_id(0)]+globalOffset;
+        globalOffset += buffer[get_local_size(0)-1];
+    }
+}
+/**
+ * Copy the input data into the buckets for sorting.
+ */
+__kernel void copyDataToBuckets(__global TYPE* data, __global TYPE* buckets, int length, __global int* bucketOffset, __global int* bucketOfElement, __global int* offsetInBucket) {
+    for (int index = get_global_id(0); index < length; index += get_global_size(0)) {
+        TYPE element = data[index];
+        int bucketIndex = bucketOfElement[index];
+        int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
+        buckets[offset+offsetInBucket[index]] = element;
+    }
+}
+/**
+ * Sort the data in each bucket.
+ */
+__kernel void sortBuckets(__global TYPE* data, __global TYPE* buckets, int numBuckets, __global int* bucketOffset, __local TYPE* buffer) {
+    for (int index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) {
+        int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
+        int endIndex = bucketOffset[index];
+        int length = endIndex-startIndex;
+        if (length <= get_local_size(0)) {
+            // Load the data into local memory.
+            buffer[get_local_id(0)] = (get_local_id(0) < length ? buckets[startIndex+get_local_id(0)] : MAXFLOAT);
+            barrier(CLK_LOCAL_MEM_FENCE);
+            // Perform a bitonic sort in local memory.
+            for (int k = 2; k <= get_local_size(0); k *= 2) {
+                for (int j = k/2; j > 0; j /= 2) {
+                    int ixj = get_local_id(0)^j;
+                    if (ixj > get_local_id(0)) {
+                        if (((get_local_id(0)&k) == 0 && getValue(buffer[get_local_id(0)]) > getValue(buffer[ixj])) ||
+                            ((get_local_id(0)&k) != 0 && getValue(buffer[get_local_id(0)]) < getValue(buffer[ixj]))) {
+                            TYPE temp = buffer[get_local_id(0)];
+                            buffer[get_local_id(0)] = buffer[ixj];
+                            buffer[ixj] = temp;
+                        }
+                    }
+                    barrier(CLK_LOCAL_MEM_FENCE);
+                }
+            }
+            // Write the data to the sorted array.
+            if (get_local_id(0) < length)
+                data[startIndex+get_local_id(0)] = buffer[get_local_id(0)];
+        }
+        else {
+            // Copy the bucket data over to the output array.
+            for (int i = get_local_id(0); i < length; i += get_local_size(0))
+                data[startIndex+i] = buckets[startIndex+i];
+            barrier(CLK_GLOBAL_MEM_FENCE);
+            // Perform a bitonic sort in global memory.
+            for (int k = 2; k < 2*length; k *= 2) {
+                for (int j = k/2; j > 0; j /= 2) {
+                    for (int i = get_local_id(0); i < length; i += get_local_size(0)) {
+                        int ixj = i^j;
+                        if (ixj > i && ixj < length) {
+                            TYPE value1 = data[startIndex+i];
+                            TYPE value2 = data[startIndex+ixj];
+                            bool ascending = ((i&k) == 0);
+                            for (int mask = k*2; mask < 2*length; mask *= 2)
+                                ascending = ((i&mask) == 0 ? !ascending : ascending);
+                            if ((ascending && getValue(value1) > getValue(value2)) ||
+                                (!ascending && getValue(value1) < getValue(value2))) {
+                                data[startIndex+i] = value2;
+                                data[startIndex+ixj] = value1;
+                            }
+                        }
+                    }
+                    barrier(CLK_GLOBAL_MEM_FENCE);
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
--- a/platforms/opencl/tests/TestOpenCLSort.cpp
+++ b/platforms/opencl/tests/TestOpenCLSort.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the OpenCL implementation of sorting.
+ */
+#include "../../../tests/AssertionUtilities.h"
+#include "../src/OpenCLArray.h"
+#include "../src/OpenCLContext.h"
+#include "../src/OpenCLSort.h"
+#include "../src/sfmt/SFMT.h"
+#include "openmm/System.h"
+#include <iostream>
+#include <cmath>
+#include <set>
+using namespace OpenMM;
+using namespace std;
+void verifySorting(vector<float> array) {
+    // Sort the array.
+    System system;
+    system.addParticle(0.0);
+    OpenCLContext context(1, -1);
+    context.initialize(system);
+    OpenCLArray<float> data(context, array.size(), "sortData");
+    data.upload(array);
+    OpenCLSort<float> sort(context, array.size(), "float", "value");
+    sort.sort(data);
+    vector<float> sorted;
+    data.download(sorted);
+    // Verify that it is in sorted order.
+    for (int i = 1; i < (int) sorted.size(); i++)
+        ASSERT(sorted[i-1] <= sorted[i]);
+    // Make sure the sorted array contains the same values as the original one.
+    multiset<float> elements1(array.begin(), array.end());
+    multiset<float> elements2(sorted.begin(), sorted.end());
+    ASSERT(elements1 == elements2);
+}
+void testUniformValues()
+{
+    init_gen_rand(0);
+    vector<float> array(10000);
+    for (int i = 0; i < (int) array.size(); i++)
+        array[i] = (float) genrand_real2();
+    verifySorting(array);
+}
+void testLogValues()
+{
+    init_gen_rand(0);
+    vector<float> array(10000);
+    for (int i = 0; i < (int) array.size(); i++)
+        array[i] = (float) log(genrand_real2());
+    verifySorting(array);
+}
+int main() {
+    try {
+        testUniformValues();
+        testLogValues();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}