Tony Tye's optimizations to sorting. Also a couple of other very minor fixes.

c35407a8 · Peter Eastman · 052aea3e · c35407a8 · c35407a8 · c35407a8
Commit c35407a8 authored Jan 13, 2012 by Peter Eastman
8 changed files
--- a/platforms/opencl/src/OpenCLArray.h
+++ b/platforms/opencl/src/OpenCLArray.h
@@ -90,13 +90,13 @@ public:
    /**
     * Get the size of the array.
     */
-    int getSize() {
+    int getSize() const {
        return size;
    }
    /**
     * Get the name of the array.
     */
-    const std::string& getName() {
+    const std::string& getName() const {
        return name;
    }
    /**

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -131,7 +131,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
        else
            simdWidth = 1;
        if (platforms[0].getInfo<CL_PLATFORM_VENDOR>() == "Apple" && vendor == "AMD")
-            compilationDefines["MAC_AMD_WORKAROUND"] == "";
+            compilationDefines["MAC_AMD_WORKAROUND"] = "";
        if (supports64BitGlobalAtomics)
            compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
        if (supportsDoublePrecision)

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1077,10 +1077,10 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        pmeBsplineTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta");
        bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
        if (deviceIsCpu)
-            pmeBsplineDTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta");
+            pmeBsplineDTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineDTheta");
        pmeAtomRange = new OpenCLArray<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
        pmeAtomGridIndex = new OpenCLArray<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
-        sort = new OpenCLSort<mm_int2>(cl, cl.getNumAtoms(), "int2", "value.y");
+        sort = new OpenCLSort<SortTrait>(cl, cl.getNumAtoms());
        fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);

        // Initialize the b-spline moduli.

--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -481,6 +481,16 @@ public:
     */
    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
 private:
+    struct SortTrait {
+        typedef mm_int2 DataType;
+        typedef cl_int KeyType;
+        static const char* clDataType() {return "int2";}
+        static const char* clKeyType() {return "int";}
+        static const char* clMinKey() {return "INT_MIN";}
+        static const char* clMaxKey() {return "INT_MAX";}
+        static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";}
+        static const char* clSortKey() {return "value.y";}
+    };
    OpenCLContext& cl;
    bool hasInitializedKernel;
    OpenCLArray<mm_float2>* sigmaEpsilon;
@@ -495,7 +505,7 @@ private:
    OpenCLArray<mm_float4>* pmeBsplineDTheta;
    OpenCLArray<cl_int>* pmeAtomRange;
    OpenCLArray<mm_int2>* pmeAtomGridIndex;
-    OpenCLSort<mm_int2>* sort;
+    OpenCLSort<SortTrait>* sort;
    OpenCLFFT3D* fft;
    cl::Kernel ewaldSumsKernel;
    cl::Kernel ewaldForcesKernel;

--- a/platforms/opencl/src/OpenCLSort.cpp
+++ b/platforms/opencl/src/OpenCLSort.cpp
 #include "OpenCLSort.h"
-
-template class OpenMM::OpenCLSort<float>;
--- a/platforms/opencl/src/OpenCLSort.h
+++ b/platforms/opencl/src/OpenCLSort.h
@@ -38,6 +38,28 @@ namespace OpenMM {
 * This class sorts arrays of values.  It supports any type of values, not just scalars,
 * so long as an appropriate sorting key can be defined by which to sort them.
 * 
+ * The class is templatized by a "trait" class that defines the type of data to
+ * sort and the key for sorting it.  Here is an example of a trait class for
+ * sorting floats:
+ * 
+ * struct FloatTrait {
+ *     // The name of the data and key types being sorted.
+ *     // Both the host type and OpenCL type is required.
+ *     // For primitive types they will be the same.
+ *     typedef cl_float DataType;
+ *     typedef cl_float KeyType;
+ *     static const char* clDataType() {return "float";}
+ *     static const char* clKeyType() {return "float";}
+ *     // The minimum value a key can take.
+ *     static const char* clMinKey() {return "-MAXFLOAT";}
+ *     // The maximum value a key can take.
+ *     static const char* clMaxKey() {return "MAXFLOAT";}
+ *     // A value whose key is guaranteed to equal clMaxKey().
+ *     static const char* clMaxValue() {return "MAXFLOAT";}
+ *     // The OpenCL code to select the key from the data value.
+ *     static const char* clSortKey() {return "value";}
+ * };
+ *
 * The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
 * (in local memory when possible, in global memory otherwise).  This is similar to
 * the algorithm described in
@@ -52,7 +74,7 @@ namespace OpenMM {
 * elements).
 */
    
-template <class TYPE>
+template <class TRAIT>
 class OPENMM_EXPORT OpenCLSort {
 public:
    /**
@@ -60,17 +82,18 @@ public:
     *
     * @param context    the context in which to perform calculations
     * @param length     the length of the arrays this object will be used to sort
-     * @param typeName   the name of the data type being sorting (e.g. "float")
-     * @param sortKey    an expression that returns the value by which the variable "value" should be sorted.
-     *                   For primitive types, this will simply be "value".
     */
-    OpenCLSort(OpenCLContext& context, int length, const std::string& typeName, const std::string& sortKey) : context(context),
+    OpenCLSort(OpenCLContext& context, unsigned int length) : context(context),
            dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
        // Create kernels.

        std::map<std::string, std::string> replacements;
-        replacements["TYPE"] = typeName;
-        replacements["SORT_KEY"] = sortKey;
+        replacements["DATA_TYPE"] = TRAIT::clDataType();
+        replacements["KEY_TYPE"] =  TRAIT::clKeyType();
+        replacements["SORT_KEY"] = TRAIT::clSortKey();
+        replacements["MIN_KEY"] = TRAIT::clMinKey();
+        replacements["MAX_KEY"] = TRAIT::clMaxKey();
+        replacements["MAX_VALUE"] = TRAIT::clMaxValue();
        cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
        computeRangeKernel = cl::Kernel(program, "computeRange");
        assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets");
@@ -80,18 +103,18 @@ public:

        // Work out the work group sizes for various kernels.

-        int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
+        unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
        for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2)
            ;
        positionsKernelSize = rangeKernelSize;
        sortKernelSize = rangeKernelSize/2;
        if (rangeKernelSize > length)
            rangeKernelSize = length;
-        int maxLocalBuffer = (int) ((context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/sizeof(TYPE))/2);
+        unsigned int maxLocalBuffer = (unsigned int) ((context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/sizeof(typename TRAIT::DataType))/2);
        if (sortKernelSize > maxLocalBuffer)
            sortKernelSize = maxLocalBuffer;
-        int targetBucketSize = sortKernelSize/2;
-        int numBuckets = length/targetBucketSize;
+        unsigned int targetBucketSize = sortKernelSize/2;
+        unsigned int numBuckets = length/targetBucketSize;
        if (numBuckets < 1)
            numBuckets = 1;
        if (positionsKernelSize > numBuckets)
@@ -99,11 +122,11 @@ public:

        // Create workspace arrays.

-        dataRange = new OpenCLArray<mm_float2>(context, 1, "sortDataRange");
-        bucketOffset = new OpenCLArray<cl_int>(context, numBuckets, "bucketOffset");
-        bucketOfElement = new OpenCLArray<cl_int>(context, length, "bucketOfElement");
-        offsetInBucket = new OpenCLArray<cl_int>(context, length, "offsetInBucket");
-        buckets = new OpenCLArray<TYPE>(context, length, "buckets");
+        dataRange = new OpenCLArray<typename TRAIT::KeyType>(context, 2, "sortDataRange");
+        bucketOffset = new OpenCLArray<cl_uint>(context, numBuckets, "bucketOffset");
+        bucketOfElement = new OpenCLArray<cl_uint>(context, length, "bucketOfElement");
+        offsetInBucket = new OpenCLArray<cl_uint>(context, length, "offsetInBucket");
+        buckets = new OpenCLArray<typename TRAIT::DataType>(context, length, "buckets");
    }
    ~OpenCLSort() {
        if (dataRange != NULL)
@@ -120,18 +143,24 @@ public:
    /**
     * Sort an array.
     */
-    void sort(OpenCLArray<TYPE>& data) {
+    void sort(OpenCLArray<typename TRAIT::DataType>& data) {
+
+        if (data.getSize() != bucketOfElement->getSize())
+            throw OpenMMException("OpenCLSort called with different data size");
+        if (data.getSize() == 0)
+            return;
+
        // Compute the range of data values.

        computeRangeKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
-        computeRangeKernel.setArg<cl_int>(1, data.getSize());
+        computeRangeKernel.setArg<cl_uint>(1, data.getSize());
        computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer());
-        computeRangeKernel.setArg(3, rangeKernelSize*sizeof(cl_float), NULL);
+        computeRangeKernel.setArg(3, rangeKernelSize*sizeof(typename TRAIT::KeyType), NULL);
        context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);

        // Assign array elements to buckets.

-        int numBuckets = bucketOffset->getSize();
+        unsigned int numBuckets = bucketOffset->getSize();
        context.clearBuffer(bucketOffset->getDeviceBuffer(), numBuckets);
        assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
        assignElementsKernel.setArg<cl_int>(1, data.getSize());
@@ -165,18 +194,18 @@ public:
        sortBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
        sortBucketsKernel.setArg<cl_int>(2, numBuckets);
        sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
-        sortBucketsKernel.setArg(4, sortKernelSize*sizeof(TYPE), NULL);
+        sortBucketsKernel.setArg(4, sortKernelSize*sizeof(typename TRAIT::DataType), NULL);
        context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
    }
 private:
    OpenCLContext& context;
-    OpenCLArray<mm_float2>* dataRange;
-    OpenCLArray<cl_int>* bucketOfElement;
-    OpenCLArray<cl_int>* offsetInBucket;
-    OpenCLArray<cl_int>* bucketOffset;
-    OpenCLArray<TYPE>* buckets;
+    OpenCLArray<typename TRAIT::KeyType>* dataRange;
+    OpenCLArray<cl_uint>* bucketOfElement;
+    OpenCLArray<cl_uint>* offsetInBucket;
+    OpenCLArray<cl_uint>* bucketOffset;
+    OpenCLArray<typename TRAIT::DataType>* buckets;
    cl::Kernel computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
-    int rangeKernelSize, positionsKernelSize, sortKernelSize;
+    unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
 };

 } // namespace OpenMM

--- a/platforms/opencl/src/kernels/sort.cl
+++ b/platforms/opencl/src/kernels/sort.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable

-float getValue(TYPE value) {
+KEY_TYPE getValue(DATA_TYPE value) {
    return SORT_KEY;
 }

@@ -8,14 +8,14 @@ float getValue(TYPE value) {
 * Calculate the minimum and maximum value in the array to be sorted.  This kernel
 * is executed as a single work group.
 */
-__kernel void computeRange(__global const TYPE* restrict data, int length, __global float2* restrict range, __local float* restrict buffer) {
-    float minimum = MAXFLOAT;
-    float maximum = -MAXFLOAT;
+__kernel void computeRange(__global const DATA_TYPE* restrict data, uint length, __global KEY_TYPE* restrict range, __local KEY_TYPE* restrict buffer) {
+    KEY_TYPE minimum = MAX_KEY;
+    KEY_TYPE maximum = MIN_KEY;

    // Each thread calculates the range of a subset of values.

-    for (int index = get_local_id(0); index < length; index += get_local_size(0)) {
-        float value = getValue(data[index]);
+    for (uint index = get_local_id(0); index < length; index += get_local_size(0)) {
+        KEY_TYPE value = getValue(data[index]);
        minimum = min(minimum, value);
        maximum = max(maximum, value);
    }
@@ -24,7 +24,7 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo

    buffer[get_local_id(0)] = minimum;
    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int step = 1; step < get_local_size(0); step *= 2) {
+    for (uint step = 1; step < get_local_size(0); step *= 2) {
        if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
            buffer[get_local_id(0)] = min(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
        barrier(CLK_LOCAL_MEM_FENCE);
@@ -32,21 +32,23 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
    minimum = buffer[0];
    buffer[get_local_id(0)] = maximum;
    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int step = 1; step < get_local_size(0); step *= 2) {
+    for (uint step = 1; step < get_local_size(0); step *= 2) {
        if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
            buffer[get_local_id(0)] = max(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    maximum = buffer[0];
-    if (get_local_id(0) == 0)
-        range[0] = (float2) (minimum, maximum);
+    if (get_local_id(0) == 0) {
+        range[0] = minimum;
+        range[1] = maximum;
+    }
 }

 /**
 * Assign elements to buckets.
 */
-__kernel void assignElementsToBuckets(__global const TYPE* restrict data, int length, int numBuckets, __global const float2* restrict range,
-        __global int* bucketOffset, __global int* restrict bucketOfElement, __global int* restrict offsetInBucket) {
+__kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, uint length, uint numBuckets, __global const KEY_TYPE* restrict range,
+        __global uint* bucketOffset, __global uint* restrict bucketOfElement, __global uint* restrict offsetInBucket) {
 #ifdef AMD_ATOMIC_WORK_AROUND
    // Do a byte write to force all memory accesses to interactionCount to use the complete path.
    // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
@@ -55,19 +57,18 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
    if (get_global_id(0) == get_local_id(0)+1)
        ((__global char*)bucketOffset)[sizeof(int)*numBuckets+1] = 0;
 #endif
-    float2 dataRange = range[0];
-    float minValue = dataRange.x;
-    float maxValue = dataRange.y;
+    float minValue = (float) (range[0]);
+    float maxValue = (float) (range[1]);
    float bucketWidth = (maxValue-minValue)/numBuckets;
-    for (int index = get_global_id(0); index < length; index += get_global_size(0)) {
+    for (uint index = get_global_id(0); index < length; index += get_global_size(0)) {
 #ifdef MAC_AMD_WORKAROUND
        __global int* d = (__global int*) data;
        int2 element = (int2) (d[2*index], d[2*index+1]);
+        float key = (float) getValue(element);
 #else
-        TYPE element = data[index];
+        float key = (float) getValue(data[index]);
 #endif
-        float value = getValue(element);
-        int bucketIndex = min((int) ((value-minValue)/bucketWidth), numBuckets-1);
+        uint bucketIndex = min((uint) ((key-minValue)/bucketWidth), numBuckets-1);
        offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]);
        bucketOfElement[index] = bucketIndex;
    }
@@ -77,19 +78,19 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
 * Sum the bucket sizes to compute the start position of each bucket.  This kernel
 * is executed as a single work group.
 */
-__kernel void computeBucketPositions(int numBuckets, __global int* restrict bucketOffset, __local int* restrict buffer) {
-    int globalOffset = 0;
-    for (int startBucket = 0; startBucket < numBuckets; startBucket += get_local_size(0)) {
+__kernel void computeBucketPositions(uint numBuckets, __global uint* restrict bucketOffset, __local uint* restrict buffer) {
+    uint globalOffset = 0;
+    for (uint startBucket = 0; startBucket < numBuckets; startBucket += get_local_size(0)) {
        // Load the bucket sizes into local memory.

-        int globalIndex = startBucket+get_local_id(0);
+        uint globalIndex = startBucket+get_local_id(0);
        buffer[get_local_id(0)] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
        barrier(CLK_LOCAL_MEM_FENCE);

        // Perform a parallel prefix sum.

-        for (int step = 1; step < get_local_size(0); step *= 2) {
-            int add = (get_local_id(0) >= step ? buffer[get_local_id(0)-step] : 0);
+        for (uint step = 1; step < get_local_size(0); step *= 2) {
+            uint add = (get_local_id(0) >= step ? buffer[get_local_id(0)-step] : 0);
            barrier(CLK_LOCAL_MEM_FENCE);
            buffer[get_local_id(0)] += add;
            barrier(CLK_LOCAL_MEM_FENCE);
@@ -106,11 +107,11 @@ __kernel void computeBucketPositions(int numBuckets, __global int* restrict buck
 /**
 * Copy the input data into the buckets for sorting.
 */
-__kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYPE* restrict buckets, int length, __global const int* restrict bucketOffset, __global const int* restrict bucketOfElement, __global const int* restrict offsetInBucket) {
-    for (int index = get_global_id(0); index < length; index += get_global_size(0)) {
-        TYPE element = data[index];
-        int bucketIndex = bucketOfElement[index];
-        int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
+__kernel void copyDataToBuckets(__global const DATA_TYPE* restrict data, __global DATA_TYPE* restrict buckets, uint length, __global const uint* restrict bucketOffset, __global const uint* restrict bucketOfElement, __global const uint* restrict offsetInBucket) {
+    for (uint index = get_global_id(0); index < length; index += get_global_size(0)) {
+        DATA_TYPE element = data[index];
+        uint bucketIndex = bucketOfElement[index];
+        uint offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
        buckets[offset+offsetInBucket[index]] = element;
    }
 }
@@ -118,28 +119,34 @@ __kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYP
 /**
 * Sort the data in each bucket.
 */
-__kernel void sortBuckets(__global TYPE* data, __global const TYPE* buckets, int numBuckets, __global const int* restrict bucketOffset, __local TYPE* buffer) {
-    for (int index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) {
-        int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
-        int endIndex = bucketOffset[index];
-        int length = endIndex-startIndex;
+__kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA_TYPE* restrict buckets, uint numBuckets, __global const uint* restrict bucketOffset, __local DATA_TYPE* restrict buffer) {
+    for (uint index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) {
+        uint startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
+        uint endIndex = bucketOffset[index];
+        uint length = endIndex-startIndex;
        if (length <= get_local_size(0)) {
            // Load the data into local memory.

-            buffer[get_local_id(0)] = (get_local_id(0) < length ? buckets[startIndex+get_local_id(0)] : (TYPE) MAXFLOAT);
+            if (get_local_id(0) < length)
+                buffer[get_local_id(0)] = buckets[startIndex+get_local_id(0)];
+            else
+                buffer[get_local_id(0)] = MAX_VALUE;
            barrier(CLK_LOCAL_MEM_FENCE);

            // Perform a bitonic sort in local memory.

-            for (int k = 2; k <= get_local_size(0); k *= 2) {
-                for (int j = k/2; j > 0; j /= 2) {
+            for (uint k = 2; k <= get_local_size(0); k *= 2) {
+                for (uint j = k/2; j > 0; j /= 2) {
                    int ixj = get_local_id(0)^j;
                    if (ixj > get_local_id(0)) {
-                        if (((get_local_id(0)&k) == 0 && getValue(buffer[get_local_id(0)]) > getValue(buffer[ixj])) ||
-                            ((get_local_id(0)&k) != 0 && getValue(buffer[get_local_id(0)]) < getValue(buffer[ixj]))) {
-                            TYPE temp = buffer[get_local_id(0)];
-                            buffer[get_local_id(0)] = buffer[ixj];
-                            buffer[ixj] = temp;
+                        DATA_TYPE value1 = buffer[get_local_id(0)];
+                        DATA_TYPE value2 = buffer[ixj];
+                        bool ascending = (get_local_id(0)&k) == 0;
+                        KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
+                        KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                        if (lowKey > highKey) {
+                            buffer[get_local_id(0)] = value2;
+                            buffer[ixj] = value1;
                        }
                    }
                    barrier(CLK_LOCAL_MEM_FENCE);
@@ -154,24 +161,25 @@ __kernel void sortBuckets(__global TYPE* data, __global const TYPE* buckets, int
        else {
            // Copy the bucket data over to the output array.

-            for (int i = get_local_id(0); i < length; i += get_local_size(0))
+            for (uint i = get_local_id(0); i < length; i += get_local_size(0))
                data[startIndex+i] = buckets[startIndex+i];
            barrier(CLK_GLOBAL_MEM_FENCE);

            // Perform a bitonic sort in global memory.

-            for (int k = 2; k < 2*length; k *= 2) {
-                for (int j = k/2; j > 0; j /= 2) {
-                    for (int i = get_local_id(0); i < length; i += get_local_size(0)) {
+            for (uint k = 2; k < 2*length; k *= 2) {
+                for (uint j = k/2; j > 0; j /= 2) {
+                    for (uint i = get_local_id(0); i < length; i += get_local_size(0)) {
                        int ixj = i^j;
                        if (ixj > i && ixj < length) {
-                            TYPE value1 = data[startIndex+i];
-                            TYPE value2 = data[startIndex+ixj];
+                            DATA_TYPE value1 = data[startIndex+i];
+                            DATA_TYPE value2 = data[startIndex+ixj];
                            bool ascending = ((i&k) == 0);
-                            for (int mask = k*2; mask < 2*length; mask *= 2)
+                            for (uint mask = k*2; mask < 2*length; mask *= 2)
                                ascending = ((i&mask) == 0 ? !ascending : ascending);
-                            if ((ascending && getValue(value1) > getValue(value2)) ||
-                                (!ascending && getValue(value1) < getValue(value2))) {
+                            KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
+                            KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                            if (lowKey > highKey) {
                                data[startIndex+i] = value2;
                                data[startIndex+ixj] = value1;
                            }

--- a/platforms/opencl/tests/TestOpenCLSort.cpp
+++ b/platforms/opencl/tests/TestOpenCLSort.cpp
@@ -46,6 +46,17 @@
 using namespace OpenMM;
 using namespace std;

+struct SortTrait {
+    typedef cl_float DataType;
+    typedef cl_float KeyType;
+    static const char* clDataType() {return "float";}
+    static const char* clKeyType() {return "float";}
+    static const char* clMinKey() {return "-MAXFLOAT";}
+    static const char* clMaxKey() {return "MAXFLOAT";}
+    static const char* clMaxValue() {return "MAXFLOAT";}
+    static const char* clSortKey() {return "value";}
+};
+
 void verifySorting(vector<float> array) {
    // Sort the array.

@@ -56,7 +67,7 @@ void verifySorting(vector<float> array) {
    context.initialize(system);
    OpenCLArray<float> data(context, array.size(), "sortData");
    data.upload(array);
-    OpenCLSort<float> sort(context, array.size(), "float", "value");
+    OpenCLSort<SortTrait> sort(context, array.size());
    sort.sort(data);
    vector<float> sorted;
    data.download(sorted);