Optimize sorting kernels and tune block sizes

* Compile kernels with max block size of 256 threads: The default hipcc behavior since ROCm 4.2 is to compile kernels with 1024 threads unless __launch_bounds__ is specified. This significantly increases register pressure especially in heavy kernels (double precision, for example), requiring register spilling; * Optimize computeRange by using multiple blocks for reduction; * Use blocks of 1024 threads for computeBucketPositions - it is executed as a single work group so larger block size is faster; * Sort up-to lenghtNextPow2 instead of blockDim.x (faster for short buckets); * Optimize sortShortList2; * Optimize sortBuckets with bit instructions; * Decrease bucket size for non-uniform sorting: too many buckets may have sizes too large to sort in shared memory; * Add more sizes in tests.

Optimize sorting kernels and tune block sizes
* Compile kernels with max block size of 256 threads: The default hipcc behavior since ROCm 4.2 is to compile kernels with 1024 threads unless __launch_bounds__ is specified. This significantly increases register pressure especially in heavy kernels (double precision, for example), requiring register spilling; * Optimize computeRange by using multiple blocks for reduction; * Use blocks of 1024 threads for computeBucketPositions - it is executed as a single work group so larger block size is faster; * Sort up-to lenghtNextPow2 instead of blockDim.x (faster for short buckets); * Optimize sortShortList2; * Optimize sortBuckets with bit instructions; * Decrease bucket size for non-uniform sorting: too many buckets may have sizes too large to sort in shared memory; * Add more sizes in tests.
7279c539 · Anton Gorenko · aca24d5f · 7279c539 · 7279c539 · 7279c539
Unverified Commit 7279c539 authored Aug 25, 2024 by Anton Gorenko
7 changed files
--- a/platforms/hip/include/HipContext.h
+++ b/platforms/hip/include/HipContext.h
@@ -342,7 +342,7 @@ public:
     * Get the maximum number of threads in a thread block supported by this device.
     */
    int getMaxThreadBlockSize() const {
-        return 1024;
+        return 256;
    }
    /**
     * Get whether the device being used is a CPU.  In some cases, different algorithms

--- a/platforms/hip/include/HipSort.h
+++ b/platforms/hip/include/HipSort.h
@@ -10,7 +10,7 @@
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2010-2018 Stanford University and the Authors.      *
- * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -77,9 +77,11 @@ public:
     * @param trait      a SortTrait defining the type of data to sort.  It should have been allocated
     *                   on the heap with the "new" operator.  This object takes over ownership of it,
     *                   and deletes it when the HipSort is deleted.
-     * @param length     the length of the arrays this object will be used to sort
+     * @param length     the length of the arrays this object will be used to sort.
+     * @param uniform    whether the input data is expected to follow a uniform or nonuniform
+     *                   distribution.  This argument is used only as a hint.
     */
-    HipSort(HipContext& context, SortTrait* trait, unsigned int length);
+    HipSort(HipContext& context, SortTrait* trait, unsigned int length, bool uniform=true);
    ~HipSort();
    /**
     * Sort an array.
@@ -88,14 +90,15 @@ public:
 private:
    HipContext& context;
    SortTrait* trait;
+    HipArray counters;
    HipArray dataRange;
    HipArray bucketOfElement;
    HipArray offsetInBucket;
    HipArray bucketOffset;
    HipArray buckets;
    hipFunction_t shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
-    unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
-    bool isShortList;
+    unsigned int dataLength, rangeKernelBlocks, rangeKernelSize, positionsKernelSize, sortKernelSize;
+    bool isShortList, uniform;
 };

 /**

--- a/platforms/hip/src/HipContext.cpp
+++ b/platforms/hip/src/HipContext.cpp
@@ -431,6 +431,9 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
    static_assert(8*sizeof(void*) == HipContext::TileSize);
    string bits = intToString(8*sizeof(void*));
    string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
+    if (getMaxThreadBlockSize() < 1024) {
+        options += " --gpu-max-threads-per-block=" + std::to_string(getMaxThreadBlockSize());
+    }
    stringstream src;
    if (!options.empty())
        src << "// Compilation Options: " << options << endl << endl;
@@ -659,6 +662,18 @@ void HipContext::executeKernel(hipFunction_t kernel, void** arguments, int threa
    }
 }

+void HipContext::executeKernelFlat(hipFunction_t kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
+    if (blockSize == -1)
+        blockSize = ThreadBlockSize;
+    int gridSize = (threads+blockSize-1)/blockSize;
+    hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
+    if (result != hipSuccess) {
+        stringstream str;
+        str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
+
 int HipContext::computeThreadBlockSize(double memory) const {
    int maxShared = this->sharedMemPerBlock;
    int max = (int) (maxShared/memory);
@@ -738,7 +753,7 @@ void HipContext::clearAutoclearBuffers() {

 double HipContext::reduceEnergy() {
    int bufferSize = energyBuffer.getSize();
-    int workGroupSize  = 512;
+    int workGroupSize = getMaxThreadBlockSize();
    void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
    executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer.getElementSize());
    energySum.download(pinnedBuffer);

--- a/platforms/hip/src/HipNonbondedUtilities.cpp
+++ b/platforms/hip/src/HipNonbondedUtilities.cpp
@@ -280,7 +280,7 @@ void HipNonbondedUtilities::initialize(const System& system) {
        sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
        oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
        rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
-        blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
+        blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false);
        vector<unsigned int> count(2, 0);
        interactionCount.upload(count);
        rebuildNeighborList.upload(&count[0]);

--- a/platforms/hip/src/HipSort.cpp
+++ b/platforms/hip/src/HipSort.cpp
@@ -7,7 +7,7 @@
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2010-2018 Stanford University and the Authors.      *
- * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -33,7 +33,8 @@
 using namespace OpenMM;
 using namespace std;

-HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataLength(length) {
+HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, bool uniform) :
+        context(context), trait(trait), dataLength(length), uniform(uniform) {
    // Create kernels.

    map<string, string> replacements;
@@ -54,32 +55,34 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length) : c

    // Work out the work group sizes for various kernels.

-    int maxBlockSize;
-    hipDeviceGetAttribute(&maxBlockSize, hipDeviceAttributeMaxBlockDimX, context.getDevice());
    int maxSharedMem;
    hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice());
    int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
    int maxShortList = min(3000, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
    isShortList = (length <= maxShortList);
-    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
-        ;
-    positionsKernelSize = rangeKernelSize;
-    sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4);
+    sortKernelSize = 256;
+    rangeKernelSize = 256;
    if (rangeKernelSize > length)
        rangeKernelSize = length;
+    rangeKernelBlocks = (length + rangeKernelSize - 1) / rangeKernelSize;
    if (sortKernelSize > maxLocalBuffer)
        sortKernelSize = maxLocalBuffer;
-    unsigned int targetBucketSize = sortKernelSize/2;
+    unsigned int targetBucketSize = uniform ? sortKernelSize/2 : sortKernelSize/8;
    unsigned int numBuckets = length/targetBucketSize;
    if (numBuckets < 1)
        numBuckets = 1;
+    // computeBucketPositions is executed as a single work group so larger block size is faster.
+    positionsKernelSize = 1024;
    if (positionsKernelSize > numBuckets)
        positionsKernelSize = numBuckets;

    // Create workspace arrays.

    if (!isShortList) {
-        dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
+        counters.initialize<unsigned int>(context, 1, "counters");
+        unsigned int zero = 0;
+        counters.upload(&zero);
+        dataRange.initialize(context, 2*rangeKernelBlocks, trait->getKeySize(), "sortDataRange");
        bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset");
        bucketOfElement.initialize<uint1>(context, length, "bucketOfElement");
        offsetInBucket.initialize<uint1>(context, length, "offsetInBucket");
@@ -101,7 +104,7 @@ void HipSort::sort(HipArray& data) {

        if (dataLength <= HipContext::ThreadBlockSize*context.getNumThreadBlocks()) {
            void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
-            context.executeKernel(shortList2Kernel, sortArgs, dataLength);
+            context.executeKernel(shortList2Kernel, sortArgs, dataLength, HipContext::ThreadBlockSize, HipContext::ThreadBlockSize*trait->getKeySize());
            buckets.copyTo(data);
        }
        else {
@@ -113,8 +116,8 @@ void HipSort::sort(HipArray& data) {
        // Compute the range of data values.

        unsigned int numBuckets = bucketOffset.getSize();
-        void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
-        context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
+        void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer(), &counters.getDevicePointer()};
+        context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelBlocks*rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());

        // Assign array elements to buckets.

@@ -124,7 +127,7 @@ void HipSort::sort(HipArray& data) {

        // Compute the position of each bucket.

-        void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer()};
+        void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer(), &counters.getDevicePointer()};
        context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));

        // Copy the data into the buckets.
@@ -135,7 +138,7 @@ void HipSort::sort(HipArray& data) {

        // Sort each bucket.

-        void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
-        context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
+        void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &bucketOffset.getDevicePointer()};
+        context.executeKernelFlat(sortBucketsKernel, sortArgs, numBuckets*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
    }
 }
--- a/platforms/hip/src/kernels/sort.hip
+++ b/platforms/hip/src/kernels/sort.hip
@@ -11,23 +11,23 @@ extern "C" {
 __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) {
    // Load the data into local memory.

-    HIP_DYNAMIC_SHARED( DATA_TYPE, dataBuffer)
+    extern __shared__ DATA_TYPE dataBuffer[];
    for (int index = threadIdx.x; index < length; index += blockDim.x)
        dataBuffer[index] = data[index];
    __syncthreads();

    // Perform a bitonic sort in local memory.

-    for (unsigned int k = 2; k < 2*length; k *= 2) {
+    unsigned int lenghtNextPow2 = length <= 2 ? length : (1 << (32 - __clz(length - 1)));
+
+    for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
        for (unsigned int j = k/2; j > 0; j /= 2) {
            for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
                int ixj = i^j;
                if (ixj > i && ixj < length) {
                    DATA_TYPE value1 = dataBuffer[i];
                    DATA_TYPE value2 = dataBuffer[ixj];
-                    bool ascending = ((i&k) == 0);
-                    for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
-                        ascending = ((i&mask) == 0 ? !ascending : ascending);
+                    bool ascending = (__popc(~i & (lenghtNextPow2 - k)) & 1) == 0;
                    KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
                    KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
                    if (lowKey > highKey) {
@@ -52,7 +52,7 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
 * work, but also parallelizes much better.
 */
 __global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE* __restrict__ dataOut, unsigned int length) {
-    __shared__ DATA_TYPE dataBuffer[64];
+    extern __shared__ KEY_TYPE keyBuffer[];
    int globalId = blockDim.x*blockIdx.x+threadIdx.x;
    DATA_TYPE value = dataIn[globalId < length ? globalId : 0];
    KEY_TYPE key = getValue(value);
@@ -61,59 +61,80 @@ __global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE*
        int numInBlock = min(static_cast<int>(blockDim.x), static_cast<int>(length-blockStart));
        __syncthreads();
        if (threadIdx.x < numInBlock)
-            dataBuffer[threadIdx.x] = dataIn[blockStart+threadIdx.x];
+            keyBuffer[threadIdx.x] = getValue(dataIn[blockStart+threadIdx.x]);
        __syncthreads();
        for (int i = 0; i < numInBlock; i++) {
-            KEY_TYPE otherKey = getValue(dataBuffer[i]);
-            if (otherKey < key || (otherKey == key && blockStart+i < globalId))
-                count++;
+            KEY_TYPE otherKey = keyBuffer[i];
+            count += (otherKey < key) | ((otherKey == key) & (blockStart+i < globalId));
        }
    }
    if (globalId < length)
        dataOut[count] = value;
 }

+inline __device__ void reduceMinMax(KEY_TYPE minimum, KEY_TYPE maximum, KEY_TYPE* minBuffer, KEY_TYPE* maxBuffer,
+        KEY_TYPE* minResult, KEY_TYPE* maxResult) {
+    minBuffer[threadIdx.x] = minimum;
+    maxBuffer[threadIdx.x] = maximum;
+    __syncthreads();
+    for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+        if ((threadIdx.x+step < blockDim.x) & ((threadIdx.x&(2*step-1)) == 0)) {
+            minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
+            maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
+        }
+        __syncthreads();
+    }
+    if (threadIdx.x == 0) {
+        *minResult = minBuffer[0];
+        *maxResult = maxBuffer[0];
+    }
+}
+
 /**
- * Calculate the minimum and maximum value in the array to be sorted.  This kernel
- * is executed as a single work group.
+ * Calculate the minimum and maximum value in the array to be sorted.
 */
 __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range,
-        unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
-    HIP_DYNAMIC_SHARED( KEY_TYPE, minBuffer)
+        unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
+    extern __shared__ KEY_TYPE minBuffer[];
    KEY_TYPE* maxBuffer = minBuffer+blockDim.x;
    KEY_TYPE minimum = MAX_KEY;
    KEY_TYPE maximum = MIN_KEY;

+    __shared__ bool isLastFinishedBlock;
+    if (threadIdx.x == 0) {
+        isLastFinishedBlock = false;
+    }
+
    // Each thread calculates the range of a subset of values.

-    for (unsigned int index = threadIdx.x; index < length; index += blockDim.x) {
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
        KEY_TYPE value = getValue(data[index]);
        minimum = min(minimum, value);
        maximum = max(maximum, value);
    }

-    // Now reduce them.
+    // Now reduce them and save partial results

-    minBuffer[threadIdx.x] = minimum;
-    maxBuffer[threadIdx.x] = maximum;
+    reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[blockIdx.x * 2 + 0], &range[blockIdx.x * 2 + 1]);
+    __threadfence();
+    if (threadIdx.x == 0) {
+        isLastFinishedBlock = atomicAdd(&counters[0], 1) + 1 == gridDim.x;
+    }
    __syncthreads();
-    for (unsigned int step = 1; step < blockDim.x; step *= 2) {
-        if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0) {
-            minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
-            maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
+
+    // The last block reduce partial results
+
+    if (isLastFinishedBlock) {
+        for (unsigned int index = threadIdx.x; index < gridDim.x; index += blockDim.x) {
+            minimum = min(minimum, range[index * 2 + 0]);
+            maximum = max(maximum, range[index * 2 + 1]);
        }
-        __syncthreads();
-    }
-    minimum = minBuffer[0];
-    maximum = maxBuffer[0];
-    if (threadIdx.x == 0) {
-        range[0] = minimum;
-        range[1] = maximum;
+        reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[0], &range[1]);
    }

    // Clear the bucket counters in preparation for the next kernel.

-    for (unsigned int index = threadIdx.x; index < numBuckets; index += blockDim.x)
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < numBuckets; index += blockDim.x*gridDim.x)
        bucketOffset[index] = 0;
 }

@@ -137,8 +158,8 @@ __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsi
 * Sum the bucket sizes to compute the start position of each bucket.  This kernel
 * is executed as a single work group.
 */
-__global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
-    HIP_DYNAMIC_SHARED( unsigned int, posBuffer)
+__global__ __launch_bounds__(1024) void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
+    extern __shared__ unsigned int posBuffer[];
    unsigned int globalOffset = 0;
    for (unsigned int startBucket = 0; startBucket < numBuckets; startBucket += blockDim.x) {
        // Load the bucket sizes into local memory.
@@ -163,6 +184,9 @@ __global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __
            bucketOffset[globalIndex] = posBuffer[threadIdx.x]+globalOffset;
        globalOffset += posBuffer[blockDim.x-1];
    }
+    if (threadIdx.x == 0) {
+        counters[0] = 0;
+    }
 }

 /**
@@ -180,77 +204,73 @@ __global__ void copyDataToBuckets(const DATA_TYPE* __restrict__ data, DATA_TYPE*
 /**
 * Sort the data in each bucket.
 */
-__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, unsigned int numBuckets, const unsigned int* __restrict__ bucketOffset) {
-    HIP_DYNAMIC_SHARED( DATA_TYPE, dataBuffer)
-    for (unsigned int index = blockIdx.x; index < numBuckets; index += gridDim.x) {
-        unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
-        unsigned int endIndex = bucketOffset[index];
-        unsigned int length = endIndex-startIndex;
-        if (length <= blockDim.x) {
-            // Load the data into local memory.
-
-            if (threadIdx.x < length)
-                dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
-            else
-                dataBuffer[threadIdx.x] = MAX_VALUE;
-            __syncthreads();
+__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, const unsigned int* __restrict__ bucketOffset) {
+    extern __shared__ DATA_TYPE dataBuffer[];
+    unsigned int index = blockIdx.x;
+    unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
+    unsigned int endIndex = bucketOffset[index];
+    unsigned int length = endIndex-startIndex;
+    unsigned int lenghtNextPow2 = length <= 2 ? length : (1 << (32 - __clz(length - 1)));
+    if (length <= blockDim.x) {
+        // Load the data into local memory.
+
+        if (threadIdx.x < length)
+            dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
+        else if (threadIdx.x < lenghtNextPow2)
+            dataBuffer[threadIdx.x] = MAX_VALUE;
+        __syncthreads();

-            // Perform a bitonic sort in local memory.
+        // Perform a bitonic sort in local memory.

-            for (unsigned int k = 2; k <= blockDim.x; k *= 2) {
-                for (unsigned int j = k/2; j > 0; j /= 2) {
-                    int ixj = threadIdx.x^j;
-                    if (ixj > threadIdx.x) {
-                        DATA_TYPE value1 = dataBuffer[threadIdx.x];
-                        DATA_TYPE value2 = dataBuffer[ixj];
-                        bool ascending = (threadIdx.x&k) == 0;
-                        KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
-                        KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
-                        if (lowKey > highKey) {
-                            dataBuffer[threadIdx.x] = value2;
-                            dataBuffer[ixj] = value1;
-                        }
+        for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
+            for (unsigned int j = k/2; j > 0; j /= 2) {
+                int ixj = threadIdx.x^j;
+                if (threadIdx.x < lenghtNextPow2 && ixj > threadIdx.x) {
+                    DATA_TYPE value1 = dataBuffer[threadIdx.x];
+                    DATA_TYPE value2 = dataBuffer[ixj];
+                    bool ascending = (threadIdx.x&k) == 0;
+                    KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
+                    KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                    if (lowKey > highKey) {
+                        dataBuffer[threadIdx.x] = value2;
+                        dataBuffer[ixj] = value1;
                    }
-                    __syncthreads();
                }
+                __syncthreads();
            }
+        }

-            // Write the data to the sorted array.
+        // Write the data to the sorted array.

-            if (threadIdx.x < length)
-                data[startIndex+threadIdx.x] = dataBuffer[threadIdx.x];
-        }
-        else {
-            // Copy the bucket data over to the output array.
+        if (threadIdx.x < length)
+            data[startIndex+threadIdx.x] = dataBuffer[threadIdx.x];
+    }
+    else {
+        // Copy the bucket data over to the output array.

-            for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
-                data[startIndex+i] = buckets[startIndex+i];
-            __threadfence_block();
-            __syncthreads();
+        for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
+            data[startIndex+i] = buckets[startIndex+i];
+        __syncthreads();

-            // Perform a bitonic sort in global memory.
-
-            for (unsigned int k = 2; k < 2*length; k *= 2) {
-                for (unsigned int j = k/2; j > 0; j /= 2) {
-                    for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
-                        int ixj = i^j;
-                        if (ixj > i && ixj < length) {
-                            DATA_TYPE value1 = data[startIndex+i];
-                            DATA_TYPE value2 = data[startIndex+ixj];
-                            bool ascending = ((i&k) == 0);
-                            for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
-                                ascending = ((i&mask) == 0 ? !ascending : ascending);
-                            KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
-                            KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
-                            if (lowKey > highKey) {
-                                data[startIndex+i] = value2;
-                                data[startIndex+ixj] = value1;
-                            }
+        // Perform a bitonic sort in global memory.
+
+        for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
+            for (unsigned int j = k/2; j > 0; j /= 2) {
+                for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
+                    int ixj = i^j;
+                    if (ixj > i && ixj < length) {
+                        DATA_TYPE value1 = data[startIndex+i];
+                        DATA_TYPE value2 = data[startIndex+ixj];
+                        bool ascending = (__popc(~i & (lenghtNextPow2 - k)) & 1) == 0;
+                        KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
+                        KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                        if (lowKey > highKey) {
+                            data[startIndex+i] = value2;
+                            data[startIndex+ixj] = value1;
                        }
                    }
-                    __threadfence_block();
-                    __syncthreads();
                }
+                __syncthreads();
            }
        }
    }

--- a/platforms/hip/tests/TestHipSort.cpp
+++ b/platforms/hip/tests/TestHipSort.cpp
@@ -60,7 +60,7 @@ class SortTrait : public HipSort::SortTrait {
    const char* getSortKey() const {return "value";}
 };

-void verifySorting(vector<float> array) {
+void verifySorting(vector<float> array, bool uniform) {
    // Sort the array.

    System system;
@@ -72,7 +72,7 @@ void verifySorting(vector<float> array) {
    context.initialize();
    HipArray data(context, array.size(), 4, "sortData");
    data.upload(array);
-    HipSort sort(context, new SortTrait(), array.size());
+    HipSort sort(context, new SortTrait(), array.size(), uniform);
    sort.sort(data);
    vector<float> sorted;
    data.download(sorted);
@@ -93,30 +93,26 @@ void testUniformValues() {
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);

-    vector<float> array(10000);
-    for (int i = 0; i < (int) array.size(); i++)
-        array[i] = (float) genrand_real2(sfmt);
-    verifySorting(array);
+    for (auto size : { 2, 63, 100, 1234, 10000, 60123, 876543}) {
+        vector<float> array(size);
+        for (int i = 0; i < (int) array.size(); i++)
+            array[i] = (float) genrand_real2(sfmt);
+        verifySorting(array, true);
+        verifySorting(array, false);
+    }
 }

 void testLogValues() {
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);

-    vector<float> array(10000);
-    for (int i = 0; i < (int) array.size(); i++)
-        array[i] = (float) log(genrand_real2(sfmt));
-    verifySorting(array);
-}
-
-void testShortList() {
-    OpenMM_SFMT::SFMT sfmt;
-    init_gen_rand(0, sfmt);
-
-    vector<float> array(500);
-    for (int i = 0; i < (int) array.size(); i++)
-        array[i] = (float) log(genrand_real2(sfmt));
-    verifySorting(array);
+    for (auto size : { 2, 63, 100, 1234, 10000, 60123, 876543}) {
+        vector<float> array(size);
+        for (int i = 0; i < (int) array.size(); i++)
+            array[i] = (float) log(genrand_real2(sfmt));
+        verifySorting(array, true);
+        verifySorting(array, false);
+    }
 }

 int main(int argc, char* argv[]) {
@@ -125,7 +121,6 @@ int main(int argc, char* argv[]) {
            platform.setPropertyDefaultValue("HipPrecision", string(argv[1]));
        testUniformValues();
        testLogValues();
-        testShortList();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;