Merge pull request #2631 from peastman/sort

Fixed errors in sorting on some AMD GPUs

Merge pull request #2631 from peastman/sort
Fixed errors in sorting on some AMD GPUs
089384df · peastman · GitHub · 7c7dc751 · 0297d2f0 · 089384df
Unverified Commit 089384df authored Apr 09, 2020 by peastman Committed by GitHub Apr 09, 2020
Show whitespace changes
Inline Side-by-side

Showing with 79 additions and 72 deletions

platforms/opencl/src/OpenCLSort.cpp platforms/opencl/src/OpenCLSort.cpp +79 -72

No files found.
--- a/platforms/opencl/src/OpenCLSort.cpp
+++ b/platforms/opencl/src/OpenCLSort.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -64,10 +64,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
    unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
    unsigned int maxShortList = min(8192, max(maxLocalBuffer, (int) OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks()));
-    // On Qualcomm's OpenCL, it's essential to check against CL_KERNEL_WORK_GROUP_SIZE.  Otherwise you get a crash.
+    // The following line checks CL_KERNEL_WORK_GROUP_SIZE to make sure we don't create too large a workgroup.
-    // But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
+    // Unfortunately, AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
-    // maximum, so including the check hurts performance.  For the moment I'm going to just comment it out.
+    // maximum, so including the check hurts performance.  For the moment I'm just leaving it commented out.
-    // If we officially support Qualcomm in the future, we'll need to do something better.
+    // If the workgroup size turns out to be too large, we catch the exception and switch back to the standard
+    // sorting kernels.
    //maxShortList = min(maxShortList, shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    isShortList = (length <= maxShortList);
    string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
@@ -92,12 +93,10 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
    // Create workspace arrays.
-    if (!isShortList) {
    dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
    bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset");
    bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement");
    offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket");
-    }
    buckets.initialize(context, length, trait->getDataSize(), "buckets");
 }
@@ -113,6 +112,7 @@ void OpenCLSort::sort(OpenCLArray& data) {
    if (isShortList) {
        // We can use a simpler sort kernel that does the entire operation in one kernel.
+        try {
            if (useShortList2) {
                shortList2Kernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
                shortList2Kernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
@@ -126,8 +126,16 @@ void OpenCLSort::sort(OpenCLArray& data) {
                shortListKernel.setArg(2, dataLength*trait->getDataSize(), NULL);
                context.executeKernel(shortListKernel, sortKernelSize, sortKernelSize);
            }
+            return;
        }
-    else {
+        catch (exception& ex) {
+            // This can happen if we chose too large a size for the kernel.  Switch
+            // over to the standard sorting method.
+            isShortList = false;
+        }
+    }
    // Compute the range of data values.
    unsigned int numBuckets = bucketOffset.getSize();
@@ -176,5 +184,4 @@ void OpenCLSort::sort(OpenCLArray& data) {
    sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset.getDeviceBuffer());
    sortBucketsKernel.setArg(4, sortKernelSize*trait->getDataSize(), NULL);
    context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
-    }
 }