Merge pull request #868 from peastman/amdopt

Fixed a performance regression on AMD GPUs

Merge pull request #868 from peastman/amdopt
Fixed a performance regression on AMD GPUs
f2a9c210 · peastman · 0f76088d · 4cf4328e · f2a9c210
Commit f2a9c210 authored Apr 06, 2015 by peastman
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 2 deletions

platforms/opencl/src/OpenCLSort.cpp platforms/opencl/src/OpenCLSort.cpp +6 -2

No files found.
--- a/platforms/opencl/src/OpenCLSort.cpp
+++ b/platforms/opencl/src/OpenCLSort.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -59,7 +59,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
    unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
-    isShortList = (length <= maxLocalBuffer && length < maxShortListSize);
+    // On Qualcomm's OpenCL, it's essential to check against maxShortListSize.  Otherwise you get a crash.
+    // But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
+    // maximum, so including the check hurts performance.  For the moment I'm going to just comment it out.
+    // If we officially support Qualcomm in the future, we'll need to do something better.
+    isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/);
    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
        ;
    positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);