Improved performance on AMD GPUs

138ae180 · peastman · 79ba3504 · 138ae180 · 138ae180
Commit 138ae180 authored Sep 28, 2018 by peastman
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 2 deletions

platforms/opencl/include/OpenCLSort.h platforms/opencl/include/OpenCLSort.h +1 -1

platforms/opencl/src/OpenCLSort.cpp platforms/opencl/src/OpenCLSort.cpp +7 -1

No files found.
--- a/platforms/opencl/include/OpenCLSort.h
+++ b/platforms/opencl/include/OpenCLSort.h
@@ -94,7 +94,7 @@ private:
    OpenCLArray buckets;
    cl::Kernel shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
    unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
-    bool isShortList;
+    bool isShortList, useShortList2;
 };

 /**

--- a/platforms/opencl/src/OpenCLSort.cpp
+++ b/platforms/opencl/src/OpenCLSort.cpp
@@ -32,6 +32,7 @@
 #include "OpenCLKernelSources.h"
 #include <algorithm>
 #include <map>
+#include <string>

 using namespace OpenMM;
 using namespace std;
@@ -91,6 +92,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
        bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset");
        bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement");
        offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket");
+        string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
+        if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA")
+            useShortList2 = (dataLength <= OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks());
+        else
+            useShortList2 = false;
    }
    buckets.initialize(context, length, trait->getDataSize(), "buckets");
 }
@@ -107,7 +113,7 @@ void OpenCLSort::sort(OpenCLArray& data) {
    if (isShortList) {
        // We can use a simpler sort kernel that does the entire operation in one kernel.
        
-        if (dataLength <= OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks()) {
+        if (useShortList2) {
            shortList2Kernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
            shortList2Kernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
            shortList2Kernel.setArg<cl_int>(2, dataLength);