Commit 138ae180 authored by peastman's avatar peastman
Browse files

Improved performance on AMD GPUs

parent 79ba3504
......@@ -94,7 +94,7 @@ private:
OpenCLArray buckets;
cl::Kernel shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList;
bool isShortList, useShortList2;
};
/**
......
......@@ -32,6 +32,7 @@
#include "OpenCLKernelSources.h"
#include <algorithm>
#include <map>
#include <string>
using namespace OpenMM;
using namespace std;
......@@ -91,6 +92,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset");
bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement");
offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket");
string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA")
useShortList2 = (dataLength <= OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks());
else
useShortList2 = false;
}
buckets.initialize(context, length, trait->getDataSize(), "buckets");
}
......@@ -107,7 +113,7 @@ void OpenCLSort::sort(OpenCLArray& data) {
if (isShortList) {
// We can use a simpler sort kernel that does the entire operation in one kernel.
if (dataLength <= OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks()) {
if (useShortList2) {
shortList2Kernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
shortList2Kernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
shortList2Kernel.setArg<cl_int>(2, dataLength);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment