Commit 0297d2f0 authored by peastman's avatar peastman
Browse files

Fixed errors in sorting on some AMD GPUs

parent 7c7dc751
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Portions copyright (c) 2010-2020 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -64,10 +64,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
unsigned int maxShortList = min(8192, max(maxLocalBuffer, (int) OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks()));
// On Qualcomm's OpenCL, it's essential to check against CL_KERNEL_WORK_GROUP_SIZE. Otherwise you get a crash.
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better.
// The following line checks CL_KERNEL_WORK_GROUP_SIZE to make sure we don't create too large a workgroup.
// Unfortunately, AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm just leaving it commented out.
// If the workgroup size turns out to be too large, we catch the exception and switch back to the standard
// sorting kernels.
//maxShortList = min(maxShortList, shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
isShortList = (length <= maxShortList);
string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
......@@ -92,12 +93,10 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
// Create workspace arrays.
if (!isShortList) {
dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset");
bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement");
offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket");
}
buckets.initialize(context, length, trait->getDataSize(), "buckets");
}
......@@ -113,6 +112,7 @@ void OpenCLSort::sort(OpenCLArray& data) {
if (isShortList) {
// We can use a simpler sort kernel that does the entire operation in one kernel.
try {
if (useShortList2) {
shortList2Kernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
shortList2Kernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
......@@ -126,8 +126,16 @@ void OpenCLSort::sort(OpenCLArray& data) {
shortListKernel.setArg(2, dataLength*trait->getDataSize(), NULL);
context.executeKernel(shortListKernel, sortKernelSize, sortKernelSize);
}
return;
}
else {
catch (exception& ex) {
// This can happen if we chose too large a size for the kernel. Switch
// over to the standard sorting method.
isShortList = false;
}
}
// Compute the range of data values.
unsigned int numBuckets = bucketOffset.getSize();
......@@ -176,5 +184,4 @@ void OpenCLSort::sort(OpenCLArray& data) {
sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset.getDeviceBuffer());
sortBucketsKernel.setArg(4, sortKernelSize*trait->getDataSize(), NULL);
context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment