Unverified Commit 089384df authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #2631 from peastman/sort

Fixed errors in sorting on some AMD GPUs
parents 7c7dc751 0297d2f0
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. * * Portions copyright (c) 2010-2020 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -64,10 +64,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le ...@@ -64,10 +64,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice())); unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2; int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
unsigned int maxShortList = min(8192, max(maxLocalBuffer, (int) OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks())); unsigned int maxShortList = min(8192, max(maxLocalBuffer, (int) OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks()));
// On Qualcomm's OpenCL, it's essential to check against CL_KERNEL_WORK_GROUP_SIZE. Otherwise you get a crash. // The following line checks CL_KERNEL_WORK_GROUP_SIZE to make sure we don't create too large a workgroup.
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual // Unfortunately, AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out. // maximum, so including the check hurts performance. For the moment I'm just leaving it commented out.
// If we officially support Qualcomm in the future, we'll need to do something better. // If the workgroup size turns out to be too large, we catch the exception and switch back to the standard
// sorting kernels.
//maxShortList = min(maxShortList, shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice())); //maxShortList = min(maxShortList, shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
isShortList = (length <= maxShortList); isShortList = (length <= maxShortList);
string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>(); string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
...@@ -92,12 +93,10 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le ...@@ -92,12 +93,10 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
// Create workspace arrays. // Create workspace arrays.
if (!isShortList) {
dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange"); dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset"); bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset");
bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement"); bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement");
offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket"); offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket");
}
buckets.initialize(context, length, trait->getDataSize(), "buckets"); buckets.initialize(context, length, trait->getDataSize(), "buckets");
} }
...@@ -113,6 +112,7 @@ void OpenCLSort::sort(OpenCLArray& data) { ...@@ -113,6 +112,7 @@ void OpenCLSort::sort(OpenCLArray& data) {
if (isShortList) { if (isShortList) {
// We can use a simpler sort kernel that does the entire operation in one kernel. // We can use a simpler sort kernel that does the entire operation in one kernel.
try {
if (useShortList2) { if (useShortList2) {
shortList2Kernel.setArg<cl::Buffer>(0, data.getDeviceBuffer()); shortList2Kernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
shortList2Kernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer()); shortList2Kernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
...@@ -126,8 +126,16 @@ void OpenCLSort::sort(OpenCLArray& data) { ...@@ -126,8 +126,16 @@ void OpenCLSort::sort(OpenCLArray& data) {
shortListKernel.setArg(2, dataLength*trait->getDataSize(), NULL); shortListKernel.setArg(2, dataLength*trait->getDataSize(), NULL);
context.executeKernel(shortListKernel, sortKernelSize, sortKernelSize); context.executeKernel(shortListKernel, sortKernelSize, sortKernelSize);
} }
return;
} }
else { catch (exception& ex) {
// This can happen if we chose too large a size for the kernel. Switch
// over to the standard sorting method.
isShortList = false;
}
}
// Compute the range of data values. // Compute the range of data values.
unsigned int numBuckets = bucketOffset.getSize(); unsigned int numBuckets = bucketOffset.getSize();
...@@ -176,5 +184,4 @@ void OpenCLSort::sort(OpenCLArray& data) { ...@@ -176,5 +184,4 @@ void OpenCLSort::sort(OpenCLArray& data) {
sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset.getDeviceBuffer()); sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset.getDeviceBuffer());
sortBucketsKernel.setArg(4, sortKernelSize*trait->getDataSize(), NULL); sortBucketsKernel.setArg(4, sortKernelSize*trait->getDataSize(), NULL);
context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize); context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
}
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment