Commit be863b08 authored by Peter Eastman's avatar Peter Eastman
Browse files

Better workaround for bug on GTX 980

parent 8e2fc4ea
...@@ -1611,12 +1611,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1611,12 +1611,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ); fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>(); string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
usePmeQueue = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA"); usePmeQueue = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
if (cl.getDevice().getInfo<CL_DEVICE_EXTENSIONS>().find("cl_nv_device_attribute_query") != string::npos) {
cl_uint computeCapabilityMajor;
clGetDeviceInfo(cl.getDevice()(), 0x4000, sizeof(cl_uint), &computeCapabilityMajor, NULL); // CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
if (computeCapabilityMajor == 5)
usePmeQueue = false; // Workaround for driver bug that affects GTX 980.
}
if (usePmeQueue) { if (usePmeQueue) {
pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice()); pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
int recipForceGroup = force.getReciprocalSpaceForceGroup(); int recipForceGroup = force.getReciprocalSpaceForceGroup();
......
...@@ -162,10 +162,10 @@ __kernel void copyDataToBuckets(__global const DATA_TYPE* restrict data, __globa ...@@ -162,10 +162,10 @@ __kernel void copyDataToBuckets(__global const DATA_TYPE* restrict data, __globa
* Sort the data in each bucket. * Sort the data in each bucket.
*/ */
__kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA_TYPE* restrict buckets, uint numBuckets, __global const uint* restrict bucketOffset, __local DATA_TYPE* restrict buffer) { __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA_TYPE* restrict buckets, uint numBuckets, __global const uint* restrict bucketOffset, __local DATA_TYPE* restrict buffer) {
for (uint index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) { for (int index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) {
uint startIndex = (index == 0 ? 0 : bucketOffset[index-1]); int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
uint endIndex = bucketOffset[index]; int endIndex = bucketOffset[index];
uint length = endIndex-startIndex; int length = endIndex-startIndex;
if (length <= get_local_size(0)) { if (length <= get_local_size(0)) {
// Load the data into local memory. // Load the data into local memory.
...@@ -177,8 +177,8 @@ __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA ...@@ -177,8 +177,8 @@ __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA
// Perform a bitonic sort in local memory. // Perform a bitonic sort in local memory.
for (uint k = 2; k <= get_local_size(0); k *= 2) { for (int k = 2; k <= get_local_size(0); k *= 2) {
for (uint j = k/2; j > 0; j /= 2) { for (int j = k/2; j > 0; j /= 2) {
int ixj = get_local_id(0)^j; int ixj = get_local_id(0)^j;
if (ixj > get_local_id(0)) { if (ixj > get_local_id(0)) {
DATA_TYPE value1 = buffer[get_local_id(0)]; DATA_TYPE value1 = buffer[get_local_id(0)];
...@@ -203,21 +203,21 @@ __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA ...@@ -203,21 +203,21 @@ __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA
else { else {
// Copy the bucket data over to the output array. // Copy the bucket data over to the output array.
for (uint i = get_local_id(0); i < length; i += get_local_size(0)) for (int i = get_local_id(0); i < length; i += get_local_size(0))
data[startIndex+i] = buckets[startIndex+i]; data[startIndex+i] = buckets[startIndex+i];
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
// Perform a bitonic sort in global memory. // Perform a bitonic sort in global memory.
for (uint k = 2; k < 2*length; k *= 2) { for (int k = 2; k < 2*length; k *= 2) {
for (uint j = k/2; j > 0; j /= 2) { for (int j = k/2; j > 0; j /= 2) {
for (uint i = get_local_id(0); i < length; i += get_local_size(0)) { for (int i = get_local_id(0); i < length; i += get_local_size(0)) {
int ixj = i^j; int ixj = i^j;
if (ixj > i && ixj < length) { if (ixj > i && ixj < length) {
DATA_TYPE value1 = data[startIndex+i]; DATA_TYPE value1 = data[startIndex+i];
DATA_TYPE value2 = data[startIndex+ixj]; DATA_TYPE value2 = data[startIndex+ixj];
bool ascending = ((i&k) == 0); bool ascending = ((i&k) == 0);
for (uint mask = k*2; mask < 2*length; mask *= 2) for (int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending); ascending = ((i&mask) == 0 ? !ascending : ascending);
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2)); KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1)); KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment