Commit e19cefde authored by peastman's avatar peastman
Browse files

Merge pull request #665 from peastman/980

Workaround for driver bugs on GTX 980
parents ba66e90e be863b08
...@@ -632,7 +632,7 @@ private: ...@@ -632,7 +632,7 @@ private:
std::vector<std::pair<int, int> > exceptionAtoms; std::vector<std::pair<int, int> > exceptionAtoms;
double ewaldSelfEnergy, dispersionCoefficient, alpha; double ewaldSelfEnergy, dispersionCoefficient, alpha;
int interpolateForceThreads; int interpolateForceThreads;
bool hasCoulomb, hasLJ; bool hasCoulomb, hasLJ, usePmeStream;
static const int PmeOrder = 5; static const int PmeOrder = 5;
}; };
......
...@@ -1457,8 +1457,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1457,8 +1457,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
if (hasInitializedFFT) { if (hasInitializedFFT) {
cufftDestroy(fftForward); cufftDestroy(fftForward);
cufftDestroy(fftBackward); cufftDestroy(fftBackward);
cuStreamDestroy(pmeStream); if (usePmeStream) {
cuEventDestroy(pmeSyncEvent); cuStreamDestroy(pmeStream);
cuEventDestroy(pmeSyncEvent);
}
} }
} }
...@@ -1670,15 +1672,18 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1670,15 +1672,18 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Prepare for doing PME on its own stream. // Prepare for doing PME on its own stream.
cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING); usePmeStream = (cu.getComputeCapability() < 5.0); // A driver bug causes this to be very slow on GTX 980.
cufftSetStream(fftForward, pmeStream); if (usePmeStream) {
cufftSetStream(fftBackward, pmeStream); cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce"); cufftSetStream(fftForward, pmeStream);
int recipForceGroup = force.getReciprocalSpaceForceGroup(); cufftSetStream(fftBackward, pmeStream);
if (recipForceGroup < 0) CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
recipForceGroup = force.getForceGroup(); int recipForceGroup = force.getReciprocalSpaceForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(pmeStream, pmeSyncEvent, recipForceGroup)); if (recipForceGroup < 0)
cu.addPostComputation(new SyncStreamPostComputation(pmeSyncEvent, recipForceGroup)); recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(pmeStream, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(pmeSyncEvent, recipForceGroup));
}
hasInitializedFFT = true; hasInitializedFFT = true;
// Initialize the b-spline moduli. // Initialize the b-spline moduli.
...@@ -1795,7 +1800,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1795,7 +1800,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms()); cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
} }
if (directPmeGrid != NULL && includeReciprocal) { if (directPmeGrid != NULL && includeReciprocal) {
cu.setCurrentStream(pmeStream); if (usePmeStream)
cu.setCurrentStream(pmeStream);
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms()); cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
...@@ -1832,8 +1838,10 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1832,8 +1838,10 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(), void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()}; cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128); cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
cuEventRecord(pmeSyncEvent, pmeStream); if (usePmeStream) {
cu.restoreDefaultStream(); cuEventRecord(pmeSyncEvent, pmeStream);
cu.restoreDefaultStream();
}
} }
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0); double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (dispersionCoefficient != 0.0 && includeDirect) { if (dispersionCoefficient != 0.0 && includeDirect) {
......
...@@ -162,10 +162,10 @@ __kernel void copyDataToBuckets(__global const DATA_TYPE* restrict data, __globa ...@@ -162,10 +162,10 @@ __kernel void copyDataToBuckets(__global const DATA_TYPE* restrict data, __globa
* Sort the data in each bucket. * Sort the data in each bucket.
*/ */
__kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA_TYPE* restrict buckets, uint numBuckets, __global const uint* restrict bucketOffset, __local DATA_TYPE* restrict buffer) { __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA_TYPE* restrict buckets, uint numBuckets, __global const uint* restrict bucketOffset, __local DATA_TYPE* restrict buffer) {
for (uint index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) { for (int index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) {
uint startIndex = (index == 0 ? 0 : bucketOffset[index-1]); int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
uint endIndex = bucketOffset[index]; int endIndex = bucketOffset[index];
uint length = endIndex-startIndex; int length = endIndex-startIndex;
if (length <= get_local_size(0)) { if (length <= get_local_size(0)) {
// Load the data into local memory. // Load the data into local memory.
...@@ -177,8 +177,8 @@ __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA ...@@ -177,8 +177,8 @@ __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA
// Perform a bitonic sort in local memory. // Perform a bitonic sort in local memory.
for (uint k = 2; k <= get_local_size(0); k *= 2) { for (int k = 2; k <= get_local_size(0); k *= 2) {
for (uint j = k/2; j > 0; j /= 2) { for (int j = k/2; j > 0; j /= 2) {
int ixj = get_local_id(0)^j; int ixj = get_local_id(0)^j;
if (ixj > get_local_id(0)) { if (ixj > get_local_id(0)) {
DATA_TYPE value1 = buffer[get_local_id(0)]; DATA_TYPE value1 = buffer[get_local_id(0)];
...@@ -203,21 +203,21 @@ __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA ...@@ -203,21 +203,21 @@ __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA
else { else {
// Copy the bucket data over to the output array. // Copy the bucket data over to the output array.
for (uint i = get_local_id(0); i < length; i += get_local_size(0)) for (int i = get_local_id(0); i < length; i += get_local_size(0))
data[startIndex+i] = buckets[startIndex+i]; data[startIndex+i] = buckets[startIndex+i];
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
// Perform a bitonic sort in global memory. // Perform a bitonic sort in global memory.
for (uint k = 2; k < 2*length; k *= 2) { for (int k = 2; k < 2*length; k *= 2) {
for (uint j = k/2; j > 0; j /= 2) { for (int j = k/2; j > 0; j /= 2) {
for (uint i = get_local_id(0); i < length; i += get_local_size(0)) { for (int i = get_local_id(0); i < length; i += get_local_size(0)) {
int ixj = i^j; int ixj = i^j;
if (ixj > i && ixj < length) { if (ixj > i && ixj < length) {
DATA_TYPE value1 = data[startIndex+i]; DATA_TYPE value1 = data[startIndex+i];
DATA_TYPE value2 = data[startIndex+ixj]; DATA_TYPE value2 = data[startIndex+ixj];
bool ascending = ((i&k) == 0); bool ascending = ((i&k) == 0);
for (uint mask = k*2; mask < 2*length; mask *= 2) for (int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending); ascending = ((i&mask) == 0 ? !ascending : ascending);
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2)); KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1)); KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment