Commit ae338b59 authored by peastman's avatar peastman
Browse files

Merge pull request #738 from peastman/bug

Workaround for CUDA bug
parents 0b0b213c 7bbf9331
...@@ -1400,15 +1400,16 @@ private: ...@@ -1400,15 +1400,16 @@ private:
class CudaCalcNonbondedForceKernel::SyncStreamPreComputation : public CudaContext::ForcePreComputation { class CudaCalcNonbondedForceKernel::SyncStreamPreComputation : public CudaContext::ForcePreComputation {
public: public:
SyncStreamPreComputation(CUstream stream, CUevent event, int forceGroup) : stream(stream), event(event), forceGroup(forceGroup) { SyncStreamPreComputation(CudaContext& cu, CUstream stream, CUevent event, int forceGroup) : cu(cu), stream(stream), event(event), forceGroup(forceGroup) {
} }
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) { void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) { if ((groups&(1<<forceGroup)) != 0) {
cuEventRecord(event, 0); cuEventRecord(event, cu.getCurrentStream());
cuStreamWaitEvent(stream, event, 0); cuStreamWaitEvent(stream, event, 0);
} }
} }
private: private:
CudaContext& cu;
CUstream stream; CUstream stream;
CUevent event; CUevent event;
int forceGroup; int forceGroup;
...@@ -1416,14 +1417,15 @@ private: ...@@ -1416,14 +1417,15 @@ private:
class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation { class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation {
public: public:
SyncStreamPostComputation(CUevent event, int forceGroup) : event(event), forceGroup(forceGroup) { SyncStreamPostComputation(CudaContext& cu, CUevent event, int forceGroup) : cu(cu), event(event), forceGroup(forceGroup) {
} }
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) { double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) if ((groups&(1<<forceGroup)) != 0)
cuStreamWaitEvent(0, event, 0); cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
return 0.0; return 0.0;
} }
private: private:
CudaContext& cu;
CUevent event; CUevent event;
int forceGroup; int forceGroup;
}; };
...@@ -1672,7 +1674,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1672,7 +1674,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Prepare for doing PME on its own stream. // Prepare for doing PME on its own stream.
usePmeStream = (cu.getComputeCapability() < 5.0); // A driver bug causes this to be very slow on GTX 980. usePmeStream = (cu.getComputeCapability() < 5.0 && numParticles < 130000); // Workarounds for various CUDA bugs
if (usePmeStream) { if (usePmeStream) {
cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING); cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
cufftSetStream(fftForward, pmeStream); cufftSetStream(fftForward, pmeStream);
...@@ -1681,8 +1683,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1681,8 +1683,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
int recipForceGroup = force.getReciprocalSpaceForceGroup(); int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0) if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup(); recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(pmeStream, pmeSyncEvent, recipForceGroup)); cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(pmeSyncEvent, recipForceGroup)); cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, recipForceGroup));
} }
hasInitializedFFT = true; hasInitializedFFT = true;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment