Commit 5cb6ad03 authored by Peter Eastman's avatar Peter Eastman
Browse files

Fixed a race condition in energy computation

parent 59b0b53f
...@@ -592,7 +592,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel { ...@@ -592,7 +592,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public: public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform), CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL), cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL), pmeio(NULL) { pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
} }
~CudaCalcNonbondedForceKernel(); ~CudaCalcNonbondedForceKernel();
/** /**
...@@ -657,6 +657,7 @@ private: ...@@ -657,6 +657,7 @@ private:
CudaArray* pmeBsplineModuliZ; CudaArray* pmeBsplineModuliZ;
CudaArray* pmeAtomRange; CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex; CudaArray* pmeAtomGridIndex;
CudaArray* pmeEnergyBuffer;
CudaSort* sort; CudaSort* sort;
Kernel cpuPme; Kernel cpuPme;
PmeIO* pmeio; PmeIO* pmeio;
......
...@@ -1458,16 +1458,24 @@ private: ...@@ -1458,16 +1458,24 @@ private:
class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation { class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation {
public: public:
SyncStreamPostComputation(CudaContext& cu, CUevent event, int forceGroup) : cu(cu), event(event), forceGroup(forceGroup) { SyncStreamPostComputation(CudaContext& cu, CUevent event, CUfunction addEnergyKernel, CudaArray& pmeEnergyBuffer, int forceGroup) : cu(cu), event(event),
addEnergyKernel(addEnergyKernel), pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
} }
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) { double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) if ((groups&(1<<forceGroup)) != 0)
cuStreamWaitEvent(cu.getCurrentStream(), event, 0); cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
if (includeEnergy) {
int bufferSize = pmeEnergyBuffer.getSize();
void* args[] = {&pmeEnergyBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &bufferSize};
cu.executeKernel(addEnergyKernel, args, bufferSize);
}
return 0.0; return 0.0;
} }
private: private:
CudaContext& cu; CudaContext& cu;
CUevent event; CUevent event;
CUfunction addEnergyKernel;
CudaArray& pmeEnergyBuffer;
int forceGroup; int forceGroup;
}; };
...@@ -1493,6 +1501,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1493,6 +1501,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete pmeAtomRange; delete pmeAtomRange;
if (pmeAtomGridIndex != NULL) if (pmeAtomGridIndex != NULL)
delete pmeAtomGridIndex; delete pmeAtomGridIndex;
if (pmeEnergyBuffer != NULL)
delete pmeEnergyBuffer;
if (sort != NULL) if (sort != NULL)
delete sort; delete sort;
if (fft != NULL) if (fft != NULL)
...@@ -1681,6 +1691,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1681,6 +1691,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex"); pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
pmeEnergyBuffer = new CudaArray(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
cu.clearBuffer(*pmeEnergyBuffer);
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms()); sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
int cufftVersion; int cufftVersion;
cufftGetVersion(&cufftVersion); cufftGetVersion(&cufftVersion);
...@@ -1714,7 +1727,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1714,7 +1727,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if (recipForceGroup < 0) if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup(); recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup)); cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, recipForceGroup)); cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), *pmeEnergyBuffer, recipForceGroup));
} }
hasInitializedFFT = true; hasInitializedFFT = true;
...@@ -1889,7 +1902,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1889,7 +1902,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
} }
if (includeEnergy) { if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), usePmeStream ? &pmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms()); cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms());
......
...@@ -188,7 +188,7 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ ...@@ -188,7 +188,7 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
energy += eterm*(grid.x*grid.x + grid.y*grid.y); energy += eterm*(grid.x*grid.x + grid.y*grid.y);
} }
} }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = 0.5f*energy;
} }
extern "C" __global__ extern "C" __global__
...@@ -286,3 +286,9 @@ void addForces(const real4* __restrict__ forces, unsigned long long* __restrict_ ...@@ -286,3 +286,9 @@ void addForces(const real4* __restrict__ forces, unsigned long long* __restrict_
forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.z*0x100000000)); forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.z*0x100000000));
} }
} }
extern "C" __global__
void addEnergy(const mixed* __restrict__ pmeEnergyBuffer, mixed* __restrict__ energyBuffer, int bufferSize) {
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < bufferSize; i += blockDim.x*gridDim.x)
energyBuffer[i] += pmeEnergyBuffer[i];
}
...@@ -570,7 +570,7 @@ public: ...@@ -570,7 +570,7 @@ public:
OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, const System& system) : CalcNonbondedForceKernel(name, platform), OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, const System& system) : CalcNonbondedForceKernel(name, platform),
hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL), hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL),
pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL), pmeio(NULL) { pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
} }
~OpenCLCalcNonbondedForceKernel(); ~OpenCLCalcNonbondedForceKernel();
/** /**
...@@ -636,12 +636,14 @@ private: ...@@ -636,12 +636,14 @@ private:
OpenCLArray* pmeBsplineTheta; OpenCLArray* pmeBsplineTheta;
OpenCLArray* pmeAtomRange; OpenCLArray* pmeAtomRange;
OpenCLArray* pmeAtomGridIndex; OpenCLArray* pmeAtomGridIndex;
OpenCLArray* pmeEnergyBuffer;
OpenCLSort* sort; OpenCLSort* sort;
cl::CommandQueue pmeQueue; cl::CommandQueue pmeQueue;
cl::Event pmeSyncEvent; cl::Event pmeSyncEvent;
OpenCLFFT3D* fft; OpenCLFFT3D* fft;
Kernel cpuPme; Kernel cpuPme;
PmeIO* pmeio; PmeIO* pmeio;
SyncQueuePostComputation* syncQueue;
cl::Kernel ewaldSumsKernel; cl::Kernel ewaldSumsKernel;
cl::Kernel ewaldForcesKernel; cl::Kernel ewaldForcesKernel;
cl::Kernel pmeGridIndexKernel; cl::Kernel pmeGridIndexKernel;
......
...@@ -1454,7 +1454,14 @@ private: ...@@ -1454,7 +1454,14 @@ private:
class OpenCLCalcNonbondedForceKernel::SyncQueuePostComputation : public OpenCLContext::ForcePostComputation { class OpenCLCalcNonbondedForceKernel::SyncQueuePostComputation : public OpenCLContext::ForcePostComputation {
public: public:
SyncQueuePostComputation(OpenCLContext& cl, cl::Event& event, int forceGroup) : cl(cl), event(event), forceGroup(forceGroup) { SyncQueuePostComputation(OpenCLContext& cl, cl::Event& event, OpenCLArray& pmeEnergyBuffer, int forceGroup) : cl(cl), event(event),
pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
}
void setKernel(cl::Kernel kernel) {
addEnergyKernel = kernel;
addEnergyKernel.setArg<cl::Buffer>(0, pmeEnergyBuffer.getDeviceBuffer());
addEnergyKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
addEnergyKernel.setArg<cl_int>(2, pmeEnergyBuffer.getSize());
} }
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) { double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) { if ((groups&(1<<forceGroup)) != 0) {
...@@ -1463,11 +1470,15 @@ public: ...@@ -1463,11 +1470,15 @@ public:
event = cl::Event(); event = cl::Event();
cl.getQueue().enqueueWaitForEvents(events); cl.getQueue().enqueueWaitForEvents(events);
} }
if (includeEnergy)
cl.executeKernel(addEnergyKernel, pmeEnergyBuffer.getSize());
return 0.0; return 0.0;
} }
private: private:
OpenCLContext& cl; OpenCLContext& cl;
cl::Event& event; cl::Event& event;
cl::Kernel addEnergyKernel;
OpenCLArray& pmeEnergyBuffer;
int forceGroup; int forceGroup;
}; };
...@@ -1494,6 +1505,8 @@ OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() { ...@@ -1494,6 +1505,8 @@ OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
delete pmeAtomRange; delete pmeAtomRange;
if (pmeAtomGridIndex != NULL) if (pmeAtomGridIndex != NULL)
delete pmeAtomGridIndex; delete pmeAtomGridIndex;
if (pmeEnergyBuffer != NULL)
delete pmeEnergyBuffer;
if (sort != NULL) if (sort != NULL)
delete sort; delete sort;
if (fft != NULL) if (fft != NULL)
...@@ -1663,6 +1676,9 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1663,6 +1676,9 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeBsplineTheta = new OpenCLArray(cl, PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta"); pmeBsplineTheta = new OpenCLArray(cl, PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex"); pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
int energyElementSize = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
pmeEnergyBuffer = new OpenCLArray(cl, cl.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
cl.clearBuffer(*pmeEnergyBuffer);
sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms()); sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ, true); fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ, true);
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>(); string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
...@@ -1676,7 +1692,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1676,7 +1692,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
if (recipForceGroup < 0) if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup(); recipForceGroup = force.getForceGroup();
cl.addPreComputation(new SyncQueuePreComputation(cl, pmeQueue, recipForceGroup)); cl.addPreComputation(new SyncQueuePreComputation(cl, pmeQueue, recipForceGroup));
cl.addPostComputation(new SyncQueuePostComputation(cl, pmeSyncEvent, recipForceGroup)); cl.addPostComputation(syncQueue = new SyncQueuePostComputation(cl, pmeSyncEvent, *pmeEnergyBuffer, recipForceGroup));
} }
// Initialize the b-spline moduli. // Initialize the b-spline moduli.
...@@ -1831,7 +1847,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1831,7 +1847,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliY->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliY->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliZ->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliZ->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer()); pmeEvalEnergyKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer()); pmeEvalEnergyKernel.setArg<cl::Buffer>(1, usePmeQueue ? pmeEnergyBuffer->getDeviceBuffer() : cl.getEnergyBuffer().getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer()); pmeEvalEnergyKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer()); pmeEvalEnergyKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer()); pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
...@@ -1844,6 +1860,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1844,6 +1860,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer()); pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid->getDeviceBuffer()); pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid->getDeviceBuffer());
} }
if (usePmeQueue)
syncQueue->setKernel(cl::Kernel(program, "addEnergy"));
} }
} }
if (cosSinSums != NULL && includeReciprocal) { if (cosSinSums != NULL && includeReciprocal) {
......
...@@ -362,7 +362,7 @@ __kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixe ...@@ -362,7 +362,7 @@ __kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixe
energy += eterm*(grid.x*grid.x + grid.y*grid.y); energy += eterm*(grid.x*grid.x + grid.y*grid.y);
} }
} }
energyBuffer[get_global_id(0)] += 0.5f*energy; energyBuffer[get_global_id(0)] = 0.5f*energy;
} }
__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real* restrict pmeGrid, __kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real* restrict pmeGrid,
...@@ -445,3 +445,8 @@ __kernel void addForces(__global const real4* restrict forces, __global real4* r ...@@ -445,3 +445,8 @@ __kernel void addForces(__global const real4* restrict forces, __global real4* r
for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0))
forceBuffers[atom] += forces[atom]; forceBuffers[atom] += forces[atom];
} }
__kernel void addEnergy(__global const mixed* restrict pmeEnergyBuffer, __global mixed* restrict energyBuffer, int bufferSize) {
for (int i = get_global_id(0); i < bufferSize; i += get_global_size(0))
energyBuffer[i] += pmeEnergyBuffer[i];
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment