Commit 61c50862 authored by Peter Eastman's avatar Peter Eastman
Browse files

Fixed error on GPUs with only 16K local memory

parent 91f3379b
...@@ -1423,10 +1423,11 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1423,10 +1423,11 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
interpolateForceThreads = (cl.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() > 2*128*PmeOrder*sizeof(mm_float4) ? 128 : 64);
pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());
pmeInterpolateForceKernel.setArg(5, 2*128*PmeOrder*sizeof(mm_float4), NULL); pmeInterpolateForceKernel.setArg(5, 2*interpolateForceThreads*PmeOrder*sizeof(mm_float4), NULL);
if (cl.getSupports64BitGlobalAtomics()) { if (cl.getSupports64BitGlobalAtomics()) {
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge"); pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer()); pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
...@@ -1479,7 +1480,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1479,7 +1480,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
fft->execFFT(*pmeGrid2, *pmeGrid, false); fft->execFFT(*pmeGrid2, *pmeGrid, false);
pmeInterpolateForceKernel.setArg<mm_float4>(3, boxSize); pmeInterpolateForceKernel.setArg<mm_float4>(3, boxSize);
pmeInterpolateForceKernel.setArg<mm_float4>(4, invBoxSize); pmeInterpolateForceKernel.setArg<mm_float4>(4, invBoxSize);
cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms(), 128); cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms(), interpolateForceThreads);
} }
double energy = ewaldSelfEnergy; double energy = ewaldSelfEnergy;
if (dispersionCoefficient != 0.0) { if (dispersionCoefficient != 0.0) {
......
...@@ -525,6 +525,7 @@ private: ...@@ -525,6 +525,7 @@ private:
cl::Kernel pmeInterpolateForceKernel; cl::Kernel pmeInterpolateForceKernel;
std::map<std::string, std::string> pmeDefines; std::map<std::string, std::string> pmeDefines;
double ewaldSelfEnergy, dispersionCoefficient; double ewaldSelfEnergy, dispersionCoefficient;
int interpolateForceThreads;
static const int PmeOrder = 5; static const int PmeOrder = 5;
}; };
......
...@@ -220,7 +220,6 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en ...@@ -220,7 +220,6 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en
energyBuffer[get_global_id(0)] += 0.5f*energy; energyBuffer[get_global_id(0)] += 0.5f*energy;
} }
__kernel __attribute__((reqd_work_group_size(128, 1, 1)))
__kernel void gridInterpolateForce(__global float4* posq, __global float4* forceBuffers, __global float2* pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize, __local float4* bsplinesCache) { __kernel void gridInterpolateForce(__global float4* posq, __global float4* forceBuffers, __global float2* pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize, __local float4* bsplinesCache) {
const float4 scale = 1.0f/(PME_ORDER-1); const float4 scale = 1.0f/(PME_ORDER-1);
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER]; __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment