Commit 86b2af14 authored by peastman's avatar peastman
Browse files

Fixed error in energy computation

parent d92e0937
...@@ -1717,6 +1717,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1717,6 +1717,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
cuDeviceGetName(deviceName, 100, cu.getDevice()); cuDeviceGetName(deviceName, 100, cu.getDevice());
usePmeStream = (string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980 usePmeStream = (string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
if (usePmeStream) { if (usePmeStream) {
pmeDefines["USE_PME_STREAM"] = "1";
cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING); cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
if (useCudaFFT) { if (useCudaFFT) {
cufftSetStream(fftForward, pmeStream); cufftSetStream(fftForward, pmeStream);
......
...@@ -188,7 +188,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ ...@@ -188,7 +188,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
energy += eterm*(grid.x*grid.x + grid.y*grid.y); energy += eterm*(grid.x*grid.x + grid.y*grid.y);
} }
} }
#ifdef USE_PME_STREAM
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = 0.5f*energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = 0.5f*energy;
#else
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
#endif
} }
extern "C" __global__ extern "C" __global__
......
...@@ -1687,6 +1687,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1687,6 +1687,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1"; pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1";
usePmeQueue = isNvidia; usePmeQueue = isNvidia;
if (usePmeQueue) { if (usePmeQueue) {
pmeDefines["USE_PME_STREAM"] = "1";
pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice()); pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
int recipForceGroup = force.getReciprocalSpaceForceGroup(); int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0) if (recipForceGroup < 0)
......
...@@ -84,7 +84,7 @@ __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global co ...@@ -84,7 +84,7 @@ __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global co
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global long* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) { __global long* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
const real4 scale = 1/(real) (PME_ORDER-1); const real scale = 1/(real) (PME_ORDER-1);
real4 data[PME_ORDER]; real4 data[PME_ORDER];
// Process the atoms in spatially sorted order. This improves efficiency when writing // Process the atoms in spatially sorted order. This improves efficiency when writing
...@@ -118,7 +118,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con ...@@ -118,7 +118,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
data[j-1] = div*dr*data[j-2]; data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++) for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]); data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0]; data[0] = div*(-dr+1.0f)*data[0];
} }
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2]; data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++) for (int j = 1; j < (PME_ORDER-1); j++)
...@@ -362,12 +362,16 @@ __kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixe ...@@ -362,12 +362,16 @@ __kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixe
energy += eterm*(grid.x*grid.x + grid.y*grid.y); energy += eterm*(grid.x*grid.x + grid.y*grid.y);
} }
} }
#ifdef USE_PME_STREAM
energyBuffer[get_global_id(0)] = 0.5f*energy; energyBuffer[get_global_id(0)] = 0.5f*energy;
#else
energyBuffer[get_global_id(0)] += 0.5f*energy;
#endif
} }
__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real* restrict pmeGrid, __kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real* restrict pmeGrid,
real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, __global int2* restrict pmeAtomGridIndex) { real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, __global int2* restrict pmeAtomGridIndex) {
const real4 scale = 1/(real) (PME_ORDER-1); const real scale = 1/(real) (PME_ORDER-1);
real4 data[PME_ORDER]; real4 data[PME_ORDER];
real4 ddata[PME_ORDER]; real4 ddata[PME_ORDER];
...@@ -403,7 +407,7 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global ...@@ -403,7 +407,7 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
data[j-1] = div*dr*data[j-2]; data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++) for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]); data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0]; data[0] = div*(-dr+1.0f)*data[0];
} }
ddata[0] = -data[0]; ddata[0] = -data[0];
for (int j = 1; j < PME_ORDER; j++) for (int j = 1; j < PME_ORDER; j++)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment