Major optimizations to PME performance

98c30a3f · Peter Eastman · b880b5d9 · 98c30a3f · 98c30a3f · 98c30a3f
Commit 98c30a3f authored Jun 10, 2010 by Peter Eastman
9 changed files
--- a/platforms/cuda/src/kernels/bbsort.cu
+++ b/platforms/cuda/src/kernels/bbsort.cu
@@ -17,7 +17,7 @@
 #include "bbsort_kernel.cu"
-float getValue(float2 v){
+int getValue(int2 v){
 	return v.y;
 }
@@ -115,7 +115,7 @@ void reduceMinMax(T* dData,int size,float& result,bool isMax)
 	CUDA_SAFE_CALL(cudaMemcpy(&originalResult, dData, sizeof(T), cudaMemcpyDeviceToHost));
-	result=(float)getValue(originalResult);
+	result=(int)getValue(originalResult);
 }
 template <typename T>
@@ -305,7 +305,7 @@ Also note that you need to use 1.3 capbility (use arch=sm_13 in your compile com
 *************************************************************************************/
 template<>
-void bbSort(float2* dData,int size,int listOrder)
+void bbSort(int2* dData,int size,int listOrder)
 {
 	bbSortBody(dData,size,listOrder);

--- a/platforms/cuda/src/kernels/bbsort_kernel.cu
+++ b/platforms/cuda/src/kernels/bbsort_kernel.cu
@@ -19,7 +19,7 @@ texture<unsigned int, 1, cudaReadModeElementType> tBucketOffsets;
 texture<unsigned int, 1, cudaReadModeElementType> tBucketOfSlices;
 texture<unsigned int, 1, cudaReadModeElementType> tSliceOffsetInBucket;
-__device__ float dGetValue(float2 v){
+__device__ int dGetValue(int2 v){
 	return v.y;
 }
@@ -29,7 +29,7 @@ __device__ T dGetValue(T v){
 }
-__device__ void dPad(float2& v){
+__device__ void dPad(int2& v){
 	v.x=0x3fffffff;
 	v.y=0x4fffffff;
 }

--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -420,7 +420,7 @@ struct cudaGmxSimulation {
    float4*         pPmeBsplineTheta;
    float4*         pPmeBsplineDtheta;
    int*            pPmeAtomRange;                  // The range of sorted atoms at each grid point
-    float2*         pPmeAtomGridIndex;              // The grid point each atom is at
+    int2*           pPmeAtomGridIndex;              // The grid point each atom is at
    unsigned int    bonds;                          // Number of bonds
    int4*           pBondID;                        // Bond atom and output buffer IDs
    float2*         pBondParameter;                 // Bond parameters

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -994,7 +994,7 @@ void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSiz
    gpu->sim.pPmeBsplineDtheta = gpu->psPmeBsplineDtheta->_pDevData;
    gpu->psPmeAtomRange = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange");
    gpu->sim.pPmeAtomRange = gpu->psPmeAtomRange->_pDevData;
-    gpu->psPmeAtomGridIndex = new CUDAStream<float2>(gpu->natoms, 1, "PmeAtomGridIndex");
+    gpu->psPmeAtomGridIndex = new CUDAStream<int2>(gpu->natoms, 1, "PmeAtomGridIndex");
    gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData;
    tabulateErfc(gpu);

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -122,7 +122,7 @@ struct _gpuContext {
    CUDAStream<float4>* psPmeBsplineTheta;
    CUDAStream<float4>* psPmeBsplineDtheta;
    CUDAStream<int>* psPmeAtomRange;           // The range of sorted atoms at each grid point
-    CUDAStream<float2>* psPmeAtomGridIndex;    // The grid point each atom is at
+    CUDAStream<int2>* psPmeAtomGridIndex;      // The grid point each atom is at
    CUDAStream<float2>* psObcData;
    CUDAStream<float4>* psGBVIData;
    CUDAStream<float>* psObcChain;

--- a/platforms/cuda/src/kernels/kCalculatePME.cu
+++ b/platforms/cuda/src/kernels/kCalculatePME.cu
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Portions copyright (c) 2009-2010 Stanford University and the Authors.      *
 * Authors: Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman       *
 * Contributors:                                                              *
 *                                                                            *
@@ -117,7 +117,7 @@ void kUpdateGridIndexAndFraction_kernel()
        int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x,
                              ((int) t.y) % cSim.pmeGridSize.y,
                              ((int) t.z) % cSim.pmeGridSize.z);
-        cSim.pPmeAtomGridIndex[i] = make_float2(i, gridIndex.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridIndex.y*cSim.pmeGridSize.z+gridIndex.z);
+        cSim.pPmeAtomGridIndex[i] = make_int2(i, gridIndex.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridIndex.y*cSim.pmeGridSize.z+gridIndex.z);
    }
 }
@@ -141,7 +141,7 @@ void kFindAtomRangeForGrid_kernel()
    int last = (start == 0 ? -1 : cSim.pPmeAtomGridIndex[start-1].y);
    for (int i = start; i < end; ++i)
    {
-        float2 atomData = cSim.pPmeAtomGridIndex[i];
+        int2 atomData = cSim.pPmeAtomGridIndex[i];
        int gridIndex = atomData.y;
        if (gridIndex != last)
        {
@@ -150,10 +150,13 @@ void kFindAtomRangeForGrid_kernel()
            last = gridIndex;
        }
-        // The grid index won't be needed again.  Reuse that component to hold the atom charge, thus saving
+        // The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
-        // an extra load operation in the charge spreading kernel.
+        // some work in the charge spreading kernel.
-        cSim.pPmeAtomGridIndex[i].y = cSim.pPosq[(int) atomData.x].w;
+        float posz = cSim.pPosq[atomData.x].z;
+        posz -= floor(posz*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
+        int z = ((int) ((posz*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z)) % cSim.pmeGridSize.z;
+        cSim.pPmeAtomGridIndex[i].y = z;
    }
    // Fill in values beyond the last atom.
@@ -266,28 +269,47 @@ void kGridSpreadCharge_kernel()
        int remainder = gridIndex-gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
        gridPoint.y = remainder/cSim.pmeGridSize.z;
        gridPoint.z = remainder-gridPoint.y*cSim.pmeGridSize.z;
-        gridPoint.x += cSim.pmeGridSize.x;
-        gridPoint.y += cSim.pmeGridSize.y;
-        gridPoint.z += cSim.pmeGridSize.z;
        float result = 0.0f;
        for (int ix = 0; ix < PME_ORDER; ++ix)
+        {
+            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : cSim.pmeGridSize.x);
            for (int iy = 0; iy < PME_ORDER; ++iy)
-                for (int iz = 0; iz < PME_ORDER; ++iz)
+            {
+                int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : cSim.pmeGridSize.y);
+                int z1 = gridPoint.z-PME_ORDER+1;
+                z1 += (z1 >= 0 ? 0 : cSim.pmeGridSize.z);
+                int z2 = (z1 < gridPoint.z ? gridPoint.z : cSim.pmeGridSize.z-1);
+                int gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z1;
+                int gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z2;
+                int firstAtom = cSim.pPmeAtomRange[gridIndex1];
+                int lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
+                for (int i = firstAtom; i < lastAtom; ++i)
+                {
+                    int2 atomData = cSim.pPmeAtomGridIndex[i];
+                    int atomIndex = atomData.x;
+                    int z = atomData.y;
+                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
+                    float atomCharge = cSim.pPosq[atomIndex].w;
+                    result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
+                }
+                if (z1 > gridPoint.z)
                {
-                    int x = (gridPoint.x-ix)%cSim.pmeGridSize.x;
+                    gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z;
-                    int y = (gridPoint.y-iy)%cSim.pmeGridSize.y;
+                    gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+gridPoint.z;
-                    int z = (gridPoint.z-iz)%cSim.pmeGridSize.z;
+                    firstAtom = cSim.pPmeAtomRange[gridIndex1];
-                    int gridIndex = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z;
+                    lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
-                    int firstAtom = cSim.pPmeAtomRange[gridIndex];
-                    int lastAtom = cSim.pPmeAtomRange[gridIndex+1];
                    for (int i = firstAtom; i < lastAtom; ++i)
                    {
-                        float2 atomData = cSim.pPmeAtomGridIndex[i];
+                        int2 atomData = cSim.pPmeAtomGridIndex[i];
                        int atomIndex = atomData.x;
-                        float atomCharge = atomData.y;
+                        int z = atomData.y;
+                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
+                        float atomCharge = cSim.pPosq[atomIndex].w;
                        result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
                    }
                }
+            }
+        }
        cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
    }
 }
@@ -358,17 +380,20 @@ void kGridInterpolateForce_kernel()
                              ((int) t.z) % cSim.pmeGridSize.z);
        for (int ix = 0; ix < PME_ORDER; ix++)
        {
-            int xindex = (gridIndex.x + ix) % cSim.pmeGridSize.x;
+            int xindex = gridIndex.x+ix;
+            xindex -= (xindex >= cSim.pmeGridSize.x ? cSim.pmeGridSize.x : 0);
            float tx = cSim.pPmeBsplineTheta[atom+ix*cSim.atoms].x;
            float dtx = cSim.pPmeBsplineDtheta[atom+ix*cSim.atoms].x;
            for (int iy = 0; iy < PME_ORDER; iy++)
            {
-                int yindex = (gridIndex.y + iy) % cSim.pmeGridSize.y;
+                int yindex = gridIndex.y+iy;
+                yindex -= (yindex >= cSim.pmeGridSize.y ? cSim.pmeGridSize.y : 0);
                float ty = cSim.pPmeBsplineTheta[atom+iy*cSim.atoms].y;
                float dty = cSim.pPmeBsplineDtheta[atom+iy*cSim.atoms].y;
                for (int iz = 0; iz < PME_ORDER; iz++)
                {
-                    int zindex               = (gridIndex.z + iz) % cSim.pmeGridSize.z;
+                    int zindex               = gridIndex.z+iz;
+                    zindex -= (zindex >= cSim.pmeGridSize.z ? cSim.pmeGridSize.z : 0);
                    float tz = cSim.pPmeBsplineTheta[atom+iz*cSim.atoms].z;
                    float dtz = cSim.pPmeBsplineDtheta[atom+iz*cSim.atoms].z;
                    int index                = xindex*cSim.pmeGridSize.y*cSim.pmeGridSize.z + yindex*cSim.pmeGridSize.z + zindex;

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1151,8 +1151,8 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        pmeBsplineTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta");
        pmeBsplineDtheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineDtheta");
        pmeAtomRange = new OpenCLArray<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
-        pmeAtomGridIndex = new OpenCLArray<mm_float2>(cl, numParticles, "pmeAtomGridIndex");
+        pmeAtomGridIndex = new OpenCLArray<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
-        sort = new OpenCLSort<mm_float2>(cl, cl.getNumAtoms(), "float2", "value.y");
+        sort = new OpenCLSort<mm_int2>(cl, cl.getNumAtoms(), "int2", "value.y");
        fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
        // Initialize the b-spline moduli.
@@ -1303,10 +1303,11 @@ void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(2, pmeBsplineDtheta->getDeviceBuffer());
            pmeUpdateBsplinesKernel.setArg(3, 2*OpenCLContext::ThreadBlockSize*PmeOrder*sizeof(mm_float4), NULL);
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(4, pmeAtomGridIndex->getDeviceBuffer());
-            pmeSpreadChargeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer());
+            pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
-            pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomRange->getDeviceBuffer());
+            pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
-            pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());
+            pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer());
-            pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeBsplineTheta->getDeviceBuffer());
+            pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer());
+            pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer());
            pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
            pmeConvolutionKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
            pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());

--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -491,9 +491,9 @@ private:
    OpenCLArray<mm_float4>* pmeBsplineTheta;
    OpenCLArray<mm_float4>* pmeBsplineDtheta;
    OpenCLArray<cl_int>* pmeAtomRange;
-    OpenCLArray<mm_float2>* pmeAtomGridIndex;
+    OpenCLArray<mm_int2>* pmeAtomGridIndex;
    OpenCLArray<cl_float>* erfcTable;
-    OpenCLSort<mm_float2>* sort;
+    OpenCLSort<mm_int2>* sort;
    OpenCLFFT3D* fft;
    cl::Kernel exceptionsKernel;
    cl::Kernel ewaldSumsKernel;

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
-__kernel void updateGridIndexAndFraction(__global float4* posq, __global float2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void updateGridIndexAndFraction(__global float4* posq, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        float4 pos = posq[i];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
@@ -10,7 +10,7 @@ __kernel void updateGridIndexAndFraction(__global float4* posq, __global float2*
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
-        pmeAtomGridIndex[i] = (float2) (i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
+        pmeAtomGridIndex[i] = (int2) (i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
    }
 }
@@ -18,12 +18,12 @@ __kernel void updateGridIndexAndFraction(__global float4* posq, __global float2*
 * For each grid point, find the range of sorted atoms associated with that point.
 */
-__kernel void findAtomRangeForGrid(__global float2* pmeAtomGridIndex, __global int* pmeAtomRange) {
+__kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int* pmeAtomRange) {
    int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
    int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
    for (int i = start; i < end; ++i) {
-        float2 atomData = pmeAtomGridIndex[i];
+        int2 atomData = pmeAtomGridIndex[i];
        int gridIndex = atomData.y;
        if (gridIndex != last) {
            for (int j = last+1; j <= gridIndex; ++j)
@@ -41,7 +41,7 @@ __kernel void findAtomRangeForGrid(__global float2* pmeAtomGridIndex, __global i
    }
 }
-__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache, __global float2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
    const float4 scale = 1.0f/(PME_ORDER-1);
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
@@ -81,45 +81,71 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
        }
    }
-    // The grid index won't be needed again.  Reuse that component to hold the atom charge, thus saving
+    // The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
-    // an extra load operation in the charge spreading kernel.
+    // some work in the charge spreading kernel.
    int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
    int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
    for (int i = start; i < end; ++i) {
-        float2 atomData = pmeAtomGridIndex[i];
+        float posz = posq[pmeAtomGridIndex[i].x].z;
-        pmeAtomGridIndex[i].y = posq[(int) atomData.x].w;
+        posz -= floor(posz*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        int z = ((int) ((posz*invPeriodicBoxSize.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
+        pmeAtomGridIndex[i].y = z;
    }
 }
-__kernel void gridSpreadCharge(__global float2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta) {
+__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta) {
    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
+        // Compute the charge on a grid point.
        int4 gridPoint;
        gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
        int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
        gridPoint.y = remainder/GRID_SIZE_Z;
        gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
-        gridPoint.x += GRID_SIZE_X;
-        gridPoint.y += GRID_SIZE_Y;
-        gridPoint.z += GRID_SIZE_Z;
        float result = 0.0f;
-        for (int ix = 0; ix < PME_ORDER; ++ix)
-            for (int iy = 0; iy < PME_ORDER; ++iy)
+        // Loop over all atoms that affect this grid point.
-                for (int iz = 0; iz < PME_ORDER; ++iz) {
-                    int x = (gridPoint.x-ix)%GRID_SIZE_X;
+        for (int ix = 0; ix < PME_ORDER; ++ix) {
-                    int y = (gridPoint.y-iy)%GRID_SIZE_Y;
+            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
-                    int z = (gridPoint.z-iz)%GRID_SIZE_Z;
+            for (int iy = 0; iy < PME_ORDER; ++iy) {
-                    int gridIndex = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z;
+                int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
-                    int firstAtom = pmeAtomRange[gridIndex];
+                int z1 = gridPoint.z-PME_ORDER+1;
-                    int lastAtom = pmeAtomRange[gridIndex+1];
+                z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
-                    for (int i = firstAtom; i < lastAtom; ++i) {
+                int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
-                        float2 atomData = pmeAtomGridIndex[i];
+                int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
+                int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
+                int firstAtom = pmeAtomRange[gridIndex1];
+                int lastAtom = pmeAtomRange[gridIndex2+1];
+                for (int i = firstAtom; i < lastAtom; ++i)
+                {
+                    int2 atomData = pmeAtomGridIndex[i];
+                    int atomIndex = atomData.x;
+                    int z = atomData.y;
+                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
+                    float atomCharge = posq[atomIndex].w;
+                    result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
+                }
+                if (z1 > gridPoint.z)
+                {
+                    gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
+                    gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
+                    firstAtom = pmeAtomRange[gridIndex1];
+                    lastAtom = pmeAtomRange[gridIndex2+1];
+                    for (int i = firstAtom; i < lastAtom; ++i)
+                    {
+                        int2 atomData = pmeAtomGridIndex[i];
                        int atomIndex = atomData.x;
-                        float atomCharge = atomData.y;
+                        int z = atomData.y;
+                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
+                        float atomCharge = posq[atomIndex].w;
                        result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
                    }
                }
+            }
+        }
        pmeGrid[gridIndex] = (float2) (result*EPSILON_FACTOR, 0.0f);
    }
 }
@@ -168,15 +194,18 @@ __kernel void gridInterpolateForce(__global float4* posq, __global float4* force
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
        for (int ix = 0; ix < PME_ORDER; ix++) {
-            int xindex = (gridIndex.x + ix) % GRID_SIZE_X;
+            int xindex = gridIndex.x+ix;
+            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
            float tx = pmeBsplineTheta[atom+ix*NUM_ATOMS].x;
            float dtx = pmeBsplineDTheta[atom+ix*NUM_ATOMS].x;
            for (int iy = 0; iy < PME_ORDER; iy++) {
-                int yindex = (gridIndex.y + iy) % GRID_SIZE_Y;
+                int yindex = gridIndex.y+iy;
+                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                float ty = pmeBsplineTheta[atom+iy*NUM_ATOMS].y;
                float dty = pmeBsplineDTheta[atom+iy*NUM_ATOMS].y;
                for (int iz = 0; iz < PME_ORDER; iz++) {
-                    int zindex = (gridIndex.z + iz) % GRID_SIZE_Z;
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    float tz = pmeBsplineTheta[atom+iz*NUM_ATOMS].z;
                    float dtz = pmeBsplineDTheta[atom+iz*NUM_ATOMS].z;
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;