Reduced duplication in the code for Coulomb and LJ PME

f7a102fb · Peter Eastman · dbcc494f · f7a102fb · f7a102fb · dbcc494f
Commit f7a102fb authored Jan 25, 2017 by Peter Eastman
4 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -701,7 +701,6 @@ private:
    CUfunction pmeDispersionConvolutionKernel;
    CUfunction pmeInterpolateForceKernel;
    CUfunction pmeInterpolateDispersionForceKernel;
-    std::map<std::string, std::string> pmeDefines;
    std::vector<std::pair<int, int> > exceptionAtoms;
    double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
    int interpolateForceThreads;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1763,19 +1763,11 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        }

        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
-        if (doLJPME) {
-            defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
-            double invRCut6 = pow(force.getCutoffDistance(), -6);
-            double dalphaR = dispersionAlpha * force.getCutoffDistance();
-            double dar2 = dalphaR*dalphaR;
-            double dar4 = dar2*dar2;
-            double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
-            defines["INVCUT6"] = cu.doubleToString(invRCut6);
-            defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
-        }
        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
        defines["DO_LJPME"] = doLJPME ? "1" : "0";
+        if (doLJPME)
+            defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
        if (cu.getContextIndex() == 0) {
            ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
            if (doLJPME)
@@ -1784,6 +1776,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
            char deviceName[100];
            cuDeviceGetName(deviceName, 100, cu.getDevice());
            usePmeStream = (!cu.getPlatformData().disablePmeStream && string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
+            map<string, string> pmeDefines;
            pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
            pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
            pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
@@ -1799,18 +1792,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                pmeDefines["USE_PME_STREAM"] = "1";
            if (cu.getPlatformData().deterministicForces)
                pmeDefines["USE_DETERMINISTIC_FORCES"] = "1";
-            if (doLJPME) {
-                pmeDefines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
-                pmeDefines["DISPERSION_GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
-                pmeDefines["DISPERSION_GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
-                pmeDefines["DISPERSION_GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
-                pmeDefines["RECIP_DISPERSION_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
-            }
-            CUmodule module;
-            if (doLJPME)
-                module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme+CudaKernelSources::ljpme, pmeDefines);
-            else
-                module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
+            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
            if (cu.getPlatformData().useCpuPme) {
                // Create the CPU PME kernel.

@@ -1843,12 +1825,27 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
                cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
                if (doLJPME) {
-                    pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadC6");
-                    pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomDispersionGridIndex");
-                    pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadC6");
-                    pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalDispersionConvolution");
-                    pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateDispersionEnergy");
-                    pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateDispersionForce");
+                    pmeDefines["EWALD_ALPHA"] = cu.doubleToString(dispersionAlpha);
+                    pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
+                    pmeDefines["GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
+                    pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
+                    pmeDefines["EPSILON_FACTOR"] = "1";
+                    pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
+                    pmeDefines["USE_LJPME"] = "1";
+                    double invRCut6 = pow(force.getCutoffDistance(), -6);
+                    double dalphaR = dispersionAlpha * force.getCutoffDistance();
+                    double dar2 = dalphaR*dalphaR;
+                    double dar4 = dar2*dar2;
+                    double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
+                    defines["INVCUT6"] = cu.doubleToString(invRCut6);
+                    defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
+                    module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
+                    pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
+                    pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
+                    pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
+                    pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
+                    pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
+                    pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateForce");
                    cuFuncSetCacheConfig(pmeDispersionSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
                }


--- a/platforms/cuda/src/kernels/ljpme.cu
+++ b/platforms/cuda/src/kernels/ljpme.cu
-extern "C" __global__ void findAtomDispersionGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
-            real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-            real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
-    // Compute the index of the grid point each atom is associated with.
-    
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        real4 pos = posq[i];
-        APPLY_PERIODIC_TO_POS(pos)
-        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
-                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
-                             pos.z*recipBoxVecZ.z);
-        t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
-        t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
-        t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
-        int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
-                                   ((int) t.y) % DISPERSION_GRID_SIZE_Y,
-                                   ((int) t.z) % DISPERSION_GRID_SIZE_Z);
-        pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z+gridIndex.y*DISPERSION_GRID_SIZE_Z+gridIndex.z);
-    }
-}
-
-extern "C" __global__ void gridSpreadC6(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex,
-        const real2* __restrict__ sigmaEpsilon) {
-    real3 data[PME_ORDER];
-    const real scale = RECIP(PME_ORDER-1);
-    
-    // Process the atoms in spatially sorted order.  This improves efficiency when writing
-    // the grid values.
-    
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        int atom = pmeAtomGridIndex[i].x;
-        real4 pos = posq[atom];
-        APPLY_PERIODIC_TO_POS(pos)
-        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
-                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
-                             pos.z*recipBoxVecZ.z);
-        t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
-        t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
-        t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
-        int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
-                                   ((int) t.y) % DISPERSION_GRID_SIZE_Y,
-                                   ((int) t.z) % DISPERSION_GRID_SIZE_Z);
-
-        // Since we need the full set of thetas, it's faster to compute them here than load them
-        // from global memory.
-        
-        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
-        data[PME_ORDER-1] = make_real3(0);
-        data[1] = dr;
-        data[0] = make_real3(1)-dr;
-        for (int j = 3; j < PME_ORDER; j++) {
-            real div = RECIP(j-1);
-            data[j-1] = div*dr*data[j-2];
-            for (int k = 1; k < (j-1); k++)
-                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
-            data[0] = div*(make_real3(1)-dr)*data[0];
-        }
-        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-        for (int j = 1; j < (PME_ORDER-1); j++)
-            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
-        data[0] = scale*(make_real3(1)-dr)*data[0];
-        
-        // Spread the charge from this atom onto each grid point.
-        
-        const real2 sigEps = sigmaEpsilon[atom];
-        const real c6 = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
-        for (int ix = 0; ix < PME_ORDER; ix++) {
-            int xbase = gridIndex.x+ix;
-            xbase -= (xbase >= DISPERSION_GRID_SIZE_X ? DISPERSION_GRID_SIZE_X : 0);
-            xbase = xbase*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
-            real dx = data[ix].x;
-            
-            for (int iy = 0; iy < PME_ORDER; iy++) {
-                int ybase = gridIndex.y+iy;
-                ybase -= (ybase >= DISPERSION_GRID_SIZE_Y ? DISPERSION_GRID_SIZE_Y : 0);
-                ybase = xbase + ybase*DISPERSION_GRID_SIZE_Z;
-                real dy = data[iy].y;
-                
-                for (int iz = 0; iz < PME_ORDER; iz++) {
-                    int zindex = gridIndex.z+iz;
-                    zindex -= (zindex >= DISPERSION_GRID_SIZE_Z ? DISPERSION_GRID_SIZE_Z : 0);
-                    int index = ybase + zindex;
-
-                    real add = c6*dx*dy*data[iz].z;
-#ifdef USE_DOUBLE_PRECISION
-                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
-                    atomicAdd(&ulonglong_p[index],  static_cast<unsigned long long>((long long) (add*0x100000000)));
-#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
-                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
-                    int gridIndex = index;
-                    gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z)/2);
-                    atomicAdd(&ulonglong_p[gridIndex],  static_cast<unsigned long long>((long long) (add*0x100000000)));
-#else
-                    atomicAdd(&originalPmeGrid[index], add);
-#endif
-
-                }
-            }
-        }
-    }
-}
-
-
-extern "C" __global__ void finishSpreadC6(long long* __restrict__ originalPmeGrid) {
-    real* floatGrid = (real*) originalPmeGrid;
-    const unsigned int gridSize = DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
-    real scale = 1.0f/(real) 0x100000000;
-#ifdef USE_DOUBLE_PRECISION
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
-        floatGrid[index] = scale*originalPmeGrid[index];
-#else
-    for (int index = 2*(blockIdx.x*blockDim.x+threadIdx.x); index < gridSize; index += 2*blockDim.x*gridDim.x) {
-        floatGrid[index] = scale*originalPmeGrid[index/2];
-        if (index+1 < gridSize)
-            floatGrid[index+1] = scale*originalPmeGrid[(index+gridSize+1)/2];
-    }
-#endif
-}
-
-
-// convolutes the dispersion grid on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
-extern "C" __global__ void 
-reciprocalDispersionConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer, 
-                      const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, 
-                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
-    // R2C stores into a half complex matrix where the last dimension is cut by half
-    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
-    const real scaleFactor =  -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
-
-    const real alpha = EWALD_DISPERSION_ALPHA;
-    real bfac = M_PI / alpha;
-    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
-    real fac2 = alpha*alpha*alpha;
-    real fac3 = -2*alpha*M_PI*M_PI;
-
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
-        // real indices
-        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
-        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
-        int ky = remainder/(GRID_SIZE_Z/2+1);
-        int kz = remainder-ky*(GRID_SIZE_Z/2+1);
-        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
-        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
-        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
-        real mhx = mx*recipBoxVecX.x;
-        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
-        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
-        real bx = pmeBsplineModuliX[kx];
-        real by = pmeBsplineModuliY[ky];
-        real bz = pmeBsplineModuliZ[kz];
-        real denom = scaleFactor/(bx*by*bz);
-        real2 grid = halfcomplex_pmeGrid[index];
-        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
-        real m = SQRT(m2);
-        real m3 = m*m2;
-        real b = bfac*m;
-        real expfac = -b*b;
-        real expterm = EXP(expfac);
-#if FAST_ERFC
-        // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
-        // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
-        // error of 1.5e-7.  Stolen by ACS from the CUDA platform's AMOEBA plugin.
-        real t = 1.0f/(1.0f+0.3275911f*b);
-        real erfcterm = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expterm;
-#else
-        real erfcterm = ERFC(b);
-#endif
-        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
-        halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
-    }
-}
-
-
-extern "C" __global__ void
-gridEvaluateDispersionEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer,
-                      const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
-                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
-    // R2C stores into a half complex matrix where the last dimension is cut by half
-    const unsigned int gridSize = DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
-    const real scaleFactor =  -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
-
-    const real alpha = EWALD_DISPERSION_ALPHA;
-    real bfac = M_PI / alpha;
-    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
-    real fac2 = alpha*alpha*alpha;
-    real fac3 = -2*alpha*M_PI*M_PI;
-
-    mixed energy = 0;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
-        // real indices
-        int kx = index/(DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z));
-        int remainder = index-kx*DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z);
-        int ky = remainder/(DISPERSION_GRID_SIZE_Z);
-        int kz = remainder-ky*(DISPERSION_GRID_SIZE_Z);
-        int mx = (kx < (DISPERSION_GRID_SIZE_X+1)/2) ? kx : (kx-DISPERSION_GRID_SIZE_X);
-        int my = (ky < (DISPERSION_GRID_SIZE_Y+1)/2) ? ky : (ky-DISPERSION_GRID_SIZE_Y);
-        int mz = (kz < (DISPERSION_GRID_SIZE_Z+1)/2) ? kz : (kz-DISPERSION_GRID_SIZE_Z);
-        real mhx = mx*recipBoxVecX.x;
-        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
-        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
-        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
-        real bx = pmeBsplineModuliX[kx];
-        real by = pmeBsplineModuliY[ky];
-        real bz = pmeBsplineModuliZ[kz];
-        real denom = scaleFactor/(bx*by*bz);
-        real m = SQRT(m2);
-        real m3 = m*m2;
-        real b = bfac*m;
-        real expfac = -b*b;
-        real expterm = EXP(expfac);
-#if FAST_ERFC
-        // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
-        // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
-        // error of 1.5e-7.  Stolen by ACS from the CUDA platform's AMOEBA plugin.
-        real t = 1.0f/(1.0f+0.3275911f*b);
-        real erfcterm = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expterm;
-#else
-        real erfcterm = ERFC(b);
-#endif
-        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
-
-        if (kz >= (DISPERSION_GRID_SIZE_Z/2+1)) {
-            kx = ((kx == 0) ? kx : DISPERSION_GRID_SIZE_X-kx);
-            ky = ((ky == 0) ? ky : DISPERSION_GRID_SIZE_Y-ky);
-            kz = DISPERSION_GRID_SIZE_Z-kz;
-        } 
-        int indexInHalfComplexGrid = kz + ky*(DISPERSION_GRID_SIZE_Z/2+1)+kx*(DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z/2+1));
-        real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
-        // N.B. We inlcude the 0,0,0 point for dispersion
-        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
-    }
-#ifdef USE_PME_STREAM
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = 0.5f*energy;
-#else
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
-#endif
-}
-
-
-extern "C" __global__
-void gridInterpolateDispersionForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex, const real2* __restrict__ sigmaEpsilon) {
-    real3 data[PME_ORDER];
-    real3 ddata[PME_ORDER];
-    const real scale = RECIP(PME_ORDER-1);
-    
-    // Process the atoms in spatially sorted order.  This improves cache performance when loading
-    // the grid values.
-    
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        int atom = pmeAtomGridIndex[i].x;
-        real3 force = make_real3(0);
-        real4 pos = posq[atom];
-        APPLY_PERIODIC_TO_POS(pos)
-        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
-                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
-                             pos.z*recipBoxVecZ.z);
-        t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
-        t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
-        t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
-        int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
-                                   ((int) t.y) % DISPERSION_GRID_SIZE_Y,
-                                   ((int) t.z) % DISPERSION_GRID_SIZE_Z);
-        // Since we need the full set of thetas, it's faster to compute them here than load them
-        // from global memory.
-        
-        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
-        data[PME_ORDER-1] = make_real3(0);
-        data[1] = dr;
-        data[0] = make_real3(1)-dr;
-        for (int j = 3; j < PME_ORDER; j++) {
-            real div = RECIP(j-1);
-            data[j-1] = div*dr*data[j-2];
-            for (int k = 1; k < (j-1); k++)
-                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
-            data[0] = div*(make_real3(1)-dr)*data[0];
-        }
-        ddata[0] = -data[0];
-        for (int j = 1; j < PME_ORDER; j++)
-            ddata[j] = data[j-1]-data[j];
-        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-        for (int j = 1; j < (PME_ORDER-1); j++)
-            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
-        data[0] = scale*(make_real3(1)-dr)*data[0];
-
-        
-        // Compute the force on this atom.
-        
-        for (int ix = 0; ix < PME_ORDER; ix++) {
-            int xbase = gridIndex.x+ix;
-            xbase -= (xbase >= DISPERSION_GRID_SIZE_X ? DISPERSION_GRID_SIZE_X : 0);
-            xbase = xbase*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
-            real dx = data[ix].x;
-            real ddx = ddata[ix].x;
-            
-            for (int iy = 0; iy < PME_ORDER; iy++) {
-                int ybase = gridIndex.y+iy;
-                ybase -= (ybase >= DISPERSION_GRID_SIZE_Y ? DISPERSION_GRID_SIZE_Y : 0);
-                ybase = xbase + ybase*DISPERSION_GRID_SIZE_Z;
-                real dy = data[iy].y;
-                real ddy = ddata[iy].y;
-                
-                for (int iz = 0; iz < PME_ORDER; iz++) {
-                    int zindex = gridIndex.z+iz;
-                    zindex -= (zindex >= DISPERSION_GRID_SIZE_Z ? DISPERSION_GRID_SIZE_Z : 0);
-                    int index = ybase + zindex;
-                    real gridvalue = originalPmeGrid[index];
-                    force.x += ddx*dy*data[iz].z*gridvalue;
-                    force.y += dx*ddy*data[iz].z*gridvalue;
-                    force.z += dx*dy*ddata[iz].z*gridvalue;
-                }
-            }
-        }
-        const real2 sigEps = sigmaEpsilon[atom];
-        real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
-        real forceX = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecX.x);
-        real forceY = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecY.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecY.y);
-        real forceZ = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecZ.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecZ.y+force.z*DISPERSION_GRID_SIZE_Z*recipBoxVecZ.z);
-        atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (forceX*0x100000000)));
-        atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (forceY*0x100000000)));
-        atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (forceZ*0x100000000)));
-    }
-}
-
--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -21,7 +21,11 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int

 extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex) {
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex
+#ifdef USE_LJPME
+        , const real2* __restrict__ sigmaEpsilon
+#endif
+        ) {
    real3 data[PME_ORDER];
    const real scale = RECIP(PME_ORDER-1);
    
@@ -63,6 +67,12 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
        
        // Spread the charge from this atom onto each grid point.
        
+#ifdef USE_LJPME
+        const real2 sigEps = sigmaEpsilon[atom];
+        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
+        const real charge = pos.w;
+#endif
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndex.x+ix;
            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
@@ -80,7 +90,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = ybase + zindex;

-                    real add = pos.w*dx*dy*data[iz].z;
+                    real add = charge*dx*dy*data[iz].z;
 #ifdef USE_DOUBLE_PRECISION
                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
                    atomicAdd(&ulonglong_p[index],  static_cast<unsigned long long>((long long) (add*0x100000000)));
@@ -121,7 +131,15 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+#ifdef USE_LJPME
+    const real recipScaleFactor = -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+    real bfac = M_PI / EWALD_ALPHA;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
+    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
+#else
    const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+#endif

    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
        // real indices
@@ -140,12 +158,23 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
        real bz = pmeBsplineModuliZ[kz];
        real2 grid = halfcomplex_pmeGrid[index];
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+#ifdef USE_LJPME
+        real denom = recipScaleFactor/(bx*by*bz);
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+        real erfcterm = ERFC(b);
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+        halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
+#else
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
-
        if (kx != 0 || ky != 0 || kz != 0) {
            halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
        }
+#endif
    }
 }

@@ -156,7 +185,15 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+ #ifdef USE_LJPME
+    const real recipScaleFactor = -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+    real bfac = M_PI / EWALD_ALPHA;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
+    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
+#else
    const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+#endif

    mixed energy = 0;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
@@ -175,8 +212,19 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
        real bx = pmeBsplineModuliX[kx];
        real by = pmeBsplineModuliY[ky];
        real bz = pmeBsplineModuliZ[kz];
+#ifdef USE_LJPME
+        real denom = recipScaleFactor/(bx*by*bz);
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+        real erfcterm = ERFC(b);
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+#else
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+#endif

        if (kz >= (GRID_SIZE_Z/2+1)) {
            kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
@@ -185,10 +233,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
        } 
        int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
        real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
-        if (kx != 0 || ky != 0 || kz != 0) {
+#ifndef USE_LJPME
+        if (kx != 0 || ky != 0 || kz != 0)
+#endif
            energy += eterm*(grid.x*grid.x + grid.y*grid.y);
    }
-    }
 #ifdef USE_PME_STREAM
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = 0.5f*energy;
 #else
@@ -199,7 +248,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
 extern "C" __global__
 void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex) {
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex
+#ifdef USE_LJPME
+        , const real2* __restrict__ sigmaEpsilon
+#endif
+        ) {
    real3 data[PME_ORDER];
    real3 ddata[PME_ORDER];
    const real scale = RECIP(PME_ORDER-1);
@@ -271,7 +324,12 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
                }
            }
        }
+#ifdef USE_LJPME
+        const real2 sigEps = sigmaEpsilon[atom];
+        real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
        real q = pos.w*EPSILON_FACTOR;
+#endif
        real forceX = -q*(force.x*GRID_SIZE_X*recipBoxVecX.x);
        real forceY = -q*(force.x*GRID_SIZE_X*recipBoxVecY.x+force.y*GRID_SIZE_Y*recipBoxVecY.y);
        real forceZ = -q*(force.x*GRID_SIZE_X*recipBoxVecZ.x+force.y*GRID_SIZE_Y*recipBoxVecZ.y+force.z*GRID_SIZE_Z*recipBoxVecZ.z);