Created OpenCL implementation of dispersion PME

5ef1d4eb · Peter Eastman · edfcdec3 · 5ef1d4eb · 5ef1d4eb · 5ef1d4eb
Commit 5ef1d4eb authored Feb 08, 2017 by Peter Eastman
7 changed files
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1646,8 +1646,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
 void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
    cu.setAsCurrent();
-    nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
    // Identify which exceptions are 1-4 interactions.
    vector<pair<int, int> > exclusions;
@@ -1664,7 +1662,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    // Initialize nonbonded interactions.
    int numParticles = force.getNumParticles();
-    doLJPME = (nonbondedMethod == LJPME);
    sigmaEpsilon = CudaArray::create<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
    CudaArray& posq = cu.getPosq();
    vector<double4> temp(posq.getSize());
@@ -1683,8 +1680,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
            posqd[i] = make_double4(0, 0, 0, charge);
        else
            posqf[i] = make_float4(0, 0, 0, (float) charge);
-        double sig = (float) (0.5*sigma);
+        double sig = 0.5*sigma;
-        double eps = (float) (2.0*sqrt(epsilon));
+        double eps = 2.0*sqrt(epsilon);
        sigmaEpsilonVector[i] = make_float2(sig, eps);
        exclusionList[i].push_back(i);
        sumSquaredCharges += charge*charge;
@@ -1701,8 +1698,10 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    }
    posq.upload(&temp[0]);
    sigmaEpsilon->upload(sigmaEpsilonVector);
+    nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
    bool useCutoff = (nonbondedMethod != NoCutoff);
    bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
+    doLJPME = (nonbondedMethod == LJPME);
    map<string, string> defines;
    defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
    defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
@@ -1761,7 +1760,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    }
    else if (nonbondedMethod == PME || nonbondedMethod == LJPME) {
        // Compute the PME parameters.
-        //
        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
        gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
        gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
@@ -1907,6 +1906,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                }
                else {
                    fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
+                    if (doLJPME)
                        dispersionFft = new CudaFFT3D(cu, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
                }
@@ -1928,6 +1928,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                    cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), *pmeEnergyBuffer, recipForceGroup));
                }
                hasInitializedFFT = true;
                // Initialize the b-spline moduli.
                for (int grid = 0; grid < 2; grid++) {

--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
@@ -76,7 +76,7 @@
            ljEnergy *= switchValue;
        }
        #endif
-        #if DO_LJPME
+#if DO_LJPME
        // The multiplicative grid term
        ljEnergy += coef*(1.0f - expDar2*eprefac);
        tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
@@ -91,7 +91,7 @@
        ljEnergy += epssig6*(1.0f - sig6);
        // The multiplicative part of the potential shift
        ljEnergy += MULTSHIFT6*c6;
-        #endif
+#endif
        tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
        tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
 #else

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -576,8 +576,9 @@ class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, const System& system) : CalcNonbondedForceKernel(name, platform),
            hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
-            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL),
+            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeDispersionBsplineModuliX(NULL),
-            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
+            pmeDispersionBsplineModuliY(NULL), pmeDispersionBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeAtomRange(NULL),
+            pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), dispersionFft(NULL), pmeio(NULL) {
    }
    ~OpenCLCalcNonbondedForceKernel();
    /**
@@ -649,6 +650,9 @@ private:
    OpenCLArray* pmeBsplineModuliX;
    OpenCLArray* pmeBsplineModuliY;
    OpenCLArray* pmeBsplineModuliZ;
+    OpenCLArray* pmeDispersionBsplineModuliX;
+    OpenCLArray* pmeDispersionBsplineModuliY;
+    OpenCLArray* pmeDispersionBsplineModuliZ;
    OpenCLArray* pmeBsplineTheta;
    OpenCLArray* pmeAtomRange;
    OpenCLArray* pmeAtomGridIndex;
@@ -657,26 +661,35 @@ private:
    cl::CommandQueue pmeQueue;
    cl::Event pmeSyncEvent;
    OpenCLFFT3D* fft;
+    OpenCLFFT3D* dispersionFft;
    Kernel cpuPme;
+    Kernel cpuDispersionPme;
    PmeIO* pmeio;
    SyncQueuePostComputation* syncQueue;
    cl::Kernel ewaldSumsKernel;
    cl::Kernel ewaldForcesKernel;
-    cl::Kernel pmeGridIndexKernel;
    cl::Kernel pmeAtomRangeKernel;
+    cl::Kernel pmeDispersionAtomRangeKernel;
    cl::Kernel pmeZIndexKernel;
+    cl::Kernel pmeDispersionZIndexKernel;
    cl::Kernel pmeUpdateBsplinesKernel;
+    cl::Kernel pmeDispersionUpdateBsplinesKernel;
    cl::Kernel pmeSpreadChargeKernel;
+    cl::Kernel pmeDispersionSpreadChargeKernel;
    cl::Kernel pmeFinishSpreadChargeKernel;
+    cl::Kernel pmeDispersionFinishSpreadChargeKernel;
    cl::Kernel pmeConvolutionKernel;
+    cl::Kernel pmeDispersionConvolutionKernel;
    cl::Kernel pmeEvalEnergyKernel;
+    cl::Kernel pmeDispersionEvalEnergyKernel;
    cl::Kernel pmeInterpolateForceKernel;
+    cl::Kernel pmeDispersionInterpolateForceKernel;
    std::map<std::string, std::string> pmeDefines;
    std::vector<std::pair<int, int> > exceptionAtoms;
    double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
    int gridSizeX, gridSizeY, gridSizeZ;
    int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
-    bool hasCoulomb, hasLJ, usePmeQueue;
+    bool hasCoulomb, hasLJ, usePmeQueue, doLJPME;
    NonbondedMethod nonbondedMethod;
    static const int PmeOrder = 5;
 };

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/kernels/coulombLennardJones.cl
+++ b/platforms/opencl/src/kernels/coulombLennardJones.cl
@@ -22,6 +22,26 @@
    const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expAlphaRSqr;
 #endif
    real tempForce = 0;
+#if HAS_LENNARD_JONES
+    // The multiplicative term to correct for the multiplicative terms that are always
+    // present in reciprocal space.  The real terms have an additive contribution
+    // added in, but for excluded terms the multiplicative term is just subtracted.
+    // These factors are needed in both clauses of the needCorrection statement, so
+    // I declare them up here.
+    #if DO_LJPME
+        const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+        const real dar2 = dispersionAlphaR*dispersionAlphaR;
+        const real dar4 = dar2*dar2;
+        const real dar6 = dar4*dar2;
+        const real invR2 = invR*invR;
+        const real expDar2 = EXP(-dar2);
+        const float2 sigExpProd = sigmaEpsilon1*sigmaEpsilon2;
+        const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
+        const real coef = invR2*invR2*invR2*c6;
+        const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+        const real dprefac = eprefac + dar6/6.0f;
+    #endif
+#endif
    if (needCorrection) {
        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
@@ -34,6 +54,13 @@
            includeInteraction = false;
            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*posq1.w*posq2.w;
        }
+#if HAS_LENNARD_JONES
+        #if DO_LJPME
+            // The multiplicative grid term
+            tempEnergy += coef*(1.0f - expDar2*eprefac);
+            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+        #endif
+#endif
    }
    else {
 #if HAS_LENNARD_JONES
@@ -41,7 +68,8 @@
        real sig2 = invR*sig;
        sig2 *= sig2;
        real sig6 = sig2*sig2*sig2;
-        real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+        real eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
+        real epssig6 = sig6*eps;
        tempForce = epssig6*(12.0f*sig6 - 6.0f);
        real ljEnergy = epssig6*(sig6 - 1.0f);
        #if USE_LJ_SWITCH
@@ -53,6 +81,22 @@
            ljEnergy *= switchValue;
        }
        #endif
+#if DO_LJPME
+        // The multiplicative grid term
+        ljEnergy += coef*(1.0f - expDar2*eprefac);
+        tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+        // The potential shift accounts for the step at the cutoff introduced by the
+        // transition from additive to multiplicative combintion rules and is only
+        // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+        // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+        sig2 = sig*sig;
+        sig6 = sig2*sig2*sig2*INVCUT6;
+        epssig6 = eps*sig6;
+        // The additive part of the potential shift
+        ljEnergy += epssig6*(1.0f - sig6);
+        // The multiplicative part of the potential shift
+        ljEnergy += MULTSHIFT6*c6;
+#endif
        tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
        tempEnergy += select((real) 0, ljEnergy + prefactor*erfcAlphaR, includeInteraction);
 #else

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
 __kernel void updateBsplines(__global const real4* restrict posq, __global real4* restrict pmeBsplineTheta, __local real4* restrict bsplinesCache,
        __global int2* restrict pmeAtomGridIndex, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ
+#ifdef USE_LJPME
+        , __global const float2* restrict sigmaEpsilon
+#endif
+    ) {
    const real4 scale = 1/(real) (PME_ORDER-1);
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        __local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
@@ -33,7 +37,13 @@ __kernel void updateBsplines(__global const real4* restrict posq, __global real4
            data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
        data[0] = scale*(-dr+1.0f)*data[0];
        for (int j = 0; j < PME_ORDER; j++) {
-            data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel
+#ifdef USE_LJPME
+            const float2 sigEps = sigmaEpsilon[atom];
+            const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
+            const real charge = pos.w;
+#endif
+            data[j].w = charge; // Storing the charge here improves cache coherency in the charge spreading kernel
            pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
        }
 #endif
@@ -86,7 +96,11 @@ __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global co
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
        __global long* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ
+#ifdef USE_LJPME
+        , __global const float2* restrict sigmaEpsilon
+#endif
+    ) {
    const real scale = 1/(real) (PME_ORDER-1);
    real4 data[PME_ORDER];
@@ -128,6 +142,12 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
        // Spread the charge from this atom onto each grid point.
+#ifdef USE_LJPME
+        const float2 sigEps = sigmaEpsilon[atom];
+        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
+        const real charge = pos.w;
+#endif
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
@@ -138,7 +158,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    real add = pos.w*data[ix].x*data[iy].y*data[iz].z;
+                    real add = charge*data[ix].x*data[iy].y*data[iz].z;
 #ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
                    // On Nvidia devices (at least Maxwell anyway), this split ordering produces much higher performance.  Why?
                    // I have no idea!  And of course on AMD it produces slower performance.  GPUs are not meant to be understood.
@@ -167,7 +187,11 @@ __kernel void finishSpreadCharge(__global long* restrict fixedGrid, __global rea
 #elif defined(DEVICE_IS_CPU)
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
        __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ
+#ifdef USE_LJPME
+        , __global const float2* restrict sigmaEpsilon
+#endif
+    ) {
    const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
    const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
    if (firstx == lastx)
@@ -194,6 +218,12 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
        // Spread the charge from this atom onto each grid point.
+#ifdef USE_LJPME
+        const float2 sigEps = sigmaEpsilon[atom];
+        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
+        const real charge = pos.w;
+#endif
        bool hasComputedThetas = false;
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;
@@ -229,7 +259,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    pmeGrid[index] += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
+                    pmeGrid[index] += EPSILON_FACTOR*charge*data[ix].x*data[iy].y*data[iz].z;
                }
            }
        }
@@ -237,7 +267,11 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
 }
 #else
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
-        __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
+        __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta
+#ifdef USE_LJPME
+        , __global const float2* restrict sigmaEpsilon
+#endif
+    ) {
    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
        // Compute the charge on a grid point.
@@ -298,7 +332,15 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global c
        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+#ifdef USE_LJPME
+    const real recipScaleFactor = -(2*M_PI/6)*SQRT(M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+    real bfac = M_PI / EWALD_ALPHA;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
+    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
+#else
    const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+#endif
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
        // real indices
@@ -317,11 +359,23 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global c
        real bz = pmeBsplineModuliZ[kz];
        real2 grid = pmeGrid[index];
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+#ifdef USE_LJPME
+        real denom = recipScaleFactor/(bx*by*bz);
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+        real erfcterm = erfc(b);
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+        pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
+#else
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
        if (kx != 0 || ky != 0 || kz != 0) {
            pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
        }
+#endif
    }
 }
@@ -330,7 +384,15 @@ __kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixe
                      real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+ #ifdef USE_LJPME
+    const real recipScaleFactor = -(2*M_PI/6)*SQRT(M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+    real bfac = M_PI / EWALD_ALPHA;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
+    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
+#else
    const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+#endif
    mixed energy = 0;
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
@@ -349,8 +411,19 @@ __kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixe
        real bx = pmeBsplineModuliX[kx];
        real by = pmeBsplineModuliY[ky];
        real bz = pmeBsplineModuliZ[kz];
+#ifdef USE_LJPME
+        real denom = recipScaleFactor/(bx*by*bz);
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+        real erfcterm = erfc(b);
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+#else
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+#endif
        if (kz >= (GRID_SIZE_Z/2+1)) {
            kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
            ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
@@ -358,11 +431,12 @@ __kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixe
        } 
        int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
        real2 grid = pmeGrid[indexInHalfComplexGrid];
-        if (kx != 0 || ky != 0 || kz != 0) {
+#ifndef USE_LJPME
+        if (kx != 0 || ky != 0 || kz != 0)
+#endif
            energy += eterm*(grid.x*grid.x + grid.y*grid.y);
    }
-    }
+#if defined(USE_PME_STREAM) && !defined(USE_LJPME)
-#ifdef USE_PME_STREAM
    energyBuffer[get_global_id(0)] = 0.5f*energy;
 #else
    energyBuffer[get_global_id(0)] += 0.5f*energy;
@@ -371,7 +445,11 @@ __kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixe
 __kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real* restrict pmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 recipBoxVecX,
-        real4 recipBoxVecY, real4 recipBoxVecZ, __global int2* restrict pmeAtomGridIndex) {
+        real4 recipBoxVecY, real4 recipBoxVecZ, __global int2* restrict pmeAtomGridIndex
+#ifdef USE_LJPME
+        , __global const float2* restrict sigmaEpsilon
+#endif
+    ) {
    const real scale = 1/(real) (PME_ORDER-1);
    real4 data[PME_ORDER];
    real4 ddata[PME_ORDER];
@@ -436,7 +514,12 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
            }
        }
        real4 totalForce = forceBuffers[atom];
+#ifdef USE_LJPME
+        const float2 sigEps = sigmaEpsilon[atom];
+        real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
        real q = pos.w*EPSILON_FACTOR;
+#endif
        totalForce.x -= q*(force.x*GRID_SIZE_X*recipBoxVecX.x);
        totalForce.y -= q*(force.x*GRID_SIZE_X*recipBoxVecY.x+force.y*GRID_SIZE_Y*recipBoxVecY.y);
        totalForce.z -= q*(force.x*GRID_SIZE_X*recipBoxVecZ.x+force.y*GRID_SIZE_Y*recipBoxVecZ.y+force.z*GRID_SIZE_Z*recipBoxVecZ.z);

--- a/tests/TestDispersionPME.h
+++ b/tests/TestDispersionPME.h
@@ -222,6 +222,71 @@ void testPMEParameters() {
    force->getLJPMEParametersInContext(context2, alpha, gridx, gridy, gridz);
 }
+void testCoulombAndLJ() {
+    // Create a cloud of random particles.
+    const int numParticles = 200;
+    const double boxWidth = 5.0;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(boxWidth, 0, 0), Vec3(0, boxWidth, 0), Vec3(0, 0, boxWidth));
+    NonbondedForce* force = new NonbondedForce();
+    system.addForce(force);
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<double> charge(numParticles), sigma(numParticles), epsilon(numParticles);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        charge[i] = -1.0+i*2.0/(numParticles-1);
+        sigma[i] = 0.1+0.2*genrand_real2(sfmt);
+        epsilon[i] = genrand_real2(sfmt);
+        force->addParticle(charge[i], 1.0, 0.0);
+        while (true) {
+            Vec3 pos = Vec3(boxWidth*genrand_real2(sfmt), boxWidth*genrand_real2(sfmt), boxWidth*genrand_real2(sfmt));
+            double minDist = boxWidth;
+            for (int j = 0; j < i; j++) {
+                Vec3 delta = pos-positions[j];
+                minDist = min(minDist, sqrt(delta.dot(delta)));
+            }
+            if (minDist > 0.1) {
+                positions[i] = pos;
+                break;
+            }
+        }
+    }
+    force->setNonbondedMethod(NonbondedForce::LJPME);
+    // Compute forces and energy with only Coulomb interactions.
+    VerletIntegrator integrator(0.01);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    State state1 = context.getState(State::Forces | State::Energy);
+    // Now repeat with only LJ interactions.
+    for (int i = 0; i < numParticles; i++)
+        force->setParticleParameters(i, 0.0, sigma[i], epsilon[i]);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state2 = context.getState(State::Forces | State::Energy);
+    // Finally compute with both Coulomb and LJ.
+    for (int i = 0; i < numParticles; i++)
+        force->setParticleParameters(i, charge[i], sigma[i], epsilon[i]);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state3 = context.getState(State::Forces | State::Energy);
+    // Make sure the results agree.
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy()+state2.getPotentialEnergy(), state3.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i]+state2.getForces()[i], state3.getForces()[i], 1e-5);
+}
 void make_waterbox(int natoms, double boxEdgeLength, NonbondedForce *forceField,  vector<Vec3> &positions, vector<double>& eps, vector<double>& sig,
                   vector<pair<int, int> >& bonds, System &system, bool do_electrostatics) {
    const int RESSIZE = 3;
@@ -1862,6 +1927,7 @@ int main(int argc, char* argv[]) {
        testConvergence();
        testErrorTolerance();
        testPMEParameters();
+        testCoulombAndLJ();
        testWater2DpmeEnergiesForcesNoExclusions();
        testWater2DpmeEnergiesForcesWithExclusions();
        testWater125DpmeVsLongCutoffNoExclusions();