Merge pull request #1557 from andysim/dpme

Dispersion PME

Merge pull request #1557 from andysim/dpme
Dispersion PME
53f770f4 · peastman · GitHub · f36f36bf · c0850062 · 53f770f4
Commit 53f770f4 authored Feb 21, 2017 by peastman Committed by GitHub Feb 21, 2017
20 changed files
--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -628,6 +628,10 @@ void CudaParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int&
    dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
 }
+void CudaParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
+}
 class CudaParallelCalcCustomNonbondedForceKernel::Task : public CudaContext::WorkTask {
 public:
    Task(ContextImpl& context, CudaCalcCustomNonbondedForceKernel& kernel, bool includeForce,

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -247,6 +247,10 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
        CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
        deviceName << name;
    }
+    size_t printfsize;
+    cuCtxGetLimit(&printfsize, CU_LIMIT_PRINTF_FIFO_SIZE);
+    cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, 10*printfsize);
    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
    disablePmeStream = (pmeStreamProperty == "true");
    deterministicForces = (deterministicForcesProperty == "true");

--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
@@ -17,6 +17,26 @@
    const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expAlphaRSqr;
 #endif
    real tempForce = 0.0f;
+#if HAS_LENNARD_JONES
+    // The multiplicative term to correct for the multiplicative terms that are always
+    // present in reciprocal space.  The real terms have an additive contribution
+    // added in, but for excluded terms the multiplicative term is just subtracted.
+    // These factors are needed in both clauses of the needCorrection statement, so
+    // I declare them up here.
+    #if DO_LJPME
+        const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+        const real dar2 = dispersionAlphaR*dispersionAlphaR;
+        const real dar4 = dar2*dar2;
+        const real dar6 = dar4*dar2;
+        const real invR2 = invR*invR;
+        const real expDar2 = EXP(-dar2);
+        const float2 sigExpProd = sigmaEpsilon1*sigmaEpsilon2;
+        const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
+        const real coef = invR2*invR2*invR2*c6;
+        const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+        const real dprefac = eprefac + dar6/6.0f;
+    #endif
+#endif
    if (needCorrection) {
        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
@@ -29,6 +49,13 @@
            includeInteraction = false;
            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*posq1.w*posq2.w;
        }
+#if HAS_LENNARD_JONES
+        #if DO_LJPME
+            // The multiplicative grid term
+            tempEnergy += coef*(1.0f - expDar2*eprefac);
+            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+        #endif
+#endif
    }
    else {
 #if HAS_LENNARD_JONES
@@ -36,7 +63,8 @@
        real sig2 = invR*sig;
        sig2 *= sig2;
        real sig6 = sig2*sig2*sig2;
-        real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+        real eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
+        real epssig6 = sig6*eps;
        tempForce = epssig6*(12.0f*sig6 - 6.0f);
        real ljEnergy = epssig6*(sig6 - 1.0f);
        #if USE_LJ_SWITCH
@@ -48,6 +76,22 @@
            ljEnergy *= switchValue;
        }
        #endif
+#if DO_LJPME
+        // The multiplicative grid term
+        ljEnergy += coef*(1.0f - expDar2*eprefac);
+        tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+        // The potential shift accounts for the step at the cutoff introduced by the
+        // transition from additive to multiplicative combintion rules and is only
+        // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+        // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+        sig2 = sig*sig;
+        sig6 = sig2*sig2*sig2*INVCUT6;
+        epssig6 = eps*sig6;
+        // The additive part of the potential shift
+        ljEnergy += epssig6*(1.0f - sig6);
+        // The multiplicative part of the potential shift
+        ljEnergy += MULTSHIFT6*c6;
+#endif
        tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
        tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
 #else

--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -21,7 +21,11 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
 extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex) {
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex
+#ifdef USE_LJPME
+        , const float2* __restrict__ sigmaEpsilon
+#endif
+        ) {
    real3 data[PME_ORDER];
    const real scale = RECIP(PME_ORDER-1);
@@ -62,7 +66,13 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
        data[0] = scale*(make_real3(1)-dr)*data[0];
        // Spread the charge from this atom onto each grid point.
+#ifdef USE_LJPME
+        const float2 sigEps = sigmaEpsilon[atom];
+        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
+        const real charge = pos.w;
+#endif
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndex.x+ix;
            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
@@ -80,7 +90,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = ybase + zindex;
-                    real add = pos.w*dx*dy*data[iz].z;
+                    real add = charge*dx*dy*data[iz].z;
 #ifdef USE_DOUBLE_PRECISION
                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
                    atomicAdd(&ulonglong_p[index],  static_cast<unsigned long long>((long long) (add*0x100000000)));
@@ -121,7 +131,15 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+#ifdef USE_LJPME
+    const real recipScaleFactor = -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+    real bfac = M_PI / EWALD_ALPHA;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
+    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
+#else
    const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+#endif
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
        // real indices
@@ -140,12 +158,23 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
        real bz = pmeBsplineModuliZ[kz];
        real2 grid = halfcomplex_pmeGrid[index];
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+#ifdef USE_LJPME
+        real denom = recipScaleFactor/(bx*by*bz);
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+        real erfcterm = ERFC(b);
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+        halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
+#else
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
        if (kx != 0 || ky != 0 || kz != 0) {
            halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
        }
+#endif
    }
 }
@@ -156,8 +185,16 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+ #ifdef USE_LJPME
+    const real recipScaleFactor = -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+    real bfac = M_PI / EWALD_ALPHA;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
+    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
+#else
    const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+#endif
    mixed energy = 0;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
        // real indices
@@ -175,8 +212,19 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
        real bx = pmeBsplineModuliX[kx];
        real by = pmeBsplineModuliY[ky];
        real bz = pmeBsplineModuliZ[kz];
+#ifdef USE_LJPME
+        real denom = recipScaleFactor/(bx*by*bz);
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+        real erfcterm = ERFC(b);
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+#else
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+#endif
        if (kz >= (GRID_SIZE_Z/2+1)) {
            kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
@@ -185,11 +233,12 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
        } 
        int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
        real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
-        if (kx != 0 || ky != 0 || kz != 0) {
+#ifndef USE_LJPME
+        if (kx != 0 || ky != 0 || kz != 0)
+#endif
            energy += eterm*(grid.x*grid.x + grid.y*grid.y);
-        }
    }
-#ifdef USE_PME_STREAM
+#if defined(USE_PME_STREAM) && !defined(USE_LJPME)
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = 0.5f*energy;
 #else
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
@@ -199,7 +248,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
 extern "C" __global__
 void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex) {
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex
+#ifdef USE_LJPME
+        , const float2* __restrict__ sigmaEpsilon
+#endif
+        ) {
    real3 data[PME_ORDER];
    real3 ddata[PME_ORDER];
    const real scale = RECIP(PME_ORDER-1);
@@ -271,7 +324,12 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
                }
            }
        }
+#ifdef USE_LJPME
+        const float2 sigEps = sigmaEpsilon[atom];
+        real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
        real q = pos.w*EPSILON_FACTOR;
+#endif
        real forceX = -q*(force.x*GRID_SIZE_X*recipBoxVecX.x);
        real forceY = -q*(force.x*GRID_SIZE_X*recipBoxVecY.x+force.y*GRID_SIZE_Y*recipBoxVecY.y);
        real forceZ = -q*(force.x*GRID_SIZE_X*recipBoxVecZ.x+force.y*GRID_SIZE_Y*recipBoxVecZ.y+force.z*GRID_SIZE_Z*recipBoxVecZ.z);

--- a/platforms/cuda/tests/TestCudaDispersionPME.cpp
+++ b/platforms/cuda/tests/TestCudaDispersionPME.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2017 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "CudaTests.h"
+#include "TestDispersionPME.h"
+void runPlatformTests() {
+}
--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -591,8 +591,9 @@ class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, const System& system) : CalcNonbondedForceKernel(name, platform),
            hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
-            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL),
+            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeDispersionBsplineModuliX(NULL),
-            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
+            pmeDispersionBsplineModuliY(NULL), pmeDispersionBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeAtomRange(NULL),
+            pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), dispersionFft(NULL), pmeio(NULL) {
    }
    ~OpenCLCalcNonbondedForceKernel();
    /**
@@ -622,13 +623,22 @@ public:
    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
    /**
     * Get the parameters being used for PME.
-     * 
+     *
     * @param alpha   the separation parameter
     * @param nx      the number of grid points along the X axis
     * @param ny      the number of grid points along the Y axis
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the parameters being used for the dispersion term in LJPME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class SortTrait : public OpenCLSort::SortTrait {
        int getDataSize() const {return 8;}
@@ -657,6 +667,9 @@ private:
    OpenCLArray* pmeBsplineModuliX;
    OpenCLArray* pmeBsplineModuliY;
    OpenCLArray* pmeBsplineModuliZ;
+    OpenCLArray* pmeDispersionBsplineModuliX;
+    OpenCLArray* pmeDispersionBsplineModuliY;
+    OpenCLArray* pmeDispersionBsplineModuliZ;
    OpenCLArray* pmeBsplineTheta;
    OpenCLArray* pmeAtomRange;
    OpenCLArray* pmeAtomGridIndex;
@@ -665,25 +678,34 @@ private:
    cl::CommandQueue pmeQueue;
    cl::Event pmeSyncEvent;
    OpenCLFFT3D* fft;
+    OpenCLFFT3D* dispersionFft;
    Kernel cpuPme;
    PmeIO* pmeio;
    SyncQueuePostComputation* syncQueue;
    cl::Kernel ewaldSumsKernel;
    cl::Kernel ewaldForcesKernel;
-    cl::Kernel pmeGridIndexKernel;
    cl::Kernel pmeAtomRangeKernel;
+    cl::Kernel pmeDispersionAtomRangeKernel;
    cl::Kernel pmeZIndexKernel;
+    cl::Kernel pmeDispersionZIndexKernel;
    cl::Kernel pmeUpdateBsplinesKernel;
+    cl::Kernel pmeDispersionUpdateBsplinesKernel;
    cl::Kernel pmeSpreadChargeKernel;
+    cl::Kernel pmeDispersionSpreadChargeKernel;
    cl::Kernel pmeFinishSpreadChargeKernel;
+    cl::Kernel pmeDispersionFinishSpreadChargeKernel;
    cl::Kernel pmeConvolutionKernel;
+    cl::Kernel pmeDispersionConvolutionKernel;
    cl::Kernel pmeEvalEnergyKernel;
+    cl::Kernel pmeDispersionEvalEnergyKernel;
    cl::Kernel pmeInterpolateForceKernel;
+    cl::Kernel pmeDispersionInterpolateForceKernel;
    std::map<std::string, std::string> pmeDefines;
    std::vector<std::pair<int, int> > exceptionAtoms;
-    double ewaldSelfEnergy, dispersionCoefficient, alpha;
+    double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
    int gridSizeX, gridSizeY, gridSizeZ;
-    bool hasCoulomb, hasLJ, usePmeQueue;
+    int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
+    bool hasCoulomb, hasLJ, usePmeQueue, doLJPME;
    NonbondedMethod nonbondedMethod;
    static const int PmeOrder = 5;
 };

--- a/platforms/opencl/include/OpenCLParallelKernels.h
+++ b/platforms/opencl/include/OpenCLParallelKernels.h
@@ -431,13 +431,22 @@ public:
    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
    /**
     * Get the parameters being used for PME.
-     * 
+     *
     * @param alpha   the separation parameter
     * @param nx      the number of grid points along the X axis
     * @param ny      the number of grid points along the Y axis
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the parameters being used for the dispersion term in LJPME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class Task;
    OpenCLPlatform::PlatformData& data;

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/OpenCLParallelKernels.cpp
+++ b/platforms/opencl/src/OpenCLParallelKernels.cpp
@@ -583,6 +583,10 @@ void OpenCLParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int
    dynamic_cast<const OpenCLCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
 }
+void OpenCLParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const OpenCLCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
+}
 class OpenCLParallelCalcCustomNonbondedForceKernel::Task : public OpenCLContext::WorkTask {
 public:
    Task(ContextImpl& context, OpenCLCalcCustomNonbondedForceKernel& kernel, bool includeForce,

--- a/platforms/opencl/src/kernels/coulombLennardJones.cl
+++ b/platforms/opencl/src/kernels/coulombLennardJones.cl
@@ -22,6 +22,26 @@
    const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expAlphaRSqr;
 #endif
    real tempForce = 0;
+#if HAS_LENNARD_JONES
+    // The multiplicative term to correct for the multiplicative terms that are always
+    // present in reciprocal space.  The real terms have an additive contribution
+    // added in, but for excluded terms the multiplicative term is just subtracted.
+    // These factors are needed in both clauses of the needCorrection statement, so
+    // I declare them up here.
+    #if DO_LJPME
+        const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+        const real dar2 = dispersionAlphaR*dispersionAlphaR;
+        const real dar4 = dar2*dar2;
+        const real dar6 = dar4*dar2;
+        const real invR2 = invR*invR;
+        const real expDar2 = EXP(-dar2);
+        const float2 sigExpProd = sigmaEpsilon1*sigmaEpsilon2;
+        const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
+        const real coef = invR2*invR2*invR2*c6;
+        const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+        const real dprefac = eprefac + dar6/6.0f;
+    #endif
+#endif
    if (needCorrection) {
        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
@@ -34,6 +54,13 @@
            includeInteraction = false;
            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*posq1.w*posq2.w;
        }
+#if HAS_LENNARD_JONES
+        #if DO_LJPME
+            // The multiplicative grid term
+            tempEnergy += coef*(1.0f - expDar2*eprefac);
+            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+        #endif
+#endif
    }
    else {
 #if HAS_LENNARD_JONES
@@ -41,7 +68,8 @@
        real sig2 = invR*sig;
        sig2 *= sig2;
        real sig6 = sig2*sig2*sig2;
-        real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+        real eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
+        real epssig6 = sig6*eps;
        tempForce = epssig6*(12.0f*sig6 - 6.0f);
        real ljEnergy = epssig6*(sig6 - 1.0f);
        #if USE_LJ_SWITCH
@@ -53,6 +81,22 @@
            ljEnergy *= switchValue;
        }
        #endif
+#if DO_LJPME
+        // The multiplicative grid term
+        ljEnergy += coef*(1.0f - expDar2*eprefac);
+        tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+        // The potential shift accounts for the step at the cutoff introduced by the
+        // transition from additive to multiplicative combintion rules and is only
+        // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+        // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+        sig2 = sig*sig;
+        sig6 = sig2*sig2*sig2*INVCUT6;
+        epssig6 = eps*sig6;
+        // The additive part of the potential shift
+        ljEnergy += epssig6*(1.0f - sig6);
+        // The multiplicative part of the potential shift
+        ljEnergy += MULTSHIFT6*c6;
+#endif
        tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
        tempEnergy += select((real) 0, ljEnergy + prefactor*erfcAlphaR, includeInteraction);
 #else

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
 __kernel void updateBsplines(__global const real4* restrict posq, __global real4* restrict pmeBsplineTheta, __local real4* restrict bsplinesCache,
        __global int2* restrict pmeAtomGridIndex, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ
+#ifdef USE_LJPME
+        , __global const float2* restrict sigmaEpsilon
+#endif
+    ) {
    const real4 scale = 1/(real) (PME_ORDER-1);
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        __local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
@@ -33,7 +37,13 @@ __kernel void updateBsplines(__global const real4* restrict posq, __global real4
            data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
        data[0] = scale*(-dr+1.0f)*data[0];
        for (int j = 0; j < PME_ORDER; j++) {
-            data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel
+#ifdef USE_LJPME
+            const float2 sigEps = sigmaEpsilon[atom];
+            const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
+            const real charge = pos.w;
+#endif
+            data[j].w = charge; // Storing the charge here improves cache coherency in the charge spreading kernel
            pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
        }
 #endif
@@ -86,7 +96,11 @@ __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global co
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
        __global long* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ
+#ifdef USE_LJPME
+        , __global const float2* restrict sigmaEpsilon
+#endif
+    ) {
    const real scale = 1/(real) (PME_ORDER-1);
    real4 data[PME_ORDER];
@@ -128,6 +142,12 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
        // Spread the charge from this atom onto each grid point.
+#ifdef USE_LJPME
+        const float2 sigEps = sigmaEpsilon[atom];
+        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
+        const real charge = pos.w;
+#endif
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
@@ -138,7 +158,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    real add = pos.w*data[ix].x*data[iy].y*data[iz].z;
+                    real add = charge*data[ix].x*data[iy].y*data[iz].z;
 #ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
                    // On Nvidia devices (at least Maxwell anyway), this split ordering produces much higher performance.  Why?
                    // I have no idea!  And of course on AMD it produces slower performance.  GPUs are not meant to be understood.
@@ -167,7 +187,11 @@ __kernel void finishSpreadCharge(__global long* restrict fixedGrid, __global rea
 #elif defined(DEVICE_IS_CPU)
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
        __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ
+#ifdef USE_LJPME
+        , __global const float2* restrict sigmaEpsilon
+#endif
+    ) {
    const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
    const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
    if (firstx == lastx)
@@ -194,6 +218,12 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
        // Spread the charge from this atom onto each grid point.
+#ifdef USE_LJPME
+        const float2 sigEps = sigmaEpsilon[atom];
+        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
+        const real charge = pos.w;
+#endif
        bool hasComputedThetas = false;
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;
@@ -229,7 +259,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    pmeGrid[index] += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
+                    pmeGrid[index] += EPSILON_FACTOR*charge*data[ix].x*data[iy].y*data[iz].z;
                }
            }
        }
@@ -237,7 +267,11 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
 }
 #else
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
-        __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
+        __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta
+#ifdef USE_LJPME
+        , __global const float2* restrict sigmaEpsilon
+#endif
+    ) {
    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
        // Compute the charge on a grid point.
@@ -298,7 +332,15 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global c
        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+#ifdef USE_LJPME
+    const real recipScaleFactor = -(2*M_PI/6)*SQRT(M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+    real bfac = M_PI / EWALD_ALPHA;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
+    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
+#else
    const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+#endif
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
        // real indices
@@ -317,11 +359,23 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global c
        real bz = pmeBsplineModuliZ[kz];
        real2 grid = pmeGrid[index];
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+#ifdef USE_LJPME
+        real denom = recipScaleFactor/(bx*by*bz);
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+        real erfcterm = erfc(b);
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+        pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
+#else
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
        if (kx != 0 || ky != 0 || kz != 0) {
            pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
        }
+#endif
    }
 }
@@ -330,7 +384,15 @@ __kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixe
                      real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+ #ifdef USE_LJPME
+    const real recipScaleFactor = -(2*M_PI/6)*SQRT(M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+    real bfac = M_PI / EWALD_ALPHA;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
+    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
+#else
    const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+#endif
    mixed energy = 0;
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
@@ -349,8 +411,19 @@ __kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixe
        real bx = pmeBsplineModuliX[kx];
        real by = pmeBsplineModuliY[ky];
        real bz = pmeBsplineModuliZ[kz];
+#ifdef USE_LJPME
+        real denom = recipScaleFactor/(bx*by*bz);
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+        real erfcterm = erfc(b);
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+#else
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+#endif
        if (kz >= (GRID_SIZE_Z/2+1)) {
            kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
            ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
@@ -358,11 +431,12 @@ __kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixe
        } 
        int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
        real2 grid = pmeGrid[indexInHalfComplexGrid];
-        if (kx != 0 || ky != 0 || kz != 0) {
+#ifndef USE_LJPME
+        if (kx != 0 || ky != 0 || kz != 0)
+#endif
            energy += eterm*(grid.x*grid.x + grid.y*grid.y);
-        }
    }
-#ifdef USE_PME_STREAM
+#if defined(USE_PME_STREAM) && !defined(USE_LJPME)
    energyBuffer[get_global_id(0)] = 0.5f*energy;
 #else
    energyBuffer[get_global_id(0)] += 0.5f*energy;
@@ -371,7 +445,11 @@ __kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixe
 __kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real* restrict pmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 recipBoxVecX,
-        real4 recipBoxVecY, real4 recipBoxVecZ, __global int2* restrict pmeAtomGridIndex) {
+        real4 recipBoxVecY, real4 recipBoxVecZ, __global int2* restrict pmeAtomGridIndex
+#ifdef USE_LJPME
+        , __global const float2* restrict sigmaEpsilon
+#endif
+    ) {
    const real scale = 1/(real) (PME_ORDER-1);
    real4 data[PME_ORDER];
    real4 ddata[PME_ORDER];
@@ -436,7 +514,12 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
            }
        }
        real4 totalForce = forceBuffers[atom];
+#ifdef USE_LJPME
+        const float2 sigEps = sigmaEpsilon[atom];
+        real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
        real q = pos.w*EPSILON_FACTOR;
+#endif
        totalForce.x -= q*(force.x*GRID_SIZE_X*recipBoxVecX.x);
        totalForce.y -= q*(force.x*GRID_SIZE_X*recipBoxVecY.x+force.y*GRID_SIZE_Y*recipBoxVecY.y);
        totalForce.z -= q*(force.x*GRID_SIZE_X*recipBoxVecZ.x+force.y*GRID_SIZE_Y*recipBoxVecZ.y+force.z*GRID_SIZE_Z*recipBoxVecZ.z);

--- a/platforms/opencl/tests/TestOpenCLDispersionPME.cpp
+++ b/platforms/opencl/tests/TestOpenCLDispersionPME.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2017 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "OpenCLTests.h"
+#include "TestDispersionPME.h"
+void runPlatformTests() {
+}
--- a/platforms/reference/include/ReferenceKernels.h
+++ b/platforms/reference/include/ReferenceKernels.h
@@ -604,12 +604,21 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    int numParticles, num14;
    int **bonded14IndexArray;
    RealOpenMM **particleParamArray, **bonded14ParamArray;
-    RealOpenMM nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, dispersionCoefficient;
+    RealOpenMM nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldDispersionAlpha, dispersionCoefficient;
-    int kmax[3], gridSize[3];
+    int kmax[3], gridSize[3], dispersionGridSize[3];
    bool useSwitchingFunction;
    std::vector<std::set<int> > exclusions;
    NonbondedMethod nonbondedMethod;

--- a/platforms/reference/include/ReferenceLJCoulombIxn.h
+++ b/platforms/reference/include/ReferenceLJCoulombIxn.h
@@ -38,14 +38,14 @@ class ReferenceLJCoulombIxn {
      bool useSwitch;
      bool periodic;
      bool ewald;
-      bool pme;
+      bool pme, ljpme;
      const OpenMM::NeighborList* neighborList;
      OpenMM::RealVec periodicBoxVectors[3];
      RealOpenMM cutoffDistance, switchingDistance;
      RealOpenMM krf, crf;
-      RealOpenMM alphaEwald;
+      RealOpenMM alphaEwald, alphaDispersionEwald;
      int numRx, numRy, numRz;
-      int meshDim[3];
+      int meshDim[3], dispersionMeshDim[3];
      // parameter indices
@@ -139,16 +139,28 @@ class ReferenceLJCoulombIxn {
      /**---------------------------------------------------------------------------------------
         Set the force to use Particle-Mesh Ewald (PME) summation.
         @param alpha    the Ewald separation parameter
         @param gridSize the dimensions of the mesh
         --------------------------------------------------------------------------------------- */
      void setUsePME(RealOpenMM alpha, int meshSize[3]);
+      /**---------------------------------------------------------------------------------------
+         Set the force to use Particle-Mesh Ewald (PME) summation for dispersion.
+         @param dalpha    the dispersion Ewald separation parameter
+         @param dgridSize the dimensions of the dispersion mesh
+         --------------------------------------------------------------------------------------- */
+      void setUseLJPME(RealOpenMM dalpha, int dmeshSize[3]);
      /**---------------------------------------------------------------------------------------
         Calculate LJ Coulomb pair ixn

--- a/platforms/reference/include/ReferencePME.h
+++ b/platforms/reference/include/ReferencePME.h
@@ -87,6 +87,28 @@ pme_exec(pme_t       pme,
         RealOpenMM *    energy);
+/*
+ * Evaluate reciprocal space PME dispersion energy and forces.
+ *
+ * Args:
+ *
+ * pme         Opaque pme_t object, must have been initialized with pme_init()
+ * x           Pointer to coordinate data array (nm)
+ * f           Pointer to force data array (will be written as kJ/mol/nm)
+ * c6s         Array of c6 coefficients (units of sqrt(kJ/mol).nm^3 )
+ * box         Simulation cell dimensions (nm)
+ * energy      Total energy (will be written in units of kJ/mol)
+ */
+int OPENMM_EXPORT
+pme_exec_dpme(pme_t       pme,
+              const std::vector<OpenMM::RealVec>& atomCoordinates,
+              std::vector<OpenMM::RealVec>& forces,
+              const std::vector<RealOpenMM>& c6s,
+              const OpenMM::RealVec  periodicBoxVectors[3],
+              RealOpenMM *    energy);
 /* Release all memory in pme structure */
 int OPENMM_EXPORT
@@ -94,4 +116,4 @@ pme_destroy(pme_t    pme);
 } // namespace OpenMM
 #endif // __ReferencePME_H__
\ No newline at end of file
--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
@@ -969,9 +969,17 @@ void ReferenceCalcNonbondedForceKernel::initialize(const System& system, const N
    }
    else if (nonbondedMethod == PME) {
        double alpha;
-        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2]);
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
        ewaldAlpha = (RealOpenMM) alpha;
    }
+    else if (nonbondedMethod == LJPME) {
+        double alpha;
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
+        ewaldAlpha = (RealOpenMM) alpha;
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, dispersionGridSize[0], dispersionGridSize[1], dispersionGridSize[2], true);
+        ewaldDispersionAlpha = (RealOpenMM) alpha;
+        useSwitchingFunction = false;
+    }
    rfDielectric = (RealOpenMM)force.getReactionFieldDielectric();
    if (force.getUseDispersionCorrection())
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
@@ -987,11 +995,12 @@ double ReferenceCalcNonbondedForceKernel::execute(ContextImpl& context, bool inc
    bool periodic = (nonbondedMethod == CutoffPeriodic);
    bool ewald  = (nonbondedMethod == Ewald);
    bool pme  = (nonbondedMethod == PME);
+    bool ljpme = (nonbondedMethod == LJPME);
    if (nonbondedMethod != NoCutoff) {
-        computeNeighborListVoxelHash(*neighborList, numParticles, posData, exclusions, extractBoxVectors(context), periodic || ewald || pme, nonbondedCutoff, 0.0);
+        computeNeighborListVoxelHash(*neighborList, numParticles, posData, exclusions, extractBoxVectors(context), periodic || ewald || pme || ljpme, nonbondedCutoff, 0.0);
        clj.setUseCutoff(nonbondedCutoff, *neighborList, rfDielectric);
    }
-    if (periodic || ewald || pme) {
+    if (periodic || ewald || pme || ljpme) {
        RealVec* boxVectors = extractBoxVectors(context);
        double minAllowedSize = 1.999999*nonbondedCutoff;
        if (boxVectors[0][0] < minAllowedSize || boxVectors[1][1] < minAllowedSize || boxVectors[2][2] < minAllowedSize)
@@ -1002,6 +1011,10 @@ double ReferenceCalcNonbondedForceKernel::execute(ContextImpl& context, bool inc
        clj.setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
    if (pme)
        clj.setUsePME(ewaldAlpha, gridSize);
+    if (ljpme){
+        clj.setUsePME(ewaldAlpha, gridSize);
+        clj.setUseLJPME(ewaldDispersionAlpha, dispersionGridSize);
+    }
    if (useSwitchingFunction)
        clj.setUseSwitchingFunction(switchingDistance);
    clj.calculatePairIxn(numParticles, posData, particleParamArray, exclusions, 0, forceData, 0, includeEnergy ? &energy : NULL, includeDirect, includeReciprocal);
@@ -1059,14 +1072,23 @@ void ReferenceCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& con
 }
 void ReferenceCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
-    if (nonbondedMethod != PME)
+    if (nonbondedMethod != PME && nonbondedMethod != LJPME)
-        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
+        throw OpenMMException("getPMEParametersInContext: This Context is not using PME or LJPME");
    alpha = ewaldAlpha;
    nx = gridSize[0];
    ny = gridSize[1];
    nz = gridSize[2];
 }
+void ReferenceCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    if (nonbondedMethod != LJPME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using LJPME");
+    alpha = ewaldDispersionAlpha;
+    nx = dispersionGridSize[0];
+    ny = dispersionGridSize[1];
+    nz = dispersionGridSize[2];
+}
 ReferenceCalcCustomNonbondedForceKernel::~ReferenceCalcCustomNonbondedForceKernel() {
    disposeRealArray(particleParamArray, numParticles);
    if (neighborList != NULL)

--- a/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
--- a/platforms/reference/src/SimTKReference/ReferencePME.cpp
+++ b/platforms/reference/src/SimTKReference/ReferencePME.cpp
@@ -513,6 +513,106 @@ pme_reciprocal_convolution(pme_t     pme,
 }
+static void
+dpme_reciprocal_convolution(pme_t     pme,
+                           const RealVec periodicBoxVectors[3],
+                           const RealVec recipBoxVectors[3],
+                           RealOpenMM *  energy)
+{
+    int kx,ky,kz;
+    int nx,ny,nz;
+    RealOpenMM mx,my,mz;
+    RealOpenMM mhx,mhy,mhz,m2;
+    RealOpenMM bx,by,bz;
+    RealOpenMM d1,d2;
+    RealOpenMM eterm,struct2,ets2;
+    RealOpenMM esum;
+    RealOpenMM denom;
+    RealOpenMM boxfactor;
+    RealOpenMM maxkx,maxky,maxkz;
+    t_complex *ptr;
+    nx = pme->ngrid[0];
+    ny = pme->ngrid[1];
+    nz = pme->ngrid[2];
+    boxfactor = (RealOpenMM) M_PI*sqrt(M_PI) / (6.0*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
+    esum = 0;
+    maxkx = (RealOpenMM) ((nx+1)/2);
+    maxky = (RealOpenMM) ((ny+1)/2);
+    maxkz = (RealOpenMM) ((nz+1)/2);
+    RealOpenMM bfac = M_PI / pme->ewaldcoeff;
+    RealOpenMM fac1 = 2.0*M_PI*M_PI*M_PI*sqrt(M_PI);
+    RealOpenMM fac2 = pme->ewaldcoeff*pme->ewaldcoeff*pme->ewaldcoeff;
+    RealOpenMM fac3 = -2.0*pme->ewaldcoeff*M_PI*M_PI;
+    RealOpenMM b, m, m3, expfac, expterm, erfcterm;
+    for (kx=0;kx<nx;kx++)
+    {
+        /* Calculate frequency. Grid indices in the upper half correspond to negative frequencies! */
+        mx  = (RealOpenMM) ((kx<maxkx) ? kx : (kx-nx));
+        mhx = mx*recipBoxVectors[0][0];
+        bx  = pme->bsplines_moduli[0][kx];
+        for (ky=0;ky<ny;ky++)
+        {
+            /* Calculate frequency. Grid indices in the upper half correspond to negative frequencies! */
+            my  = (RealOpenMM) ((ky<maxky) ? ky : (ky-ny));
+            mhy = mx*recipBoxVectors[1][0]+my*recipBoxVectors[1][1];
+            by  = pme->bsplines_moduli[1][ky];
+            for (kz=0;kz<nz;kz++)
+            {
+                /*
+                 * Unlike the Coulombic case, there's an m=0 term so all terms are considered here.
+                 */
+                /* Calculate frequency. Grid indices in the upper half correspond to negative frequencies! */
+                mz        = (RealOpenMM) ((kz<maxkz) ? kz : (kz-nz));
+                mhz       = mx*recipBoxVectors[2][0]+my*recipBoxVectors[2][1]+mz*recipBoxVectors[2][2];
+                /* Pointer to the grid cell in question */
+                ptr       = pme->grid + kx*ny*nz + ky*nz + kz;
+                /* Get grid data for this frequency */
+                d1        = ptr->re;
+                d2        = ptr->im;
+                /* Calculate the convolution - see the Essman/Darden paper for the equation! */
+                m2        = mhx*mhx+mhy*mhy+mhz*mhz;
+                bz        = pme->bsplines_moduli[2][kz];
+                denom     = boxfactor / (bx*by*bz);
+                m = sqrt(m2);
+                m3 = m*m2;
+                b = bfac*m;
+                expfac = -b*b;
+                erfcterm = erfc(b);
+                expterm = exp(expfac);
+                eterm     = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+                /* write back convolution data to grid */
+                ptr->re   = d1*eterm;
+                ptr->im   = d2*eterm;
+                struct2   = (d1*d1+d2*d2);
+                /* Long-range PME contribution to the energy for this frequency */
+                ets2      = eterm*struct2;
+                esum     += ets2;
+            }
+        }
+    }
+    // Remember the C6 energy is attractive, hence the negative sign.
+    *energy = (RealOpenMM) (-esum);
+}
 static void
 pme_grid_interpolate_force(pme_t pme,
                           const RealVec recipBoxVectors[3],
@@ -704,6 +804,49 @@ int pme_exec(pme_t       pme,
 }
+int pme_exec_dpme(pme_t       pme,
+             const vector<RealVec>& atomCoordinates,
+             vector<RealVec>& forces,
+             const vector<RealOpenMM>& c6s,
+             const RealVec periodicBoxVectors[3],
+             RealOpenMM* energy)
+{
+    /* Routine is called with coordinates in x, a box, and charges in q */
+    RealVec recipBoxVectors[3];
+    invert_box_vectors(periodicBoxVectors, recipBoxVectors);
+    /* Before we can do the actual interpolation, we need to recalculate and update
+     * the indices for each particle in the charge grid (initialized in pme_init()),
+     * and what its fractional offset in this grid cell is.
+     */
+    /* Update charge grid indices and fractional offsets for each atom.
+     * The indices/fractions are stored internally in the pme datatype
+     */
+    pme_update_grid_index_and_fraction(pme,atomCoordinates,periodicBoxVectors,recipBoxVectors);
+    /* Calculate bsplines (and their differentials) from current fractional coordinates, store in pme structure */
+    pme_update_bsplines(pme);
+    /* Spread the charges on grid (using newly calculated bsplines in the pme structure) */
+    pme_grid_spread_charge(pme, c6s);
+    /* do 3d-fft */
+    fftpack_exec_3d(pme->fftplan,FFTPACK_FORWARD,pme->grid,pme->grid);
+    /* solve in k-space */
+    dpme_reciprocal_convolution(pme,periodicBoxVectors,recipBoxVectors,energy);
+    /* do 3d-invfft */
+    fftpack_exec_3d(pme->fftplan,FFTPACK_BACKWARD,pme->grid,pme->grid);
+    /* Get the particle forces from the grid and bsplines in the pme structure */
+    pme_grid_interpolate_force(pme,recipBoxVectors,c6s,forces);
+    return 0;
+}
 int
 pme_destroy(pme_t    pme)

--- a/platforms/reference/tests/TestReferenceDispersionPME.cpp
+++ b/platforms/reference/tests/TestReferenceDispersionPME.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2017 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "ReferenceTests.h"
+#include "TestDispersionPME.h"
+void runPlatformTests() {
+}
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -1155,7 +1155,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            NonbondedForce nb;
            nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
            nb.setCutoffDistance(force.getCutoffDistance());
-            NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ);
+            NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
            gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
            gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
            gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);