Merge branch 'dpme' of https://github.com/andysim/openmm into ljpme

a9f65649 · Peter Eastman · 9567ddb3 · 58b6e3b6 · a9f65649 · a9f65649
Commit a9f65649 authored Jan 23, 2017 by Peter Eastman
20 changed files
--- a/olla/include/openmm/kernels.h
+++ b/olla/include/openmm/kernels.h
@@ -555,7 +555,8 @@ public:
        CutoffNonPeriodic = 1,
        CutoffPeriodic = 2,
        Ewald = 3,
-        PME = 4
+        PME = 4,
+        LJPME = 5
    };
    static std::string Name() {
        return "CalcNonbondedForce";
@@ -589,13 +590,22 @@ public:
    virtual void copyParametersToContext(ContextImpl& context, const NonbondedForce& force) = 0;
    /**
     * Get the parameters being used for PME.
-     * 
+     *
     * @param alpha   the separation parameter
     * @param nx      the number of grid points along the X axis
     * @param ny      the number of grid points along the Y axis
     * @param nz      the number of grid points along the Z axis
     */
    virtual void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const = 0;
+    /**
+     * Get the parameters being used for the dispersion terms in LJPME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    virtual void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const = 0;
 };

 /**
@@ -1335,6 +1345,57 @@ public:
 };


+/**
+ * This kernel performs the dispersion reciprocal space calculation for LJPME.  In most cases, this
+ * calculation is done directly by CalcNonbondedForceKernel so this kernel is unneeded.
+ * In some cases it may want to outsource the work to a different kernel.  In particular,
+ * GPU based platforms sometimes use a CPU based implementation provided by a separate
+ * plugin.
+ */
+class CalcDispersionPmeReciprocalForceKernel : public KernelImpl {
+public:
+    class IO;
+    static std::string Name() {
+        return "CalcDispersionPmeReciprocalForce";
+    }
+    CalcDispersionPmeReciprocalForceKernel(std::string name, const Platform& platform) : KernelImpl(name, platform) {
+    }
+    /**
+     * Initialize the kernel.
+     * 
+     * @param gridx        the x size of the PME grid
+     * @param gridy        the y size of the PME grid
+     * @param gridz        the z size of the PME grid
+     * @param numParticles the number of particles in the system
+     * @param alpha        the Ewald blending parameter
+     */
+    virtual void initialize(int gridx, int gridy, int gridz, int numParticles, double alpha) = 0;
+    /**
+     * Begin computing the force and energy.
+     *
+     * @param io                  an object that coordinates data transfer
+     * @param periodicBoxVectors  the vectors defining the periodic box (measured in nm)
+     * @param includeEnergy       true if potential energy should be computed
+     */
+    virtual void beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) = 0;
+    /**
+     * Finish computing the force and energy.
+     * 
+     * @param io   an object that coordinates data transfer
+     * @return the potential energy due to the PME reciprocal space interactions
+     */
+    virtual double finishComputation(IO& io) = 0;
+    /**
+     * Get the parameters being used for PME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    virtual void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const = 0;
+};
+
 } // namespace OpenMM

 #endif /*OPENMM_KERNELS_H_*/
--- a/openmmapi/include/openmm/NonbondedForce.h
+++ b/openmmapi/include/openmm/NonbondedForce.h
@@ -109,7 +109,12 @@ public:
         * Periodic boundary conditions are used, and Particle-Mesh Ewald (PME) summation is used to compute the interaction of each particle
         * with all periodic copies of every other particle.
         */
-        PME = 4
+        PME = 4,
+        /**
+         * Periodic boundary conditions are used, and Particle-Mesh Ewald (PME) summation is used to compute the interaction of each particle
+         * with all periodic copies of every other particle for both electrostatics and dispersion.  No switching is used for either interaction.
+         */
+        LJPME = 5
    };
    /**
     * Create a NonbondedForce.
@@ -207,6 +212,16 @@ public:
     * @param[out] nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the parameters to use for dispersion term in LJ-PME calculations.  If alpha is 0 (the default),
+     * these parameters are ignored and instead their values are chosen based on the Ewald error tolerance.
+     *
+     * @param[out] alpha   the separation parameter
+     * @param[out] nx      the number of dispersion grid points along the X axis
+     * @param[out] ny      the number of dispersion grid points along the Y axis
+     * @param[out] nz      the number of dispersion grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
    /**
     * Set the parameters to use for PME calculations.  If alpha is 0 (the default), these parameters are
     * ignored and instead their values are chosen based on the Ewald error tolerance.
@@ -217,6 +232,16 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void setPMEParameters(double alpha, int nx, int ny, int nz);
+    /**
+     * Set the parameters to use for the dispersion term in LJPME calculations.  If alpha is 0 (the default),
+     * these parameters are ignored and instead their values are chosen based on the Ewald error tolerance.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void setLJPMEParameters(double alpha, int nx, int ny, int nz);
    /**
     * Get the parameters being used for PME in a particular Context.  Because some platforms have restrictions
     * on the allowed grid sizes, the values that are actually used may be slightly different from those
@@ -230,6 +255,19 @@ public:
     * @param[out] nz      the number of grid points along the Z axis
     */
    void getPMEParametersInContext(const Context& context, double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the PME parameters being used for the dispersion term for LJPME in a particular Context.  Because some
+     * platforms have restrictions on the allowed grid sizes, the values that are actually used may be slightly different
+     * from those specified with setPMEParameters(), or the standard values calculated based on the Ewald error tolerance.
+     * See the manual for details.
+     *
+     * @param context      the Context for which to get the parameters
+     * @param[out] alpha   the separation parameter
+     * @param[out] nx      the number of grid points along the X axis
+     * @param[out] ny      the number of grid points along the Y axis
+     * @param[out] nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParametersInContext(const Context& context, double& alpha, int& nx, int& ny, int& nz) const;
    /**
     * Add the nonbonded force parameters for a particle.  This should be called once for each particle
     * in the System.  When it is called for the i'th time, it specifies the parameters for the i'th particle.
@@ -382,9 +420,9 @@ private:
    class ParticleInfo;
    class ExceptionInfo;
    NonbondedMethod nonbondedMethod;
-    double cutoffDistance, switchingDistance, rfDielectric, ewaldErrorTol, alpha;
+    double cutoffDistance, switchingDistance, rfDielectric, ewaldErrorTol, alpha, dalpha;
    bool useSwitchingFunction, useDispersionCorrection;
-    int recipForceGroup, nx, ny, nz;
+    int recipForceGroup, nx, ny, nz, dnx, dny, dnz;
    void addExclusionsToSet(const std::vector<std::set<int> >& bonded12, std::set<int>& exclusions, int baseParticle, int fromParticle, int currentLevel) const;
    std::vector<ParticleInfo> particles;
    std::vector<ExceptionInfo> exceptions;

--- a/openmmapi/include/openmm/internal/NonbondedForceImpl.h
+++ b/openmmapi/include/openmm/internal/NonbondedForceImpl.h
@@ -65,6 +65,7 @@ public:
    std::vector<std::string> getKernelNames();
    void updateParametersInContext(ContextImpl& context);
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
    /**
     * This is a utility routine that calculates the values to use for alpha and kmax when using
     * Ewald summation.
@@ -74,7 +75,7 @@ public:
     * This is a utility routine that calculates the values to use for alpha and grid size when using
     * Particle Mesh Ewald.
     */
-    static void calcPMEParameters(const System& system, const NonbondedForce& force, double& alpha, int& xsize, int& ysize, int& zsize);
+    static void calcPMEParameters(const System& system, const NonbondedForce& force, double& alpha, int& xsize, int& ysize, int& zsize, bool LJ);
    /**
     * Compute the coefficient which, when divided by the periodic box volume, gives the
     * long range dispersion correction to the energy.

--- a/openmmapi/src/NonbondedForce.cpp
+++ b/openmmapi/src/NonbondedForce.cpp
@@ -106,6 +106,13 @@ void NonbondedForce::getPMEParameters(double& alpha, int& nx, int& ny, int& nz)
    nz = this->nz;
 }

+void NonbondedForce::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    alpha = this->dalpha;
+    nx = this->dnx;
+    ny = this->dny;
+    nz = this->dnz;
+}
+
 void NonbondedForce::setPMEParameters(double alpha, int nx, int ny, int nz) {
    this->alpha = alpha;
    this->nx = nx;
@@ -113,10 +120,21 @@ void NonbondedForce::setPMEParameters(double alpha, int nx, int ny, int nz) {
    this->nz = nz;
 }

+void NonbondedForce::setLJPMEParameters(double alpha, int nx, int ny, int nz) {
+    this->dalpha = alpha;
+    this->dnx = nx;
+    this->dny = ny;
+    this->dnz = nz;
+}
+
 void NonbondedForce::getPMEParametersInContext(const Context& context, double& alpha, int& nx, int& ny, int& nz) const {
    dynamic_cast<const NonbondedForceImpl&>(getImplInContext(context)).getPMEParameters(alpha, nx, ny, nz);
 }

+void NonbondedForce::getLJPMEParametersInContext(const Context& context, double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const NonbondedForceImpl&>(getImplInContext(context)).getLJPMEParameters(alpha, nx, ny, nz);
+}
+
 int NonbondedForce::addParticle(double charge, double sigma, double epsilon) {
    particles.push_back(ParticleInfo(charge, sigma, epsilon));
    return particles.size()-1;

--- a/openmmapi/src/NonbondedForceImpl.cpp
+++ b/openmmapi/src/NonbondedForceImpl.cpp
@@ -151,8 +151,12 @@ void NonbondedForceImpl::calcEwaldParameters(const System& system, const Nonbond
        kmaxz++;
 }

-void NonbondedForceImpl::calcPMEParameters(const System& system, const NonbondedForce& force, double& alpha, int& xsize, int& ysize, int& zsize) {
-    force.getPMEParameters(alpha, xsize, ysize, zsize);
+void NonbondedForceImpl::calcPMEParameters(const System& system, const NonbondedForce& force, double& alpha, int& xsize, int& ysize, int& zsize, bool LJ) {
+    if(LJ) {
+        force.getLJPMEParameters(alpha, xsize, ysize, zsize);
+    } else {
+        force.getPMEParameters(alpha, xsize, ysize, zsize);
+    }
    if (alpha == 0.0) {
        Vec3 boxVectors[3];
        system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
@@ -283,3 +287,7 @@ void NonbondedForceImpl::updateParametersInContext(ContextImpl& context) {
 void NonbondedForceImpl::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
    kernel.getAs<CalcNonbondedForceKernel>().getPMEParameters(alpha, nx, ny, nz);
 }
+
+void NonbondedForceImpl::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    kernel.getAs<CalcNonbondedForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
+}
--- a/platforms/cpu/include/CpuKernels.h
+++ b/platforms/cpu/include/CpuKernels.h
@@ -251,27 +251,37 @@ public:
    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
    /**
     * Get the parameters being used for PME.
-     * 
+     *
     * @param alpha   the separation parameter
     * @param nx      the number of grid points along the X axis
     * @param ny      the number of grid points along the Y axis
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the parameters being used for the dispersion term in LJPME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class PmeIO;
    CpuPlatform::PlatformData& data;
    int numParticles, num14;
    int **bonded14IndexArray;
    double **bonded14ParamArray;
-    double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldSelfEnergy, dispersionCoefficient;
-    int kmax[3], gridSize[3];
-    bool useSwitchingFunction, useOptimizedPme, hasInitializedPme;
+    double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldDispersionAlpha, ewaldSelfEnergy, dispersionCoefficient;
+    int kmax[3], gridSize[3], dispersionGridSize[3];
+    bool useSwitchingFunction, useOptimizedPme, hasInitializedPme, hasInitializedDispersionPme;
    std::vector<std::set<int> > exclusions;
    std::vector<std::pair<float, float> > particleParams;
+    std::vector<float> C6params;
    NonbondedMethod nonbondedMethod;
    CpuNonbondedForce* nonbonded;
-    Kernel optimizedPme;
+    Kernel optimizedPme, optimizedDispersionPme;
    CpuBondForce bondForce;
 };


--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -104,16 +104,27 @@ class CpuNonbondedForce {

     
      /**---------------------------------------------------------------------------------------
-      
+
         Set the force to use Particle-Mesh Ewald (PME) summation.
-      
+
         @param alpha    the Ewald separation parameter
         @param gridSize the dimensions of the mesh
-      
+
         --------------------------------------------------------------------------------------- */
-      
+
      void setUsePME(float alpha, int meshSize[3]);

+      /**---------------------------------------------------------------------------------------
+
+         Set the force to use Particle-Mesh Ewald (PME) summation for dispersion.
+
+         @param alpha    the Ewald separation parameter
+         @param gridSize the dimensions of the mesh
+
+         --------------------------------------------------------------------------------------- */
+
+      void setUseLJPME(float alpha, int meshSize[3]);
+
      /**---------------------------------------------------------------------------------------
      
         Calculate Ewald ixn
@@ -122,16 +133,17 @@ class CpuNonbondedForce {
         @param posq             atom coordinates and charges
         @param atomCoordinates  atom coordinates (in format needed by PME)
         @param atomParameters   atom parameters (sigma/2, 2*sqrt(epsilon))
+         @param C6Paramrs        C6 parameters for multiplicative representation of dispersion
         @param exclusions       atom exclusion indices
                                 exclusions[atomIndex] contains the list of exclusions for that atom
         @param forces           force array (forces added)
         @param totalEnergy      total energy
            
         --------------------------------------------------------------------------------------- */
-          
+
      void calculateReciprocalIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates,
-                            const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
-                            std::vector<RealVec>& forces, double* totalEnergy) const;
+                                  const std::vector<std::pair<float, float> >& atomParameters, const std::vector<float> &C6params,
+                                  const std::vector<std::set<int> >& exclusions, std::vector<RealVec>& forces, double* totalEnergy) const;
      
      /**---------------------------------------------------------------------------------------
      
@@ -150,7 +162,7 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
          
      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+            const std::vector<float>& C6params, const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);

    /**
     * This routine contains the code executed by each thread.
@@ -163,28 +175,32 @@ protected:
        bool periodic;
        bool triclinic;
        bool ewald;
-        bool pme;
-        bool tableIsValid;
+        bool ljpme, pme;
+        bool tableIsValid, expTableIsValid;
        const CpuNeighborList* neighborList;
        float recipBoxSize[3];
        RealVec periodicBoxVectors[3];
        AlignedArray<fvec4> periodicBoxVec4;
        float cutoffDistance, switchingDistance;
        float krf, crf;
-        float alphaEwald;
+        float alphaEwald, alphaDispersionEwald;
        int numRx, numRy, numRz;
-        int meshDim[3];
+        int meshDim[3], dispersionMeshDim[3];
        std::vector<float> erfcTable, ewaldScaleTable;
-        float ewaldDX, ewaldDXInv, erfcDXInv;
+        std::vector<float> exptermsTable, dExptermsTable;
+        float ewaldDX, ewaldDXInv, erfcDXInv, exptermsDX, exptermsDXInv;
        std::vector<double> threadEnergy;
        // The following variables are used to make information accessible to the individual threads.
        int numberOfAtoms;
        float* posq;
        RealVec const* atomCoordinates;
-        std::pair<float, float> const* atomParameters;        
+        std::pair<float, float> const* atomParameters;
+        float const *C6params;
        std::set<int> const* exclusions;
        std::vector<AlignedArray<float> >* threadForce;
        bool includeEnergy;
+        float inverseRcut6;
+        float inverseRcut6Expterm;
        void* atomicCounter;

        static const float TWO_OVER_SQRT_PI;
@@ -238,10 +254,29 @@ protected:
       */
      void tabulateEwaldScaleFactor();

+      /**
+       * Create a lookup table for the scale factor used with dispersion PME.
+       */
+      void tabulateExpTerms();
+
      /**
       * Compute a fast approximation to erfc(x).
       */
      float erfcApprox(float x);
+
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME energies.
+       */
+      float exptermsApprox(float R);
+
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME forces.
+       */
+      float dExptermsApprox(float R);
 };

 } // namespace OpenMM

--- a/platforms/cpu/include/CpuNonbondedForceVec4.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec4.h
@@ -88,11 +88,25 @@ protected:
       * Compute a fast approximation to erfc(x).
       */
      fvec4 erfcApprox(const fvec4& x);
-      
+
      /**
       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
       */
      fvec4 ewaldScaleFunction(const fvec4& x);
+
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME energies.
+       */
+      fvec4 exptermsApprox(const fvec4& R);
+
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME forces.
+       */
+      fvec4 dExptermsApprox(const fvec4& R);
 };

 } // namespace OpenMM

--- a/platforms/cpu/include/CpuNonbondedForceVec8.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec8.h
@@ -92,6 +92,21 @@ protected:
       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
       */
      fvec8 ewaldScaleFunction(const fvec8& x);
+
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME energies.
+       */
+      fvec8 exptermsApprox(const fvec8& R);
+
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME forces.
+       */
+      fvec8 dExptermsApprox(const fvec8& R);
+
 };

 } // namespace OpenMM

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -50,6 +50,7 @@
 #include "lepton/CustomFunction.h"
 #include "lepton/Operation.h"
 #include "lepton/Parser.h"
+#include <iostream>
 #include "lepton/ParsedExpression.h"

 using namespace OpenMM;
@@ -528,7 +529,7 @@ CpuNonbondedForce* createCpuNonbondedForceVec4();
 CpuNonbondedForce* createCpuNonbondedForceVec8();

 CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
-        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), nonbonded(NULL) {
+        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), hasInitializedDispersionPme(false), nonbonded(NULL) {
    if (isVec8Supported())
        nonbonded = createCpuNonbondedForceVec8();
    else
@@ -575,12 +576,14 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    for (int i = 0; i < num14; i++)
        bonded14ParamArray[i] = new double[3];
    particleParams.resize(numParticles);
+    C6params.resize(numParticles);
    double sumSquaredCharges = 0.0;
    for (int i = 0; i < numParticles; ++i) {
        double charge, radius, depth;
        force.getParticleParameters(i, charge, radius, depth);
        data.posq[4*i+3] = (float) charge;
        particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
+        C6params[i] = 8.0*pow(particleParams[i].first, 3.0) * particleParams[i].second;
        sumSquaredCharges += charge*charge;
    }
    
@@ -616,19 +619,35 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    }
    else if (nonbondedMethod == PME) {
        double alpha;
-        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2]);
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
        ewaldAlpha = alpha;
    }
-    if (nonbondedMethod == Ewald || nonbondedMethod == PME)
+    else if (nonbondedMethod == LJPME) {
+        double alpha;
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
+        ewaldAlpha = (RealOpenMM) alpha;
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, dispersionGridSize[0], dispersionGridSize[1], dispersionGridSize[2], true);
+        ewaldDispersionAlpha = (RealOpenMM) alpha;
+        useSwitchingFunction = false;
+    }
+
+    if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) {
        ewaldSelfEnergy = -ONE_4PI_EPS0*ewaldAlpha*sumSquaredCharges/sqrt(M_PI);
-    else
+        if(nonbondedMethod == LJPME){
+            for (int atom = 0; atom < numParticles; atom++) {
+                // Dispersion self term
+                ewaldSelfEnergy += pow(ewaldDispersionAlpha, 6.0) * C6params[atom]*C6params[atom] / 12.0;
+            }
+        }
+    } else {
        ewaldSelfEnergy = 0.0;
+    }
    rfDielectric = force.getReactionFieldDielectric();
    if (force.getUseDispersionCorrection())
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
    else
        dispersionCoefficient = 0.0;
-    data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME);
+    data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME);
 }

 double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
@@ -646,6 +665,20 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
            }
        }
+        if (nonbondedMethod == LJPME) {
+            // If available, use the optimized PME implementation.
+
+            vector<string> kernelNames;
+            kernelNames.push_back("CalcPmeReciprocalForce");
+            useOptimizedPme = getPlatform().supportsKernels(kernelNames);
+            if (useOptimizedPme) {
+                optimizedPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
+                optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
+                optimizedDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), context);
+                optimizedDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSize[0], dispersionGridSize[1],
+                                                                                                  dispersionGridSize[2], numParticles, ewaldDispersionAlpha);
+            }
+        }
    }
    AlignedArray<float>& posq = data.posq;
    vector<RealVec>& posData = extractPositions(context);
@@ -654,6 +687,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    bool ewald  = (nonbondedMethod == Ewald);
    bool pme  = (nonbondedMethod == PME);
+    bool ljpme = (nonbondedMethod == LJPME);
    if (nonbondedMethod != NoCutoff)
        nonbonded->setUseCutoff(nonbondedCutoff, *data.neighborList, rfDielectric);
    if (data.isPeriodic) {
@@ -669,9 +703,13 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
        nonbonded->setUsePME(ewaldAlpha, gridSize);
    if (useSwitchingFunction)
        nonbonded->setUseSwitchingFunction(switchingDistance);
+    if (ljpme){
+        nonbonded->setUsePME(ewaldAlpha, gridSize);
+        nonbonded->setUseLJPME(ewaldDispersionAlpha, dispersionGridSize);
+    }
    double nonbondedEnergy = 0;
    if (includeDirect)
-        nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
+        nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, C6params, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
    if (includeReciprocal) {
        if (useOptimizedPme) {
            PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
@@ -680,13 +718,13 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
        }
        else
-            nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
+            nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, C6params, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
    }
    energy += nonbondedEnergy;
    if (includeDirect) {
        ReferenceLJCoulomb14 nonbonded14;
        bondForce.calculateForce(posData, bonded14ParamArray, forceData, includeEnergy ? &energy : NULL, nonbonded14);
-        if (data.isPeriodic)
+        if (data.isPeriodic && nonbondedMethod != LJPME)
            energy += dispersionCoefficient/(boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2]);
    }
    return energy;
@@ -739,7 +777,7 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 }

 void CpuCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
-    if (nonbondedMethod != PME)
+    if (nonbondedMethod != PME && nonbondedMethod != LJPME)
        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
    if (useOptimizedPme)
        optimizedPme.getAs<const CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
@@ -751,6 +789,19 @@ void CpuCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int&
    }
 }

+void CpuCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    if (nonbondedMethod != LJPME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
+    if (useOptimizedPme)
+        optimizedDispersionPme.getAs<const CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
+    else {
+        alpha = ewaldDispersionAlpha;
+        nx = dispersionGridSize[0];
+        ny = dispersionGridSize[1];
+        nz = dispersionGridSize[2];
+    }
+}
+
 CpuCalcCustomNonbondedForceKernel::CpuCalcCustomNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) :
            CalcCustomNonbondedForceKernel(name, platform), data(data), forceCopy(NULL), nonbonded(NULL) {
 }

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -30,6 +30,7 @@
 #include "ReferencePME.h"
 #include "openmm/internal/gmx_atomic.h"
 #include <algorithm>
+#include <iostream>

 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
@@ -57,7 +58,8 @@ public:

   --------------------------------------------------------------------------------------- */

-CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false), cutoffDistance(0.0f), alphaEwald(0.0f) {
+CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), ljpme(false), tableIsValid(false), expTableIsValid(false),
+    cutoffDistance(0.0f), alphaDispersionEwald(0.0f), alphaEwald(0.0f) {
 }

 CpuNonbondedForce::~CpuNonbondedForce() {
@@ -78,10 +80,21 @@ void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neig
        tableIsValid = false;
    cutoff = true;
    cutoffDistance = distance;
+    inverseRcut6 = pow(cutoffDistance, -6);
    neighborList = &neighbors;
    krf = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0)/(2.0*solventDielectric+1.0);
    crf = (1.0/cutoffDistance)*(3.0*solventDielectric)/(2.0*solventDielectric+1.0);
-  }
+    if(alphaDispersionEwald != 0.0f){
+        // We set this here, in case setUseCutoff is called after the dispersion alpha is set.
+        double dalphaR = alphaDispersionEwald*cutoffDistance;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        inverseRcut6Expterm  = inverseRcut6*(1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
+    }
+
+}

 /**---------------------------------------------------------------------------------------

@@ -96,7 +109,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    switchingDistance = distance;
 }

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use periodic boundary conditions.  This requires that a cutoff has
     also been set, and the smallest side of the periodic box is at least twice the cutoff
@@ -106,7 +119,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {

     --------------------------------------------------------------------------------------- */

-  void CpuNonbondedForce::setPeriodic(RealVec* periodicBoxVectors) {
+void CpuNonbondedForce::setPeriodic(RealVec* periodicBoxVectors) {

    assert(cutoff);
    assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
@@ -124,11 +137,11 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    periodicBoxVec4[1] = fvec4(periodicBoxVectors[1][0], periodicBoxVectors[1][1], periodicBoxVectors[1][2], 0);
    periodicBoxVec4[2] = fvec4(periodicBoxVectors[2][0], periodicBoxVectors[2][1], periodicBoxVectors[2][2], 0);
    triclinic = (periodicBoxVectors[0][1] != 0.0 || periodicBoxVectors[0][2] != 0.0 ||
-                 periodicBoxVectors[1][0] != 0.0 || periodicBoxVectors[1][2] != 0.0 ||
-                 periodicBoxVectors[2][0] != 0.0 || periodicBoxVectors[2][1] != 0.0);
-  }
+            periodicBoxVectors[1][0] != 0.0 || periodicBoxVectors[1][2] != 0.0 ||
+            periodicBoxVectors[2][0] != 0.0 || periodicBoxVectors[2][1] != 0.0);
+}

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use Ewald summation.

@@ -139,18 +152,18 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {

     --------------------------------------------------------------------------------------- */

-  void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
-      if (alpha != alphaEwald)
-          tableIsValid = false;
-      alphaEwald = alpha;
-      numRx = kmaxx;
-      numRy = kmaxy;
-      numRz = kmaxz;
-      ewald = true;
-      tabulateEwaldScaleFactor();
-  }
+void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
+    if (alpha != alphaEwald)
+        tableIsValid = false;
+    alphaEwald = alpha;
+    numRx = kmaxx;
+    numRy = kmaxy;
+    numRz = kmaxz;
+    ewald = true;
+    tabulateEwaldScaleFactor();
+}

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use Particle-Mesh Ewald (PME) summation.

@@ -159,19 +172,49 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {

     --------------------------------------------------------------------------------------- */

-  void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
-      if (alpha != alphaEwald)
-          tableIsValid = false;
-      alphaEwald = alpha;
-      meshDim[0] = meshSize[0];
-      meshDim[1] = meshSize[1];
-      meshDim[2] = meshSize[2];
-      pme = true;
-      tabulateEwaldScaleFactor();
-  }
-
-  
-  void CpuNonbondedForce::tabulateEwaldScaleFactor() {
+void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
+    if (alpha != alphaEwald)
+        tableIsValid = false;
+    alphaEwald = alpha;
+    meshDim[0] = meshSize[0];
+    meshDim[1] = meshSize[1];
+    meshDim[2] = meshSize[2];
+    pme = true;
+    tabulateEwaldScaleFactor();
+}
+
+
+/**---------------------------------------------------------------------------------------
+
+     Set the force to use Particle-Mesh Ewald (PME) summation for dispersion.
+
+     @param alpha  the Ewald separation parameter
+     @param gridSize the dimensions of the mesh
+
+     --------------------------------------------------------------------------------------- */
+
+void CpuNonbondedForce::setUseLJPME(float alpha, int meshSize[3]) {
+    if (alpha != alphaDispersionEwald)
+        expTableIsValid = false;
+    alphaDispersionEwald = alpha;
+    dispersionMeshDim[0] = meshSize[0];
+    dispersionMeshDim[1] = meshSize[1];
+    dispersionMeshDim[2] = meshSize[2];
+    ljpme = true;
+    tabulateExpTerms();
+    if(cutoffDistance != 0.0f){
+        // We set this here, in case setUseLJPME is called after the cutoff is set
+        double dalphaR = alphaDispersionEwald*cutoffDistance;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        inverseRcut6Expterm  = inverseRcut6*(1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
+    }
+}
+
+
+void CpuNonbondedForce::tabulateEwaldScaleFactor() {
    if (tableIsValid)
        return;
    tableIsValid = true;
@@ -187,10 +230,30 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
        ewaldScaleTable[i] = erfcTable[i] + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
    }
 }
-  
+
+void CpuNonbondedForce::tabulateExpTerms() {
+    if (expTableIsValid)
+        return;
+    expTableIsValid = true;
+    exptermsDX = cutoffDistance/NUM_TABLE_POINTS;
+    exptermsDXInv = 1.0f/exptermsDX;
+    exptermsTable.resize(NUM_TABLE_POINTS+4);
+    dExptermsTable.resize(NUM_TABLE_POINTS+4);
+    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
+        double r = i*ewaldDX;
+        double dalphaR = alphaDispersionEwald*r;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        exptermsTable[i]  = (1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
+        dExptermsTable[i] = (1.0 - expterm * (1.0 + dar2 + 0.5*dar4 + dar6/6.0));
+    }
+}
+
 void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates,
-                                             const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
-                                             vector<RealVec>& forces, double* totalEnergy) const {
+                                               const vector<pair<float, float> >& atomParameters, const vector<float> &C6params, const vector<set<int> >& exclusions,
+                                               vector<RealVec>& forces, double* totalEnergy) const {
    typedef std::complex<float> d_complex;

    static const float epsilon     =  1.0;
@@ -211,6 +274,29 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
        if (totalEnergy)
            *totalEnergy += recipEnergy;
        pme_destroy(pmedata);
+
+        if (ljpme) {
+            // Dispersion reciprocal space terms
+            pme_init(&pmedata,alphaDispersionEwald,numberOfAtoms,dispersionMeshDim,5,1);
+
+            std::vector<RealVec> dpmeforces;
+            for (int i = 0; i < numberOfAtoms; i++){
+                charges[i] = (RealOpenMM)C6params[i];
+                dpmeforces.push_back(RealVec());
+            }
+            RealOpenMM recipDispersionEnergy    = 0.0;
+            pme_exec_dpme(pmedata,atomCoordinates,dpmeforces,charges,periodicBoxVectors,&recipDispersionEnergy);
+            for (int i = 0; i < numberOfAtoms; i++){
+                forces[i][0] -= 2.0*dpmeforces[i][0];
+                forces[i][1] -= 2.0*dpmeforces[i][1];
+                forces[i][2] -= 2.0*dpmeforces[i][2];
+            }
+            if (totalEnergy)
+                *totalEnergy += recipDispersionEnergy;
+
+            pme_destroy(pmedata);
+        }
+
    }

    // Ewald method
@@ -224,7 +310,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c

        // setup K-vectors

-        #define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
+#define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
        vector<d_complex> eir(kmax*numberOfAtoms*3);
        vector<d_complex> tab_xy(numberOfAtoms);
        vector<d_complex> tab_qxyz(numberOfAtoms);
@@ -232,15 +318,15 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
        for (int i = 0; (i < numberOfAtoms); i++) {
            float* pos = posq+4*i;
            for (int m = 0; (m < 3); m++)
-              EIR(0, i, m) = d_complex(1,0);
+                EIR(0, i, m) = d_complex(1,0);

            for (int m=0; (m<3); m++)
-              EIR(1, i, m) = d_complex(cos(pos[m]*recipBoxSize[m]),
-                                       sin(pos[m]*recipBoxSize[m]));
+                EIR(1, i, m) = d_complex(cos(pos[m]*recipBoxSize[m]),
+                                         sin(pos[m]*recipBoxSize[m]));

            for (int j=2; (j<kmax); j++)
-              for (int m=0; (m<3); m++)
-                EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
+                for (int m=0; (m<3); m++)
+                    EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
        }

        // calculate reciprocal space energy and forces
@@ -254,11 +340,11 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
                float ky = ry * recipBoxSize[1];
                if (ry >= 0) {
                    for (int n = 0; n < numberOfAtoms; n++)
-                      tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
+                        tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
                }
                else {
                    for (int n = 0; n < numberOfAtoms; n++)
-                      tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
+                        tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
                }
                for (int rz = lowrz; rz < numRz; rz++) {
                    if (rz >= 0) {
@@ -301,13 +387,14 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c


 void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+                                           const vector<float>& C6params, const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    
    this->numberOfAtoms = numberOfAtoms;
    this->posq = posq;
    this->atomCoordinates = &atomCoordinates[0];
    this->atomParameters = &atomParameters[0];
+    this->C6params = &C6params[0];
    this->exclusions = &exclusions[0];
    this->threadForce = &threadForce;
    includeEnergy = (totalEnergy != NULL);
@@ -319,7 +406,7 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    // Signal the threads to start running and wait for them to finish.
    
    ComputeDirectTask task(*this);
-    threads.execute(task);
+    threads.execute(task); // ACS calls threadcomputedirect
    threads.waitForThreads();
    
    // Signal the threads to subtract the exclusions.
@@ -350,9 +437,8 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    float* forces = &(*threadForce)[threadIndex][0];
    fvec4 boxSize(periodicBoxVectors[0][0], periodicBoxVectors[1][1], periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize(recipBoxSize[0], recipBoxSize[1], recipBoxSize[2], 0);
-    if (ewald || pme) {
+    if (ewald || pme || ljpme) {
        // Compute the interactions from the neighbor list.
-
        while (true) {
            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
            if (nextBlock >= neighborList->getNumBlocks())
@@ -370,7 +456,7 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
                break;
            int end = min(start+groupSize, numberOfAtoms);
            for (int i = start; i < end; i++) {
-               fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
+                fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
                float scaledChargeI = (float) (ONE_4PI_EPS0*posq[4*i+3]);
                for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
                    if (*iter > i) {
@@ -394,7 +480,18 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
                                threadEnergy[threadIndex] -= chargeProdOverR*erfAlphaR;
                        }
                        else if (includeEnergy)
-                           threadEnergy[threadIndex] -= alphaEwald*TWO_OVER_SQRT_PI*scaledChargeI*posq[4*j+3];
+                            threadEnergy[threadIndex] -= alphaEwald*TWO_OVER_SQRT_PI*scaledChargeI*posq[4*j+3];
+                        if (ljpme) {
+                            float C6ij = C6params[i]*C6params[j];
+                            float inverseR2 = 1.0f/r2;
+                            float emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                            if(includeEnergy)
+                                threadEnergy[threadIndex] += emult;
+                            float dEdR = -6.0f*C6ij*inverseR2*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                            fvec4 result = deltaR*dEdR;
+                            (fvec4(forces+4*i)-result).store(forces+4*i);
+                            (fvec4(forces+4*j)+result).store(forces+4*j);
+                        }
                    }
                }
            }
@@ -444,7 +541,7 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
    }
    float sig       = atomParameters[ii].first + atomParameters[jj].first;
    float sig2      = inverseR*sig;
-          sig2     *= sig2;
+    sig2     *= sig2;
    float sig6      = sig2*sig2*sig2;

    float eps       = atomParameters[ii].second*atomParameters[jj].second;
@@ -476,7 +573,7 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
    fvec4 result = deltaR*dEdR;
    (fvec4(forces+4*ii)+result).store(forces+4*ii);
    (fvec4(forces+4*jj)-result).store(forces+4*jj);
-  }
+}

 void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    deltaR = posJ-posI;
@@ -502,3 +599,18 @@ float CpuNonbondedForce::erfcApprox(float x) {
    return coeff1*erfcTable[index] + coeff2*erfcTable[index+1];
 }

+float CpuNonbondedForce::exptermsApprox(float x) {
+    float x1 = x*exptermsDXInv;
+    int index = min((int) floor(x1), NUM_TABLE_POINTS);
+    float coeff2 = x1-index;
+    float coeff1 = 1.0f-coeff2;
+    return coeff1*exptermsTable[index] + coeff2*exptermsTable[index+1];
+}
+
+float CpuNonbondedForce::dExptermsApprox(float x) {
+    float x1 = x*exptermsDXInv;
+    int index = min((int) floor(x1), NUM_TABLE_POINTS);
+    float coeff2 = x1-index;
+    float coeff1 = 1.0f-coeff2;
+    return coeff1*dExptermsTable[index] + coeff2*dExptermsTable[index+1];
+}
--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
@@ -25,6 +25,7 @@
 #include "SimTKOpenMMUtilities.h"
 #include "CpuNonbondedForceVec4.h"
 #include <algorithm>
+#include <iostream>

 using namespace std;
 using namespace OpenMM;
@@ -213,7 +214,6 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,

 void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
    // Determine whether we need to apply periodic boundary conditions.
-    
    PeriodicType periodicType;
    fvec4 blockCenter;
    if (!periodic) {
@@ -263,7 +263,6 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces
 template <int PERIODIC_TYPE>
 void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
-    
    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
    fvec4 blockAtomPosq[4];
    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
@@ -278,9 +277,10 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    fvec4 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]]);
    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
-    
+
    // Loop over neighbors for this block.
    
    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
@@ -318,7 +318,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
            fvec4 sig2 = inverseR*sig;
            sig2 *= sig2;
            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            fvec4 eps = blockAtomEpsilon*atomEpsilon;
+            fvec4 epsSig6 = eps*sig6;
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
@@ -328,6 +329,17 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
                dEdR = switchValue*dEdR - energy*switchDeriv*r;
                energy *= switchValue;
            }
+
+            if (ljpme) {
+                fvec4 C6ij = C6s*C6params[atom];
+                fvec4 inverseR2 = inverseR*inverseR;
+                fvec4 mysig2 = sig*sig;
+                fvec4 mysig6 = mysig2*mysig2*mysig2;
+                fvec4 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                fvec4 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
+                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                energy += emult + potentialShift;
+            }
        }
        else {
            energy = 0.0f;
@@ -362,7 +374,7 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
    }
    
    // Record the forces on the block atoms.
-    
+
    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
    transpose(f[0], f[1], f[2], f[3]);
    for (int j = 0; j < 4; j++)
@@ -420,3 +432,30 @@ fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(const fvec4& x) {
    transpose(t1, t2, t3, t4);
    return coeff1*t1 + coeff2*t2;
 }
+
+fvec4 CpuNonbondedForceVec4::exptermsApprox(const fvec4& r) {
+    fvec4 r1 = r*exptermsDXInv;
+    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = r1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&exptermsTable[index[0]]);
+    fvec4 t2(&exptermsTable[index[1]]);
+    fvec4 t3(&exptermsTable[index[2]]);
+    fvec4 t4(&exptermsTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
+
+fvec4 CpuNonbondedForceVec4::dExptermsApprox(const fvec4& r) {
+    fvec4 r1 = r*exptermsDXInv;
+    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = r1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&dExptermsTable[index[0]]);
+    fvec4 t2(&dExptermsTable[index[1]]);
+    fvec4 t3(&dExptermsTable[index[2]]);
+    fvec4 t4(&dExptermsTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
+
--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
@@ -27,6 +27,7 @@
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/hardware.h"
 #include <algorithm>
+#include <iostream>

 using namespace std;
 using namespace OpenMM;
@@ -80,8 +81,7 @@ CpuNonbondedForceVec8::CpuNonbondedForceVec8() {
 enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};

 void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // Determine whether we need to apply periodic boundary conditions.
-    
+    // Determine whether we need to apply periodic boundary conditions.    
    PeriodicType periodicType;
    fvec4 blockCenter;
    if (!periodic) {
@@ -308,6 +308,7 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
    blockAtomCharge *= ONE_4PI_EPS0;
    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
+    fvec8 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]], C6params[blockAtom[4]], C6params[blockAtom[5]], C6params[blockAtom[6]], C6params[blockAtom[7]]);
    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
@@ -348,7 +349,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
            fvec8 sig2 = inverseR*sig;
            sig2 *= sig2;
            fvec8 sig6 = sig2*sig2*sig2;
-            fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            fvec8 eps = blockAtomEpsilon*atomEpsilon;
+            fvec8 epsSig6 = eps*sig6;
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
@@ -358,6 +360,17 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
                dEdR = switchValue*dEdR - energy*switchDeriv*r;
                energy *= switchValue;
            }
+            if (ljpme) {
+                fvec8 C6ij = C6s*C6params[atom];
+                fvec8 inverseR2 = inverseR*inverseR;
+                fvec8 mysig2 = sig*sig;
+                fvec8 mysig6 = mysig2*mysig2*mysig2;
+                fvec8 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                fvec8 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
+                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                energy += emult + potentialShift;
+            }
+
        }
        else {
            energy = 0.0f;
@@ -464,4 +477,45 @@ fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(const fvec8& x) {
    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
    return coeff1*s1 + coeff2*s2;
 }
+
+fvec8 CpuNonbondedForceVec8::exptermsApprox(const fvec8& r) {
+    fvec8 r1 = r*exptermsDXInv;
+    ivec8 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = r1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&exptermsTable[indexLower[0]]);
+    fvec4 t2(&exptermsTable[indexLower[1]]);
+    fvec4 t3(&exptermsTable[indexLower[2]]);
+    fvec4 t4(&exptermsTable[indexLower[3]]);
+    fvec4 t5(&exptermsTable[indexUpper[0]]);
+    fvec4 t6(&exptermsTable[indexUpper[1]]);
+    fvec4 t7(&exptermsTable[indexUpper[2]]);
+    fvec4 t8(&exptermsTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
+
+fvec8 CpuNonbondedForceVec8::dExptermsApprox(const fvec8& r) {
+    fvec8 r1 = r*exptermsDXInv;
+    ivec8 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = r1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&dExptermsTable[indexLower[0]]);
+    fvec4 t2(&dExptermsTable[indexLower[1]]);
+    fvec4 t3(&dExptermsTable[indexLower[2]]);
+    fvec4 t4(&dExptermsTable[indexLower[3]]);
+    fvec4 t5(&dExptermsTable[indexUpper[0]]);
+    fvec4 t6(&dExptermsTable[indexUpper[1]]);
+    fvec4 t7(&dExptermsTable[indexUpper[2]]);
+    fvec4 t8(&dExptermsTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
+
 #endif
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -598,8 +598,10 @@ private:
 class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
-            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
+            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), C6s(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
+            directDispersionPmeGrid(NULL), reciprocalDispersionPmeGrid(NULL),
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeAtomDispersionGridIndex(NULL),
+            pmeEnergyBuffer(NULL), dispersionPmeEnergyBuffer(NULL), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), dispersionPmeio(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -636,6 +638,15 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class SortTrait : public CudaSort::SortTrait {
        int getDataSize() const {return 8;}
@@ -655,38 +666,55 @@ private:
    CudaContext& cu;
    bool hasInitializedFFT;
    CudaArray* sigmaEpsilon;
+    CudaArray* C6s;
    CudaArray* exceptionParams;
    CudaArray* cosSinSums;
    CudaArray* directPmeGrid;
    CudaArray* reciprocalPmeGrid;
+    CudaArray* directDispersionPmeGrid;
+    CudaArray* reciprocalDispersionPmeGrid;
    CudaArray* pmeBsplineModuliX;
    CudaArray* pmeBsplineModuliY;
    CudaArray* pmeBsplineModuliZ;
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
+    CudaArray* pmeAtomDispersionGridIndex;
    CudaArray* pmeEnergyBuffer;
+    CudaArray* dispersionPmeEnergyBuffer;
    CudaSort* sort;
    Kernel cpuPme;
+    Kernel cpuDispersionPme;
    PmeIO* pmeio;
-    CUstream pmeStream;
-    CUevent pmeSyncEvent;
+    PmeIO* dispersionPmeio;
+    CUstream pmeStream, dispersionPmeStream;
+    CUevent pmeSyncEvent, dispersionPmeSyncEvent;
    CudaFFT3D* fft;
    cufftHandle fftForward;
    cufftHandle fftBackward;
+    CudaFFT3D* dispersionFft;
+    cufftHandle dispersionFftForward;
+    cufftHandle dispersionFftBackward;
    CUfunction ewaldSumsKernel;
    CUfunction ewaldForcesKernel;
    CUfunction pmeGridIndexKernel;
+    CUfunction pmeDispersionGridIndexKernel;
    CUfunction pmeSpreadChargeKernel;
+    CUfunction pmeDispersionSpreadChargeKernel;
    CUfunction pmeFinishSpreadChargeKernel;
+    CUfunction pmeDispersionFinishSpreadChargeKernel;
    CUfunction pmeEvalEnergyKernel;
+    CUfunction pmeEvalDispersionEnergyKernel;
    CUfunction pmeConvolutionKernel;
+    CUfunction pmeDispersionConvolutionKernel;
    CUfunction pmeInterpolateForceKernel;
+    CUfunction pmeInterpolateDispersionForceKernel;
    std::map<std::string, std::string> pmeDefines;
    std::vector<std::pair<int, int> > exceptionAtoms;
-    double ewaldSelfEnergy, dispersionCoefficient, alpha;
+    double ewaldSelfEnergy, dispersionSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
    int interpolateForceThreads;
    int gridSizeX, gridSizeY, gridSizeZ;
-    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT;
+    int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
+    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT, doLJPME;
    NonbondedMethod nonbondedMethod;
    static const int PmeOrder = 5;
 };

--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
@@ -439,6 +439,15 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class Task;
    CudaPlatform::PlatformData& data;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1593,14 +1593,20 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
    cu.setAsCurrent();
    if (sigmaEpsilon != NULL)
        delete sigmaEpsilon;
+    if (C6s != NULL)
+        delete C6s;
    if (exceptionParams != NULL)
        delete exceptionParams;
    if (cosSinSums != NULL)
        delete cosSinSums;
    if (directPmeGrid != NULL)
        delete directPmeGrid;
+    if (directDispersionPmeGrid != NULL)
+        delete directDispersionPmeGrid;
    if (reciprocalPmeGrid != NULL)
        delete reciprocalPmeGrid;
+    if (reciprocalDispersionPmeGrid != NULL)
+        delete reciprocalDispersionPmeGrid;
    if (pmeBsplineModuliX != NULL)
        delete pmeBsplineModuliX;
    if (pmeBsplineModuliY != NULL)
@@ -1611,14 +1617,20 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
        delete pmeAtomRange;
    if (pmeAtomGridIndex != NULL)
        delete pmeAtomGridIndex;
+    if (pmeAtomDispersionGridIndex != NULL)
+        delete pmeAtomDispersionGridIndex;
    if (pmeEnergyBuffer != NULL)
        delete pmeEnergyBuffer;
+    if (dispersionPmeEnergyBuffer != NULL)
+        delete dispersionPmeEnergyBuffer;
    if (sort != NULL)
        delete sort;
    if (fft != NULL)
        delete fft;
    if (pmeio != NULL)
        delete pmeio;
+    if (dispersionPmeio != NULL)
+        delete dispersionPmeio;
    if (hasInitializedFFT) {
        if (useCudaFFT) {
            cufftDestroy(fftForward);
@@ -1627,6 +1639,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
        if (usePmeStream) {
            cuStreamDestroy(pmeStream);
            cuEventDestroy(pmeSyncEvent);
+            if(doLJPME){
+                cuStreamDestroy(dispersionPmeStream);
+                cuEventDestroy(dispersionPmeSyncEvent);
+            }
        }
    }
 }
@@ -1634,6 +1650,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
 void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
    cu.setAsCurrent();

+    nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
+
    // Identify which exceptions are 1-4 interactions.

    vector<pair<int, int> > exclusions;
@@ -1650,14 +1668,29 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    // Initialize nonbonded interactions.

    int numParticles = force.getNumParticles();
+    // Pack the C6 coeffiecient in with sigma and epsilon, in case LJPME is being used. The C6
+    // coefficients could live in a separate array, but that would hurt cache efficiency for LJPME.
+    doLJPME = nonbondedMethod == LJPME;
    sigmaEpsilon = CudaArray::create<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
+    if(doLJPME){
+        if (cu.getUseDoublePrecision())
+            C6s = CudaArray::create<double>(cu, cu.getPaddedNumAtoms(), "C6s");
+        else
+            C6s = CudaArray::create<float>(cu, cu.getPaddedNumAtoms(), "C6s");
+    }
    CudaArray& posq = cu.getPosq();
    vector<double4> temp(posq.getSize());
    float4* posqf = (float4*) &temp[0];
    double4* posqd = (double4*) &temp[0];
+    // The C6 coefficients for LJPME could be computed from sigma and epsilon, but it
+    // seems like a good idea to cache them and avoid many recomputations.
+    vector<double> tmpc6(posq.getSize());
+    float* c6f = (float*) &tmpc6[0];
+    double* c6d = (double*) &tmpc6[0];
    vector<float2> sigmaEpsilonVector(cu.getPaddedNumAtoms(), make_float2(0, 0));
    vector<vector<int> > exclusionList(numParticles);
    double sumSquaredCharges = 0.0;
+    double sumSquaredC6 = 0.0;
    hasCoulomb = false;
    hasLJ = false;
    for (int i = 0; i < numParticles; i++) {
@@ -1667,7 +1700,17 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
            posqd[i] = make_double4(0, 0, 0, charge);
        else
            posqf[i] = make_float4(0, 0, 0, (float) charge);
-        sigmaEpsilonVector[i] = make_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon)));
+        double sig = (float) (0.5*sigma);
+        double eps = (float) (2.0*sqrt(epsilon));
+        sigmaEpsilonVector[i] = make_float2(sig, eps);
+        if(doLJPME){
+            float C6 = (float) 8.0*pow(sig, 3) * eps;
+            sumSquaredC6 += C6*C6;
+            if (cu.getUseDoublePrecision())
+                c6d[i] = 8.0*pow(sig,3)*eps;
+            else
+                c6f[i] = 8.0f*pow((float)sig,3)*eps;
+        }
        exclusionList[i].push_back(i);
        sumSquaredCharges += charge*charge;
        if (charge != 0.0)
@@ -1681,7 +1724,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    }
    posq.upload(&temp[0]);
    sigmaEpsilon->upload(sigmaEpsilonVector);
-    nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
+    if(doLJPME)
+        C6s->upload(&tmpc6[0]);
    bool useCutoff = (nonbondedMethod != NoCutoff);
    bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
    map<string, string> defines;
@@ -1705,12 +1749,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
            defines["LJ_SWITCH_C5"] = cu.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
        }
    }
-    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0)
+    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && !doLJPME)
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
    else
        dispersionCoefficient = 0.0;
    alpha = 0;
    ewaldSelfEnergy = 0.0;
+    dispersionSelfEnergy = 0.0;
    if (nonbondedMethod == Ewald) {
        // Compute the Ewald parameters.

@@ -1740,19 +1785,39 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
            cosSinSums = new CudaArray(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
        }
    }
-    else if (nonbondedMethod == PME) {
+    else if (nonbondedMethod == PME || nonbondedMethod == LJPME) {
        // Compute the PME parameters.
-
-        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
+        //
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
        gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
        gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
        gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);
+        if(doLJPME){
+            NonbondedForceImpl::calcPMEParameters(system, force, dispersionAlpha, dispersionGridSizeX,
+                                                  dispersionGridSizeY, dispersionGridSizeZ, true);
+            dispersionGridSizeX = CudaFFT3D::findLegalDimension(dispersionGridSizeX);
+            dispersionGridSizeY = CudaFFT3D::findLegalDimension(dispersionGridSizeY);
+            dispersionGridSizeZ = CudaFFT3D::findLegalDimension(dispersionGridSizeZ);
+        }

        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
+        if(doLJPME) {
+            defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
+            double invRCut6 = pow(force.getCutoffDistance(), -6);
+            double dalphaR  = dispersionAlpha * force.getCutoffDistance();
+            double dar2     = dalphaR*dalphaR;
+            double dar4     = dar2*dar2;
+            double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
+            defines["INVCUT6"] = cu.doubleToString(invRCut6);
+            defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
+        }
        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
+        defines["DO_LJPME"] = doLJPME ? "1" : "0";
        if (cu.getContextIndex() == 0) {
            ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
+            if(doLJPME) dispersionSelfEnergy = pow(dispersionAlpha, 6) * sumSquaredC6 / 12.0;
+
            char deviceName[100];
            cuDeviceGetName(deviceName, 100, cu.getDevice());
            usePmeStream = (!cu.getPlatformData().disablePmeStream && string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
@@ -1771,7 +1836,19 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                pmeDefines["USE_PME_STREAM"] = "1";
            if (cu.getPlatformData().deterministicForces)
                pmeDefines["USE_DETERMINISTIC_FORCES"] = "1";
-            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
+            if (doLJPME){
+                pmeDefines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
+                pmeDefines["DISPERSION_GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
+                pmeDefines["DISPERSION_GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
+                pmeDefines["DISPERSION_GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
+                pmeDefines["RECIP_DISPERSION_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
+            }
+            CUmodule module;
+            if(doLJPME){
+                module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme+CudaKernelSources::ljpme, pmeDefines);
+            }else{
+                module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
+            }
            if (cu.getPlatformData().useCpuPme) {
                // Create the CPU PME kernel.

@@ -1782,6 +1859,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                    pmeio = new PmeIO(cu, addForcesKernel);
                    cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
                    cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
+                    if(doLJPME){
+                        cpuDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
+                        cpuDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSizeX, dispersionGridSizeY,
+                                                     dispersionGridSizeZ, numParticles, dispersionAlpha);
+                        cu.addPreComputation(new PmePreComputation(cu, cpuDispersionPme, *pmeio));
+                        cu.addPostComputation(new PmePostComputation(cpuDispersionPme, *pmeio));
+                    }
                }
                catch (OpenMMException& ex) {
                    // The CPU PME plugin isn't available.
@@ -1796,6 +1880,15 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
                cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
                cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
+                if(doLJPME){
+                    pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadC6");
+                    pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomDispersionGridIndex");
+                    pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadC6");
+                    pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalDispersionConvolution");
+                    pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateDispersionEnergy");
+                    pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateDispersionForce");
+                    cuFuncSetCacheConfig(pmeDispersionSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
+                }

                // Create required data structures.

@@ -1803,14 +1896,28 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid");
                reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "reciprocalPmeGrid");
                cu.addAutoclearBuffer(*directPmeGrid);
+                if(doLJPME){
+                    directDispersionPmeGrid = new CudaArray(cu, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ,
+                                            cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalDispersionPmeGrid");
+                    cu.addAutoclearBuffer(*directDispersionPmeGrid);
+                    reciprocalDispersionPmeGrid = new CudaArray(cu, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ,
+                                                                2*elementSize, "reciprocalDispersionPmeGrid");
+                }
                pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
                pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
                pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
                pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
                pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
+                if(doLJPME)
+                    pmeAtomDispersionGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomDispersionGridIndex");
                int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
                pmeEnergyBuffer = new CudaArray(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
                cu.clearBuffer(*pmeEnergyBuffer);
+                if(doLJPME){
+                    dispersionPmeEnergyBuffer = new CudaArray(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize,
+                                                              "dispersionPmeEnergyBuffer");
+                    cu.clearBuffer(*dispersionPmeEnergyBuffer);
+                }
                sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
                int cufftVersion;
                cufftGetVersion(&cufftVersion);
@@ -1822,12 +1929,24 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                    result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
                    if (result != CUFFT_SUCCESS)
                        throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+                    if(doLJPME){
+                        result = cufftPlan3d(&dispersionFftForward, dispersionGridSizeX, dispersionGridSizeY, 
+                                                dispersionGridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
+                        if (result != CUFFT_SUCCESS)
+                            throw OpenMMException("Error initializing disperison FFT: "+cu.intToString(result));
+                        result = cufftPlan3d(&dispersionFftBackward, dispersionGridSizeX, dispersionGridSizeY,
+                                             dispersionGridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
+                        if (result != CUFFT_SUCCESS)
+                            throw OpenMMException("Error initializing disperison FFT: "+cu.intToString(result));
+                    }
                }
-                else
+                else {
                    fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
-                
+                    dispersionFft = new CudaFFT3D(cu, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
+                }
+
                // Prepare for doing PME on its own stream.
-                
+
                if (usePmeStream) {
                    cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
                    if (useCudaFFT) {
@@ -1840,9 +1959,21 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                        recipForceGroup = force.getForceGroup();
                    cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
                    cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), *pmeEnergyBuffer, recipForceGroup));
+                    if(doLJPME){
+                        cuStreamCreate(&dispersionPmeStream, CU_STREAM_NON_BLOCKING);
+                        // Dispersion terms use yet another stream.
+                        if (useCudaFFT) {
+                            cufftSetStream(dispersionFftForward, dispersionPmeStream);
+                            cufftSetStream(dispersionFftBackward, dispersionPmeStream);
+                        }
+                        CHECK_RESULT(cuEventCreate(&dispersionPmeSyncEvent, CU_EVENT_DISABLE_TIMING),
+                                                   "Error creating event for Dispersion term of NonbondedForce");
+                        // The force group is the same as the electrostatic PME reciprocal force group.
+                        cu.addPreComputation(new SyncStreamPreComputation(cu, dispersionPmeStream, dispersionPmeSyncEvent, recipForceGroup));
+                        cu.addPostComputation(new SyncStreamPostComputation(cu, dispersionPmeSyncEvent, cu.getKernel(module, "addEnergy"), *dispersionPmeEnergyBuffer, recipForceGroup));
+                    }
                }
                hasInitializedFFT = true;
-
                // Initialize the b-spline moduli.

                int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
@@ -1916,13 +2047,22 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
            }
        }
    }
-
    // Add the interaction to the default nonbonded kernel.
-   
-    string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
-    cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
-    if (hasLJ)
-        cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon->getDevicePointer()));
+
+   string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
+   cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
+   if (hasLJ){
+       cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2,
+                                               sizeof(float2), sigmaEpsilon->getDevicePointer()));
+       if(doLJPME){
+            if (cu.getUseDoublePrecision())
+                cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("C6s", "double", 1,
+                                                        sizeof(double), C6s->getDevicePointer()));
+            else
+                cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("C6s", "float", 1,
+                                                        sizeof(float), C6s->getDevicePointer()));
+       }
+   }

    // Initialize the exceptions.

@@ -1959,9 +2099,9 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
    if (directPmeGrid != NULL && includeReciprocal) {
        if (usePmeStream)
            cu.setCurrentStream(pmeStream);
-        
+
        // Invert the periodic box vectors.
-        
+
        Vec3 boxVectors[3];
        cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
        double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
@@ -1985,7 +2125,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
            recipBoxVectorPointer[1] = &recipBoxVectorsFloat[1];
            recipBoxVectorPointer[2] = &recipBoxVectorsFloat[2];
        }
-        
+
        // Execute the reciprocal space kernels.

        void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
@@ -2045,8 +2185,80 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
            cuEventRecord(pmeSyncEvent, pmeStream);
            cu.restoreDefaultStream();
        }
+
+        // As written, we check only the Electrostatic grid pointer to get here.  We could separate them out, but for
+        // now we assume that LJPME can only be used if electrostatic PME is also active.
+        if(doLJPME){
+            if (usePmeStream)
+                cu.setCurrentStream(dispersionPmeStream);
+
+            void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomDispersionGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+            cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
+
+            sort->sort(*pmeAtomDispersionGridIndex);
+
+            void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directDispersionPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomDispersionGridIndex->getDevicePointer(),
+                    &C6s->getDevicePointer()};
+            cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
+
+            if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
+                void* finishSpreadArgs[] = {&directDispersionPmeGrid->getDevicePointer()};
+                cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, directDispersionPmeGrid->getSize(), 256);
+            }
+
+
+            if (useCudaFFT) {
+                if (cu.getUseDoublePrecision())
+                    cufftExecD2Z(dispersionFftForward, (double*) directDispersionPmeGrid->getDevicePointer(), (double2*) reciprocalDispersionPmeGrid->getDevicePointer());
+                else
+                    cufftExecR2C(dispersionFftForward, (float*) directDispersionPmeGrid->getDevicePointer(), (float2*) reciprocalDispersionPmeGrid->getDevicePointer());
+            }
+            else {
+                fft->execFFT(*directDispersionPmeGrid, *reciprocalDispersionPmeGrid, true);
+            }
+
+            if (includeEnergy) {
+                void* computeEnergyArgs[] = {&reciprocalDispersionPmeGrid->getDevicePointer(), usePmeStream ? &dispersionPmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
+                        &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
+                        cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2],
+                        &C6s->getDevicePointer()};
+                cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
+            }
+
+            void* convolutionArgs[] = {&reciprocalDispersionPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
+                    &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
+                    cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+            cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
+
+            if (useCudaFFT) {
+                if (cu.getUseDoublePrecision())
+                    cufftExecZ2D(dispersionFftBackward, (double2*) reciprocalDispersionPmeGrid->getDevicePointer(), (double*) directDispersionPmeGrid->getDevicePointer());
+                else
+                    cufftExecC2R(dispersionFftBackward, (float2*) reciprocalDispersionPmeGrid->getDevicePointer(), (float*)  directDispersionPmeGrid->getDevicePointer());
+            }
+            else {
+                fft->execFFT(*reciprocalDispersionPmeGrid, *directDispersionPmeGrid, false);
+            }
+
+            void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directDispersionPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomDispersionGridIndex->getDevicePointer(),
+                    &C6s->getDevicePointer()};
+            cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
+            if (usePmeStream) {
+                cuEventRecord(dispersionPmeSyncEvent, dispersionPmeStream);
+                cu.restoreDefaultStream();
+            }
+        }
+
    }
+
    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
+    if(includeReciprocal) energy += dispersionSelfEnergy;
    if (dispersionCoefficient != 0.0 && includeDirect) {
        double4 boxSize = cu.getPeriodicBoxSize();
        energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);
@@ -2092,7 +2304,9 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
    float4* posqf = (float4*) cu.getPinnedBuffer();
    double4* posqd = (double4*) cu.getPinnedBuffer();
    vector<float2> sigmaEpsilonVector(cu.getPaddedNumAtoms(), make_float2(0, 0));
+    vector<float3> sigmaEpsilonC6Vector(cu.getPaddedNumAtoms(), make_float3(0, 0, 0));
    double sumSquaredCharges = 0.0;
+    double sumSquaredC6 = 0.0;
    const vector<int>& order = cu.getAtomIndex();
    for (int i = 0; i < force.getNumParticles(); i++) {
        int index = order[i];
@@ -2102,11 +2316,23 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
            posqd[i].w = charge;
        else
            posqf[i].w = (float) charge;
-        sigmaEpsilonVector[index] = make_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon)));
+        float sig = (float) (0.5*sigma);
+        float eps = (float) (2.0*sqrt(epsilon));
+        if(doLJPME){
+            float C6 = (float) 8.0*pow(sig, 3) * eps;
+            sumSquaredC6 += C6*C6;
+            sigmaEpsilonC6Vector[index] = make_float3(sig, eps, C6);
+        }else{
+            sigmaEpsilonVector[index] = make_float2(sig, eps);
+        }
        sumSquaredCharges += charge*charge;
    }
    posq.upload(cu.getPinnedBuffer());
-    sigmaEpsilon->upload(sigmaEpsilonVector);
+    if(doLJPME){
+        sigmaEpsilon->upload(sigmaEpsilonC6Vector);
+    }else{
+        sigmaEpsilon->upload(sigmaEpsilonVector);
+    }
    
    // Record the exceptions.
    
@@ -2125,6 +2351,8 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
    
    if (nonbondedMethod == Ewald || nonbondedMethod == PME)
        ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
+    if (nonbondedMethod == LJPME)
+        dispersionSelfEnergy = (cu.getContextIndex() == 0 ?  pow(dispersionAlpha, 6) * sumSquaredC6 / 12.0 : 0);
    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME))
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
    cu.invalidateMolecules();
@@ -2143,6 +2371,20 @@ void CudaCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int&
    }
 }

+void CudaCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    if (!doLJPME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
+    if (cu.getPlatformData().useCpuPme)
+        //cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
+        throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
+    else {
+        alpha = this->dispersionAlpha;
+        nx = dispersionGridSizeX;
+        ny = dispersionGridSizeY;
+        nz = dispersionGridSizeZ;
+    }
+}
+
 class CudaCustomNonbondedForceInfo : public CudaForceInfo {
 public:
    CudaCustomNonbondedForceInfo(const CustomNonbondedForce& force) : force(force) {

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -628,6 +628,10 @@ void CudaParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int&
    dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
 }

+void CudaParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
+}
+
 class CudaParallelCalcCustomNonbondedForceKernel::Task : public CudaContext::WorkTask {
 public:
    Task(ContextImpl& context, CudaCalcCustomNonbondedForceKernel& kernel, bool includeForce,

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -247,6 +247,10 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
        CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
        deviceName << name;
    }
+    size_t printfsize;
+    cuCtxGetLimit(&printfsize, CU_LIMIT_PRINTF_FIFO_SIZE);
+    cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, 10*printfsize);
+
    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
    disablePmeStream = (pmeStreamProperty == "true");
    deterministicForces = (deterministicForcesProperty == "true");

--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
@@ -17,6 +17,25 @@
    const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expAlphaRSqr;
 #endif
    real tempForce = 0.0f;
+#if HAS_LENNARD_JONES
+        // The multiplicative term to correct for the multiplicative terms that are always
+        // present in reciprocal space.  The real terms have an additive contribution
+        // added in, but for excluded terms the multiplicative term is just subtracted.
+        // These factors are needed in both clauses of the needCorrection statement, so
+        // I declare them up here.
+        #if DO_LJPME
+            const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+            const real dar2 = dispersionAlphaR*dispersionAlphaR;
+            const real dar4 = dar2*dar2;
+            const real dar6 = dar4*dar2;
+            const real invR2 = invR*invR;
+            const real expDar2 = EXP(-dar2);
+            const real c6 = C6s1*C6s2;
+            const real coef = invR2*invR2*invR2*c6;
+            const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+            const real dprefac = eprefac + dar6/6.0f;
+        #endif
+#endif
    if (needCorrection) {
        // Subtract off the part of this interaction that was included in the reciprocal space contribution.

@@ -29,6 +48,13 @@
            includeInteraction = false;
            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*posq1.w*posq2.w;
        }
+#if HAS_LENNARD_JONES
+        #if DO_LJPME
+            // The multiplicative grid term
+            tempEnergy += coef*(1.0f - expDar2*eprefac);
+            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+        #endif
+#endif
    }
    else {
 #if HAS_LENNARD_JONES
@@ -36,7 +62,8 @@
        real sig2 = invR*sig;
        sig2 *= sig2;
        real sig6 = sig2*sig2*sig2;
-        real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+        real eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
+        real epssig6 = sig6*eps;
        tempForce = epssig6*(12.0f*sig6 - 6.0f);
        real ljEnergy = epssig6*(sig6 - 1.0f);
        #if USE_LJ_SWITCH
@@ -48,6 +75,22 @@
            ljEnergy *= switchValue;
        }
        #endif
+        #if DO_LJPME
+            // The multiplicative grid term
+            ljEnergy += coef*(1.0f - expDar2*eprefac);
+            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+            // The potential shift accounts for the step at the cutoff introduced by the
+            // transition from additive to multiplicative combintion rules and is only
+            // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+            // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+            sig2 = sig*sig;
+            sig6 = sig2*sig2*sig2*INVCUT6;
+            epssig6 = eps*sig6;
+            // The additive part of the potential shift
+            ljEnergy += epssig6*(1.0f - sig6);
+            // The multiplicative part of the potential shift
+            ljEnergy += MULTSHIFT6*c6;
+        #endif
        tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
        tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
 #else

--- a/platforms/cuda/src/kernels/ljpme.cu
+++ b/platforms/cuda/src/kernels/ljpme.cu
+extern "C" __global__ void findAtomDispersionGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
+            real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+            real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+    // Compute the index of the grid point each atom is associated with.
+    
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        real4 pos = posq[i];
+        APPLY_PERIODIC_TO_POS(pos)
+        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
+                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
+                             pos.z*recipBoxVecZ.z);
+        t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
+        t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
+        t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
+        int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
+                                   ((int) t.y) % DISPERSION_GRID_SIZE_Y,
+                                   ((int) t.z) % DISPERSION_GRID_SIZE_Z);
+        pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z+gridIndex.y*DISPERSION_GRID_SIZE_Z+gridIndex.z);
+    }
+}
+
+extern "C" __global__ void gridSpreadC6(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex,
+        const real* __restrict__ C6s) {
+    real3 data[PME_ORDER];
+    const real scale = RECIP(PME_ORDER-1);
+    
+    // Process the atoms in spatially sorted order.  This improves efficiency when writing
+    // the grid values.
+    
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        int atom = pmeAtomGridIndex[i].x;
+        real4 pos = posq[atom];
+        APPLY_PERIODIC_TO_POS(pos)
+        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
+                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
+                             pos.z*recipBoxVecZ.z);
+        t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
+        t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
+        t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
+        int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
+                                   ((int) t.y) % DISPERSION_GRID_SIZE_Y,
+                                   ((int) t.z) % DISPERSION_GRID_SIZE_Z);
+
+        // Since we need the full set of thetas, it's faster to compute them here than load them
+        // from global memory.
+        
+        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
+        data[PME_ORDER-1] = make_real3(0);
+        data[1] = dr;
+        data[0] = make_real3(1)-dr;
+        for (int j = 3; j < PME_ORDER; j++) {
+            real div = RECIP(j-1);
+            data[j-1] = div*dr*data[j-2];
+            for (int k = 1; k < (j-1); k++)
+                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
+            data[0] = div*(make_real3(1)-dr)*data[0];
+        }
+        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
+        for (int j = 1; j < (PME_ORDER-1); j++)
+            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
+        data[0] = scale*(make_real3(1)-dr)*data[0];
+        
+        // Spread the charge from this atom onto each grid point.
+         
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xbase = gridIndex.x+ix;
+            xbase -= (xbase >= DISPERSION_GRID_SIZE_X ? DISPERSION_GRID_SIZE_X : 0);
+            xbase = xbase*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
+            real dx = data[ix].x;
+            
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int ybase = gridIndex.y+iy;
+                ybase -= (ybase >= DISPERSION_GRID_SIZE_Y ? DISPERSION_GRID_SIZE_Y : 0);
+                ybase = xbase + ybase*DISPERSION_GRID_SIZE_Z;
+                real dy = data[iy].y;
+                
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= DISPERSION_GRID_SIZE_Z ? DISPERSION_GRID_SIZE_Z : 0);
+                    int index = ybase + zindex;
+
+                    // We need to grab the C6 coefficient from the array
+                    real add = C6s[atom]*dx*dy*data[iz].z;
+#ifdef USE_DOUBLE_PRECISION
+                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
+                    atomicAdd(&ulonglong_p[index],  static_cast<unsigned long long>((long long) (add*0x100000000)));
+#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
+                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
+                    int gridIndex = index;
+                    gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z)/2);
+                    atomicAdd(&ulonglong_p[gridIndex],  static_cast<unsigned long long>((long long) (add*0x100000000)));
+#else
+                    atomicAdd(&originalPmeGrid[index], add);
+#endif
+
+                }
+            }
+        }
+    }
+}
+
+
+extern "C" __global__ void finishSpreadC6(long long* __restrict__ originalPmeGrid) {
+    real* floatGrid = (real*) originalPmeGrid;
+    const unsigned int gridSize = DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
+    real scale = 1.0f/(real) 0x100000000;
+#ifdef USE_DOUBLE_PRECISION
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
+        floatGrid[index] = scale*originalPmeGrid[index];
+#else
+    for (int index = 2*(blockIdx.x*blockDim.x+threadIdx.x); index < gridSize; index += 2*blockDim.x*gridDim.x) {
+        floatGrid[index] = scale*originalPmeGrid[index/2];
+        if (index+1 < gridSize)
+            floatGrid[index+1] = scale*originalPmeGrid[(index+gridSize+1)/2];
+    }
+#endif
+}
+
+
+// convolutes the dispersion grid on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
+extern "C" __global__ void 
+reciprocalDispersionConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer, 
+                      const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, 
+                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+    // R2C stores into a half complex matrix where the last dimension is cut by half
+    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+    const real scaleFactor =  -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+
+    const real alpha = EWALD_DISPERSION_ALPHA;
+    real bfac = M_PI / alpha;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = alpha*alpha*alpha;
+    real fac3 = -2*alpha*M_PI*M_PI;
+
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        // real indices
+        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
+        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+        int ky = remainder/(GRID_SIZE_Z/2+1);
+        int kz = remainder-ky*(GRID_SIZE_Z/2+1);
+        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
+        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
+        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
+        real mhx = mx*recipBoxVecX.x;
+        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
+        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
+        real bx = pmeBsplineModuliX[kx];
+        real by = pmeBsplineModuliY[ky];
+        real bz = pmeBsplineModuliZ[kz];
+        real denom = scaleFactor/(bx*by*bz);
+        real2 grid = halfcomplex_pmeGrid[index];
+        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+#if FAST_ERFC
+        // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
+        // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
+        // error of 1.5e-7.  Stolen by ACS from the CUDA platform's AMOEBA plugin.
+        real t = 1.0f/(1.0f+0.3275911f*b);
+        real erfcterm = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expterm;
+#else
+        real erfcterm = ERFC(b);
+#endif
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+        halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
+    }
+}
+
+
+extern "C" __global__ void
+gridEvaluateDispersionEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer,
+                      const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
+                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+    // R2C stores into a half complex matrix where the last dimension is cut by half
+    const unsigned int gridSize = DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
+    const real scaleFactor =  -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+
+    const real alpha = EWALD_DISPERSION_ALPHA;
+    real bfac = M_PI / alpha;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = alpha*alpha*alpha;
+    real fac3 = -2*alpha*M_PI*M_PI;
+
+    mixed energy = 0;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        // real indices
+        int kx = index/(DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z));
+        int remainder = index-kx*DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z);
+        int ky = remainder/(DISPERSION_GRID_SIZE_Z);
+        int kz = remainder-ky*(DISPERSION_GRID_SIZE_Z);
+        int mx = (kx < (DISPERSION_GRID_SIZE_X+1)/2) ? kx : (kx-DISPERSION_GRID_SIZE_X);
+        int my = (ky < (DISPERSION_GRID_SIZE_Y+1)/2) ? ky : (ky-DISPERSION_GRID_SIZE_Y);
+        int mz = (kz < (DISPERSION_GRID_SIZE_Z+1)/2) ? kz : (kz-DISPERSION_GRID_SIZE_Z);
+        real mhx = mx*recipBoxVecX.x;
+        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
+        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
+        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+        real bx = pmeBsplineModuliX[kx];
+        real by = pmeBsplineModuliY[ky];
+        real bz = pmeBsplineModuliZ[kz];
+        real denom = scaleFactor/(bx*by*bz);
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+#if FAST_ERFC
+        // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
+        // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
+        // error of 1.5e-7.  Stolen by ACS from the CUDA platform's AMOEBA plugin.
+        real t = 1.0f/(1.0f+0.3275911f*b);
+        real erfcterm = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expterm;
+#else
+        real erfcterm = ERFC(b);
+#endif
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+
+        if (kz >= (DISPERSION_GRID_SIZE_Z/2+1)) {
+            kx = ((kx == 0) ? kx : DISPERSION_GRID_SIZE_X-kx);
+            ky = ((ky == 0) ? ky : DISPERSION_GRID_SIZE_Y-ky);
+            kz = DISPERSION_GRID_SIZE_Z-kz;
+        } 
+        int indexInHalfComplexGrid = kz + ky*(DISPERSION_GRID_SIZE_Z/2+1)+kx*(DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z/2+1));
+        real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
+        // N.B. We inlcude the 0,0,0 point for dispersion
+        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
+    }
+#ifdef USE_PME_STREAM
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = 0.5f*energy;
+#else
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
+#endif
+}
+
+
+extern "C" __global__
+void gridInterpolateDispersionForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex, const real* __restrict__ C6s) {
+    real3 data[PME_ORDER];
+    real3 ddata[PME_ORDER];
+    const real scale = RECIP(PME_ORDER-1);
+    
+    // Process the atoms in spatially sorted order.  This improves cache performance when loading
+    // the grid values.
+    
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        int atom = pmeAtomGridIndex[i].x;
+        real3 force = make_real3(0);
+        real4 pos = posq[atom];
+        APPLY_PERIODIC_TO_POS(pos)
+        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
+                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
+                             pos.z*recipBoxVecZ.z);
+        t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
+        t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
+        t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
+        int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
+                                   ((int) t.y) % DISPERSION_GRID_SIZE_Y,
+                                   ((int) t.z) % DISPERSION_GRID_SIZE_Z);
+        // Since we need the full set of thetas, it's faster to compute them here than load them
+        // from global memory.
+        
+        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
+        data[PME_ORDER-1] = make_real3(0);
+        data[1] = dr;
+        data[0] = make_real3(1)-dr;
+        for (int j = 3; j < PME_ORDER; j++) {
+            real div = RECIP(j-1);
+            data[j-1] = div*dr*data[j-2];
+            for (int k = 1; k < (j-1); k++)
+                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
+            data[0] = div*(make_real3(1)-dr)*data[0];
+        }
+        ddata[0] = -data[0];
+        for (int j = 1; j < PME_ORDER; j++)
+            ddata[j] = data[j-1]-data[j];
+        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
+        for (int j = 1; j < (PME_ORDER-1); j++)
+            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
+        data[0] = scale*(make_real3(1)-dr)*data[0];
+
+        
+        // Compute the force on this atom.
+         
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xbase = gridIndex.x+ix;
+            xbase -= (xbase >= DISPERSION_GRID_SIZE_X ? DISPERSION_GRID_SIZE_X : 0);
+            xbase = xbase*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
+            real dx = data[ix].x;
+            real ddx = ddata[ix].x;
+            
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int ybase = gridIndex.y+iy;
+                ybase -= (ybase >= DISPERSION_GRID_SIZE_Y ? DISPERSION_GRID_SIZE_Y : 0);
+                ybase = xbase + ybase*DISPERSION_GRID_SIZE_Z;
+                real dy = data[iy].y;
+                real ddy = ddata[iy].y;
+                
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= DISPERSION_GRID_SIZE_Z ? DISPERSION_GRID_SIZE_Z : 0);
+                    int index = ybase + zindex;
+                    real gridvalue = originalPmeGrid[index];
+                    force.x += ddx*dy*data[iz].z*gridvalue;
+                    force.y += dx*ddy*data[iz].z*gridvalue;
+                    force.z += dx*dy*ddata[iz].z*gridvalue;
+                }
+            }
+        }
+        real q = C6s[atom];
+        real forceX = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecX.x);
+        real forceY = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecY.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecY.y);
+        real forceZ = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecZ.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecZ.y+force.z*DISPERSION_GRID_SIZE_Z*recipBoxVecZ.z);
+        atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (forceX*0x100000000)));
+        atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (forceY*0x100000000)));
+        atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (forceZ*0x100000000)));
+    }
+}
+