Merge pull request #1 from peastman/ljpme

Cleanup to LJ PME code

Merge pull request #1 from peastman/ljpme
Cleanup to LJ PME code
3b6925ae · Andy Simmonett · GitHub · 5a8a8aa9 · f7a102fb · 3b6925ae
Commit 3b6925ae authored Jan 26, 2017 by Andy Simmonett Committed by GitHub Jan 26, 2017
20 changed files
--- a/platforms/cpu/include/CpuCustomNonbondedForce.h
+++ b/platforms/cpu/include/CpuCustomNonbondedForce.h
@@ -129,6 +129,7 @@ private:
    bool useSwitch;
    bool periodic;
    bool triclinic;
+    bool useInteractionGroups;
    const CpuNeighborList* neighborList;
    float recipBoxSize[3];
    RealVec periodicBoxVectors[3];
@@ -183,8 +184,8 @@ public:
    Lepton::CompiledExpression forceExpression;
    std::vector<Lepton::CompiledExpression> energyParamDerivExpressions;
    CompiledExpressionSet expressionSet;
-    std::vector<int> particleParamIndex;
-    int rIndex;
+    std::vector<double> particleParam;
+    double r;
    std::vector<RealOpenMM> energyParamDerivs; 
 };


--- a/platforms/cpu/include/CpuKernels.h
+++ b/platforms/cpu/include/CpuKernels.h
@@ -258,20 +258,30 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the parameters being used for the dispersion term in LJPME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class PmeIO;
    CpuPlatform::PlatformData& data;
    int numParticles, num14;
    int **bonded14IndexArray;
    double **bonded14ParamArray;
-    double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldSelfEnergy, dispersionCoefficient;
-    int kmax[3], gridSize[3];
-    bool useSwitchingFunction, useOptimizedPme, hasInitializedPme;
+    double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldDispersionAlpha, ewaldSelfEnergy, dispersionCoefficient;
+    int kmax[3], gridSize[3], dispersionGridSize[3];
+    bool useSwitchingFunction, useOptimizedPme, hasInitializedPme, hasInitializedDispersionPme;
    std::vector<std::set<int> > exclusions;
    std::vector<std::pair<float, float> > particleParams;
+    std::vector<float> C6params;
    NonbondedMethod nonbondedMethod;
    CpuNonbondedForce* nonbonded;
-    Kernel optimizedPme;
+    Kernel optimizedPme, optimizedDispersionPme;
    CpuBondForce bondForce;
 };


--- a/platforms/cpu/include/CpuLangevinDynamics.h
+++ b/platforms/cpu/include/CpuLangevinDynamics.h

-/* Portions copyright (c) 2013-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2013-2016 Stanford University and Simbios.
 * Authors: Peter Eastman
 * Contributors: 
 *
@@ -43,12 +43,12 @@ public:
     *
     * @param numberOfAtoms  number of atoms
     * @param deltaT         delta t for dynamics
-     * @param tau            viscosity
+     * @param friction       friction coefficient
     * @param temperature    temperature
     * @param threads        thread pool for parallelizing computation
     * @param random         random number generator
     */
-    CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM tau, RealOpenMM temperature, OpenMM::ThreadPool& threads, OpenMM::CpuRandom& random);
+    CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM friction, RealOpenMM temperature, OpenMM::ThreadPool& threads, OpenMM::CpuRandom& random);

    /**
     * Destructor.

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -114,6 +114,17 @@ class CpuNonbondedForce {

      void setUsePME(float alpha, int meshSize[3]);

+      /**---------------------------------------------------------------------------------------
+
+         Set the force to use Particle-Mesh Ewald (PME) summation for dispersion.
+
+         @param alpha    the Ewald separation parameter
+         @param gridSize the dimensions of the mesh
+
+         --------------------------------------------------------------------------------------- */
+
+      void setUseLJPME(float alpha, int meshSize[3]);
+
      /**---------------------------------------------------------------------------------------
      
         Calculate Ewald ixn
@@ -122,6 +133,7 @@ class CpuNonbondedForce {
         @param posq             atom coordinates and charges
         @param atomCoordinates  atom coordinates (in format needed by PME)
         @param atomParameters   atom parameters (sigma/2, 2*sqrt(epsilon))
+         @param C6Paramrs        C6 parameters for multiplicative representation of dispersion
         @param exclusions       atom exclusion indices
                                 exclusions[atomIndex] contains the list of exclusions for that atom
         @param forces           force array (forces added)
@@ -130,8 +142,8 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */

      void calculateReciprocalIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates,
-                            const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
-                            std::vector<RealVec>& forces, double* totalEnergy) const;
+                                  const std::vector<std::pair<float, float> >& atomParameters, const std::vector<float> &C6params,
+                                  const std::vector<std::set<int> >& exclusions, std::vector<RealVec>& forces, double* totalEnergy) const;
      
      /**---------------------------------------------------------------------------------------
      
@@ -150,7 +162,7 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
          
      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+            const std::vector<float>& C6params, const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);

    /**
     * This routine contains the code executed by each thread.
@@ -163,28 +175,32 @@ protected:
        bool periodic;
        bool triclinic;
        bool ewald;
-        bool pme;
-        bool tableIsValid;
+        bool ljpme, pme;
+        bool tableIsValid, expTableIsValid;
        const CpuNeighborList* neighborList;
        float recipBoxSize[3];
        RealVec periodicBoxVectors[3];
        AlignedArray<fvec4> periodicBoxVec4;
        float cutoffDistance, switchingDistance;
        float krf, crf;
-        float alphaEwald;
+        float alphaEwald, alphaDispersionEwald;
        int numRx, numRy, numRz;
-        int meshDim[3];
+        int meshDim[3], dispersionMeshDim[3];
        std::vector<float> erfcTable, ewaldScaleTable;
-        float ewaldDX, ewaldDXInv, erfcDXInv;
+        std::vector<float> exptermsTable, dExptermsTable;
+        float ewaldDX, ewaldDXInv, erfcDXInv, exptermsDX, exptermsDXInv;
        std::vector<double> threadEnergy;
        // The following variables are used to make information accessible to the individual threads.
        int numberOfAtoms;
        float* posq;
        RealVec const* atomCoordinates;
        std::pair<float, float> const* atomParameters;
+        float const *C6params;
        std::set<int> const* exclusions;
        std::vector<AlignedArray<float> >* threadForce;
        bool includeEnergy;
+        float inverseRcut6;
+        float inverseRcut6Expterm;
        void* atomicCounter;

        static const float TWO_OVER_SQRT_PI;
@@ -238,10 +254,29 @@ protected:
       */
      void tabulateEwaldScaleFactor();

+      /**
+       * Create a lookup table for the scale factor used with dispersion PME.
+       */
+      void tabulateExpTerms();
+
      /**
       * Compute a fast approximation to erfc(x).
       */
      float erfcApprox(float x);
+
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME energies.
+       */
+      float exptermsApprox(float R);
+
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME forces.
+       */
+      float dExptermsApprox(float R);
 };

 } // namespace OpenMM

--- a/platforms/cpu/include/CpuNonbondedForceVec4.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec4.h
@@ -93,6 +93,20 @@ protected:
       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
       */
      fvec4 ewaldScaleFunction(const fvec4& x);
+
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME energies.
+       */
+      fvec4 exptermsApprox(const fvec4& R);
+
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME forces.
+       */
+      fvec4 dExptermsApprox(const fvec4& R);
 };

 } // namespace OpenMM

--- a/platforms/cpu/include/CpuNonbondedForceVec8.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec8.h
@@ -92,6 +92,21 @@ protected:
       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
       */
      fvec8 ewaldScaleFunction(const fvec8& x);
+
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME energies.
+       */
+      fvec8 exptermsApprox(const fvec8& R);
+
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME forces.
+       */
+      fvec8 dExptermsApprox(const fvec8& R);
+
 };

 } // namespace OpenMM

--- a/platforms/cpu/src/CpuCustomGBForce.cpp
+++ b/platforms/cpu/src/CpuCustomGBForce.cpp
@@ -59,47 +59,68 @@ CpuCustomGBForce::ThreadData::ThreadData(int numAtoms, int numThreads, int threa
            energyGradientExpressions(energyGradientExpressions), energyParamDerivExpressions(energyParamDerivExpressions) {
    firstAtom = (threadIndex*(long long) numAtoms)/numThreads;
    lastAtom = ((threadIndex+1)*(long long) numAtoms)/numThreads;
-    for (int i = 0; i < (int) valueExpressions.size(); i++)
+    map<string, double*> variableLocations;
+    variableLocations["x"] = &x;
+    variableLocations["y"] = &y;
+    variableLocations["z"] = &z;
+    variableLocations["r"] = &r;
+    param.resize(parameterNames.size());
+    particleParam.resize(parameterNames.size()*2);
+    for (int i = 0; i < (int) parameterNames.size(); i++) {
+        variableLocations[parameterNames[i]] = &param[i];
+        for (int j = 0; j < 2; j++) {
+            stringstream name;
+            name << parameterNames[i] << (j+1);
+            variableLocations[name.str()] = &particleParam[2*i+j];
+        }
+    }
+    value.resize(valueNames.size());
+    particleValue.resize(valueNames.size()*2);
+    for (int i = 0; i < (int) valueNames.size(); i++) {
+        variableLocations[valueNames[i]] = &value[i];
+        for (int j = 0; j < 2; j++) {
+            stringstream name;
+            name << valueNames[i] << (j+1);
+            variableLocations[name.str()] = &particleValue[2*i+j];
+        }
+    }
+    for (int i = 0; i < (int) valueExpressions.size(); i++) {
+        this->valueExpressions[i].setVariableLocations(variableLocations);
        expressionSet.registerExpression(this->valueExpressions[i]);
+    }
    for (int i = 0; i < (int) valueDerivExpressions.size(); i++)
-        for (int j = 0; j < (int) valueDerivExpressions[i].size(); j++)
+        for (int j = 0; j < (int) valueDerivExpressions[i].size(); j++) {
+            this->valueDerivExpressions[i][j].setVariableLocations(variableLocations);
            expressionSet.registerExpression(this->valueDerivExpressions[i][j]);
+        }
    for (int i = 0; i < (int) valueGradientExpressions.size(); i++)
-        for (int j = 0; j < (int) valueGradientExpressions[i].size(); j++)
+        for (int j = 0; j < (int) valueGradientExpressions[i].size(); j++) {
+            this->valueGradientExpressions[i][j].setVariableLocations(variableLocations);
            expressionSet.registerExpression(this->valueGradientExpressions[i][j]);
+        }
    for (int i = 0; i < (int) valueParamDerivExpressions.size(); i++)
-        for (int j = 0; j < (int) valueParamDerivExpressions[i].size(); j++)
+        for (int j = 0; j < (int) valueParamDerivExpressions[i].size(); j++) {
+            this->valueParamDerivExpressions[i][j].setVariableLocations(variableLocations);
            expressionSet.registerExpression(this->valueParamDerivExpressions[i][j]);
-    for (int i = 0; i < (int) energyExpressions.size(); i++)
+        }
+    for (int i = 0; i < (int) energyExpressions.size(); i++) {
+        this->energyExpressions[i].setVariableLocations(variableLocations);
        expressionSet.registerExpression(this->energyExpressions[i]);
+    }
    for (int i = 0; i < (int) energyDerivExpressions.size(); i++)
-        for (int j = 0; j < (int) energyDerivExpressions[i].size(); j++)
+        for (int j = 0; j < (int) energyDerivExpressions[i].size(); j++) {
+            this->energyDerivExpressions[i][j].setVariableLocations(variableLocations);
            expressionSet.registerExpression(this->energyDerivExpressions[i][j]);
+        }
    for (int i = 0; i < (int) energyGradientExpressions.size(); i++)
-        for (int j = 0; j < (int) energyGradientExpressions[i].size(); j++)
+        for (int j = 0; j < (int) energyGradientExpressions[i].size(); j++) {
+            this->energyGradientExpressions[i][j].setVariableLocations(variableLocations);
            expressionSet.registerExpression(this->energyGradientExpressions[i][j]);
+        }
    for (int i = 0; i < (int) energyParamDerivExpressions.size(); i++)
-        for (int j = 0; j < (int) energyParamDerivExpressions[i].size(); j++)
+        for (int j = 0; j < (int) energyParamDerivExpressions[i].size(); j++) {
+            this->energyParamDerivExpressions[i][j].setVariableLocations(variableLocations);
            expressionSet.registerExpression(this->energyParamDerivExpressions[i][j]);
-    xindex = expressionSet.getVariableIndex("x");
-    yindex = expressionSet.getVariableIndex("y");
-    zindex = expressionSet.getVariableIndex("z");
-    rindex = expressionSet.getVariableIndex("r");
-    for (int i = 0; i < (int) parameterNames.size(); i++) {
-        paramIndex.push_back(expressionSet.getVariableIndex(parameterNames[i]));
-        for (int j = 1; j < 3; j++) {
-            stringstream name;
-            name << parameterNames[i] << j;
-            particleParamIndex.push_back(expressionSet.getVariableIndex(name.str()));
-        }
-    }
-    for (int i = 0; i < (int) valueNames.size(); i++) {
-        valueIndex.push_back(expressionSet.getVariableIndex(valueNames[i]));
-        for (int j = 1; j < 3; j++) {
-            stringstream name;
-            name << valueNames[i] << j;
-            particleValueIndex.push_back(expressionSet.getVariableIndex(name.str()));
-        }
        }
    value0.resize(numAtoms);
    dEdV.resize(valueNames.size());
@@ -283,13 +304,13 @@ void CpuCustomGBForce::threadComputeForce(ThreadPool& threads, int threadIndex)
        for (int j = 0; j < (int) threadData.size(); j++)
            sum += threadData[j]->value0[atom];
        values[0][atom] = sum;
-        data.expressionSet.setVariable(data.xindex, posq[4*atom]);
-        data.expressionSet.setVariable(data.yindex, posq[4*atom+1]);
-        data.expressionSet.setVariable(data.zindex, posq[4*atom+2]);
+        data.x = posq[4*atom];
+        data.y = posq[4*atom+1];
+        data.z = posq[4*atom+2];
        for (int j = 0; j < numParams; j++)
-            data.expressionSet.setVariable(data.paramIndex[j], atomParameters[atom][j]);
+            data.param[j] = atomParameters[atom][j];
        for (int i = 1; i < numValues; i++) {
-            data.expressionSet.setVariable(data.valueIndex[i-1], values[i-1][atom]);
+            data.value[i-1] = values[i-1][atom];
            values[i][atom] = (float) data.valueExpressions[i].evaluate();

            // Calculate derivatives with respect to parameters.
@@ -397,15 +418,14 @@ void CpuCustomGBForce::calculateOnePairValue(int index, int atom1, int atom2, Th
    getDeltaR(pos2, pos1, deltaR, r2, periodic, boxSize, invBoxSize);
    if (cutoff && r2 >= cutoffDistance2)
        return;
-    float r = sqrtf(r2);
+    data.r = sqrtf(r2);
    for (int i = 0; i < numParams; i++) {
-        data.expressionSet.setVariable(data.particleParamIndex[i*2], atomParameters[atom1][i]);
-        data.expressionSet.setVariable(data.particleParamIndex[i*2+1], atomParameters[atom2][i]);
+        data.particleParam[i*2] = atomParameters[atom1][i];
+        data.particleParam[i*2+1] = atomParameters[atom2][i];
    }
-    data.expressionSet.setVariable(data.rindex, r);
    for (int i = 0; i < index; i++) {
-        data.expressionSet.setVariable(data.particleValueIndex[i*2], values[i][atom1]);
-        data.expressionSet.setVariable(data.particleValueIndex[i*2+1], values[i][atom2]);
+        data.particleValue[i*2] = values[i][atom1];
+        data.particleValue[i*2+1] = values[i][atom2];
    }
    valueArray[atom1] += (float) data.valueExpressions[index].evaluate();
    
@@ -418,13 +438,13 @@ void CpuCustomGBForce::calculateOnePairValue(int index, int atom1, int atom2, Th
 void CpuCustomGBForce::calculateSingleParticleEnergyTerm(int index, ThreadData& data, int numAtoms, float* posq,
        RealOpenMM** atomParameters, float* forces, double& totalEnergy) {
    for (int i = data.firstAtom; i < data.lastAtom; i++) {
-        data.expressionSet.setVariable(data.xindex, posq[4*i]);
-        data.expressionSet.setVariable(data.yindex, posq[4*i+1]);
-        data.expressionSet.setVariable(data.zindex, posq[4*i+2]);
+        data.x = posq[4*i];
+        data.y = posq[4*i+1];
+        data.z = posq[4*i+2];
        for (int j = 0; j < numParams; j++)
-            data.expressionSet.setVariable(data.paramIndex[j], atomParameters[i][j]);
+            data.param[j] = atomParameters[i][j];
        for (int j = 0; j < (int) values.size(); j++)
-            data.expressionSet.setVariable(data.valueIndex[j], values[j][i]);
+            data.value[j] = values[j][i];
        if (includeEnergy)
            totalEnergy += (float) data.energyExpressions[index].evaluate();
        for (int j = 0; j < (int) values.size(); j++)
@@ -494,17 +514,17 @@ void CpuCustomGBForce::calculateOnePairEnergyTerm(int index, int atom1, int atom
    if (cutoff && r2 >= cutoffDistance2)
        return;
    float r = sqrtf(r2);
+    data.r = r;

    // Record variables for evaluating expressions.

    for (int i = 0; i < numParams; i++) {
-        data.expressionSet.setVariable(data.particleParamIndex[i*2], atomParameters[atom1][i]);
-        data.expressionSet.setVariable(data.particleParamIndex[i*2+1], atomParameters[atom2][i]);
+        data.particleParam[i*2] = atomParameters[atom1][i];
+        data.particleParam[i*2+1] = atomParameters[atom2][i];
    }
-    data.expressionSet.setVariable(data.rindex, r);
    for (int i = 0; i < (int) values.size(); i++) {
-        data.expressionSet.setVariable(data.particleValueIndex[i*2], values[i][atom1]);
-        data.expressionSet.setVariable(data.particleValueIndex[i*2+1], values[i][atom2]);
+        data.particleValue[i*2] = values[i][atom1];
+        data.particleValue[i*2+1] = values[i][atom2];
    }

    // Evaluate the energy and its derivatives.
@@ -571,13 +591,13 @@ void CpuCustomGBForce::calculateChainRuleForces(ThreadData& data, int numAtoms,
    // Compute chain rule terms for computed values that depend explicitly on particle coordinates.

    for (int i = data.firstAtom; i < data.lastAtom; i++) {
-        data.expressionSet.setVariable(data.xindex, posq[4*i]);
-        data.expressionSet.setVariable(data.yindex, posq[4*i+1]);
-        data.expressionSet.setVariable(data.zindex, posq[4*i+2]);
+        data.x = posq[4*i];
+        data.y = posq[4*i+1];
+        data.z = posq[4*i+2];
        for (int j = 0; j < numParams; j++)
-            data.expressionSet.setVariable(data.paramIndex[j], atomParameters[i][j]);
+            data.param[j] = atomParameters[i][j];
        for (int j = 1; j < (int) values.size(); j++) {
-            data.expressionSet.setVariable(data.valueIndex[j-1], values[j-1][i]);
+            data.value[j-1] = values[j-1][i];
            data.dVdX[j] = 0.0;
            data.dVdY[j] = 0.0;
            data.dVdZ[j] = 0.0;
@@ -599,7 +619,7 @@ void CpuCustomGBForce::calculateChainRuleForces(ThreadData& data, int numAtoms,
    // Compute chain rule terms for derivatives with respect to parameters.

    for (int i = data.firstAtom; i < data.lastAtom; i++)
-        for (int j = 0; j < data.valueIndex.size(); j++)
+        for (int j = 0; j < data.value.size(); j++)
            for (int k = 0; k < dValuedParam[j].size(); k++)
                data.energyParamDerivs[k] += dEdV[j][i]*dValuedParam[j][k][i];
 }
@@ -616,21 +636,21 @@ void CpuCustomGBForce::calculateOnePairChainRule(int atom1, int atom2, ThreadDat
    if (cutoff && r2 >= cutoffDistance2)
        return;
    float r = sqrtf(r2);
+    data.r = r;

    // Record variables for evaluating expressions.

    for (int i = 0; i < numParams; i++) {
-        data.expressionSet.setVariable(data.particleParamIndex[i*2], atomParameters[atom1][i]);
-        data.expressionSet.setVariable(data.particleParamIndex[i*2+1], atomParameters[atom2][i]);
-        data.expressionSet.setVariable(data.paramIndex[i], atomParameters[atom1][i]);
-    }
-    data.expressionSet.setVariable(data.valueIndex[0], values[0][atom1]);
-    data.expressionSet.setVariable(data.xindex, posq[4*atom1]);
-    data.expressionSet.setVariable(data.yindex, posq[4*atom1+1]);
-    data.expressionSet.setVariable(data.zindex, posq[4*atom1+2]);
-    data.expressionSet.setVariable(data.rindex, r);
-    data.expressionSet.setVariable(data.particleValueIndex[0], values[0][atom1]);
-    data.expressionSet.setVariable(data.particleValueIndex[1], values[0][atom2]);
+        data.particleParam[i*2] = atomParameters[atom1][i];
+        data.particleParam[i*2+1] = atomParameters[atom2][i];
+        data.param[i] = atomParameters[atom1][i];
+    }
+    data.value[0] = values[0][atom1];
+    data.x = posq[4*atom1];
+    data.y = posq[4*atom1+1];
+    data.z = posq[4*atom1+2];
+    data.particleValue[0] = values[0][atom1];
+    data.particleValue[1] = values[0][atom2];

    // Evaluate the derivative of each parameter with respect to position and apply forces.

@@ -644,7 +664,7 @@ void CpuCustomGBForce::calculateOnePairChainRule(int atom1, int atom2, ThreadDat
        f2 -= deltaR*(dEdV[0][atom1]*data.dVdR2[0]);
    }
    for (int i = 1; i < (int) values.size(); i++) {
-        data.expressionSet.setVariable(data.valueIndex[i], values[i][atom1]);
+        data.value[i] = values[i][atom1];
        data.dVdR1[i] = 0.0;
        data.dVdR2[i] = 0.0;
        for (int j = 0; j < i; j++) {

--- a/platforms/cpu/src/CpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForce.cpp
@@ -46,25 +46,31 @@ public:
 CpuCustomNonbondedForce::ThreadData::ThreadData(const Lepton::CompiledExpression& energyExpression, const Lepton::CompiledExpression& forceExpression,
            const vector<string>& parameterNames, const std::vector<Lepton::CompiledExpression> energyParamDerivExpressions) :
            energyExpression(energyExpression), forceExpression(forceExpression), energyParamDerivExpressions(energyParamDerivExpressions) {
-    expressionSet.registerExpression(this->energyExpression);
-    expressionSet.registerExpression(this->forceExpression);
-    for (int i = 0; i < this->energyParamDerivExpressions.size(); i++)
-        expressionSet.registerExpression(this->energyParamDerivExpressions[i]);
-    rIndex = expressionSet.getVariableIndex("r");
+    map<string, double*> variableLocations;
+    variableLocations["r"] = &r;
+    particleParam.resize(2*parameterNames.size());
    for (int i = 0; i < (int) parameterNames.size(); i++) {
-        for (int j = 1; j < 3; j++) {
+        for (int j = 0; j < 2; j++) {
            stringstream name;
-            name << parameterNames[i] << j;
-            particleParamIndex.push_back(expressionSet.getVariableIndex(name.str()));
+            name << parameterNames[i] << (j+1);
+            variableLocations[name.str()] = &particleParam[i*2+j];
        }
    }
    energyParamDerivs.resize(energyParamDerivExpressions.size());
+    this->energyExpression.setVariableLocations(variableLocations);
+    this->forceExpression.setVariableLocations(variableLocations);
+    expressionSet.registerExpression(this->energyExpression);
+    expressionSet.registerExpression(this->forceExpression);
+    for (int i = 0; i < this->energyParamDerivExpressions.size(); i++) {
+        this->energyParamDerivExpressions[i].setVariableLocations(variableLocations);
+        expressionSet.registerExpression(this->energyParamDerivExpressions[i]);
+    }
 }

 CpuCustomNonbondedForce::CpuCustomNonbondedForce(const Lepton::CompiledExpression& energyExpression,
            const Lepton::CompiledExpression& forceExpression, const vector<string>& parameterNames, const vector<set<int> >& exclusions,
            const std::vector<Lepton::CompiledExpression> energyParamDerivExpressions, ThreadPool& threads) :
-            cutoff(false), useSwitch(false), periodic(false), paramNames(parameterNames), exclusions(exclusions), threads(threads) {
+            cutoff(false), useSwitch(false), periodic(false), useInteractionGroups(false), paramNames(parameterNames), exclusions(exclusions), threads(threads) {
    for (int i = 0; i < threads.getNumThreads(); i++)
        threadData.push_back(new ThreadData(energyExpression, forceExpression, parameterNames, energyParamDerivExpressions));
 }
@@ -81,6 +87,7 @@ void CpuCustomNonbondedForce::setUseCutoff(RealOpenMM distance, const CpuNeighbo
  }

 void CpuCustomNonbondedForce::setInteractionGroups(const vector<pair<set<int>, set<int> > >& groups) {
+    useInteractionGroups = true;
    for (int group = 0; group < (int) groups.size(); group++) {
        const set<int>& set1 = groups[group].first;
        const set<int>& set2 = groups[group].second;
@@ -177,7 +184,7 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
        data.energyParamDerivs[i] = 0.0;
    fvec4 boxSize(periodicBoxVectors[0][0], periodicBoxVectors[1][1], periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize(recipBoxSize[0], recipBoxSize[1], recipBoxSize[2], 0);
-    if (groupInteractions.size() > 0) {
+    if (useInteractionGroups) {
        // The user has specified interaction groups, so compute only the requested interactions.
        
        while (true) {
@@ -187,8 +194,8 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
            int atom1 = groupInteractions[i].first;
            int atom2 = groupInteractions[i].second;
            for (int j = 0; j < (int) paramNames.size(); j++) {
-                data.expressionSet.setVariable(data.particleParamIndex[j*2], atomParameters[atom1][j]);
-                data.expressionSet.setVariable(data.particleParamIndex[j*2+1], atomParameters[atom2][j]);
+                data.particleParam[j*2] = atomParameters[atom1][j];
+                data.particleParam[j*2+1] = atomParameters[atom2][j];
            }
            calculateOneIxn(atom1, atom2, data, forces, energy, boxSize, invBoxSize);
        }
@@ -207,12 +214,12 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
            for (int i = 0; i < (int) neighbors.size(); i++) {
                int first = neighbors[i];
                for (int j = 0; j < (int) paramNames.size(); j++)
-                    data.expressionSet.setVariable(data.particleParamIndex[j*2], atomParameters[first][j]);
+                    data.particleParam[j*2] = atomParameters[first][j];
                for (int k = 0; k < blockSize; k++) {
                    if ((exclusions[i] & (1<<k)) == 0) {
                        int second = blockAtom[k];
                        for (int j = 0; j < (int) paramNames.size(); j++)
-                            data.expressionSet.setVariable(data.particleParamIndex[j*2+1], atomParameters[second][j]);
+                            data.particleParam[j*2+1] = atomParameters[second][j];
                        calculateOneIxn(first, second, data, forces, energy, boxSize, invBoxSize);
                    }
                }
@@ -229,8 +236,8 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
            for (int jj = ii+1; jj < numberOfAtoms; jj++) {
                if (exclusions[jj].find(ii) == exclusions[jj].end()) {
                    for (int j = 0; j < (int) paramNames.size(); j++) {
-                        data.expressionSet.setVariable(data.particleParamIndex[j*2], atomParameters[ii][j]);
-                        data.expressionSet.setVariable(data.particleParamIndex[j*2+1], atomParameters[jj][j]);
+                        data.particleParam[j*2] = atomParameters[ii][j];
+                        data.particleParam[j*2+1] = atomParameters[jj][j];
                    }
                    calculateOneIxn(ii, jj, data, forces, energy, boxSize, invBoxSize);
                }
@@ -251,10 +258,10 @@ void CpuCustomNonbondedForce::calculateOneIxn(int ii, int jj, ThreadData& data,
    if (cutoff && r2 >= cutoffDistance*cutoffDistance)
        return;
    float r = sqrtf(r2);
+    data.r = r;

    // accumulate forces

-    data.expressionSet.setVariable(data.rIndex, r);
    double dEdR = (includeForce ? data.forceExpression.evaluate()/r : 0.0);
    double energy = (includeEnergy ? data.energyExpression.evaluate() : 0.0);
    double switchValue = 1.0;

--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
-
-/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2016 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -89,6 +88,10 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
    particleParams = params;
    bornRadii.resize(params.size()+3);
    obcChain.resize(params.size()+3);
+    for (int i = bornRadii.size()-3; i < bornRadii.size(); i++) {
+        bornRadii[i] = 0;
+        obcChain[i] = 0;
+    }
 }

 void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -50,6 +50,7 @@
 #include "lepton/CustomFunction.h"
 #include "lepton/Operation.h"
 #include "lepton/Parser.h"
+#include <iostream>
 #include "lepton/ParsedExpression.h"

 using namespace OpenMM;
@@ -528,7 +529,7 @@ CpuNonbondedForce* createCpuNonbondedForceVec4();
 CpuNonbondedForce* createCpuNonbondedForceVec8();

 CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
-        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), nonbonded(NULL) {
+        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), hasInitializedDispersionPme(false), nonbonded(NULL) {
    if (isVec8Supported())
        nonbonded = createCpuNonbondedForceVec8();
    else
@@ -575,12 +576,14 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    for (int i = 0; i < num14; i++)
        bonded14ParamArray[i] = new double[3];
    particleParams.resize(numParticles);
+    C6params.resize(numParticles);
    double sumSquaredCharges = 0.0;
    for (int i = 0; i < numParticles; ++i) {
        double charge, radius, depth;
        force.getParticleParameters(i, charge, radius, depth);
        data.posq[4*i+3] = (float) charge;
        particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
+        C6params[i] = 8.0*pow(particleParams[i].first, 3.0) * particleParams[i].second;
        sumSquaredCharges += charge*charge;
    }
    
@@ -616,19 +619,35 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    }
    else if (nonbondedMethod == PME) {
        double alpha;
-        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2]);
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
        ewaldAlpha = alpha;
    }
-    if (nonbondedMethod == Ewald || nonbondedMethod == PME)
+    else if (nonbondedMethod == LJPME) {
+        double alpha;
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
+        ewaldAlpha = (RealOpenMM) alpha;
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, dispersionGridSize[0], dispersionGridSize[1], dispersionGridSize[2], true);
+        ewaldDispersionAlpha = (RealOpenMM) alpha;
+        useSwitchingFunction = false;
+    }
+
+    if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) {
        ewaldSelfEnergy = -ONE_4PI_EPS0*ewaldAlpha*sumSquaredCharges/sqrt(M_PI);
-    else
+        if(nonbondedMethod == LJPME){
+            for (int atom = 0; atom < numParticles; atom++) {
+                // Dispersion self term
+                ewaldSelfEnergy += pow(ewaldDispersionAlpha, 6.0) * C6params[atom]*C6params[atom] / 12.0;
+            }
+        }
+    } else {
        ewaldSelfEnergy = 0.0;
+    }
    rfDielectric = force.getReactionFieldDielectric();
    if (force.getUseDispersionCorrection())
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
    else
        dispersionCoefficient = 0.0;
-    data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME);
+    data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME);
 }

 double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
@@ -646,6 +665,20 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
            }
        }
+        if (nonbondedMethod == LJPME) {
+            // If available, use the optimized PME implementation.
+
+            vector<string> kernelNames;
+            kernelNames.push_back("CalcPmeReciprocalForce");
+            useOptimizedPme = getPlatform().supportsKernels(kernelNames);
+            if (useOptimizedPme) {
+                optimizedPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
+                optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
+                optimizedDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), context);
+                optimizedDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSize[0], dispersionGridSize[1],
+                                                                                                  dispersionGridSize[2], numParticles, ewaldDispersionAlpha);
+            }
+        }
    }
    AlignedArray<float>& posq = data.posq;
    vector<RealVec>& posData = extractPositions(context);
@@ -654,6 +687,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    bool ewald  = (nonbondedMethod == Ewald);
    bool pme  = (nonbondedMethod == PME);
+    bool ljpme = (nonbondedMethod == LJPME);
    if (nonbondedMethod != NoCutoff)
        nonbonded->setUseCutoff(nonbondedCutoff, *data.neighborList, rfDielectric);
    if (data.isPeriodic) {
@@ -669,9 +703,13 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
        nonbonded->setUsePME(ewaldAlpha, gridSize);
    if (useSwitchingFunction)
        nonbonded->setUseSwitchingFunction(switchingDistance);
+    if (ljpme){
+        nonbonded->setUsePME(ewaldAlpha, gridSize);
+        nonbonded->setUseLJPME(ewaldDispersionAlpha, dispersionGridSize);
+    }
    double nonbondedEnergy = 0;
    if (includeDirect)
-        nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
+        nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, C6params, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
    if (includeReciprocal) {
        if (useOptimizedPme) {
            PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
@@ -680,13 +718,13 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
        }
        else
-            nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
+            nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, C6params, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
    }
    energy += nonbondedEnergy;
    if (includeDirect) {
        ReferenceLJCoulomb14 nonbonded14;
        bondForce.calculateForce(posData, bonded14ParamArray, forceData, includeEnergy ? &energy : NULL, nonbonded14);
-        if (data.isPeriodic)
+        if (data.isPeriodic && nonbondedMethod != LJPME)
            energy += dispersionCoefficient/(boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2]);
    }
    return energy;
@@ -739,7 +777,7 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 }

 void CpuCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
-    if (nonbondedMethod != PME)
+    if (nonbondedMethod != PME && nonbondedMethod != LJPME)
        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
    if (useOptimizedPme)
        optimizedPme.getAs<const CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
@@ -751,6 +789,19 @@ void CpuCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int&
    }
 }

+void CpuCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    if (nonbondedMethod != LJPME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
+    if (useOptimizedPme)
+        optimizedDispersionPme.getAs<const CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
+    else {
+        alpha = ewaldDispersionAlpha;
+        nx = dispersionGridSize[0];
+        ny = dispersionGridSize[1];
+        nz = dispersionGridSize[2];
+    }
+}
+
 CpuCalcCustomNonbondedForceKernel::CpuCalcCustomNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) :
            CalcCustomNonbondedForceKernel(name, platform), data(data), forceCopy(NULL), nonbonded(NULL) {
 }
@@ -1285,8 +1336,7 @@ void CpuIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langevi
        
        if (dynamics)
            delete dynamics;
-        RealOpenMM tau = (friction == 0.0 ? 0.0 : 1.0/friction);
-        dynamics = new CpuLangevinDynamics(context.getSystem().getNumParticles(), stepSize, tau, temperature, data.threads, data.random);
+        dynamics = new CpuLangevinDynamics(context.getSystem().getNumParticles(), stepSize, friction, temperature, data.threads, data.random);
        dynamics->setReferenceConstraintAlgorithm(&extractConstraints(context));
        prevTemp = temperature;
        prevFriction = friction;

--- a/platforms/cpu/src/CpuLangevinDynamics.cpp
+++ b/platforms/cpu/src/CpuLangevinDynamics.cpp

-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2016 Stanford University and Simbios.
 * Authors: Peter Eastman
 * Contributors: 
 *
@@ -59,8 +59,8 @@ public:
    CpuLangevinDynamics& owner;
 };

-CpuLangevinDynamics::CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM tau, RealOpenMM temperature, ThreadPool& threads, CpuRandom& random) : 
-           ReferenceStochasticDynamics(numberOfAtoms, deltaT, tau, temperature), threads(threads), random(random) {
+CpuLangevinDynamics::CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM friction, RealOpenMM temperature, ThreadPool& threads, CpuRandom& random) : 
+           ReferenceStochasticDynamics(numberOfAtoms, deltaT, friction, temperature), threads(threads), random(random) {
 }

 CpuLangevinDynamics::~CpuLangevinDynamics() {
@@ -120,11 +120,12 @@ void CpuLangevinDynamics::updatePart3(int numberOfAtoms, vector<RealVec>& atomCo
 }

 void CpuLangevinDynamics::threadUpdate1(int threadIndex) {
-    const RealOpenMM tau = getTau();
-    const RealOpenMM vscale = EXP(-getDeltaT()/tau);
-    const RealOpenMM fscale = (1-vscale)*tau;
+    RealOpenMM dt = getDeltaT();
+    RealOpenMM friction = getFriction();
+    const RealOpenMM vscale = EXP(-dt*friction);
+    const RealOpenMM fscale = (friction == 0 ? dt : (1-vscale)/friction);
    const RealOpenMM kT = BOLTZ*getTemperature();
-    const RealOpenMM noisescale = SQRT(2*kT/tau)*SQRT(0.5*(1-vscale*vscale)*tau);
+    const RealOpenMM noisescale = SQRT(kT*(1-vscale*vscale));
    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();


--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -30,6 +30,7 @@
 #include "ReferencePME.h"
 #include "openmm/internal/gmx_atomic.h"
 #include <algorithm>
+#include <iostream>

 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
@@ -57,7 +58,8 @@ public:

   --------------------------------------------------------------------------------------- */

-CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false), cutoffDistance(0.0f), alphaEwald(0.0f) {
+CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), ljpme(false), tableIsValid(false), expTableIsValid(false),
+    cutoffDistance(0.0f), alphaDispersionEwald(0.0f), alphaEwald(0.0f) {
 }

 CpuNonbondedForce::~CpuNonbondedForce() {
@@ -78,11 +80,22 @@ void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neig
        tableIsValid = false;
    cutoff = true;
    cutoffDistance = distance;
+    inverseRcut6 = pow(cutoffDistance, -6);
    neighborList = &neighbors;
    krf = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0)/(2.0*solventDielectric+1.0);
    crf = (1.0/cutoffDistance)*(3.0*solventDielectric)/(2.0*solventDielectric+1.0);
+    if(alphaDispersionEwald != 0.0f){
+        // We set this here, in case setUseCutoff is called after the dispersion alpha is set.
+        double dalphaR = alphaDispersionEwald*cutoffDistance;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        inverseRcut6Expterm  = inverseRcut6*(1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
    }

+}
+
 /**---------------------------------------------------------------------------------------

   Set the force to use a switching function on the Lennard-Jones interaction.
@@ -96,7 +109,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    switchingDistance = distance;
 }

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use periodic boundary conditions.  This requires that a cutoff has
     also been set, and the smallest side of the periodic box is at least twice the cutoff
@@ -106,7 +119,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {

     --------------------------------------------------------------------------------------- */

-  void CpuNonbondedForce::setPeriodic(RealVec* periodicBoxVectors) {
+void CpuNonbondedForce::setPeriodic(RealVec* periodicBoxVectors) {

    assert(cutoff);
    assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
@@ -126,9 +139,9 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    triclinic = (periodicBoxVectors[0][1] != 0.0 || periodicBoxVectors[0][2] != 0.0 ||
            periodicBoxVectors[1][0] != 0.0 || periodicBoxVectors[1][2] != 0.0 ||
            periodicBoxVectors[2][0] != 0.0 || periodicBoxVectors[2][1] != 0.0);
-  }
+}

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use Ewald summation.

@@ -139,7 +152,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {

     --------------------------------------------------------------------------------------- */

-  void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
+void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
    if (alpha != alphaEwald)
        tableIsValid = false;
    alphaEwald = alpha;
@@ -148,9 +161,9 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    numRz = kmaxz;
    ewald = true;
    tabulateEwaldScaleFactor();
-  }
+}

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use Particle-Mesh Ewald (PME) summation.

@@ -159,7 +172,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {

     --------------------------------------------------------------------------------------- */

-  void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
+void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
    if (alpha != alphaEwald)
        tableIsValid = false;
    alphaEwald = alpha;
@@ -168,10 +181,40 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    meshDim[2] = meshSize[2];
    pme = true;
    tabulateEwaldScaleFactor();
+}
+
+
+/**---------------------------------------------------------------------------------------
+
+     Set the force to use Particle-Mesh Ewald (PME) summation for dispersion.
+
+     @param alpha  the Ewald separation parameter
+     @param gridSize the dimensions of the mesh
+
+     --------------------------------------------------------------------------------------- */
+
+void CpuNonbondedForce::setUseLJPME(float alpha, int meshSize[3]) {
+    if (alpha != alphaDispersionEwald)
+        expTableIsValid = false;
+    alphaDispersionEwald = alpha;
+    dispersionMeshDim[0] = meshSize[0];
+    dispersionMeshDim[1] = meshSize[1];
+    dispersionMeshDim[2] = meshSize[2];
+    ljpme = true;
+    tabulateExpTerms();
+    if(cutoffDistance != 0.0f){
+        // We set this here, in case setUseLJPME is called after the cutoff is set
+        double dalphaR = alphaDispersionEwald*cutoffDistance;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        inverseRcut6Expterm  = inverseRcut6*(1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
    }
+}


-  void CpuNonbondedForce::tabulateEwaldScaleFactor() {
+void CpuNonbondedForce::tabulateEwaldScaleFactor() {
    if (tableIsValid)
        return;
    tableIsValid = true;
@@ -188,8 +231,28 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    }
 }

+void CpuNonbondedForce::tabulateExpTerms() {
+    if (expTableIsValid)
+        return;
+    expTableIsValid = true;
+    exptermsDX = cutoffDistance/NUM_TABLE_POINTS;
+    exptermsDXInv = 1.0f/exptermsDX;
+    exptermsTable.resize(NUM_TABLE_POINTS+4);
+    dExptermsTable.resize(NUM_TABLE_POINTS+4);
+    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
+        double r = i*ewaldDX;
+        double dalphaR = alphaDispersionEwald*r;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        exptermsTable[i]  = (1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
+        dExptermsTable[i] = (1.0 - expterm * (1.0 + dar2 + 0.5*dar4 + dar6/6.0));
+    }
+}
+
 void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates,
-                                             const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
+                                               const vector<pair<float, float> >& atomParameters, const vector<float> &C6params, const vector<set<int> >& exclusions,
                                               vector<RealVec>& forces, double* totalEnergy) const {
    typedef std::complex<float> d_complex;

@@ -211,6 +274,29 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
        if (totalEnergy)
            *totalEnergy += recipEnergy;
        pme_destroy(pmedata);
+
+        if (ljpme) {
+            // Dispersion reciprocal space terms
+            pme_init(&pmedata,alphaDispersionEwald,numberOfAtoms,dispersionMeshDim,5,1);
+
+            std::vector<RealVec> dpmeforces;
+            for (int i = 0; i < numberOfAtoms; i++){
+                charges[i] = (RealOpenMM)C6params[i];
+                dpmeforces.push_back(RealVec());
+            }
+            RealOpenMM recipDispersionEnergy    = 0.0;
+            pme_exec_dpme(pmedata,atomCoordinates,dpmeforces,charges,periodicBoxVectors,&recipDispersionEnergy);
+            for (int i = 0; i < numberOfAtoms; i++){
+                forces[i][0] -= 2.0*dpmeforces[i][0];
+                forces[i][1] -= 2.0*dpmeforces[i][1];
+                forces[i][2] -= 2.0*dpmeforces[i][2];
+            }
+            if (totalEnergy)
+                *totalEnergy += recipDispersionEnergy;
+
+            pme_destroy(pmedata);
+        }
+
    }

    // Ewald method
@@ -224,7 +310,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c

        // setup K-vectors

-        #define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
+#define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
        vector<d_complex> eir(kmax*numberOfAtoms*3);
        vector<d_complex> tab_xy(numberOfAtoms);
        vector<d_complex> tab_qxyz(numberOfAtoms);
@@ -301,13 +387,14 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c


 void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+                                           const vector<float>& C6params, const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    
    this->numberOfAtoms = numberOfAtoms;
    this->posq = posq;
    this->atomCoordinates = &atomCoordinates[0];
    this->atomParameters = &atomParameters[0];
+    this->C6params = &C6params[0];
    this->exclusions = &exclusions[0];
    this->threadForce = &threadForce;
    includeEnergy = (totalEnergy != NULL);
@@ -319,7 +406,7 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    // Signal the threads to start running and wait for them to finish.
    
    ComputeDirectTask task(*this);
-    threads.execute(task);
+    threads.execute(task); // ACS calls threadcomputedirect
    threads.waitForThreads();
    
    // Signal the threads to subtract the exclusions.
@@ -350,9 +437,8 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    float* forces = &(*threadForce)[threadIndex][0];
    fvec4 boxSize(periodicBoxVectors[0][0], periodicBoxVectors[1][1], periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize(recipBoxSize[0], recipBoxSize[1], recipBoxSize[2], 0);
-    if (ewald || pme) {
+    if (ewald || pme || ljpme) {
        // Compute the interactions from the neighbor list.
-
        while (true) {
            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
            if (nextBlock >= neighborList->getNumBlocks())
@@ -395,6 +481,17 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
                        }
                        else if (includeEnergy)
                            threadEnergy[threadIndex] -= alphaEwald*TWO_OVER_SQRT_PI*scaledChargeI*posq[4*j+3];
+                        if (ljpme) {
+                            float C6ij = C6params[i]*C6params[j];
+                            float inverseR2 = 1.0f/r2;
+                            float emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                            if(includeEnergy)
+                                threadEnergy[threadIndex] += emult;
+                            float dEdR = -6.0f*C6ij*inverseR2*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                            fvec4 result = deltaR*dEdR;
+                            (fvec4(forces+4*i)-result).store(forces+4*i);
+                            (fvec4(forces+4*j)+result).store(forces+4*j);
+                        }
                    }
                }
            }
@@ -476,7 +573,7 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
    fvec4 result = deltaR*dEdR;
    (fvec4(forces+4*ii)+result).store(forces+4*ii);
    (fvec4(forces+4*jj)-result).store(forces+4*jj);
-  }
+}

 void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    deltaR = posJ-posI;
@@ -502,3 +599,18 @@ float CpuNonbondedForce::erfcApprox(float x) {
    return coeff1*erfcTable[index] + coeff2*erfcTable[index+1];
 }

+float CpuNonbondedForce::exptermsApprox(float x) {
+    float x1 = x*exptermsDXInv;
+    int index = min((int) floor(x1), NUM_TABLE_POINTS);
+    float coeff2 = x1-index;
+    float coeff1 = 1.0f-coeff2;
+    return coeff1*exptermsTable[index] + coeff2*exptermsTable[index+1];
+}
+
+float CpuNonbondedForce::dExptermsApprox(float x) {
+    float x1 = x*exptermsDXInv;
+    int index = min((int) floor(x1), NUM_TABLE_POINTS);
+    float coeff2 = x1-index;
+    float coeff1 = 1.0f-coeff2;
+    return coeff1*dExptermsTable[index] + coeff2*dExptermsTable[index+1];
+}
--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
@@ -25,6 +25,7 @@
 #include "SimTKOpenMMUtilities.h"
 #include "CpuNonbondedForceVec4.h"
 #include <algorithm>
+#include <iostream>

 using namespace std;
 using namespace OpenMM;
@@ -213,7 +214,6 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,

 void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
    // Determine whether we need to apply periodic boundary conditions.
-    
    PeriodicType periodicType;
    fvec4 blockCenter;
    if (!periodic) {
@@ -263,7 +263,6 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces
 template <int PERIODIC_TYPE>
 void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
-    
    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
    fvec4 blockAtomPosq[4];
    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
@@ -278,6 +277,7 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    fvec4 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]]);
    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);

@@ -318,7 +318,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
            fvec4 sig2 = inverseR*sig;
            sig2 *= sig2;
            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            fvec4 eps = blockAtomEpsilon*atomEpsilon;
+            fvec4 epsSig6 = eps*sig6;
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
@@ -328,6 +329,17 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
                dEdR = switchValue*dEdR - energy*switchDeriv*r;
                energy *= switchValue;
            }
+
+            if (ljpme) {
+                fvec4 C6ij = C6s*C6params[atom];
+                fvec4 inverseR2 = inverseR*inverseR;
+                fvec4 mysig2 = sig*sig;
+                fvec4 mysig6 = mysig2*mysig2*mysig2;
+                fvec4 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                fvec4 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
+                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                energy += emult + potentialShift;
+            }
        }
        else {
            energy = 0.0f;
@@ -420,3 +432,30 @@ fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(const fvec4& x) {
    transpose(t1, t2, t3, t4);
    return coeff1*t1 + coeff2*t2;
 }
+
+fvec4 CpuNonbondedForceVec4::exptermsApprox(const fvec4& r) {
+    fvec4 r1 = r*exptermsDXInv;
+    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = r1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&exptermsTable[index[0]]);
+    fvec4 t2(&exptermsTable[index[1]]);
+    fvec4 t3(&exptermsTable[index[2]]);
+    fvec4 t4(&exptermsTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
+
+fvec4 CpuNonbondedForceVec4::dExptermsApprox(const fvec4& r) {
+    fvec4 r1 = r*exptermsDXInv;
+    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = r1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&dExptermsTable[index[0]]);
+    fvec4 t2(&dExptermsTable[index[1]]);
+    fvec4 t3(&dExptermsTable[index[2]]);
+    fvec4 t4(&dExptermsTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
+
--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
@@ -27,6 +27,7 @@
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/hardware.h"
 #include <algorithm>
+#include <iostream>

 using namespace std;
 using namespace OpenMM;
@@ -81,7 +82,6 @@ enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, Periodic

 void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
    // Determine whether we need to apply periodic boundary conditions.    
-    
    PeriodicType periodicType;
    fvec4 blockCenter;
    if (!periodic) {
@@ -308,6 +308,7 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
    blockAtomCharge *= ONE_4PI_EPS0;
    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
+    fvec8 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]], C6params[blockAtom[4]], C6params[blockAtom[5]], C6params[blockAtom[6]], C6params[blockAtom[7]]);
    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
@@ -348,7 +349,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
            fvec8 sig2 = inverseR*sig;
            sig2 *= sig2;
            fvec8 sig6 = sig2*sig2*sig2;
-            fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            fvec8 eps = blockAtomEpsilon*atomEpsilon;
+            fvec8 epsSig6 = eps*sig6;
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
@@ -358,6 +360,17 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
                dEdR = switchValue*dEdR - energy*switchDeriv*r;
                energy *= switchValue;
            }
+            if (ljpme) {
+                fvec8 C6ij = C6s*C6params[atom];
+                fvec8 inverseR2 = inverseR*inverseR;
+                fvec8 mysig2 = sig*sig;
+                fvec8 mysig6 = mysig2*mysig2*mysig2;
+                fvec8 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                fvec8 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
+                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                energy += emult + potentialShift;
+            }
+
        }
        else {
            energy = 0.0f;
@@ -464,4 +477,45 @@ fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(const fvec8& x) {
    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
    return coeff1*s1 + coeff2*s2;
 }
+
+fvec8 CpuNonbondedForceVec8::exptermsApprox(const fvec8& r) {
+    fvec8 r1 = r*exptermsDXInv;
+    ivec8 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = r1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&exptermsTable[indexLower[0]]);
+    fvec4 t2(&exptermsTable[indexLower[1]]);
+    fvec4 t3(&exptermsTable[indexLower[2]]);
+    fvec4 t4(&exptermsTable[indexLower[3]]);
+    fvec4 t5(&exptermsTable[indexUpper[0]]);
+    fvec4 t6(&exptermsTable[indexUpper[1]]);
+    fvec4 t7(&exptermsTable[indexUpper[2]]);
+    fvec4 t8(&exptermsTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
+
+fvec8 CpuNonbondedForceVec8::dExptermsApprox(const fvec8& r) {
+    fvec8 r1 = r*exptermsDXInv;
+    ivec8 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = r1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&dExptermsTable[indexLower[0]]);
+    fvec4 t2(&dExptermsTable[indexLower[1]]);
+    fvec4 t3(&dExptermsTable[indexLower[2]]);
+    fvec4 t4(&dExptermsTable[indexLower[3]]);
+    fvec4 t5(&dExptermsTable[indexUpper[0]]);
+    fvec4 t6(&dExptermsTable[indexUpper[1]]);
+    fvec4 t7(&dExptermsTable[indexUpper[2]]);
+    fvec4 t8(&dExptermsTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
+
 #endif
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -127,6 +127,8 @@ void CpuPlatform::contextDestroyed(ContextImpl& context) const {
    PlatformData* data = contextData[&context];
    delete data;
    contextData.erase(&context);
+    ReferencePlatform::PlatformData* refPlatformData = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+    delete refPlatformData;
 }

 CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -599,7 +599,8 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL),
+            pmeEnergyBuffer(NULL), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -636,6 +637,15 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class SortTrait : public CudaSort::SortTrait {
        int getDataSize() const {return 8;}
@@ -667,26 +677,36 @@ private:
    CudaArray* pmeEnergyBuffer;
    CudaSort* sort;
    Kernel cpuPme;
+    Kernel cpuDispersionPme;
    PmeIO* pmeio;
    CUstream pmeStream;
    CUevent pmeSyncEvent;
    CudaFFT3D* fft;
    cufftHandle fftForward;
    cufftHandle fftBackward;
+    CudaFFT3D* dispersionFft;
+    cufftHandle dispersionFftForward;
+    cufftHandle dispersionFftBackward;
    CUfunction ewaldSumsKernel;
    CUfunction ewaldForcesKernel;
    CUfunction pmeGridIndexKernel;
+    CUfunction pmeDispersionGridIndexKernel;
    CUfunction pmeSpreadChargeKernel;
+    CUfunction pmeDispersionSpreadChargeKernel;
    CUfunction pmeFinishSpreadChargeKernel;
+    CUfunction pmeDispersionFinishSpreadChargeKernel;
    CUfunction pmeEvalEnergyKernel;
+    CUfunction pmeEvalDispersionEnergyKernel;
    CUfunction pmeConvolutionKernel;
+    CUfunction pmeDispersionConvolutionKernel;
    CUfunction pmeInterpolateForceKernel;
-    std::map<std::string, std::string> pmeDefines;
+    CUfunction pmeInterpolateDispersionForceKernel;
    std::vector<std::pair<int, int> > exceptionAtoms;
-    double ewaldSelfEnergy, dispersionCoefficient, alpha;
+    double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
    int interpolateForceThreads;
    int gridSizeX, gridSizeY, gridSizeZ;
-    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT;
+    int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
+    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT, doLJPME;
    NonbondedMethod nonbondedMethod;
    static const int PmeOrder = 5;
 };
@@ -1432,7 +1452,7 @@ private:
    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
    Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
    void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
-    void recordGlobalValue(double value, GlobalTarget target);
+    void recordGlobalValue(double value, GlobalTarget target, CustomIntegrator& integrator);
    void recordChangedParameters(ContextImpl& context);
    bool evaluateCondition(int step);
    CudaContext& cu;

--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -78,8 +78,9 @@ public:
     * @param exclusionList    for each atom, specifies the list of other atoms whose interactions should be excluded
     * @param kernel           the code to evaluate the interaction
     * @param forceGroup       the force group in which the interaction should be calculated
+     * @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
     */
-    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
+    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList=false);
    /**
     * Add a per-atom parameter that the default interaction kernel may depend on.
     */
@@ -189,6 +190,12 @@ public:
    CudaArray& getInteractingAtoms() {
        return *interactingAtoms;
    }
+    /**
+     * Get the array containing single pairs in the neighbor list.
+     */
+    CudaArray& getSinglePairs() {
+        return *singlePairs;
+    }
    /**
     * Get the array containing exclusion flags.
     */
@@ -270,6 +277,8 @@ private:
    CudaArray* interactingTiles;
    CudaArray* interactingAtoms;
    CudaArray* interactionCount;
+    CudaArray* singlePairs;
+    CudaArray* singlePairCount;
    CudaArray* blockCenter;
    CudaArray* blockBoundingBox;
    CudaArray* sortedBlocks;
@@ -288,8 +297,8 @@ private:
    std::map<int, double> groupCutoff;
    std::map<int, std::string> groupKernelSource;
    double lastCutoff;
-    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList;
-    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
+    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
+    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
 };

 /**

--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
@@ -83,7 +83,7 @@ private:
    std::vector<Kernel> kernels;
    std::vector<long long> completionTimes;
    std::vector<double> contextNonbondedFractions;
-    int* tileCounts;
+    int2* interactionCounts;
    CudaArray* contextForces;
    void* pinnedPositionBuffer;
    long long* pinnedForceBuffer;
@@ -439,6 +439,15 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class Task;
    CudaPlatform::PlatformData& data;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -52,6 +52,7 @@
 #include <set>
 #include <sstream>
 #include <typeinfo>
+#include <sys/stat.h>
 #include <cudaProfiler.h>
 #ifndef WIN32
  #include <unistd.h>
@@ -127,9 +128,12 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    string testCompilerCommand = this->compiler+" --version > /dev/null 2> /dev/null";
    int res = std::system(testCompilerCommand.c_str());
 #endif
-    isNvccAvailable = (res == 0);
+    struct stat info;
+    isNvccAvailable = (res == 0 && stat(tempDir.c_str(), &info) == 0);
+    int cudaDriverVersion;
+    cuDriverGetVersion(&cudaDriverVersion);
    static bool hasShownNvccWarning = false;
-    if (hasCompilerKernel && !isNvccAvailable && !hasShownNvccWarning) {
+    if (hasCompilerKernel && !isNvccAvailable && !hasShownNvccWarning && cudaDriverVersion < 8000) {
        hasShownNvccWarning = true;
        printf("Could not find nvcc.  Using runtime compiler, which may produce slower performance.  ");
 #ifdef WIN32
@@ -205,14 +209,15 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking

    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
-#if __CUDA_API_VERSION < 7000
+    int numThreadBlocksPerComputeUnit = (major >= 6 ? 4 : 6);
+    if (cudaDriverVersion < 7000) {
        // This is a workaround to support GTX 980 with CUDA 6.5.  It reports
        // its compute capability as 5.2, but the compiler doesn't support
        // anything beyond 5.0.
        if (major == 5)
            minor = 0;
-#endif
-#if __CUDA_API_VERSION < 8000
+    }
+    if (cudaDriverVersion < 8000) {
        // This is a workaround to support Pascal with CUDA 7.5.  It reports
        // its compute capability as 6.x, but the compiler doesn't support
        // anything beyond 5.3.
@@ -220,7 +225,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
            major = 5;
            minor = 3;
        }
-#endif
+    }
    gpuArchitecture = intToString(major)+intToString(minor);
    computeCapability = major+0.1*minor;

@@ -241,7 +246,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
-    int numThreadBlocksPerComputeUnit = 6;
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
    if (useDoublePrecision) {
        posq = CudaArray::create<double4>(*this, paddedNumAtoms, "posq");

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp