Merged changes from main branch

cd874b2b · peastman · a783b996 · b84e22ba · cd874b2b · cd874b2b
Commit cd874b2b authored Feb 22, 2017 by peastman
20 changed files
--- a/platforms/cpu/include/CpuCustomNonbondedForce.h
+++ b/platforms/cpu/include/CpuCustomNonbondedForce.h
-/* Portions copyright (c) 2009-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2017 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -122,7 +122,6 @@ class CpuCustomNonbondedForce {
                          double* fixedParameters, const std::map<std::string, double>& globalParameters,
                          std::vector<AlignedArray<float> >& threadForce, bool includeForce, bool includeEnergy, double& totalEnergy, double* energyParamDerivs);
 private:
-    class ComputeForceTask;
    class ThreadData;
    bool cutoff;

--- a/platforms/cpu/include/CpuGBSAOBCForce.h
+++ b/platforms/cpu/include/CpuGBSAOBCForce.h
-/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -36,7 +36,6 @@ namespace OpenMM {
 class CpuGBSAOBCForce {
 public:
-    class ComputeTask;
    CpuGBSAOBCForce();
    /**

--- a/platforms/cpu/include/CpuGayBerneForce.h
+++ b/platforms/cpu/include/CpuGayBerneForce.h
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2016 Stanford University and the Authors.           *
+ * Portions copyright (c) 2016-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -45,7 +45,6 @@ namespace OpenMM {
 class CpuGayBerneForce {
 public:
    struct Matrix;
-    class ComputeTask;
    /**
     * Constructor.

--- a/platforms/cpu/include/CpuKernels.h
+++ b/platforms/cpu/include/CpuKernels.h
@@ -54,8 +54,6 @@ namespace OpenMM {
 */
 class CpuCalcForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
 public:
-    class InitForceTask;
-    class SumForceTask;
    CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context);
    /**
     * Initialize the kernel.
@@ -251,27 +249,37 @@ public:
    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
    /**
     * Get the parameters being used for PME.
-     * 
+     *
     * @param alpha   the separation parameter
     * @param nx      the number of grid points along the X axis
     * @param ny      the number of grid points along the Y axis
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the parameters being used for the dispersion term in LJPME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class PmeIO;
    CpuPlatform::PlatformData& data;
    int numParticles, num14;
    int **bonded14IndexArray;
    double **bonded14ParamArray;
-    double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldSelfEnergy, dispersionCoefficient;
+    double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldDispersionAlpha, ewaldSelfEnergy, dispersionCoefficient;
-    int kmax[3], gridSize[3];
+    int kmax[3], gridSize[3], dispersionGridSize[3];
-    bool useSwitchingFunction, useOptimizedPme, hasInitializedPme;
+    bool useSwitchingFunction, useOptimizedPme, hasInitializedPme, hasInitializedDispersionPme;
    std::vector<std::set<int> > exclusions;
    std::vector<std::pair<float, float> > particleParams;
+    std::vector<float> C6params;
    NonbondedMethod nonbondedMethod;
    CpuNonbondedForce* nonbonded;
-    Kernel optimizedPme;
+    Kernel optimizedPme, optimizedDispersionPme;
    CpuBondForce bondForce;
 };

--- a/platforms/cpu/include/CpuLangevinDynamics.h
+++ b/platforms/cpu/include/CpuLangevinDynamics.h
-/* Portions copyright (c) 2013-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2013-2017 Stanford University and Simbios.
 * Authors: Peter Eastman
 * Contributors: 
 *
@@ -35,9 +35,6 @@ namespace OpenMM {
 class CpuLangevinDynamics : public ReferenceStochasticDynamics {
 public:
-    class Update1Task;
-    class Update2Task;
-    class Update3Task;
    /**
     * Constructor.
     *

--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -45,7 +45,6 @@ namespace OpenMM {
 class OPENMM_EXPORT_CPU CpuNeighborList {
 public:
-    class ThreadTask;
    class Voxels;
    CpuNeighborList(int blockSize);
    void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -39,7 +39,6 @@ namespace OpenMM {
 class CpuNonbondedForce {
    public:
-        class ComputeDirectTask;
      /**---------------------------------------------------------------------------------------
@@ -104,16 +103,27 @@ class CpuNonbondedForce {
      /**---------------------------------------------------------------------------------------
         Set the force to use Particle-Mesh Ewald (PME) summation.
         @param alpha    the Ewald separation parameter
         @param gridSize the dimensions of the mesh
         --------------------------------------------------------------------------------------- */
      void setUsePME(float alpha, int meshSize[3]);
+      /**---------------------------------------------------------------------------------------
+         Set the force to use Particle-Mesh Ewald (PME) summation for dispersion.
+         @param alpha    the Ewald separation parameter
+         @param gridSize the dimensions of the mesh
+         --------------------------------------------------------------------------------------- */
+      void setUseLJPME(float alpha, int meshSize[3]);
      /**---------------------------------------------------------------------------------------
         Calculate Ewald ixn
@@ -122,16 +132,17 @@ class CpuNonbondedForce {
         @param posq             atom coordinates and charges
         @param atomCoordinates  atom coordinates (in format needed by PME)
         @param atomParameters   atom parameters (sigma/2, 2*sqrt(epsilon))
+         @param C6Paramrs        C6 parameters for multiplicative representation of dispersion
         @param exclusions       atom exclusion indices
                                 exclusions[atomIndex] contains the list of exclusions for that atom
         @param forces           force array (forces added)
         @param totalEnergy      total energy
         --------------------------------------------------------------------------------------- */
      void calculateReciprocalIxn(int numberOfAtoms, float* posq, const std::vector<Vec3>& atomCoordinates,
-                            const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
+                                  const std::vector<std::pair<float, float> >& atomParameters, const std::vector<float> &C6params,
-                            std::vector<Vec3>& forces, double* totalEnergy) const;
+                                  const std::vector<std::set<int> >& exclusions, std::vector<Vec3>& forces, double* totalEnergy) const;
      /**---------------------------------------------------------------------------------------
@@ -150,7 +161,7 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<Vec3>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+            const std::vector<float>& C6params, const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
    /**
     * This routine contains the code executed by each thread.
@@ -163,28 +174,32 @@ protected:
        bool periodic;
        bool triclinic;
        bool ewald;
-        bool pme;
+        bool ljpme, pme;
-        bool tableIsValid;
+        bool tableIsValid, expTableIsValid;
        const CpuNeighborList* neighborList;
        float recipBoxSize[3];
        Vec3 periodicBoxVectors[3];
        AlignedArray<fvec4> periodicBoxVec4;
        float cutoffDistance, switchingDistance;
        float krf, crf;
-        float alphaEwald;
+        float alphaEwald, alphaDispersionEwald;
        int numRx, numRy, numRz;
-        int meshDim[3];
+        int meshDim[3], dispersionMeshDim[3];
        std::vector<float> erfcTable, ewaldScaleTable;
-        float ewaldDX, ewaldDXInv, erfcDXInv;
+        std::vector<float> exptermsTable, dExptermsTable;
+        float ewaldDX, ewaldDXInv, erfcDXInv, exptermsDX, exptermsDXInv;
        std::vector<double> threadEnergy;
        // The following variables are used to make information accessible to the individual threads.
        int numberOfAtoms;
        float* posq;
        Vec3 const* atomCoordinates;
        std::pair<float, float> const* atomParameters;        
+        float const *C6params;
        std::set<int> const* exclusions;
        std::vector<AlignedArray<float> >* threadForce;
        bool includeEnergy;
+        float inverseRcut6;
+        float inverseRcut6Expterm;
        void* atomicCounter;
        static const float TWO_OVER_SQRT_PI;
@@ -238,10 +253,29 @@ protected:
       */
      void tabulateEwaldScaleFactor();
+      /**
+       * Create a lookup table for the scale factor used with dispersion PME.
+       */
+      void tabulateExpTerms();
      /**
       * Compute a fast approximation to erfc(x).
       */
      float erfcApprox(float x);
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME energies.
+       */
+      float exptermsApprox(float R);
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME forces.
+       */
+      float dExptermsApprox(float R);
 };
 } // namespace OpenMM

--- a/platforms/cpu/include/CpuNonbondedForceVec4.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec4.h
@@ -88,11 +88,25 @@ protected:
       * Compute a fast approximation to erfc(x).
       */
      fvec4 erfcApprox(const fvec4& x);
      /**
       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
       */
      fvec4 ewaldScaleFunction(const fvec4& x);
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME energies.
+       */
+      fvec4 exptermsApprox(const fvec4& R);
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME forces.
+       */
+      fvec4 dExptermsApprox(const fvec4& R);
 };
 } // namespace OpenMM

--- a/platforms/cpu/include/CpuNonbondedForceVec8.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec8.h
@@ -92,6 +92,21 @@ protected:
       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
       */
      fvec8 ewaldScaleFunction(const fvec8& x);
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME energies.
+       */
+      fvec8 exptermsApprox(const fvec8& R);
+      /**
+       * Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
+       * where dar = (dispersionAlpha * R)
+       * needed for LJPME forces.
+       */
+      fvec8 dExptermsApprox(const fvec8& R);
 };
 } // namespace OpenMM

--- a/platforms/cpu/include/CpuSETTLE.h
+++ b/platforms/cpu/include/CpuSETTLE.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -45,8 +45,6 @@ namespace OpenMM {
 */
 class OPENMM_EXPORT_CPU CpuSETTLE : public ReferenceConstraintAlgorithm {
 public:
-    class ApplyToPositionsTask;
-    class ApplyToVelocitiesTask;
    CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads);
    ~CpuSETTLE();

--- a/platforms/cpu/src/CpuBondForce.cpp
+++ b/platforms/cpu/src/CpuBondForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2014-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2014-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,25 +35,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuBondForce::ComputeForceTask : public ThreadPool::Task {
-public:
-    ComputeForceTask(CpuBondForce& owner, vector<Vec3>& atomCoordinates, double** parameters, vector<Vec3>& forces, 
-        vector<double>& threadEnergy, double* totalEnergy, ReferenceBondIxn& referenceBondIxn) : owner(owner), atomCoordinates(atomCoordinates),
-        parameters(parameters), forces(forces), threadEnergy(threadEnergy), totalEnergy(totalEnergy), referenceBondIxn(referenceBondIxn) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        double* energy = (totalEnergy == NULL ? NULL : &threadEnergy[threadIndex]);
-        owner.threadComputeForce(threads, threadIndex, atomCoordinates, parameters, forces, energy, referenceBondIxn);
-    }
-    CpuBondForce& owner;
-    vector<Vec3>& atomCoordinates;
-    double** parameters;
-    vector<Vec3>& forces;
-    vector<double>& threadEnergy;
-    double* totalEnergy;
-    ReferenceBondIxn& referenceBondIxn;
-};
 CpuBondForce::CpuBondForce() {
 }
@@ -188,8 +169,10 @@ void CpuBondForce::calculateForce(vector<Vec3>& atomCoordinates, double** parame
    // Have the worker threads compute their forces.
    vector<double> threadEnergy(threads->getNumThreads(), 0);
-    ComputeForceTask task(*this, atomCoordinates, parameters, forces, threadEnergy, totalEnergy, referenceBondIxn);
+    threads->execute([&] (ThreadPool& threads, int threadIndex) {
-    threads->execute(task);
+        double* energy = (totalEnergy == NULL ? NULL : &threadEnergy[threadIndex]);
+        threadComputeForce(threads, threadIndex, atomCoordinates, parameters, forces, energy, referenceBondIxn);
+    });
    threads->waitForThreads();
    // Compute any "extra" bonds.

--- a/platforms/cpu/src/CpuCustomGBForce.cpp
+++ b/platforms/cpu/src/CpuCustomGBForce.cpp
-/* Portions copyright (c) 2009-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2017 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -33,16 +33,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuCustomGBForce::ComputeForceTask : public ThreadPool::Task {
-public:
-    ComputeForceTask(CpuCustomGBForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuCustomGBForce& owner;
-};
 CpuCustomGBForce::ThreadData::ThreadData(int numAtoms, int numThreads, int threadIndex,
                      const vector<Lepton::CompiledExpression>& valueExpressions,
                      const vector<vector<Lepton::CompiledExpression> >& valueDerivExpressions,
@@ -206,7 +196,7 @@ void CpuCustomGBForce::calculateIxn(int numberOfAtoms, float* posq, double** ato
    // Calculate the first computed value.
-    ComputeForceTask task(*this);
+    auto task = [&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); };
    gmx_atomic_set(&counter, 0);
    threads.execute(task);
    threads.waitForThreads();

--- a/platforms/cpu/src/CpuCustomManyParticleForce.cpp
+++ b/platforms/cpu/src/CpuCustomManyParticleForce.cpp
-/* Portions copyright (c) 2009-2014 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2017 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -37,16 +37,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuCustomManyParticleForce::ComputeForceTask : public ThreadPool::Task {
-public:
-    ComputeForceTask(CpuCustomManyParticleForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuCustomManyParticleForce& owner;
-};
 CpuCustomManyParticleForce::CpuCustomManyParticleForce(const CustomManyParticleForce& force, ThreadPool& threads) :
            threads(threads), useCutoff(false), usePeriodic(false), neighborList(NULL) {
    numParticles = force.getNumParticles();
@@ -141,8 +131,7 @@ void CpuCustomManyParticleForce::calculateIxn(AlignedArray<float>& posq, double*
    // Signal the threads to start running and wait for them to finish.
-    ComputeForceTask task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
    // Combine the energies from all the threads.

--- a/platforms/cpu/src/CpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForce.cpp
-/* Portions copyright (c) 2009-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2017 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -33,16 +33,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuCustomNonbondedForce::ComputeForceTask : public ThreadPool::Task {
-public:
-    ComputeForceTask(CpuCustomNonbondedForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuCustomNonbondedForce& owner;
-};
 CpuCustomNonbondedForce::ThreadData::ThreadData(const Lepton::CompiledExpression& energyExpression, const Lepton::CompiledExpression& forceExpression,
            const vector<string>& parameterNames, const std::vector<Lepton::CompiledExpression> energyParamDerivExpressions) :
            energyExpression(energyExpression), forceExpression(forceExpression), energyParamDerivExpressions(energyParamDerivExpressions) {
@@ -150,8 +140,7 @@ void CpuCustomNonbondedForce::calculatePairIxn(int numberOfAtoms, float* posq, v
    // Signal the threads to start running and wait for them to finish.
-    ComputeForceTask task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
    // Combine the energies from all the threads.

--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
-/* Portions copyright (c) 2006-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -36,16 +36,6 @@ const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
 const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
 const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;
-class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
-public:
-    ComputeTask(CpuGBSAOBCForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuGBSAOBCForce& owner;
-};
 CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
    logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
    logDXInv = 1.0f/logDX;
@@ -110,9 +100,8 @@ void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<Align
    // Signal the threads to start running and wait for them to finish.
-    ComputeTask task(*this);
    gmx_atomic_set(&counter, 0);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
    threads.waitForThreads(); // Compute Born radii
    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();

--- a/platforms/cpu/src/CpuGayBerneForce.cpp
+++ b/platforms/cpu/src/CpuGayBerneForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2016 Stanford University and the Authors.           *
+ * Portions copyright (c) 2016-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -44,17 +44,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuGayBerneForce::ComputeTask : public ThreadPool::Task {
-public:
-    ComputeTask(CpuGayBerneForce& owner, CpuNeighborList* neighborList) : owner(owner), neighborList(neighborList) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex, neighborList);
-    }
-    CpuGayBerneForce& owner;
-    CpuNeighborList* neighborList;
-};
 CpuGayBerneForce::CpuGayBerneForce(const GayBerneForce& force) {
    // Record the force parameters.
@@ -137,8 +126,7 @@ double CpuGayBerneForce::calculateForce(const vector<Vec3>& positions, std::vect
    // Signal the threads to compute the pairwise interactions.
-    ComputeTask task(*this, data.neighborList);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex, data.neighborList); });
-    threads.execute(task);
    threads.waitForThreads();
    // Signal the threads to compute exceptions.

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -50,6 +50,7 @@
 #include "lepton/CustomFunction.h"
 #include "lepton/Operation.h"
 #include "lepton/Parser.h"
+#include <iostream>
 #include "lepton/ParsedExpression.h"
 using namespace OpenMM;
@@ -137,35 +138,27 @@ static double computeShiftedKineticEnergy(ContextImpl& context, vector<double>&
    return 0.5*energy;
 }
-class CpuCalcForcesAndEnergyKernel::SumForceTask : public ThreadPool::Task {
+CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
-public:
+        CalcForcesAndEnergyKernel(name, platform), data(data) {
-    SumForceTask(int numParticles, vector<Vec3>& forceData, CpuPlatform::PlatformData& data) : numParticles(numParticles), forceData(forceData), data(data) {
+    // Create a Reference platform version of this kernel.
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
+    ReferenceKernelFactory referenceFactory;
-        // Sum the contributions to forces that have been calculated by different threads.
+    referenceKernel = Kernel(referenceFactory.createKernelImpl(name, platform, context));
+}
-        int numThreads = threads.getNumThreads();
-        int start = threadIndex*numParticles/numThreads;
-        int end = (threadIndex+1)*numParticles/numThreads;
-        for (int i = start; i < end; i++) {
-            fvec4 f(0.0f);
-            for (int j = 0; j < numThreads; j++)
-                f += fvec4(&data.threadForce[j][4*i]);
-            forceData[i][0] += f[0];
-            forceData[i][1] += f[1];
-            forceData[i][2] += f[2];
-        }
-    }
-    int numParticles;
-    vector<Vec3>& forceData;
-    CpuPlatform::PlatformData& data;
-};
-class CpuCalcForcesAndEnergyKernel::InitForceTask : public ThreadPool::Task {
+void CpuCalcForcesAndEnergyKernel::initialize(const System& system) {
-public:
+    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().initialize(system);
-    InitForceTask(int numParticles, ContextImpl& context, CpuPlatform::PlatformData& data) : numParticles(numParticles), positionsValid(true), context(context), data(data) {
+    lastPositions.resize(system.getNumParticles(), Vec3(1e10, 1e10, 1e10));
-    }
+}
-    void execute(ThreadPool& threads, int threadIndex) {
+void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().beginComputation(context, includeForce, includeEnergy, groups);
+    // Convert positions to single precision and clear the forces.
+    int numParticles = context.getSystem().getNumParticles();
+    bool positionsValid = true;
+    data.threads.execute([&] (ThreadPool& threads, int threadIndex) {
        // Convert the positions to single precision and apply periodic boundary conditions
        AlignedArray<float>& posq = data.posq;
@@ -218,36 +211,9 @@ public:
        fvec4 zero(0.0f);
        for (int j = 0; j < numParticles; j++)
            zero.store(&data.threadForce[threadIndex][j*4]);
-    }
+    });
-    int numParticles;
-    bool positionsValid;
-    ContextImpl& context;
-    CpuPlatform::PlatformData& data;
-};
-CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
-        CalcForcesAndEnergyKernel(name, platform), data(data) {
-    // Create a Reference platform version of this kernel.
-    ReferenceKernelFactory referenceFactory;
-    referenceKernel = Kernel(referenceFactory.createKernelImpl(name, platform, context));
-}
-void CpuCalcForcesAndEnergyKernel::initialize(const System& system) {
-    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().initialize(system);
-    lastPositions.resize(system.getNumParticles(), Vec3(1e10, 1e10, 1e10));
-}
-void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
-    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().beginComputation(context, includeForce, includeEnergy, groups);
-    // Convert positions to single precision and clear the forces.
-    int numParticles = context.getSystem().getNumParticles();
-    InitForceTask task(numParticles, context, data);
-    data.threads.execute(task);
    data.threads.waitForThreads();
-    if (!task.positionsValid)
+    if (!positionsValid)
        throw OpenMMException("Particle coordinate is nan");
    // Determine whether we need to recompute the neighbor list.
@@ -302,8 +268,23 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
 double CpuCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
    // Sum the forces from all the threads.
-    SumForceTask task(context.getSystem().getNumParticles(), extractForces(context), data);
+    data.threads.execute([&] (ThreadPool& threads, int threadIndex) {
-    data.threads.execute(task);
+        // Sum the contributions to forces that have been calculated by different threads.
+        int numParticles = context.getSystem().getNumParticles();
+        int numThreads = threads.getNumThreads();
+        int start = threadIndex*numParticles/numThreads;
+        int end = (threadIndex+1)*numParticles/numThreads;
+        vector<Vec3>& forceData = extractForces(context);
+        for (int i = start; i < end; i++) {
+            fvec4 f(0.0f);
+            for (int j = 0; j < numThreads; j++)
+                f += fvec4(&data.threadForce[j][4*i]);
+            forceData[i][0] += f[0];
+            forceData[i][1] += f[1];
+            forceData[i][2] += f[2];
+        }
+    });
    data.threads.waitForThreads();
    return referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().finishComputation(context, includeForce, includeEnergy, groups, valid);
 }
@@ -528,7 +509,7 @@ CpuNonbondedForce* createCpuNonbondedForceVec4();
 CpuNonbondedForce* createCpuNonbondedForceVec8();
 CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
-        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), nonbonded(NULL) {
+        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), hasInitializedDispersionPme(false), nonbonded(NULL) {
    if (isVec8Supported())
        nonbonded = createCpuNonbondedForceVec8();
    else
@@ -575,12 +556,14 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    for (int i = 0; i < num14; i++)
        bonded14ParamArray[i] = new double[3];
    particleParams.resize(numParticles);
+    C6params.resize(numParticles);
    double sumSquaredCharges = 0.0;
    for (int i = 0; i < numParticles; ++i) {
        double charge, radius, depth;
        force.getParticleParameters(i, charge, radius, depth);
        data.posq[4*i+3] = (float) charge;
        particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
+        C6params[i] = 8.0*pow(particleParams[i].first, 3.0) * particleParams[i].second;
        sumSquaredCharges += charge*charge;
    }
@@ -616,19 +599,35 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    }
    else if (nonbondedMethod == PME) {
        double alpha;
-        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2]);
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
        ewaldAlpha = alpha;
    }
-    if (nonbondedMethod == Ewald || nonbondedMethod == PME)
+    else if (nonbondedMethod == LJPME) {
+        double alpha;
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
+        ewaldAlpha = alpha;
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, dispersionGridSize[0], dispersionGridSize[1], dispersionGridSize[2], true);
+        ewaldDispersionAlpha = alpha;
+        useSwitchingFunction = false;
+    }
+    if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) {
        ewaldSelfEnergy = -ONE_4PI_EPS0*ewaldAlpha*sumSquaredCharges/sqrt(M_PI);
-    else
+        if(nonbondedMethod == LJPME){
+            for (int atom = 0; atom < numParticles; atom++) {
+                // Dispersion self term
+                ewaldSelfEnergy += pow(ewaldDispersionAlpha, 6.0) * C6params[atom]*C6params[atom] / 12.0;
+            }
+        }
+    } else {
        ewaldSelfEnergy = 0.0;
+    }
    rfDielectric = force.getReactionFieldDielectric();
    if (force.getUseDispersionCorrection())
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
    else
        dispersionCoefficient = 0.0;
-    data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME);
+    data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME);
 }
 double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
@@ -646,6 +645,20 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
            }
        }
+        if (nonbondedMethod == LJPME) {
+            // If available, use the optimized PME implementation.
+            vector<string> kernelNames;
+            kernelNames.push_back("CalcPmeReciprocalForce");
+            useOptimizedPme = getPlatform().supportsKernels(kernelNames);
+            if (useOptimizedPme) {
+                optimizedPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
+                optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
+                optimizedDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), context);
+                optimizedDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSize[0], dispersionGridSize[1],
+                                                                                                  dispersionGridSize[2], numParticles, ewaldDispersionAlpha);
+            }
+        }
    }
    AlignedArray<float>& posq = data.posq;
    vector<Vec3>& posData = extractPositions(context);
@@ -654,6 +667,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    bool ewald  = (nonbondedMethod == Ewald);
    bool pme  = (nonbondedMethod == PME);
+    bool ljpme = (nonbondedMethod == LJPME);
    if (nonbondedMethod != NoCutoff)
        nonbonded->setUseCutoff(nonbondedCutoff, *data.neighborList, rfDielectric);
    if (data.isPeriodic) {
@@ -669,9 +683,13 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
        nonbonded->setUsePME(ewaldAlpha, gridSize);
    if (useSwitchingFunction)
        nonbonded->setUseSwitchingFunction(switchingDistance);
+    if (ljpme){
+        nonbonded->setUsePME(ewaldAlpha, gridSize);
+        nonbonded->setUseLJPME(ewaldDispersionAlpha, dispersionGridSize);
+    }
    double nonbondedEnergy = 0;
    if (includeDirect)
-        nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
+        nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, C6params, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
    if (includeReciprocal) {
        if (useOptimizedPme) {
            PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
@@ -680,13 +698,13 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
        }
        else
-            nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
+            nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, C6params, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
    }
    energy += nonbondedEnergy;
    if (includeDirect) {
        ReferenceLJCoulomb14 nonbonded14;
        bondForce.calculateForce(posData, bonded14ParamArray, forceData, includeEnergy ? &energy : NULL, nonbonded14);
-        if (data.isPeriodic)
+        if (data.isPeriodic && nonbondedMethod != LJPME)
            energy += dispersionCoefficient/(boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2]);
    }
    return energy;
@@ -739,7 +757,7 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 }
 void CpuCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
-    if (nonbondedMethod != PME)
+    if (nonbondedMethod != PME && nonbondedMethod != LJPME)
        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
    if (useOptimizedPme)
        optimizedPme.getAs<const CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
@@ -751,6 +769,19 @@ void CpuCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int&
    }
 }
+void CpuCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    if (nonbondedMethod != LJPME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
+    if (useOptimizedPme)
+        optimizedDispersionPme.getAs<const CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
+    else {
+        alpha = ewaldDispersionAlpha;
+        nx = dispersionGridSize[0];
+        ny = dispersionGridSize[1];
+        nz = dispersionGridSize[2];
+    }
+}
 CpuCalcCustomNonbondedForceKernel::CpuCalcCustomNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) :
            CalcCustomNonbondedForceKernel(name, platform), data(data), forceCopy(NULL), nonbonded(NULL) {
 }

--- a/platforms/cpu/src/CpuLangevinDynamics.cpp
+++ b/platforms/cpu/src/CpuLangevinDynamics.cpp
-/* Portions copyright (c) 2006-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Authors: Peter Eastman
 * Contributors: 
 *
@@ -29,36 +29,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuLangevinDynamics::Update1Task : public ThreadPool::Task {
-public:
-    Update1Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate1(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-class CpuLangevinDynamics::Update2Task : public ThreadPool::Task {
-public:
-    Update2Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate2(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-class CpuLangevinDynamics::Update3Task : public ThreadPool::Task {
-public:
-    Update3Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate3(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
 CpuLangevinDynamics::CpuLangevinDynamics(int numberOfAtoms, double deltaT, double friction, double temperature, ThreadPool& threads, CpuRandom& random) : 
           ReferenceStochasticDynamics(numberOfAtoms, deltaT, friction, temperature), threads(threads), random(random) {
 }
@@ -79,8 +49,7 @@ void CpuLangevinDynamics::updatePart1(int numberOfAtoms, vector<Vec3>& atomCoord
    // Signal the threads to start running and wait for them to finish.
-    Update1Task task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate1(threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
 }
@@ -97,8 +66,7 @@ void CpuLangevinDynamics::updatePart2(int numberOfAtoms, vector<Vec3>& atomCoord
    // Signal the threads to start running and wait for them to finish.
-    Update2Task task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate2(threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
 }
@@ -114,8 +82,7 @@ void CpuLangevinDynamics::updatePart3(int numberOfAtoms, vector<Vec3>& atomCoord
    // Signal the threads to start running and wait for them to finish.
-    Update3Task task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate3(threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
 }

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -409,16 +409,6 @@ private:
    vector<vector<vector<pair<float, int> > > > bins;
 };
-class CpuNeighborList::ThreadTask : public ThreadPool::Task {
-public:
-    ThreadTask(CpuNeighborList& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeNeighborList(threads, threadIndex);
-    }
-    CpuNeighborList& owner;
-};
 CpuNeighborList::CpuNeighborList(int blockSize) : blockSize(blockSize) {
 }
@@ -460,8 +450,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
    // Sort the atoms based on a Hilbert curve.
    atomBins.resize(numAtoms);
-    ThreadTask task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeNeighborList(threads, threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
    sort(atomBins.begin(), atomBins.end());

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -30,6 +30,7 @@
 #include "ReferencePME.h"
 #include "openmm/internal/gmx_atomic.h"
 #include <algorithm>
+#include <iostream>
 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
@@ -41,23 +42,14 @@ using namespace OpenMM;
 const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
 const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;
-class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
-public:
-    ComputeDirectTask(CpuNonbondedForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeDirect(threads, threadIndex);
-    }
-    CpuNonbondedForce& owner;
-};
 /**---------------------------------------------------------------------------------------
   CpuNonbondedForce constructor
   --------------------------------------------------------------------------------------- */
-CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false), cutoffDistance(0.0f), alphaEwald(0.0f) {
+CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), ljpme(false), tableIsValid(false), expTableIsValid(false),
+    cutoffDistance(0.0f), alphaDispersionEwald(0.0f), alphaEwald(0.0f) {
 }
 CpuNonbondedForce::~CpuNonbondedForce() {
@@ -78,10 +70,21 @@ void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neig
        tableIsValid = false;
    cutoff = true;
    cutoffDistance = distance;
+    inverseRcut6 = pow(cutoffDistance, -6);
    neighborList = &neighbors;
    krf = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0)/(2.0*solventDielectric+1.0);
    crf = (1.0/cutoffDistance)*(3.0*solventDielectric)/(2.0*solventDielectric+1.0);
-  }
+    if(alphaDispersionEwald != 0.0f){
+        // We set this here, in case setUseCutoff is called after the dispersion alpha is set.
+        double dalphaR = alphaDispersionEwald*cutoffDistance;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        inverseRcut6Expterm  = inverseRcut6*(1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
+    }
+}
 /**---------------------------------------------------------------------------------------
@@ -96,7 +99,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    switchingDistance = distance;
 }
-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------
     Set the force to use periodic boundary conditions.  This requires that a cutoff has
     also been set, and the smallest side of the periodic box is at least twice the cutoff
@@ -106,7 +109,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */
-  void CpuNonbondedForce::setPeriodic(Vec3* periodicBoxVectors) {
+void CpuNonbondedForce::setPeriodic(Vec3* periodicBoxVectors) {
    assert(cutoff);
    assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
@@ -124,11 +127,11 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    periodicBoxVec4[1] = fvec4(periodicBoxVectors[1][0], periodicBoxVectors[1][1], periodicBoxVectors[1][2], 0);
    periodicBoxVec4[2] = fvec4(periodicBoxVectors[2][0], periodicBoxVectors[2][1], periodicBoxVectors[2][2], 0);
    triclinic = (periodicBoxVectors[0][1] != 0.0 || periodicBoxVectors[0][2] != 0.0 ||
-                 periodicBoxVectors[1][0] != 0.0 || periodicBoxVectors[1][2] != 0.0 ||
+            periodicBoxVectors[1][0] != 0.0 || periodicBoxVectors[1][2] != 0.0 ||
-                 periodicBoxVectors[2][0] != 0.0 || periodicBoxVectors[2][1] != 0.0);
+            periodicBoxVectors[2][0] != 0.0 || periodicBoxVectors[2][1] != 0.0);
-  }
+}
-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------
     Set the force to use Ewald summation.
@@ -139,18 +142,18 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */
-  void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
+void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
-      if (alpha != alphaEwald)
+    if (alpha != alphaEwald)
-          tableIsValid = false;
+        tableIsValid = false;
-      alphaEwald = alpha;
+    alphaEwald = alpha;
-      numRx = kmaxx;
+    numRx = kmaxx;
-      numRy = kmaxy;
+    numRy = kmaxy;
-      numRz = kmaxz;
+    numRz = kmaxz;
-      ewald = true;
+    ewald = true;
-      tabulateEwaldScaleFactor();
+    tabulateEwaldScaleFactor();
-  }
+}
-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------
     Set the force to use Particle-Mesh Ewald (PME) summation.
@@ -159,19 +162,49 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */
-  void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
+void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
-      if (alpha != alphaEwald)
+    if (alpha != alphaEwald)
-          tableIsValid = false;
+        tableIsValid = false;
-      alphaEwald = alpha;
+    alphaEwald = alpha;
-      meshDim[0] = meshSize[0];
+    meshDim[0] = meshSize[0];
-      meshDim[1] = meshSize[1];
+    meshDim[1] = meshSize[1];
-      meshDim[2] = meshSize[2];
+    meshDim[2] = meshSize[2];
-      pme = true;
+    pme = true;
-      tabulateEwaldScaleFactor();
+    tabulateEwaldScaleFactor();
-  }
+}
-  void CpuNonbondedForce::tabulateEwaldScaleFactor() {
+/**---------------------------------------------------------------------------------------
+     Set the force to use Particle-Mesh Ewald (PME) summation for dispersion.
+     @param alpha  the Ewald separation parameter
+     @param gridSize the dimensions of the mesh
+     --------------------------------------------------------------------------------------- */
+void CpuNonbondedForce::setUseLJPME(float alpha, int meshSize[3]) {
+    if (alpha != alphaDispersionEwald)
+        expTableIsValid = false;
+    alphaDispersionEwald = alpha;
+    dispersionMeshDim[0] = meshSize[0];
+    dispersionMeshDim[1] = meshSize[1];
+    dispersionMeshDim[2] = meshSize[2];
+    ljpme = true;
+    tabulateExpTerms();
+    if(cutoffDistance != 0.0f){
+        // We set this here, in case setUseLJPME is called after the cutoff is set
+        double dalphaR = alphaDispersionEwald*cutoffDistance;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        inverseRcut6Expterm  = inverseRcut6*(1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
+    }
+}
+void CpuNonbondedForce::tabulateEwaldScaleFactor() {
    if (tableIsValid)
        return;
    tableIsValid = true;
@@ -187,10 +220,30 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
        ewaldScaleTable[i] = erfcTable[i] + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
    }
 }
+void CpuNonbondedForce::tabulateExpTerms() {
+    if (expTableIsValid)
+        return;
+    expTableIsValid = true;
+    exptermsDX = cutoffDistance/NUM_TABLE_POINTS;
+    exptermsDXInv = 1.0f/exptermsDX;
+    exptermsTable.resize(NUM_TABLE_POINTS+4);
+    dExptermsTable.resize(NUM_TABLE_POINTS+4);
+    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
+        double r = i*ewaldDX;
+        double dalphaR = alphaDispersionEwald*r;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        exptermsTable[i]  = (1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
+        dExptermsTable[i] = (1.0 - expterm * (1.0 + dar2 + 0.5*dar4 + dar6/6.0));
+    }
+}
 void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<Vec3>& atomCoordinates,
-                                             const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
+                                               const vector<pair<float, float> >& atomParameters, const vector<float> &C6params, const vector<set<int> >& exclusions,
-                                             vector<Vec3>& forces, double* totalEnergy) const {
+                                               vector<Vec3>& forces, double* totalEnergy) const {
    typedef std::complex<float> d_complex;
    static const float epsilon     =  1.0;
@@ -211,6 +264,29 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
        if (totalEnergy)
            *totalEnergy += recipEnergy;
        pme_destroy(pmedata);
+        if (ljpme) {
+            // Dispersion reciprocal space terms
+            pme_init(&pmedata,alphaDispersionEwald,numberOfAtoms,dispersionMeshDim,5,1);
+            std::vector<Vec3> dpmeforces;
+            for (int i = 0; i < numberOfAtoms; i++){
+                charges[i] = C6params[i];
+                dpmeforces.push_back(Vec3());
+            }
+            double recipDispersionEnergy = 0.0;
+            pme_exec_dpme(pmedata,atomCoordinates,dpmeforces,charges,periodicBoxVectors,&recipDispersionEnergy);
+            for (int i = 0; i < numberOfAtoms; i++){
+                forces[i][0] -= 2.0*dpmeforces[i][0];
+                forces[i][1] -= 2.0*dpmeforces[i][1];
+                forces[i][2] -= 2.0*dpmeforces[i][2];
+            }
+            if (totalEnergy)
+                *totalEnergy += recipDispersionEnergy;
+            pme_destroy(pmedata);
+        }
    }
    // Ewald method
@@ -224,7 +300,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
        // setup K-vectors
-        #define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
+#define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
        vector<d_complex> eir(kmax*numberOfAtoms*3);
        vector<d_complex> tab_xy(numberOfAtoms);
        vector<d_complex> tab_qxyz(numberOfAtoms);
@@ -232,15 +308,15 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
        for (int i = 0; (i < numberOfAtoms); i++) {
            float* pos = posq+4*i;
            for (int m = 0; (m < 3); m++)
-              EIR(0, i, m) = d_complex(1,0);
+                EIR(0, i, m) = d_complex(1,0);
            for (int m=0; (m<3); m++)
-              EIR(1, i, m) = d_complex(cos(pos[m]*recipBoxSize[m]),
+                EIR(1, i, m) = d_complex(cos(pos[m]*recipBoxSize[m]),
-                                       sin(pos[m]*recipBoxSize[m]));
+                                         sin(pos[m]*recipBoxSize[m]));
            for (int j=2; (j<kmax); j++)
-              for (int m=0; (m<3); m++)
+                for (int m=0; (m<3); m++)
-                EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
+                    EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
        }
        // calculate reciprocal space energy and forces
@@ -254,11 +330,11 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
                float ky = ry * recipBoxSize[1];
                if (ry >= 0) {
                    for (int n = 0; n < numberOfAtoms; n++)
-                      tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
+                        tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
                }
                else {
                    for (int n = 0; n < numberOfAtoms; n++)
-                      tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
+                        tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
                }
                for (int rz = lowrz; rz < numRz; rz++) {
                    if (rz >= 0) {
@@ -301,13 +377,14 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
 void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<Vec3>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+                                           const vector<float>& C6params, const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    this->numberOfAtoms = numberOfAtoms;
    this->posq = posq;
    this->atomCoordinates = &atomCoordinates[0];
    this->atomParameters = &atomParameters[0];
+    this->C6params = &C6params[0];
    this->exclusions = &exclusions[0];
    this->threadForce = &threadForce;
    includeEnergy = (totalEnergy != NULL);
@@ -318,8 +395,7 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    // Signal the threads to start running and wait for them to finish.
-    ComputeDirectTask task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeDirect(threads, threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
    // Signal the threads to subtract the exclusions.
@@ -350,9 +426,8 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    float* forces = &(*threadForce)[threadIndex][0];
    fvec4 boxSize(periodicBoxVectors[0][0], periodicBoxVectors[1][1], periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize(recipBoxSize[0], recipBoxSize[1], recipBoxSize[2], 0);
-    if (ewald || pme) {
+    if (ewald || pme || ljpme) {
        // Compute the interactions from the neighbor list.
        while (true) {
            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
            if (nextBlock >= neighborList->getNumBlocks())
@@ -370,7 +445,7 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
                break;
            int end = min(start+groupSize, numberOfAtoms);
            for (int i = start; i < end; i++) {
-               fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
+                fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
                float scaledChargeI = (float) (ONE_4PI_EPS0*posq[4*i+3]);
                for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
                    if (*iter > i) {
@@ -394,7 +469,18 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
                                threadEnergy[threadIndex] -= chargeProdOverR*erfAlphaR;
                        }
                        else if (includeEnergy)
-                           threadEnergy[threadIndex] -= alphaEwald*TWO_OVER_SQRT_PI*scaledChargeI*posq[4*j+3];
+                            threadEnergy[threadIndex] -= alphaEwald*TWO_OVER_SQRT_PI*scaledChargeI*posq[4*j+3];
+                        if (ljpme) {
+                            float C6ij = C6params[i]*C6params[j];
+                            float inverseR2 = 1.0f/r2;
+                            float emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                            if(includeEnergy)
+                                threadEnergy[threadIndex] += emult;
+                            float dEdR = -6.0f*C6ij*inverseR2*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                            fvec4 result = deltaR*dEdR;
+                            (fvec4(forces+4*i)-result).store(forces+4*i);
+                            (fvec4(forces+4*j)+result).store(forces+4*j);
+                        }
                    }
                }
            }
@@ -444,7 +530,7 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
    }
    float sig       = atomParameters[ii].first + atomParameters[jj].first;
    float sig2      = inverseR*sig;
-          sig2     *= sig2;
+    sig2     *= sig2;
    float sig6      = sig2*sig2*sig2;
    float eps       = atomParameters[ii].second*atomParameters[jj].second;
@@ -476,7 +562,7 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
    fvec4 result = deltaR*dEdR;
    (fvec4(forces+4*ii)+result).store(forces+4*ii);
    (fvec4(forces+4*jj)-result).store(forces+4*jj);
-  }
+}
 void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    deltaR = posJ-posI;
@@ -502,3 +588,18 @@ float CpuNonbondedForce::erfcApprox(float x) {
    return coeff1*erfcTable[index] + coeff2*erfcTable[index+1];
 }
+float CpuNonbondedForce::exptermsApprox(float x) {
+    float x1 = x*exptermsDXInv;
+    int index = min((int) floor(x1), NUM_TABLE_POINTS);
+    float coeff2 = x1-index;
+    float coeff1 = 1.0f-coeff2;
+    return coeff1*exptermsTable[index] + coeff2*exptermsTable[index+1];
+}
+float CpuNonbondedForce::dExptermsApprox(float x) {
+    float x1 = x*exptermsDXInv;
+    int index = min((int) floor(x1), NUM_TABLE_POINTS);
+    float coeff2 = x1-index;
+    float coeff1 = 1.0f-coeff2;
+    return coeff1*dExptermsTable[index] + coeff2*dExptermsTable[index+1];
+}