Merged changes from main branch

cd874b2b · peastman · a783b996 · b84e22ba · cd874b2b · cd874b2b
Commit cd874b2b authored Feb 22, 2017 by peastman
20 changed files
--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
@@ -25,6 +25,7 @@
 #include "SimTKOpenMMUtilities.h"
 #include "CpuNonbondedForceVec4.h"
 #include <algorithm>
+#include <iostream>

 using namespace std;
 using namespace OpenMM;
@@ -213,7 +214,6 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,

 void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
    // Determine whether we need to apply periodic boundary conditions.
-    
    PeriodicType periodicType;
    fvec4 blockCenter;
    if (!periodic) {
@@ -263,7 +263,6 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces
 template <int PERIODIC_TYPE>
 void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
-    
    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
    fvec4 blockAtomPosq[4];
    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
@@ -278,6 +277,7 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    fvec4 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]]);
    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);

@@ -318,7 +318,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
            fvec4 sig2 = inverseR*sig;
            sig2 *= sig2;
            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            fvec4 eps = blockAtomEpsilon*atomEpsilon;
+            fvec4 epsSig6 = eps*sig6;
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
@@ -328,6 +329,17 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
                dEdR = switchValue*dEdR - energy*switchDeriv*r;
                energy *= switchValue;
            }
+
+            if (ljpme) {
+                fvec4 C6ij = C6s*C6params[atom];
+                fvec4 inverseR2 = inverseR*inverseR;
+                fvec4 mysig2 = sig*sig;
+                fvec4 mysig6 = mysig2*mysig2*mysig2;
+                fvec4 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                fvec4 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
+                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                energy += emult + potentialShift;
+            }
        }
        else {
            energy = 0.0f;
@@ -420,3 +432,30 @@ fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(const fvec4& x) {
    transpose(t1, t2, t3, t4);
    return coeff1*t1 + coeff2*t2;
 }
+
+fvec4 CpuNonbondedForceVec4::exptermsApprox(const fvec4& r) {
+    fvec4 r1 = r*exptermsDXInv;
+    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = r1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&exptermsTable[index[0]]);
+    fvec4 t2(&exptermsTable[index[1]]);
+    fvec4 t3(&exptermsTable[index[2]]);
+    fvec4 t4(&exptermsTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
+
+fvec4 CpuNonbondedForceVec4::dExptermsApprox(const fvec4& r) {
+    fvec4 r1 = r*exptermsDXInv;
+    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = r1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&dExptermsTable[index[0]]);
+    fvec4 t2(&dExptermsTable[index[1]]);
+    fvec4 t3(&dExptermsTable[index[2]]);
+    fvec4 t4(&dExptermsTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
+
--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
@@ -27,6 +27,7 @@
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/hardware.h"
 #include <algorithm>
+#include <iostream>

 using namespace std;
 using namespace OpenMM;
@@ -81,7 +82,6 @@ enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, Periodic

 void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
    // Determine whether we need to apply periodic boundary conditions.    
-    
    PeriodicType periodicType;
    fvec4 blockCenter;
    if (!periodic) {
@@ -308,6 +308,7 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
    blockAtomCharge *= ONE_4PI_EPS0;
    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
+    fvec8 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]], C6params[blockAtom[4]], C6params[blockAtom[5]], C6params[blockAtom[6]], C6params[blockAtom[7]]);
    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
@@ -348,7 +349,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
            fvec8 sig2 = inverseR*sig;
            sig2 *= sig2;
            fvec8 sig6 = sig2*sig2*sig2;
-            fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            fvec8 eps = blockAtomEpsilon*atomEpsilon;
+            fvec8 epsSig6 = eps*sig6;
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
@@ -358,6 +360,17 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
                dEdR = switchValue*dEdR - energy*switchDeriv*r;
                energy *= switchValue;
            }
+            if (ljpme) {
+                fvec8 C6ij = C6s*C6params[atom];
+                fvec8 inverseR2 = inverseR*inverseR;
+                fvec8 mysig2 = sig*sig;
+                fvec8 mysig6 = mysig2*mysig2*mysig2;
+                fvec8 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                fvec8 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
+                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                energy += emult + potentialShift;
+            }
+
        }
        else {
            energy = 0.0f;
@@ -464,4 +477,45 @@ fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(const fvec8& x) {
    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
    return coeff1*s1 + coeff2*s2;
 }
+
+fvec8 CpuNonbondedForceVec8::exptermsApprox(const fvec8& r) {
+    fvec8 r1 = r*exptermsDXInv;
+    ivec8 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = r1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&exptermsTable[indexLower[0]]);
+    fvec4 t2(&exptermsTable[indexLower[1]]);
+    fvec4 t3(&exptermsTable[indexLower[2]]);
+    fvec4 t4(&exptermsTable[indexLower[3]]);
+    fvec4 t5(&exptermsTable[indexUpper[0]]);
+    fvec4 t6(&exptermsTable[indexUpper[1]]);
+    fvec4 t7(&exptermsTable[indexUpper[2]]);
+    fvec4 t8(&exptermsTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
+
+fvec8 CpuNonbondedForceVec8::dExptermsApprox(const fvec8& r) {
+    fvec8 r1 = r*exptermsDXInv;
+    ivec8 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = r1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&dExptermsTable[indexLower[0]]);
+    fvec4 t2(&dExptermsTable[indexLower[1]]);
+    fvec4 t3(&dExptermsTable[indexLower[2]]);
+    fvec4 t4(&dExptermsTable[indexLower[3]]);
+    fvec4 t5(&dExptermsTable[indexUpper[0]]);
+    fvec4 t6(&dExptermsTable[indexUpper[1]]);
+    fvec4 t7(&dExptermsTable[indexUpper[2]]);
+    fvec4 t8(&dExptermsTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
+
 #endif
--- a/platforms/cpu/src/CpuSETTLE.cpp
+++ b/platforms/cpu/src/CpuSETTLE.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,52 +35,6 @@
 using namespace OpenMM;
 using namespace std;

-class CpuSETTLE::ApplyToPositionsTask : public ThreadPool::Task {
-public:
-    ApplyToPositionsTask(vector<OpenMM::Vec3>& atomCoordinates, vector<OpenMM::Vec3>& atomCoordinatesP, vector<double>& inverseMasses,
-            double tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
-            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
-        gmx_atomic_set(&atomicCounter, 0);
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        while (true) {
-            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
-            if (index >= threadSettle.size())
-                break;
-            threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
-        }
-    }
-    vector<OpenMM::Vec3>& atomCoordinates;
-    vector<OpenMM::Vec3>& atomCoordinatesP;
-    vector<double>& inverseMasses;
-    double tolerance;
-    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
-    gmx_atomic_t atomicCounter;
-};
-
-class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
-public:
-    ApplyToVelocitiesTask(vector<OpenMM::Vec3>& atomCoordinates, vector<OpenMM::Vec3>& velocities, vector<double>& inverseMasses,
-            double tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
-            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
-        gmx_atomic_set(&atomicCounter, 0);
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        while (true) {
-            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
-            if (index >= threadSettle.size())
-                break;
-            threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
-        }
-    }
-    vector<OpenMM::Vec3>& atomCoordinates;
-    vector<OpenMM::Vec3>& velocities;
-    vector<double>& inverseMasses;
-    double tolerance;
-    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
-    gmx_atomic_t atomicCounter;
-};
-
 CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
    int numBlocks = 10*threads.getNumThreads();
    int numClusters = settle.getNumClusters();
@@ -107,13 +61,29 @@ CpuSETTLE::~CpuSETTLE() {
 }

 void CpuSETTLE::apply(vector<OpenMM::Vec3>& atomCoordinates, vector<OpenMM::Vec3>& atomCoordinatesP, vector<double>& inverseMasses, double tolerance) {
-    ApplyToPositionsTask task(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance, threadSettle);
-    threads.execute(task);
+    gmx_atomic_t atomicCounter;
+    gmx_atomic_set(&atomicCounter, 0);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        while (true) {
+            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            if (index >= threadSettle.size())
+                break;
+            threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
+        }
+    });
    threads.waitForThreads();
 }

 void CpuSETTLE::applyToVelocities(vector<OpenMM::Vec3>& atomCoordinates, vector<OpenMM::Vec3>& velocities, vector<double>& inverseMasses, double tolerance) {
-    ApplyToVelocitiesTask task(atomCoordinates, velocities, inverseMasses, tolerance, threadSettle);
-    threads.execute(task);
+    gmx_atomic_t atomicCounter;
+    gmx_atomic_set(&atomicCounter, 0);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        while (true) {
+            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            if (index >= threadSettle.size())
+                break;
+            threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
+        }
+    });
    threads.waitForThreads();
 }
--- a/platforms/cpu/staticTarget/CMakeLists.txt
+++ b/platforms/cpu/staticTarget/CMakeLists.txt
@@ -16,7 +16,6 @@ ENDFOREACH(file)
 ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

 TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}_static ${PTHREADS_LIB_STATIC})
-#-DPTW32_STATIC_LIB only works for the windows pthreads.
-SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_STATIC_LIBRARY -DPTW32_STATIC_LIB")
+SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_STATIC_LIBRARY")

 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
--- a/platforms/cpu/tests/TestCpuDispersionPME.cpp
+++ b/platforms/cpu/tests/TestCpuDispersionPME.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2017 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CpuTests.h"
+#include "TestDispersionPME.h"
+
+void runPlatformTests() {
+}
+
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -494,6 +494,10 @@ public:
    CudaNonbondedUtilities& getNonbondedUtilities() {
        return *nonbonded;
    }
+    /**
+     * Set the particle charges.  These are packed into the fourth element of the posq array.
+     */
+    void setCharges(const std::vector<double>& charges);
    /**
     * Get the thread used by this context for executing parallel computations.
     */
@@ -577,6 +581,12 @@ public:
     * and order to be revalidated.
     */
    void invalidateMolecules();
+    /**
+     * Mark that the current molecule definitions from one particular force (and hence the atom order)
+     * may be invalid.  This should be called whenever force field parameters change.  It will cause the
+     * definitions and order to be revalidated.
+     */
+    bool invalidateMolecules(CudaForceInfo* force);
 private:
    /**
     * Compute a sorted list of device indices in decreasing order of desirability
@@ -626,6 +636,7 @@ private:
    CUfunction clearFourBuffersKernel;
    CUfunction clearFiveBuffersKernel;
    CUfunction clearSixBuffersKernel;
+    CUfunction setChargesKernel;
    std::vector<CudaForceInfo*> forces;
    std::vector<Molecule> molecules;
    std::vector<MoleculeGroup> moleculeGroups;
@@ -638,6 +649,7 @@ private:
    CudaArray* energyBuffer;
    CudaArray* energyParamDerivBuffer;
    CudaArray* atomIndexDevice;
+    CudaArray* chargeBuffer;
    std::vector<std::string> energyParamDerivNames;
    std::map<std::string, double> energyParamDerivWorkspace;
    std::vector<int> atomIndex;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -198,7 +198,6 @@ public:
     */
    void loadCheckpoint(ContextImpl& context, std::istream& stream);
 private:
-    class GetPositionsTask;
    CudaContext& cu;
 };

@@ -292,9 +291,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force);
 private:
+    class ForceInfo;
    int numBonds;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaArray* params;
 };
@@ -332,9 +333,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomBondForce& force);
 private:
+    class ForceInfo;
    int numBonds;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaParameterSet* params;
    CudaArray* globals;
@@ -375,9 +378,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force);
 private:
+    class ForceInfo;
    int numAngles;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaArray* params;
 };
@@ -415,9 +420,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomAngleForce& force);
 private:
+    class ForceInfo;
    int numAngles;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaParameterSet* params;
    CudaArray* globals;
@@ -458,9 +465,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaArray* params;
 };
@@ -498,9 +507,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const RBTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaArray* params1;
    CudaArray* params2;
@@ -539,9 +550,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    std::vector<int2> mapPositionsVec;
    CudaArray* coefficients;
@@ -582,9 +595,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaParameterSet* params;
    CudaArray* globals;
@@ -599,7 +614,8 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeDispersionBsplineModuliX(NULL), pmeDispersionBsplineModuliY(NULL),
+            pmeDispersionBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -636,6 +652,15 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class SortTrait : public CudaSort::SortTrait {
        int getDataSize() const {return 8;}
@@ -647,12 +672,14 @@ private:
        const char* getMaxValue() const {return "make_int2(2147483647, 2147483647)";}
        const char* getSortKey() const {return "value.y";}
    };
+    class ForceInfo;
    class PmeIO;
    class PmePreComputation;
    class PmePostComputation;
    class SyncStreamPreComputation;
    class SyncStreamPostComputation;
    CudaContext& cu;
+    ForceInfo* info;
    bool hasInitializedFFT;
    CudaArray* sigmaEpsilon;
    CudaArray* exceptionParams;
@@ -662,6 +689,9 @@ private:
    CudaArray* pmeBsplineModuliX;
    CudaArray* pmeBsplineModuliY;
    CudaArray* pmeBsplineModuliZ;
+    CudaArray* pmeDispersionBsplineModuliX;
+    CudaArray* pmeDispersionBsplineModuliY;
+    CudaArray* pmeDispersionBsplineModuliZ;
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
    CudaArray* pmeEnergyBuffer;
@@ -673,20 +703,29 @@ private:
    CudaFFT3D* fft;
    cufftHandle fftForward;
    cufftHandle fftBackward;
+    CudaFFT3D* dispersionFft;
+    cufftHandle dispersionFftForward;
+    cufftHandle dispersionFftBackward;
    CUfunction ewaldSumsKernel;
    CUfunction ewaldForcesKernel;
    CUfunction pmeGridIndexKernel;
+    CUfunction pmeDispersionGridIndexKernel;
    CUfunction pmeSpreadChargeKernel;
+    CUfunction pmeDispersionSpreadChargeKernel;
    CUfunction pmeFinishSpreadChargeKernel;
+    CUfunction pmeDispersionFinishSpreadChargeKernel;
    CUfunction pmeEvalEnergyKernel;
+    CUfunction pmeEvalDispersionEnergyKernel;
    CUfunction pmeConvolutionKernel;
+    CUfunction pmeDispersionConvolutionKernel;
    CUfunction pmeInterpolateForceKernel;
-    std::map<std::string, std::string> pmeDefines;
+    CUfunction pmeInterpolateDispersionForceKernel;
    std::vector<std::pair<int, int> > exceptionAtoms;
-    double ewaldSelfEnergy, dispersionCoefficient, alpha;
+    double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
    int interpolateForceThreads;
    int gridSizeX, gridSizeY, gridSizeZ;
-    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT;
+    int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
+    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT, doLJPME;
    NonbondedMethod nonbondedMethod;
    static const int PmeOrder = 5;
 };
@@ -724,8 +763,10 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
 private:
+    class ForceInfo;
    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource, const std::vector<std::string>& tableTypes);
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* params;
    CudaArray* globals;
    CudaArray* interactionGroupData;
@@ -775,10 +816,12 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
 private:
+    class ForceInfo;
    double prefactor, surfaceAreaFactor, cutoff;
    bool hasCreatedKernels;
    int maxTiles;
    CudaContext& cu;
+    ForceInfo* info;
    CudaArray* params;
    CudaArray* bornSum;
    CudaArray* bornRadii;
@@ -825,10 +868,12 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
 private:
+    class ForceInfo;
    double cutoff;
    bool hasInitializedKernels, needParameterGradient, needEnergyParamDerivs;
    int maxTiles, numComputedValues;
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* params;
    CudaParameterSet* computedValues;
    CudaParameterSet* energyDerivs;
@@ -882,9 +927,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomExternalForce& force);
 private:
+    class ForceInfo;
    int numParticles;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaParameterSet* params;
    CudaArray* globals;
@@ -926,9 +973,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
 private:
+    class ForceInfo;
    int numDonors, numAcceptors;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* donorParams;
    CudaParameterSet* acceptorParams;
    CudaArray* globals;
@@ -978,9 +1027,11 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomCentroidBondForce& force);

 private:
+    class ForceInfo;
    int numGroups, numBonds;
    bool needEnergyParamDerivs;
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* params;
    CudaArray* globals;
    CudaArray* groupParticles;
@@ -1031,8 +1082,10 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force);

 private:
+    class ForceInfo;
    int numBonds;
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* params;
    CudaArray* globals;
    std::vector<std::string> globalParamNames;
@@ -1077,7 +1130,9 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomManyParticleForce& force);

 private:
+    class ForceInfo;
    CudaContext& cu;
+    ForceInfo* info;
    bool hasInitializedKernel;
    NonbondedMethod nonbondedMethod;
    int maxNeighborPairs, forceWorkgroupSize, findNeighborsWorkgroupSize;
@@ -1139,9 +1194,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const GayBerneForce& force);
 private:
+    class ForceInfo;
    class ReorderListener;
    void sortAtoms();
    CudaContext& cu;
+    ForceInfo* info;
    bool hasInitializedKernels;
    int numRealParticles, numExceptions, maxNeighborBlocks;
    GayBerneForce::NonbondedMethod nonbondedMethod;

--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
@@ -439,6 +439,15 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class Task;
    CudaPlatform::PlatformData& data;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -108,7 +108,8 @@ static int executeInWindows(const string &command) {
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false),
-        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL),
+        integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    // Determine what compiler to use.
    
    this->compiler = "\""+compiler+"\"";
@@ -291,6 +292,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
    clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
    clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
+    setChargesKernel = getKernel(utilities, "setCharges");

    // Set defines based on the requested precision.

@@ -407,6 +409,8 @@ CudaContext::~CudaContext() {
        delete energyParamDerivBuffer;
    if (atomIndexDevice != NULL)
        delete atomIndexDevice;
+    if (chargeBuffer != NULL)
+        delete chargeBuffer;
    if (integration != NULL)
        delete integration;
    if (expression != NULL)
@@ -860,6 +864,25 @@ void CudaContext::clearAutoclearBuffers() {
    }
 }

+void CudaContext::setCharges(const vector<double>& charges) {
+    if (chargeBuffer == NULL)
+        chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
+    if (getUseDoublePrecision()) {
+        double* c = (double*) getPinnedBuffer();
+        for (int i = 0; i < charges.size(); i++)
+            c[i] = charges[i];
+        chargeBuffer->upload(c);
+    }
+    else {
+        float* c = (float*) getPinnedBuffer();
+        for (int i = 0; i < charges.size(); i++)
+            c[i] = (float) charges[i];
+        chargeBuffer->upload(c);
+    }
+    void* args[] = {&chargeBuffer->getDevicePointer(), &posq->getDevicePointer(), &atomIndexDevice->getDevicePointer(), &numAtoms};
+    executeKernel(setChargesKernel, args, numAtoms);
+}
+
 /**
 * This class ensures that atom reordering doesn't break virtual sites.
 */
@@ -1058,9 +1081,19 @@ void CudaContext::findMoleculeGroups() {
 }

 void CudaContext::invalidateMolecules() {
-    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
+    for (int i = 0; i < forces.size(); i++)
+        if (invalidateMolecules(forces[i]))
            return;
+}
+
+bool CudaContext::invalidateMolecules(CudaForceInfo* force) {
+    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
+        return false;
    bool valid = true;
+    int forceIndex = -1;
+    for (int i = 0; i < forces.size(); i++)
+        if (forces[i] == force)
+            forceIndex = i;
    for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
        MoleculeGroup& mol = moleculeGroups[group];
        vector<int>& instances = mol.instances;
@@ -1075,22 +1108,21 @@ void CudaContext::invalidateMolecules() {
            Molecule& m2 = molecules[instances[j]];
            int offset2 = offsets[j];
            for (int i = 0; i < (int) atoms.size() && valid; i++) {
-                for (int k = 0; k < (int) forces.size(); k++)
-                    if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
+                if (!force->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
                    valid = false;
            }

            // See if the force groups are identical.

-            for (int i = 0; i < (int) forces.size() && valid; i++) {
-                for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
-                    if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
+            if (valid && forceIndex > -1) {
+                for (int k = 0; k < (int) m1.groups[forceIndex].size() && valid; k++)
+                    if (!force->areGroupsIdentical(m1.groups[forceIndex][k], m2.groups[forceIndex][k]))
                        valid = false;
            }
        }
    }
    if (valid)
-        return;
+        return false;

    // The list of which molecules are identical is no longer valid.  We need to restore the
    // atoms to their original order, rebuild the list of identical molecules, and sort them
@@ -1158,6 +1190,7 @@ void CudaContext::invalidateMolecules() {
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        reorderListeners[i]->execute();
    reorderAtoms();
+    return true;
 }

 void CudaContext::reorderAtoms() {

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -628,6 +628,10 @@ void CudaParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int&
    dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
 }

+void CudaParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
+}
+
 class CudaParallelCalcCustomNonbondedForceKernel::Task : public CudaContext::WorkTask {
 public:
    Task(ContextImpl& context, CudaCalcCustomNonbondedForceKernel& kernel, bool includeForce,

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -247,6 +247,10 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
        CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
        deviceName << name;
    }
+    size_t printfsize;
+    cuCtxGetLimit(&printfsize, CU_LIMIT_PRINTF_FIFO_SIZE);
+    cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, 10*printfsize);
+
    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
    disablePmeStream = (pmeStreamProperty == "true");
    deterministicForces = (deterministicForcesProperty == "true");

--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
@@ -17,6 +17,26 @@
    const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expAlphaRSqr;
 #endif
    real tempForce = 0.0f;
+#if HAS_LENNARD_JONES
+    // The multiplicative term to correct for the multiplicative terms that are always
+    // present in reciprocal space.  The real terms have an additive contribution
+    // added in, but for excluded terms the multiplicative term is just subtracted.
+    // These factors are needed in both clauses of the needCorrection statement, so
+    // I declare them up here.
+    #if DO_LJPME
+        const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+        const real dar2 = dispersionAlphaR*dispersionAlphaR;
+        const real dar4 = dar2*dar2;
+        const real dar6 = dar4*dar2;
+        const real invR2 = invR*invR;
+        const real expDar2 = EXP(-dar2);
+        const float2 sigExpProd = sigmaEpsilon1*sigmaEpsilon2;
+        const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
+        const real coef = invR2*invR2*invR2*c6;
+        const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+        const real dprefac = eprefac + dar6/6.0f;
+    #endif
+#endif
    if (needCorrection) {
        // Subtract off the part of this interaction that was included in the reciprocal space contribution.

@@ -29,6 +49,13 @@
            includeInteraction = false;
            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*posq1.w*posq2.w;
        }
+#if HAS_LENNARD_JONES
+        #if DO_LJPME
+            // The multiplicative grid term
+            tempEnergy += coef*(1.0f - expDar2*eprefac);
+            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+        #endif
+#endif
    }
    else {
 #if HAS_LENNARD_JONES
@@ -36,7 +63,8 @@
        real sig2 = invR*sig;
        sig2 *= sig2;
        real sig6 = sig2*sig2*sig2;
-        real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+        real eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
+        real epssig6 = sig6*eps;
        tempForce = epssig6*(12.0f*sig6 - 6.0f);
        real ljEnergy = epssig6*(sig6 - 1.0f);
        #if USE_LJ_SWITCH
@@ -48,6 +76,22 @@
            ljEnergy *= switchValue;
        }
        #endif
+#if DO_LJPME
+        // The multiplicative grid term
+        ljEnergy += coef*(1.0f - expDar2*eprefac);
+        tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+        // The potential shift accounts for the step at the cutoff introduced by the
+        // transition from additive to multiplicative combintion rules and is only
+        // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+        // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+        sig2 = sig*sig;
+        sig6 = sig2*sig2*sig2*INVCUT6;
+        epssig6 = eps*sig6;
+        // The additive part of the potential shift
+        ljEnergy += epssig6*(1.0f - sig6);
+        // The multiplicative part of the potential shift
+        ljEnergy += MULTSHIFT6*c6;
+#endif
        tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
        tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
 #else

--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -21,7 +21,11 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int

 extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex) {
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex
+#ifdef USE_LJPME
+        , const float2* __restrict__ sigmaEpsilon
+#endif
+        ) {
    real3 data[PME_ORDER];
    const real scale = RECIP(PME_ORDER-1);
    
@@ -63,6 +67,12 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
        
        // Spread the charge from this atom onto each grid point.
        
+#ifdef USE_LJPME
+        const float2 sigEps = sigmaEpsilon[atom];
+        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
+        const real charge = pos.w;
+#endif
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndex.x+ix;
            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
@@ -80,7 +90,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = ybase + zindex;

-                    real add = pos.w*dx*dy*data[iz].z;
+                    real add = charge*dx*dy*data[iz].z;
 #ifdef USE_DOUBLE_PRECISION
                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
                    atomicAdd(&ulonglong_p[index],  static_cast<unsigned long long>((long long) (add*0x100000000)));
@@ -121,7 +131,15 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+#ifdef USE_LJPME
+    const real recipScaleFactor = -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+    real bfac = M_PI / EWALD_ALPHA;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
+    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
+#else
    const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+#endif

    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
        // real indices
@@ -140,12 +158,23 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
        real bz = pmeBsplineModuliZ[kz];
        real2 grid = halfcomplex_pmeGrid[index];
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+#ifdef USE_LJPME
+        real denom = recipScaleFactor/(bx*by*bz);
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+        real erfcterm = ERFC(b);
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+        halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
+#else
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
-
        if (kx != 0 || ky != 0 || kz != 0) {
            halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
        }
+#endif
    }
 }

@@ -156,7 +185,15 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+ #ifdef USE_LJPME
+    const real recipScaleFactor = -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+    real bfac = M_PI / EWALD_ALPHA;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
+    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
+#else
    const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+#endif

    mixed energy = 0;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
@@ -175,8 +212,19 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
        real bx = pmeBsplineModuliX[kx];
        real by = pmeBsplineModuliY[ky];
        real bz = pmeBsplineModuliZ[kz];
+#ifdef USE_LJPME
+        real denom = recipScaleFactor/(bx*by*bz);
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+        real erfcterm = ERFC(b);
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+#else
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+#endif

        if (kz >= (GRID_SIZE_Z/2+1)) {
            kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
@@ -185,11 +233,12 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
        } 
        int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
        real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
-        if (kx != 0 || ky != 0 || kz != 0) {
+#ifndef USE_LJPME
+        if (kx != 0 || ky != 0 || kz != 0)
+#endif
            energy += eterm*(grid.x*grid.x + grid.y*grid.y);
    }
-    }
-#ifdef USE_PME_STREAM
+#if defined(USE_PME_STREAM) && !defined(USE_LJPME)
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = 0.5f*energy;
 #else
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
@@ -199,7 +248,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
 extern "C" __global__
 void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex) {
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex
+#ifdef USE_LJPME
+        , const float2* __restrict__ sigmaEpsilon
+#endif
+        ) {
    real3 data[PME_ORDER];
    real3 ddata[PME_ORDER];
    const real scale = RECIP(PME_ORDER-1);
@@ -271,7 +324,12 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
                }
            }
        }
+#ifdef USE_LJPME
+        const float2 sigEps = sigmaEpsilon[atom];
+        real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
        real q = pos.w*EPSILON_FACTOR;
+#endif
        real forceX = -q*(force.x*GRID_SIZE_X*recipBoxVecX.x);
        real forceY = -q*(force.x*GRID_SIZE_X*recipBoxVecY.x+force.y*GRID_SIZE_Y*recipBoxVecY.y);
        real forceZ = -q*(force.x*GRID_SIZE_X*recipBoxVecZ.x+force.y*GRID_SIZE_Y*recipBoxVecZ.y+force.z*GRID_SIZE_Z*recipBoxVecZ.z);

--- a/platforms/cuda/src/kernels/utilities.cu
+++ b/platforms/cuda/src/kernels/utilities.cu
@@ -73,4 +73,11 @@ __global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __res
    clearSingleBuffer(buffer6, size6);
 }

+/**
+ * Record the atomic charges into the posq array.
+ */
+__global__ void setCharges(real* __restrict__ charges, real4* __restrict__ posq, int* __restrict__ atomOrder, int numAtoms) {
+    for (int i = blockDim.x*blockIdx.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x)
+        posq[i].w = charges[atomOrder[i]];
+}
 }
\ No newline at end of file
--- a/platforms/cuda/staticTarget/CMakeLists.txt
+++ b/platforms/cuda/staticTarget/CMakeLists.txt
@@ -14,8 +14,7 @@ SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GEN
 ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

 TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${CUDA_CUDA_LIBRARY} ${CUDA_cufft_LIBRARY} ${PTHREADS_LIB_STATIC})
-#-DPTW32_STATIC_LIB only works for the windows pthreads.
-SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CUDA_BUILDING_STATIC_LIBRARY -DPTW32_STATIC_LIB")
+SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CUDA_BUILDING_STATIC_LIBRARY")
 IF (APPLE)
    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
 ELSE (APPLE)

--- a/platforms/cuda/tests/TestCudaDispersionPME.cpp
+++ b/platforms/cuda/tests/TestCudaDispersionPME.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2017 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaTests.h"
+#include "TestDispersionPME.h"
+
+void runPlatformTests() {
+}
--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -609,6 +609,10 @@ public:
    OpenCLNonbondedUtilities& getNonbondedUtilities() {
        return *nonbonded;
    }
+    /**
+     * Set the particle charges.  These are packed into the fourth element of the posq array.
+     */
+    void setCharges(const std::vector<double>& charges);
    /**
     * Get the thread used by this context for executing parallel computations.
     */
@@ -692,6 +696,12 @@ public:
     * and order to be revalidated.
     */
    void invalidateMolecules();
+    /**
+     * Mark that the current molecule definitions from one particular force (and hence the atom order)
+     * may be invalid.  This should be called whenever force field parameters change.  It will cause the
+     * definitions and order to be revalidated.
+     */
+    bool invalidateMolecules(OpenCLForceInfo* force);
 private:
    struct Molecule;
    struct MoleculeGroup;
@@ -739,6 +749,7 @@ private:
    cl::Kernel clearSixBuffersKernel;
    cl::Kernel reduceReal4Kernel;
    cl::Kernel reduceForcesKernel;
+    cl::Kernel setChargesKernel;
    std::vector<OpenCLForceInfo*> forces;
    std::vector<Molecule> molecules;
    std::vector<MoleculeGroup> moleculeGroups;
@@ -754,6 +765,7 @@ private:
    OpenCLArray* energyBuffer;
    OpenCLArray* energyParamDerivBuffer;
    OpenCLArray* atomIndexDevice;
+    OpenCLArray* chargeBuffer;
    std::vector<std::string> energyParamDerivNames;
    std::map<std::string, double> energyParamDerivWorkspace;
    std::vector<int> atomIndex;

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -176,7 +176,6 @@ public:
     */
    void loadCheckpoint(ContextImpl& context, std::istream& stream);
 private:
-    class GetPositionsTask;
    OpenCLContext& cl;
 };

@@ -270,9 +269,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force);
 private:
+    class ForceInfo;
    int numBonds;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLArray* params;
 };
@@ -310,9 +311,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomBondForce& force);
 private:
+    class ForceInfo;
    int numBonds;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
@@ -353,9 +356,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force);
 private:
+    class ForceInfo;
    int numAngles;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLArray* params;
 };
@@ -393,9 +398,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomAngleForce& force);
 private:
+    class ForceInfo;
    int numAngles;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
@@ -436,9 +443,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLArray* params;
 };
@@ -476,9 +485,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const RBTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLArray* params;
 };
@@ -516,9 +527,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    std::vector<mm_int2> mapPositionsVec;
    OpenCLArray* coefficients;
@@ -559,9 +572,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
@@ -576,8 +591,9 @@ class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, const System& system) : CalcNonbondedForceKernel(name, platform),
            hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
-            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL),
-            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
+            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeDispersionBsplineModuliX(NULL),
+            pmeDispersionBsplineModuliY(NULL), pmeDispersionBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeAtomRange(NULL),
+            pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), dispersionFft(NULL), pmeio(NULL) {
    }
    ~OpenCLCalcNonbondedForceKernel();
    /**
@@ -614,6 +630,15 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the parameters being used for the dispersion term in LJPME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class SortTrait : public OpenCLSort::SortTrait {
        int getDataSize() const {return 8;}
@@ -625,12 +650,14 @@ private:
        const char* getMaxValue() const {return "(int2) (INT_MAX, INT_MAX)";}
        const char* getSortKey() const {return "value.y";}
    };
+    class ForceInfo;
    class PmeIO;
    class PmePreComputation;
    class PmePostComputation;
    class SyncQueuePreComputation;
    class SyncQueuePostComputation;
    OpenCLContext& cl;
+    ForceInfo* info;
    bool hasInitializedKernel;
    OpenCLArray* sigmaEpsilon;
    OpenCLArray* exceptionParams;
@@ -640,6 +667,9 @@ private:
    OpenCLArray* pmeBsplineModuliX;
    OpenCLArray* pmeBsplineModuliY;
    OpenCLArray* pmeBsplineModuliZ;
+    OpenCLArray* pmeDispersionBsplineModuliX;
+    OpenCLArray* pmeDispersionBsplineModuliY;
+    OpenCLArray* pmeDispersionBsplineModuliZ;
    OpenCLArray* pmeBsplineTheta;
    OpenCLArray* pmeAtomRange;
    OpenCLArray* pmeAtomGridIndex;
@@ -648,25 +678,34 @@ private:
    cl::CommandQueue pmeQueue;
    cl::Event pmeSyncEvent;
    OpenCLFFT3D* fft;
+    OpenCLFFT3D* dispersionFft;
    Kernel cpuPme;
    PmeIO* pmeio;
    SyncQueuePostComputation* syncQueue;
    cl::Kernel ewaldSumsKernel;
    cl::Kernel ewaldForcesKernel;
-    cl::Kernel pmeGridIndexKernel;
    cl::Kernel pmeAtomRangeKernel;
+    cl::Kernel pmeDispersionAtomRangeKernel;
    cl::Kernel pmeZIndexKernel;
+    cl::Kernel pmeDispersionZIndexKernel;
    cl::Kernel pmeUpdateBsplinesKernel;
+    cl::Kernel pmeDispersionUpdateBsplinesKernel;
    cl::Kernel pmeSpreadChargeKernel;
+    cl::Kernel pmeDispersionSpreadChargeKernel;
    cl::Kernel pmeFinishSpreadChargeKernel;
+    cl::Kernel pmeDispersionFinishSpreadChargeKernel;
    cl::Kernel pmeConvolutionKernel;
+    cl::Kernel pmeDispersionConvolutionKernel;
    cl::Kernel pmeEvalEnergyKernel;
+    cl::Kernel pmeDispersionEvalEnergyKernel;
    cl::Kernel pmeInterpolateForceKernel;
+    cl::Kernel pmeDispersionInterpolateForceKernel;
    std::map<std::string, std::string> pmeDefines;
    std::vector<std::pair<int, int> > exceptionAtoms;
-    double ewaldSelfEnergy, dispersionCoefficient, alpha;
+    double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
    int gridSizeX, gridSizeY, gridSizeZ;
-    bool hasCoulomb, hasLJ, usePmeQueue;
+    int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
+    bool hasCoulomb, hasLJ, usePmeQueue, doLJPME;
    NonbondedMethod nonbondedMethod;
    static const int PmeOrder = 5;
 };
@@ -704,8 +743,10 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
 private:
+    class ForceInfo;
    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource, const std::vector<std::string>& tableTypes);
    OpenCLContext& cl;
+    ForceInfo* info;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
    OpenCLArray* interactionGroupData;
@@ -756,10 +797,12 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
 private:
+    class ForceInfo;
    double prefactor, surfaceAreaFactor, cutoff;
    bool hasCreatedKernels;
    int maxTiles;
    OpenCLContext& cl;
+    ForceInfo* info;
    OpenCLArray* params;
    OpenCLArray* bornSum;
    OpenCLArray* longBornSum;
@@ -807,10 +850,12 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
 private:
+    class ForceInfo;
    double cutoff;
    bool hasInitializedKernels, needParameterGradient, needEnergyParamDerivs;
    int maxTiles, numComputedValues;
    OpenCLContext& cl;
+    ForceInfo* info;
    OpenCLParameterSet* params;
    OpenCLParameterSet* computedValues;
    OpenCLParameterSet* energyDerivs;
@@ -864,9 +909,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomExternalForce& force);
 private:
+    class ForceInfo;
    int numParticles;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
@@ -908,9 +955,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
 private:
+    class ForceInfo;
    int numDonors, numAcceptors;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    OpenCLParameterSet* donorParams;
    OpenCLParameterSet* acceptorParams;
    OpenCLArray* globals;
@@ -961,9 +1010,11 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomCentroidBondForce& force);

 private:
+    class ForceInfo;
    int numGroups, numBonds;
    bool needEnergyParamDerivs;
    OpenCLContext& cl;
+    ForceInfo* info;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
    OpenCLArray* groupParticles;
@@ -1013,8 +1064,10 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force);

 private:
+    class ForceInfo;
    int numBonds;
    OpenCLContext& cl;
+    ForceInfo* info;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
    std::vector<std::string> globalParamNames;
@@ -1059,7 +1112,9 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomManyParticleForce& force);

 private:
+    class ForceInfo;
    OpenCLContext& cl;
+    ForceInfo* info;
    bool hasInitializedKernel;
    NonbondedMethod nonbondedMethod;
    int maxNeighborPairs, forceWorkgroupSize, findNeighborsWorkgroupSize;
@@ -1119,9 +1174,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const GayBerneForce& force);
 private:
+    class ForceInfo;
    class ReorderListener;
    void sortAtoms();
    OpenCLContext& cl;
+    ForceInfo* info;
    bool hasInitializedKernels;
    int numRealParticles, maxNeighborBlocks;
    GayBerneForce::NonbondedMethod nonbondedMethod;

--- a/platforms/opencl/include/OpenCLParallelKernels.h
+++ b/platforms/opencl/include/OpenCLParallelKernels.h
@@ -438,6 +438,15 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the parameters being used for the dispersion term in LJPME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class Task;
    OpenCLPlatform::PlatformData& data;