Merge branch 'master' of github.com:pandegroup/openmm into dpme

1945dd6c · Andy Simmonett · 203e5407 · 9963e51a · 1945dd6c · 1945dd6c
Commit 1945dd6c authored Feb 20, 2017 by Andy Simmonett
20 changed files
--- a/platforms/cpu/src/CpuCustomGBForce.cpp
+++ b/platforms/cpu/src/CpuCustomGBForce.cpp
-/* Portions copyright (c) 2009-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2017 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -33,16 +33,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuCustomGBForce::ComputeForceTask : public ThreadPool::Task {
-public:
-    ComputeForceTask(CpuCustomGBForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuCustomGBForce& owner;
-};
 CpuCustomGBForce::ThreadData::ThreadData(int numAtoms, int numThreads, int threadIndex,
                      const vector<Lepton::CompiledExpression>& valueExpressions,
                      const vector<vector<Lepton::CompiledExpression> >& valueDerivExpressions,
@@ -206,7 +196,7 @@ void CpuCustomGBForce::calculateIxn(int numberOfAtoms, float* posq, RealOpenMM**
    // Calculate the first computed value.
-    ComputeForceTask task(*this);
+    auto task = [&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); };
    gmx_atomic_set(&counter, 0);
    threads.execute(task);
    threads.waitForThreads();

--- a/platforms/cpu/src/CpuCustomManyParticleForce.cpp
+++ b/platforms/cpu/src/CpuCustomManyParticleForce.cpp
-/* Portions copyright (c) 2009-2014 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2017 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -37,16 +37,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuCustomManyParticleForce::ComputeForceTask : public ThreadPool::Task {
-public:
-    ComputeForceTask(CpuCustomManyParticleForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuCustomManyParticleForce& owner;
-};
 CpuCustomManyParticleForce::CpuCustomManyParticleForce(const CustomManyParticleForce& force, ThreadPool& threads) :
            threads(threads), useCutoff(false), usePeriodic(false), neighborList(NULL) {
    numParticles = force.getNumParticles();
@@ -141,8 +131,7 @@ void CpuCustomManyParticleForce::calculateIxn(AlignedArray<float>& posq, RealOpe
    // Signal the threads to start running and wait for them to finish.
-    ComputeForceTask task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
    // Combine the energies from all the threads.

--- a/platforms/cpu/src/CpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForce.cpp
-/* Portions copyright (c) 2009-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2017 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -33,16 +33,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuCustomNonbondedForce::ComputeForceTask : public ThreadPool::Task {
-public:
-    ComputeForceTask(CpuCustomNonbondedForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuCustomNonbondedForce& owner;
-};
 CpuCustomNonbondedForce::ThreadData::ThreadData(const Lepton::CompiledExpression& energyExpression, const Lepton::CompiledExpression& forceExpression,
            const vector<string>& parameterNames, const std::vector<Lepton::CompiledExpression> energyParamDerivExpressions) :
            energyExpression(energyExpression), forceExpression(forceExpression), energyParamDerivExpressions(energyParamDerivExpressions) {
@@ -150,8 +140,7 @@ void CpuCustomNonbondedForce::calculatePairIxn(int numberOfAtoms, float* posq, v
    // Signal the threads to start running and wait for them to finish.
-    ComputeForceTask task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
    // Combine the energies from all the threads.

--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
-/* Portions copyright (c) 2006-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -36,16 +36,6 @@ const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
 const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
 const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;
-class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
-public:
-    ComputeTask(CpuGBSAOBCForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuGBSAOBCForce& owner;
-};
 CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
    logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
    logDXInv = 1.0f/logDX;
@@ -110,9 +100,8 @@ void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<Align
    // Signal the threads to start running and wait for them to finish.
-    ComputeTask task(*this);
    gmx_atomic_set(&counter, 0);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
    threads.waitForThreads(); // Compute Born radii
    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();

--- a/platforms/cpu/src/CpuGayBerneForce.cpp
+++ b/platforms/cpu/src/CpuGayBerneForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2016 Stanford University and the Authors.           *
+ * Portions copyright (c) 2016-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -44,17 +44,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuGayBerneForce::ComputeTask : public ThreadPool::Task {
-public:
-    ComputeTask(CpuGayBerneForce& owner, CpuNeighborList* neighborList) : owner(owner), neighborList(neighborList) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex, neighborList);
-    }
-    CpuGayBerneForce& owner;
-    CpuNeighborList* neighborList;
-};
 CpuGayBerneForce::CpuGayBerneForce(const GayBerneForce& force) {
    // Record the force parameters.
@@ -137,8 +126,7 @@ RealOpenMM CpuGayBerneForce::calculateForce(const vector<RealVec>& positions, st
    // Signal the threads to compute the pairwise interactions.
-    ComputeTask task(*this, data.neighborList);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex, data.neighborList); });
-    threads.execute(task);
    threads.waitForThreads();
    // Signal the threads to compute exceptions.

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -138,35 +138,27 @@ static double computeShiftedKineticEnergy(ContextImpl& context, vector<double>&
    return 0.5*energy;
 }
-class CpuCalcForcesAndEnergyKernel::SumForceTask : public ThreadPool::Task {
+CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
-public:
+        CalcForcesAndEnergyKernel(name, platform), data(data) {
-    SumForceTask(int numParticles, vector<RealVec>& forceData, CpuPlatform::PlatformData& data) : numParticles(numParticles), forceData(forceData), data(data) {
+    // Create a Reference platform version of this kernel.
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
+    ReferenceKernelFactory referenceFactory;
-        // Sum the contributions to forces that have been calculated by different threads.
+    referenceKernel = Kernel(referenceFactory.createKernelImpl(name, platform, context));
+}
-        int numThreads = threads.getNumThreads();
-        int start = threadIndex*numParticles/numThreads;
-        int end = (threadIndex+1)*numParticles/numThreads;
-        for (int i = start; i < end; i++) {
-            fvec4 f(0.0f);
-            for (int j = 0; j < numThreads; j++)
-                f += fvec4(&data.threadForce[j][4*i]);
-            forceData[i][0] += f[0];
-            forceData[i][1] += f[1];
-            forceData[i][2] += f[2];
-        }
-    }
-    int numParticles;
-    vector<RealVec>& forceData;
-    CpuPlatform::PlatformData& data;
-};
-class CpuCalcForcesAndEnergyKernel::InitForceTask : public ThreadPool::Task {
+void CpuCalcForcesAndEnergyKernel::initialize(const System& system) {
-public:
+    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().initialize(system);
-    InitForceTask(int numParticles, ContextImpl& context, CpuPlatform::PlatformData& data) : numParticles(numParticles), positionsValid(true), context(context), data(data) {
+    lastPositions.resize(system.getNumParticles(), Vec3(1e10, 1e10, 1e10));
-    }
+}
-    void execute(ThreadPool& threads, int threadIndex) {
+void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().beginComputation(context, includeForce, includeEnergy, groups);
+    // Convert positions to single precision and clear the forces.
+    int numParticles = context.getSystem().getNumParticles();
+    bool positionsValid = true;
+    data.threads.execute([&] (ThreadPool& threads, int threadIndex) {
        // Convert the positions to single precision and apply periodic boundary conditions
        AlignedArray<float>& posq = data.posq;
@@ -219,36 +211,9 @@ public:
        fvec4 zero(0.0f);
        for (int j = 0; j < numParticles; j++)
            zero.store(&data.threadForce[threadIndex][j*4]);
-    }
+    });
-    int numParticles;
-    bool positionsValid;
-    ContextImpl& context;
-    CpuPlatform::PlatformData& data;
-};
-CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
-        CalcForcesAndEnergyKernel(name, platform), data(data) {
-    // Create a Reference platform version of this kernel.
-    ReferenceKernelFactory referenceFactory;
-    referenceKernel = Kernel(referenceFactory.createKernelImpl(name, platform, context));
-}
-void CpuCalcForcesAndEnergyKernel::initialize(const System& system) {
-    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().initialize(system);
-    lastPositions.resize(system.getNumParticles(), Vec3(1e10, 1e10, 1e10));
-}
-void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
-    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().beginComputation(context, includeForce, includeEnergy, groups);
-    // Convert positions to single precision and clear the forces.
-    int numParticles = context.getSystem().getNumParticles();
-    InitForceTask task(numParticles, context, data);
-    data.threads.execute(task);
    data.threads.waitForThreads();
-    if (!task.positionsValid)
+    if (!positionsValid)
        throw OpenMMException("Particle coordinate is nan");
    // Determine whether we need to recompute the neighbor list.
@@ -303,8 +268,23 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
 double CpuCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
    // Sum the forces from all the threads.
-    SumForceTask task(context.getSystem().getNumParticles(), extractForces(context), data);
+    data.threads.execute([&] (ThreadPool& threads, int threadIndex) {
-    data.threads.execute(task);
+        // Sum the contributions to forces that have been calculated by different threads.
+        int numParticles = context.getSystem().getNumParticles();
+        int numThreads = threads.getNumThreads();
+        int start = threadIndex*numParticles/numThreads;
+        int end = (threadIndex+1)*numParticles/numThreads;
+        vector<RealVec>& forceData = extractForces(context);
+        for (int i = start; i < end; i++) {
+            fvec4 f(0.0f);
+            for (int j = 0; j < numThreads; j++)
+                f += fvec4(&data.threadForce[j][4*i]);
+            forceData[i][0] += f[0];
+            forceData[i][1] += f[1];
+            forceData[i][2] += f[2];
+        }
+    });
    data.threads.waitForThreads();
    return referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().finishComputation(context, includeForce, includeEnergy, groups, valid);
 }

--- a/platforms/cpu/src/CpuLangevinDynamics.cpp
+++ b/platforms/cpu/src/CpuLangevinDynamics.cpp
-/* Portions copyright (c) 2006-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Authors: Peter Eastman
 * Contributors: 
 *
@@ -29,36 +29,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuLangevinDynamics::Update1Task : public ThreadPool::Task {
-public:
-    Update1Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate1(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-class CpuLangevinDynamics::Update2Task : public ThreadPool::Task {
-public:
-    Update2Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate2(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-class CpuLangevinDynamics::Update3Task : public ThreadPool::Task {
-public:
-    Update3Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate3(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
 CpuLangevinDynamics::CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM friction, RealOpenMM temperature, ThreadPool& threads, CpuRandom& random) : 
           ReferenceStochasticDynamics(numberOfAtoms, deltaT, friction, temperature), threads(threads), random(random) {
 }
@@ -79,8 +49,7 @@ void CpuLangevinDynamics::updatePart1(int numberOfAtoms, vector<RealVec>& atomCo
    // Signal the threads to start running and wait for them to finish.
-    Update1Task task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate1(threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
 }
@@ -97,8 +66,7 @@ void CpuLangevinDynamics::updatePart2(int numberOfAtoms, vector<RealVec>& atomCo
    // Signal the threads to start running and wait for them to finish.
-    Update2Task task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate2(threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
 }
@@ -114,8 +82,7 @@ void CpuLangevinDynamics::updatePart3(int numberOfAtoms, vector<RealVec>& atomCo
    // Signal the threads to start running and wait for them to finish.
-    Update3Task task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate3(threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
 }

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -409,16 +409,6 @@ private:
    vector<vector<vector<pair<float, int> > > > bins;
 };
-class CpuNeighborList::ThreadTask : public ThreadPool::Task {
-public:
-    ThreadTask(CpuNeighborList& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeNeighborList(threads, threadIndex);
-    }
-    CpuNeighborList& owner;
-};
 CpuNeighborList::CpuNeighborList(int blockSize) : blockSize(blockSize) {
 }
@@ -460,8 +450,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
    // Sort the atoms based on a Hilbert curve.
    atomBins.resize(numAtoms);
-    ThreadTask task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeNeighborList(threads, threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
    sort(atomBins.begin(), atomBins.end());

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -42,16 +42,6 @@ using namespace OpenMM;
 const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
 const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;
-class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
-public:
-    ComputeDirectTask(CpuNonbondedForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeDirect(threads, threadIndex);
-    }
-    CpuNonbondedForce& owner;
-};
 /**---------------------------------------------------------------------------------------
   CpuNonbondedForce constructor
@@ -405,8 +395,7 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    // Signal the threads to start running and wait for them to finish.
-    ComputeDirectTask task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeDirect(threads, threadIndex); });
-    threads.execute(task); // ACS calls threadcomputedirect
    threads.waitForThreads();
    // Signal the threads to subtract the exclusions.

--- a/platforms/cpu/src/CpuSETTLE.cpp
+++ b/platforms/cpu/src/CpuSETTLE.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,52 +35,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuSETTLE::ApplyToPositionsTask : public ThreadPool::Task {
-public:
-    ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
-            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
-            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
-        gmx_atomic_set(&atomicCounter, 0);
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        while (true) {
-            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
-            if (index >= threadSettle.size())
-                break;
-            threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
-        }
-    }
-    vector<OpenMM::RealVec>& atomCoordinates;
-    vector<OpenMM::RealVec>& atomCoordinatesP;
-    vector<RealOpenMM>& inverseMasses;
-    RealOpenMM tolerance;
-    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
-    gmx_atomic_t atomicCounter;
-};
-class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
-public:
-    ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
-            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
-            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
-        gmx_atomic_set(&atomicCounter, 0);
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        while (true) {
-            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
-            if (index >= threadSettle.size())
-                break;
-            threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
-        }
-    }
-    vector<OpenMM::RealVec>& atomCoordinates;
-    vector<OpenMM::RealVec>& velocities;
-    vector<RealOpenMM>& inverseMasses;
-    RealOpenMM tolerance;
-    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
-    gmx_atomic_t atomicCounter;
-};
 CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
    int numBlocks = 10*threads.getNumThreads();
    int numClusters = settle.getNumClusters();
@@ -107,13 +61,29 @@ CpuSETTLE::~CpuSETTLE() {
 }
 void CpuSETTLE::apply(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
-    ApplyToPositionsTask task(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance, threadSettle);
+    gmx_atomic_t atomicCounter;
-    threads.execute(task);
+    gmx_atomic_set(&atomicCounter, 0);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        while (true) {
+            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            if (index >= threadSettle.size())
+                break;
+            threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
+        }
+    });
    threads.waitForThreads();
 }
 void CpuSETTLE::applyToVelocities(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
-    ApplyToVelocitiesTask task(atomCoordinates, velocities, inverseMasses, tolerance, threadSettle);
+    gmx_atomic_t atomicCounter;
-    threads.execute(task);
+    gmx_atomic_set(&atomicCounter, 0);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        while (true) {
+            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            if (index >= threadSettle.size())
+                break;
+            threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
+        }
+    });
    threads.waitForThreads();
 }
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -494,6 +494,10 @@ public:
    CudaNonbondedUtilities& getNonbondedUtilities() {
        return *nonbonded;
    }
+    /**
+     * Set the particle charges.  These are packed into the fourth element of the posq array.
+     */
+    void setCharges(const std::vector<double>& charges);
    /**
     * Get the thread used by this context for executing parallel computations.
     */
@@ -577,6 +581,12 @@ public:
     * and order to be revalidated.
     */
    void invalidateMolecules();
+    /**
+     * Mark that the current molecule definitions from one particular force (and hence the atom order)
+     * may be invalid.  This should be called whenever force field parameters change.  It will cause the
+     * definitions and order to be revalidated.
+     */
+    bool invalidateMolecules(CudaForceInfo* force);
 private:
    /**
     * Compute a sorted list of device indices in decreasing order of desirability
@@ -626,6 +636,7 @@ private:
    CUfunction clearFourBuffersKernel;
    CUfunction clearFiveBuffersKernel;
    CUfunction clearSixBuffersKernel;
+    CUfunction setChargesKernel;
    std::vector<CudaForceInfo*> forces;
    std::vector<Molecule> molecules;
    std::vector<MoleculeGroup> moleculeGroups;
@@ -638,6 +649,7 @@ private:
    CudaArray* energyBuffer;
    CudaArray* energyParamDerivBuffer;
    CudaArray* atomIndexDevice;
+    CudaArray* chargeBuffer;
    std::vector<std::string> energyParamDerivNames;
    std::map<std::string, double> energyParamDerivWorkspace;
    std::vector<int> atomIndex;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -198,7 +198,6 @@ public:
     */
    void loadCheckpoint(ContextImpl& context, std::istream& stream);
 private:
-    class GetPositionsTask;
    CudaContext& cu;
 };
@@ -292,9 +291,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force);
 private:
+    class ForceInfo;
    int numBonds;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaArray* params;
 };
@@ -332,9 +333,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomBondForce& force);
 private:
+    class ForceInfo;
    int numBonds;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaParameterSet* params;
    CudaArray* globals;
@@ -375,9 +378,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force);
 private:
+    class ForceInfo;
    int numAngles;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaArray* params;
 };
@@ -415,9 +420,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomAngleForce& force);
 private:
+    class ForceInfo;
    int numAngles;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaParameterSet* params;
    CudaArray* globals;
@@ -458,9 +465,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaArray* params;
 };
@@ -498,9 +507,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const RBTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaArray* params1;
    CudaArray* params2;
@@ -539,9 +550,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    std::vector<int2> mapPositionsVec;
    CudaArray* coefficients;
@@ -582,9 +595,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaParameterSet* params;
    CudaArray* globals;
@@ -657,12 +672,14 @@ private:
        const char* getMaxValue() const {return "make_int2(2147483647, 2147483647)";}
        const char* getSortKey() const {return "value.y";}
    };
+    class ForceInfo;
    class PmeIO;
    class PmePreComputation;
    class PmePostComputation;
    class SyncStreamPreComputation;
    class SyncStreamPostComputation;
    CudaContext& cu;
+    ForceInfo* info;
    bool hasInitializedFFT;
    CudaArray* sigmaEpsilon;
    CudaArray* exceptionParams;
@@ -746,8 +763,10 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
 private:
+    class ForceInfo;
    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource, const std::vector<std::string>& tableTypes);
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* params;
    CudaArray* globals;
    CudaArray* interactionGroupData;
@@ -797,10 +816,12 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
 private:
+    class ForceInfo;
    double prefactor, surfaceAreaFactor, cutoff;
    bool hasCreatedKernels;
    int maxTiles;
    CudaContext& cu;
+    ForceInfo* info;
    CudaArray* params;
    CudaArray* bornSum;
    CudaArray* bornRadii;
@@ -847,10 +868,12 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
 private:
+    class ForceInfo;
    double cutoff;
    bool hasInitializedKernels, needParameterGradient, needEnergyParamDerivs;
    int maxTiles, numComputedValues;
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* params;
    CudaParameterSet* computedValues;
    CudaParameterSet* energyDerivs;
@@ -904,9 +927,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomExternalForce& force);
 private:
+    class ForceInfo;
    int numParticles;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaParameterSet* params;
    CudaArray* globals;
@@ -948,9 +973,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
 private:
+    class ForceInfo;
    int numDonors, numAcceptors;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* donorParams;
    CudaParameterSet* acceptorParams;
    CudaArray* globals;
@@ -1000,9 +1027,11 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomCentroidBondForce& force);
 private:
+    class ForceInfo;
    int numGroups, numBonds;
    bool needEnergyParamDerivs;
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* params;
    CudaArray* globals;
    CudaArray* groupParticles;
@@ -1053,8 +1082,10 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force);
 private:
+    class ForceInfo;
    int numBonds;
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* params;
    CudaArray* globals;
    std::vector<std::string> globalParamNames;
@@ -1099,7 +1130,9 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomManyParticleForce& force);
 private:
+    class ForceInfo;
    CudaContext& cu;
+    ForceInfo* info;
    bool hasInitializedKernel;
    NonbondedMethod nonbondedMethod;
    int maxNeighborPairs, forceWorkgroupSize, findNeighborsWorkgroupSize;
@@ -1161,9 +1194,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const GayBerneForce& force);
 private:
+    class ForceInfo;
    class ReorderListener;
    void sortAtoms();
    CudaContext& cu;
+    ForceInfo* info;
    bool hasInitializedKernels;
    int numRealParticles, numExceptions, maxNeighborBlocks;
    GayBerneForce::NonbondedMethod nonbondedMethod;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -108,7 +108,8 @@ static int executeInWindows(const string &command) {
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false),
-        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL),
+        integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    // Determine what compiler to use.
    this->compiler = "\""+compiler+"\"";
@@ -291,6 +292,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
    clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
    clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
+    setChargesKernel = getKernel(utilities, "setCharges");
    // Set defines based on the requested precision.
@@ -407,6 +409,8 @@ CudaContext::~CudaContext() {
        delete energyParamDerivBuffer;
    if (atomIndexDevice != NULL)
        delete atomIndexDevice;
+    if (chargeBuffer != NULL)
+        delete chargeBuffer;
    if (integration != NULL)
        delete integration;
    if (expression != NULL)
@@ -860,6 +864,25 @@ void CudaContext::clearAutoclearBuffers() {
    }
 }
+void CudaContext::setCharges(const vector<double>& charges) {
+    if (chargeBuffer == NULL)
+        chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
+    if (getUseDoublePrecision()) {
+        double* c = (double*) getPinnedBuffer();
+        for (int i = 0; i < charges.size(); i++)
+            c[i] = charges[i];
+        chargeBuffer->upload(c);
+    }
+    else {
+        float* c = (float*) getPinnedBuffer();
+        for (int i = 0; i < charges.size(); i++)
+            c[i] = (float) charges[i];
+        chargeBuffer->upload(c);
+    }
+    void* args[] = {&chargeBuffer->getDevicePointer(), &posq->getDevicePointer(), &atomIndexDevice->getDevicePointer(), &numAtoms};
+    executeKernel(setChargesKernel, args, numAtoms);
+}
 /**
 * This class ensures that atom reordering doesn't break virtual sites.
 */
@@ -1058,9 +1081,19 @@ void CudaContext::findMoleculeGroups() {
 }
 void CudaContext::invalidateMolecules() {
+    for (int i = 0; i < forces.size(); i++)
+        if (invalidateMolecules(forces[i]))
+            return;
+}
+bool CudaContext::invalidateMolecules(CudaForceInfo* force) {
    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
-        return;
+        return false;
    bool valid = true;
+    int forceIndex = -1;
+    for (int i = 0; i < forces.size(); i++)
+        if (forces[i] == force)
+            forceIndex = i;
    for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
        MoleculeGroup& mol = moleculeGroups[group];
        vector<int>& instances = mol.instances;
@@ -1075,22 +1108,21 @@ void CudaContext::invalidateMolecules() {
            Molecule& m2 = molecules[instances[j]];
            int offset2 = offsets[j];
            for (int i = 0; i < (int) atoms.size() && valid; i++) {
-                for (int k = 0; k < (int) forces.size(); k++)
+                if (!force->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
-                    if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
+                    valid = false;
-                        valid = false;
            }
            // See if the force groups are identical.
-            for (int i = 0; i < (int) forces.size() && valid; i++) {
+            if (valid && forceIndex > -1) {
-                for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
+                for (int k = 0; k < (int) m1.groups[forceIndex].size() && valid; k++)
-                    if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
+                    if (!force->areGroupsIdentical(m1.groups[forceIndex][k], m2.groups[forceIndex][k]))
                        valid = false;
            }
        }
    }
    if (valid)
-        return;
+        return false;
    // The list of which molecules are identical is no longer valid.  We need to restore the
    // atoms to their original order, rebuild the list of identical molecules, and sort them
@@ -1158,6 +1190,7 @@ void CudaContext::invalidateMolecules() {
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        reorderListeners[i]->execute();
    reorderAtoms();
+    return true;
 }
 void CudaContext::reorderAtoms() {

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/kernels/utilities.cu
+++ b/platforms/cuda/src/kernels/utilities.cu
@@ -73,4 +73,11 @@ __global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __res
    clearSingleBuffer(buffer6, size6);
 }
+/**
+ * Record the atomic charges into the posq array.
+ */
+__global__ void setCharges(real* __restrict__ charges, real4* __restrict__ posq, int* __restrict__ atomOrder, int numAtoms) {
+    for (int i = blockDim.x*blockIdx.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x)
+        posq[i].w = charges[atomOrder[i]];
+}
 }
\ No newline at end of file
--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -609,6 +609,10 @@ public:
    OpenCLNonbondedUtilities& getNonbondedUtilities() {
        return *nonbonded;
    }
+    /**
+     * Set the particle charges.  These are packed into the fourth element of the posq array.
+     */
+    void setCharges(const std::vector<double>& charges);
    /**
     * Get the thread used by this context for executing parallel computations.
     */
@@ -692,6 +696,12 @@ public:
     * and order to be revalidated.
     */
    void invalidateMolecules();
+    /**
+     * Mark that the current molecule definitions from one particular force (and hence the atom order)
+     * may be invalid.  This should be called whenever force field parameters change.  It will cause the
+     * definitions and order to be revalidated.
+     */
+    bool invalidateMolecules(OpenCLForceInfo* force);
 private:
    struct Molecule;
    struct MoleculeGroup;
@@ -739,6 +749,7 @@ private:
    cl::Kernel clearSixBuffersKernel;
    cl::Kernel reduceReal4Kernel;
    cl::Kernel reduceForcesKernel;
+    cl::Kernel setChargesKernel;
    std::vector<OpenCLForceInfo*> forces;
    std::vector<Molecule> molecules;
    std::vector<MoleculeGroup> moleculeGroups;
@@ -754,6 +765,7 @@ private:
    OpenCLArray* energyBuffer;
    OpenCLArray* energyParamDerivBuffer;
    OpenCLArray* atomIndexDevice;
+    OpenCLArray* chargeBuffer;
    std::vector<std::string> energyParamDerivNames;
    std::map<std::string, double> energyParamDerivWorkspace;
    std::vector<int> atomIndex;

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -176,7 +176,6 @@ public:
     */
    void loadCheckpoint(ContextImpl& context, std::istream& stream);
 private:
-    class GetPositionsTask;
    OpenCLContext& cl;
 };
@@ -270,9 +269,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force);
 private:
+    class ForceInfo;
    int numBonds;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLArray* params;
 };
@@ -310,9 +311,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomBondForce& force);
 private:
+    class ForceInfo;
    int numBonds;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
@@ -353,9 +356,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force);
 private:
+    class ForceInfo;
    int numAngles;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLArray* params;
 };
@@ -393,9 +398,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomAngleForce& force);
 private:
+    class ForceInfo;
    int numAngles;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
@@ -436,9 +443,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLArray* params;
 };
@@ -476,9 +485,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const RBTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLArray* params;
 };
@@ -516,9 +527,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    std::vector<mm_int2> mapPositionsVec;
    OpenCLArray* coefficients;
@@ -559,9 +572,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
@@ -635,12 +650,14 @@ private:
        const char* getMaxValue() const {return "(int2) (INT_MAX, INT_MAX)";}
        const char* getSortKey() const {return "value.y";}
    };
+    class ForceInfo;
    class PmeIO;
    class PmePreComputation;
    class PmePostComputation;
    class SyncQueuePreComputation;
    class SyncQueuePostComputation;
    OpenCLContext& cl;
+    ForceInfo* info;
    bool hasInitializedKernel;
    OpenCLArray* sigmaEpsilon;
    OpenCLArray* exceptionParams;
@@ -726,8 +743,10 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
 private:
+    class ForceInfo;
    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource, const std::vector<std::string>& tableTypes);
    OpenCLContext& cl;
+    ForceInfo* info;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
    OpenCLArray* interactionGroupData;
@@ -778,10 +797,12 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
 private:
+    class ForceInfo;
    double prefactor, surfaceAreaFactor, cutoff;
    bool hasCreatedKernels;
    int maxTiles;
    OpenCLContext& cl;
+    ForceInfo* info;
    OpenCLArray* params;
    OpenCLArray* bornSum;
    OpenCLArray* longBornSum;
@@ -829,10 +850,12 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
 private:
+    class ForceInfo;
    double cutoff;
    bool hasInitializedKernels, needParameterGradient, needEnergyParamDerivs;
    int maxTiles, numComputedValues;
    OpenCLContext& cl;
+    ForceInfo* info;
    OpenCLParameterSet* params;
    OpenCLParameterSet* computedValues;
    OpenCLParameterSet* energyDerivs;
@@ -886,9 +909,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomExternalForce& force);
 private:
+    class ForceInfo;
    int numParticles;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    const System& system;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
@@ -930,9 +955,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
 private:
+    class ForceInfo;
    int numDonors, numAcceptors;
    bool hasInitializedKernel;
    OpenCLContext& cl;
+    ForceInfo* info;
    OpenCLParameterSet* donorParams;
    OpenCLParameterSet* acceptorParams;
    OpenCLArray* globals;
@@ -983,9 +1010,11 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomCentroidBondForce& force);
 private:
+    class ForceInfo;
    int numGroups, numBonds;
    bool needEnergyParamDerivs;
    OpenCLContext& cl;
+    ForceInfo* info;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
    OpenCLArray* groupParticles;
@@ -1035,8 +1064,10 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force);
 private:
+    class ForceInfo;
    int numBonds;
    OpenCLContext& cl;
+    ForceInfo* info;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
    std::vector<std::string> globalParamNames;
@@ -1081,7 +1112,9 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomManyParticleForce& force);
 private:
+    class ForceInfo;
    OpenCLContext& cl;
+    ForceInfo* info;
    bool hasInitializedKernel;
    NonbondedMethod nonbondedMethod;
    int maxNeighborPairs, forceWorkgroupSize, findNeighborsWorkgroupSize;
@@ -1141,9 +1174,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const GayBerneForce& force);
 private:
+    class ForceInfo;
    class ReorderListener;
    void sortAtoms();
    OpenCLContext& cl;
+    ForceInfo* info;
    bool hasInitializedKernels;
    int numRealParticles, maxNeighborBlocks;
    GayBerneForce::NonbondedMethod nonbondedMethod;

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -69,8 +69,8 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
 OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) :
        system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), atomsWereReordered(false), posq(NULL),
-        posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
+        posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL),
-        expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+        chargeBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    if (precision == "single") {
        useDoublePrecision = false;
        useMixedPrecision = false;
@@ -309,6 +309,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
    reduceReal4Kernel = cl::Kernel(utilities, "reduceReal4Buffer");
    if (supports64BitGlobalAtomics)
        reduceForcesKernel = cl::Kernel(utilities, "reduceForces");
+    setChargesKernel = cl::Kernel(utilities, "setCharges");
    // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
@@ -439,6 +440,8 @@ OpenCLContext::~OpenCLContext() {
        delete energyParamDerivBuffer;
    if (atomIndexDevice != NULL)
        delete atomIndexDevice;
+    if (chargeBuffer != NULL)
+        delete chargeBuffer;
    if (integration != NULL)
        delete integration;
    if (expression != NULL)
@@ -747,6 +750,28 @@ void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
    executeKernel(reduceReal4Kernel, bufferSize, 128);
 }
+void OpenCLContext::setCharges(const vector<double>& charges) {
+    if (chargeBuffer == NULL)
+        chargeBuffer = new OpenCLArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
+    if (getUseDoublePrecision()) {
+        double* c = (double*) getPinnedBuffer();
+        for (int i = 0; i < charges.size(); i++)
+            c[i] = charges[i];
+        chargeBuffer->upload(c);
+    }
+    else {
+        float* c = (float*) getPinnedBuffer();
+        for (int i = 0; i < charges.size(); i++)
+            c[i] = (float) charges[i];
+        chargeBuffer->upload(c);
+    }
+    setChargesKernel.setArg<cl::Buffer>(0, chargeBuffer->getDeviceBuffer());
+    setChargesKernel.setArg<cl::Buffer>(1, posq->getDeviceBuffer());
+    setChargesKernel.setArg<cl::Buffer>(2, atomIndexDevice->getDeviceBuffer());
+    setChargesKernel.setArg<cl_int>(3, numAtoms);
+    executeKernel(setChargesKernel, numAtoms);
+}
 /**
 * This class ensures that atom reordering doesn't break virtual sites.
 */
@@ -945,9 +970,19 @@ void OpenCLContext::findMoleculeGroups() {
 }
 void OpenCLContext::invalidateMolecules() {
+    for (int i = 0; i < forces.size(); i++)
+        if (invalidateMolecules(forces[i]))
+            return;
+}
+bool OpenCLContext::invalidateMolecules(OpenCLForceInfo* force) {
    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
-        return;
+        return false;
    bool valid = true;
+    int forceIndex = -1;
+    for (int i = 0; i < forces.size(); i++)
+        if (forces[i] == force)
+            forceIndex = i;
    for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
        MoleculeGroup& mol = moleculeGroups[group];
        vector<int>& instances = mol.instances;
@@ -962,22 +997,21 @@ void OpenCLContext::invalidateMolecules() {
            Molecule& m2 = molecules[instances[j]];
            int offset2 = offsets[j];
            for (int i = 0; i < (int) atoms.size() && valid; i++) {
-                for (int k = 0; k < (int) forces.size(); k++)
+                if (!force->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
-                    if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
+                    valid = false;
-                        valid = false;
            }
            // See if the force groups are identical.
-            for (int i = 0; i < (int) forces.size() && valid; i++) {
+            if (valid && forceIndex > -1) {
-                for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
+                for (int k = 0; k < (int) m1.groups[forceIndex].size() && valid; k++)
-                    if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
+                    if (!force->areGroupsIdentical(m1.groups[forceIndex][k], m2.groups[forceIndex][k]))
                        valid = false;
            }
        }
    }
    if (valid)
-        return;
+        return false;
    // The list of which molecules are identical is no longer valid.  We need to restore the
    // atoms to their original order, rebuild the list of identical molecules, and sort them
@@ -1045,6 +1079,7 @@ void OpenCLContext::invalidateMolecules() {
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        reorderListeners[i]->execute();
    reorderAtoms();
+    return true;
 }
 void OpenCLContext::reorderAtoms() {

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
@@ -107,3 +107,11 @@ __kernel void determineNativeAccuracy(__global float8* restrict values, int numV
        values[i] = (float8) (v, native_sqrt(v), native_rsqrt(v), native_recip(v), native_exp(v), native_log(v), 0.0f, 0.0f);
    }
 }
+/**
+ * Record the atomic charges into the posq array.
+ */
+__kernel void setCharges(__global real* restrict charges, __global real4* restrict posq, __global int* restrict atomOrder, int numAtoms) {
+    for (int i = get_global_id(0); i < numAtoms; i += get_global_size(0))
+        posq[i].w = charges[atomOrder[i]];
+}
\ No newline at end of file