Merge remote-tracking branch 'upstream/master'

047934e2 · Rafal P. Wiewiora · ce3a5dc0 · d12c9bd1 · 047934e2 · 047934e2
Commit 047934e2 authored Mar 01, 2017 by Rafal P. Wiewiora
20 changed files
--- a/platforms/cpu/src/CpuCustomManyParticleForce.cpp
+++ b/platforms/cpu/src/CpuCustomManyParticleForce.cpp

-/* Portions copyright (c) 2009-2014 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2017 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -37,16 +37,6 @@
 using namespace OpenMM;
 using namespace std;

-class CpuCustomManyParticleForce::ComputeForceTask : public ThreadPool::Task {
-public:
-    ComputeForceTask(CpuCustomManyParticleForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuCustomManyParticleForce& owner;
-};
-
 CpuCustomManyParticleForce::CpuCustomManyParticleForce(const CustomManyParticleForce& force, ThreadPool& threads) :
            threads(threads), useCutoff(false), usePeriodic(false), neighborList(NULL) {
    numParticles = force.getNumParticles();
@@ -98,7 +88,7 @@ CpuCustomManyParticleForce::~CpuCustomManyParticleForce() {
        delete threadData[i];
 }

-void CpuCustomManyParticleForce::calculateIxn(AlignedArray<float>& posq, RealOpenMM** particleParameters,
+void CpuCustomManyParticleForce::calculateIxn(AlignedArray<float>& posq, double** particleParameters,
                                                  const map<string, double>& globalParameters, vector<AlignedArray<float> >& threadForce,
                                                  bool includeForces, bool includeEnergy, double& energy) {
    // Record the parameters for the threads.
@@ -141,8 +131,7 @@ void CpuCustomManyParticleForce::calculateIxn(AlignedArray<float>& posq, RealOpe
    
    // Signal the threads to start running and wait for them to finish.
    
-    ComputeForceTask task(*this);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
    threads.waitForThreads();
    
    // Combine the energies from all the threads.
@@ -191,14 +180,14 @@ void CpuCustomManyParticleForce::threadComputeForce(ThreadPool& threads, int thr
    }
 }

-void CpuCustomManyParticleForce::setUseCutoff(RealOpenMM distance) {
+void CpuCustomManyParticleForce::setUseCutoff(double distance) {
    useCutoff = true;
    cutoffDistance = distance;
    if (neighborList == NULL)
        neighborList = new CpuNeighborList(4);
 }

-void CpuCustomManyParticleForce::setPeriodic(RealVec* periodicBoxVectors) {
+void CpuCustomManyParticleForce::setPeriodic(Vec3* periodicBoxVectors) {
    assert(useCutoff);
    assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
    assert(periodicBoxVectors[1][1] >= 2.0*cutoffDistance);
@@ -220,7 +209,7 @@ void CpuCustomManyParticleForce::setPeriodic(RealVec* periodicBoxVectors) {
 }

 void CpuCustomManyParticleForce::loopOverInteractions(vector<int>& availableParticles, vector<int>& particleSet, int loopIndex, int startIndex,
-                                                          RealOpenMM** particleParameters, float* forces, ThreadData& data, const fvec4& boxSize, const fvec4& invBoxSize) {
+                                                          double** particleParameters, float* forces, ThreadData& data, const fvec4& boxSize, const fvec4& invBoxSize) {
    int numParticles = availableParticles.size();
    double cutoff2 = cutoffDistance*cutoffDistance;
    int checkRange = (centralParticleMode ? 1 : loopIndex);
@@ -254,7 +243,7 @@ void CpuCustomManyParticleForce::loopOverInteractions(vector<int>& availablePart
    }
 }

-void CpuCustomManyParticleForce::calculateOneIxn(vector<int>& particleSet, RealOpenMM** particleParameters, float* forces, ThreadData& data, const fvec4& boxSize, const fvec4& invBoxSize) {
+void CpuCustomManyParticleForce::calculateOneIxn(vector<int>& particleSet, double** particleParameters, float* forces, ThreadData& data, const fvec4& boxSize, const fvec4& invBoxSize) {
    // Select the ordering to use for the particles.
    
    vector<int>& permutedParticles = data.permutedParticles;

--- a/platforms/cpu/src/CpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForce.cpp

-/* Portions copyright (c) 2009-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2017 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -33,16 +33,6 @@
 using namespace OpenMM;
 using namespace std;

-class CpuCustomNonbondedForce::ComputeForceTask : public ThreadPool::Task {
-public:
-    ComputeForceTask(CpuCustomNonbondedForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuCustomNonbondedForce& owner;
-};
-
 CpuCustomNonbondedForce::ThreadData::ThreadData(const Lepton::CompiledExpression& energyExpression, const Lepton::CompiledExpression& forceExpression,
            const vector<string>& parameterNames, const std::vector<Lepton::CompiledExpression> energyParamDerivExpressions) :
            energyExpression(energyExpression), forceExpression(forceExpression), energyParamDerivExpressions(energyParamDerivExpressions) {
@@ -70,7 +60,7 @@ CpuCustomNonbondedForce::ThreadData::ThreadData(const Lepton::CompiledExpression
 CpuCustomNonbondedForce::CpuCustomNonbondedForce(const Lepton::CompiledExpression& energyExpression,
            const Lepton::CompiledExpression& forceExpression, const vector<string>& parameterNames, const vector<set<int> >& exclusions,
            const std::vector<Lepton::CompiledExpression> energyParamDerivExpressions, ThreadPool& threads) :
-            cutoff(false), useSwitch(false), periodic(false), paramNames(parameterNames), exclusions(exclusions), threads(threads) {
+            cutoff(false), useSwitch(false), periodic(false), useInteractionGroups(false), paramNames(parameterNames), exclusions(exclusions), threads(threads) {
    for (int i = 0; i < threads.getNumThreads(); i++)
        threadData.push_back(new ThreadData(energyExpression, forceExpression, parameterNames, energyParamDerivExpressions));
 }
@@ -80,13 +70,14 @@ CpuCustomNonbondedForce::~CpuCustomNonbondedForce() {
        delete threadData[i];
 }

-void CpuCustomNonbondedForce::setUseCutoff(RealOpenMM distance, const CpuNeighborList& neighbors) {
+void CpuCustomNonbondedForce::setUseCutoff(double distance, const CpuNeighborList& neighbors) {
    cutoff = true;
    cutoffDistance = distance;
    neighborList = &neighbors;
  }

 void CpuCustomNonbondedForce::setInteractionGroups(const vector<pair<set<int>, set<int> > >& groups) {
+    useInteractionGroups = true;
    for (int group = 0; group < (int) groups.size(); group++) {
        const set<int>& set1 = groups[group].first;
        const set<int>& set2 = groups[group].second;
@@ -102,12 +93,12 @@ void CpuCustomNonbondedForce::setInteractionGroups(const vector<pair<set<int>, s
    }
 }

-void CpuCustomNonbondedForce::setUseSwitchingFunction(RealOpenMM distance) {
+void CpuCustomNonbondedForce::setUseSwitchingFunction(double distance) {
    useSwitch = true;
    switchingDistance = distance;
 }

-void CpuCustomNonbondedForce::setPeriodic(RealVec* periodicBoxVectors) {
+void CpuCustomNonbondedForce::setPeriodic(Vec3* periodicBoxVectors) {
    assert(cutoff);
    assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
    assert(periodicBoxVectors[1][1] >= 2.0*cutoffDistance);
@@ -129,9 +120,9 @@ void CpuCustomNonbondedForce::setPeriodic(RealVec* periodicBoxVectors) {
 }


-void CpuCustomNonbondedForce::calculatePairIxn(int numberOfAtoms, float* posq, vector<RealVec>& atomCoordinates, RealOpenMM** atomParameters,
-                                             RealOpenMM* fixedParameters, const map<string, double>& globalParameters,
-                                             vector<AlignedArray<float> >& threadForce, bool includeForce, bool includeEnergy, double& totalEnergy, double* energyParamDerivs) {
+void CpuCustomNonbondedForce::calculatePairIxn(int numberOfAtoms, float* posq, vector<Vec3>& atomCoordinates, double** atomParameters,
+                                               double* fixedParameters, const map<string, double>& globalParameters,
+                                               vector<AlignedArray<float> >& threadForce, bool includeForce, bool includeEnergy, double& totalEnergy, double* energyParamDerivs) {
    // Record the parameters for the threads.
    
    this->numberOfAtoms = numberOfAtoms;
@@ -149,8 +140,7 @@ void CpuCustomNonbondedForce::calculatePairIxn(int numberOfAtoms, float* posq, v
    
    // Signal the threads to start running and wait for them to finish.
    
-    ComputeForceTask task(*this);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
    threads.waitForThreads();
    
    // Combine the energies from all the threads.
@@ -183,7 +173,7 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
        data.energyParamDerivs[i] = 0.0;
    fvec4 boxSize(periodicBoxVectors[0][0], periodicBoxVectors[1][1], periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize(recipBoxSize[0], recipBoxSize[1], recipBoxSize[2], 0);
-    if (groupInteractions.size() > 0) {
+    if (useInteractionGroups) {
        // The user has specified interaction groups, so compute only the requested interactions.
        
        while (true) {

--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
-
-/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -37,16 +36,6 @@ const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
 const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
 const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;

-class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
-public:
-    ComputeTask(CpuGBSAOBCForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuGBSAOBCForce& owner;
-};
-
 CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
    logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
    logDXInv = 1.0f/logDX;
@@ -89,6 +78,10 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
    particleParams = params;
    bornRadii.resize(params.size()+3);
    obcChain.resize(params.size()+3);
+    for (int i = bornRadii.size()-3; i < bornRadii.size(); i++) {
+        bornRadii[i] = 0;
+        obcChain[i] = 0;
+    }
 }

 void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
@@ -107,9 +100,8 @@ void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<Align
    
    // Signal the threads to start running and wait for them to finish.
    
-    ComputeTask task(*this);
    gmx_atomic_set(&counter, 0);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
    threads.waitForThreads(); // Compute Born radii
    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();

--- a/platforms/cpu/src/CpuGayBerneForce.cpp
+++ b/platforms/cpu/src/CpuGayBerneForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2016 Stanford University and the Authors.           *
+ * Portions copyright (c) 2016-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -44,17 +44,6 @@
 using namespace OpenMM;
 using namespace std;

-class CpuGayBerneForce::ComputeTask : public ThreadPool::Task {
-public:
-    ComputeTask(CpuGayBerneForce& owner, CpuNeighborList* neighborList) : owner(owner), neighborList(neighborList) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex, neighborList);
-    }
-    CpuGayBerneForce& owner;
-    CpuNeighborList* neighborList;
-};
-
 CpuGayBerneForce::CpuGayBerneForce(const GayBerneForce& force) {
    // Record the force parameters.

@@ -111,7 +100,7 @@ const vector<set<int> >& CpuGayBerneForce::getExclusions() const {
    return particleExclusions;
 }

-RealOpenMM CpuGayBerneForce::calculateForce(const vector<RealVec>& positions, std::vector<RealVec>& forces, std::vector<AlignedArray<float> >& threadForce, RealVec* boxVectors, CpuPlatform::PlatformData& data) {
+double CpuGayBerneForce::calculateForce(const vector<Vec3>& positions, std::vector<Vec3>& forces, std::vector<AlignedArray<float> >& threadForce, Vec3* boxVectors, CpuPlatform::PlatformData& data) {
    if (nonbondedMethod == GayBerneForce::CutoffPeriodic) {
        double minAllowedSize = 1.999999*cutoffDistance;
        if (boxVectors[0][0] < minAllowedSize || boxVectors[1][1] < minAllowedSize || boxVectors[2][2] < minAllowedSize)
@@ -137,8 +126,7 @@ RealOpenMM CpuGayBerneForce::calculateForce(const vector<RealVec>& positions, st
    
    // Signal the threads to compute the pairwise interactions.
    
-    ComputeTask task(*this, data.neighborList);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex, data.neighborList); });
    threads.waitForThreads();
    
    // Signal the threads to compute exceptions.
@@ -164,10 +152,10 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
    int numThreads = threads.getNumThreads();
    threadEnergy[threadIndex] = 0;
    float* forces = &(*threadForce)[threadIndex][0];
-    vector<RealVec>& torques = threadTorque[threadIndex];
+    vector<Vec3>& torques = threadTorque[threadIndex];
    torques.resize(numParticles);
    for (int i = 0; i < numParticles; i++)
-        torques[i] = RealVec();
+        torques[i] = Vec3();
    double energy = 0.0;

    // Compute this thread's subset of interactions.
@@ -184,8 +172,8 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
                    continue;
                if (particleExclusions[i].find(j) != particleExclusions[i].end())
                    continue; // This interaction will be handled by an exception.
-                RealOpenMM sigma = particles[i].sigmaOver2+particles[j].sigmaOver2;
-                RealOpenMM epsilon = particles[i].sqrtEpsilon*particles[j].sqrtEpsilon;
+                double sigma = particles[i].sigmaOver2+particles[j].sigmaOver2;
+                double epsilon = particles[i].sqrtEpsilon*particles[j].sqrtEpsilon;
                energy += computeOneInteraction(i, j, sigma, epsilon, positions, forces, torques, boxVectors);
            }
        }
@@ -208,8 +196,8 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
                        int second = blockAtom[k];
                        if (particles[second].sqrtEpsilon == 0.0f)
                            continue;
-                        RealOpenMM sigma = particles[first].sigmaOver2+particles[second].sigmaOver2;
-                        RealOpenMM epsilon = particles[first].sqrtEpsilon*particles[second].sqrtEpsilon;
+                        double sigma = particles[first].sigmaOver2+particles[second].sigmaOver2;
+                        double epsilon = particles[first].sqrtEpsilon*particles[second].sqrtEpsilon;
                        energy += computeOneInteraction(first, second, sigma, epsilon, positions, forces, torques, boxVectors);
                    }
                }
@@ -235,39 +223,39 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
    threadEnergy[threadIndex] = energy;
 }

-void CpuGayBerneForce::computeEllipsoidFrames(const vector<RealVec>& positions) {
+void CpuGayBerneForce::computeEllipsoidFrames(const vector<Vec3>& positions) {
    int numParticles = particles.size();
    for (int particle = 0; particle < numParticles; particle++) {
        ParticleInfo& p = particles[particle];

        // Compute the local coordinate system of the ellipsoid;

-        RealVec xdir, ydir, zdir;
+        Vec3 xdir, ydir, zdir;
        if (p.xparticle == -1) {
-            xdir = RealVec(1, 0, 0);
-            ydir = RealVec(0, 1, 0);
+            xdir = Vec3(1, 0, 0);
+            ydir = Vec3(0, 1, 0);
        }
        else {
            xdir = positions[particle]-positions[p.xparticle];
-            xdir /= SQRT(xdir.dot(xdir));
+            xdir /= sqrt(xdir.dot(xdir));
            if (p.yparticle == -1) {
                if (xdir[1] > -0.5 && xdir[1] < 0.5)
-                    ydir = RealVec(0, 1, 0);
+                    ydir = Vec3(0, 1, 0);
                else
-                    ydir = RealVec(1, 0, 0);
+                    ydir = Vec3(1, 0, 0);
            }
            else
                ydir = positions[particle]-positions[p.yparticle];
            ydir -= xdir*(xdir.dot(ydir));
-            ydir /= SQRT(ydir.dot(ydir));
+            ydir /= sqrt(ydir.dot(ydir));
        }
        zdir = xdir.cross(ydir);

        // Compute matrices we will need later.

-        RealOpenMM (&a)[3][3] = A[particle].v;
-        RealOpenMM (&b)[3][3] = B[particle].v;
-        RealOpenMM (&g)[3][3] = G[particle].v;
+        double (&a)[3][3] = A[particle].v;
+        double (&b)[3][3] = B[particle].v;
+        double (&g)[3][3] = G[particle].v;
        a[0][0] = xdir[0];
        a[0][1] = xdir[1];
        a[0][2] = xdir[2];
@@ -277,8 +265,8 @@ void CpuGayBerneForce::computeEllipsoidFrames(const vector<RealVec>& positions)
        a[2][0] = zdir[0];
        a[2][1] = zdir[1];
        a[2][2] = zdir[2];
-        RealVec r2(p.rx*p.rx, p.ry*p.ry, p.rz*p.rz);
-        RealVec e2(1/sqrt(p.ex), 1/sqrt(p.ey), 1/sqrt(p.ez));
+        Vec3 r2(p.rx*p.rx, p.ry*p.ry, p.rz*p.rz);
+        Vec3 e2(1/sqrt(p.ex), 1/sqrt(p.ey), 1/sqrt(p.ez));
        for (int i = 0; i < 3; i++)
            for (int j = 0; j < 3; j++) {
                b[i][j] = 0;
@@ -291,33 +279,33 @@ void CpuGayBerneForce::computeEllipsoidFrames(const vector<RealVec>& positions)
    }
 }

-void CpuGayBerneForce::applyTorques(const vector<RealVec>& positions, vector<RealVec>& forces) {
+void CpuGayBerneForce::applyTorques(const vector<Vec3>& positions, vector<Vec3>& forces) {
    int numParticles = particles.size();
    int numThreads = threadTorque.size();
    for (int particle = 0; particle < numParticles; particle++) {
        ParticleInfo& p = particles[particle];
-        RealVec pos = positions[particle];
+        Vec3 pos = positions[particle];
        if (p.xparticle != -1) {
            // Add up the torques from the individual threads.
            
-            RealVec torque;
+            Vec3 torque;
            for (int i = 0; i < numThreads; i++)
                torque += threadTorque[i][particle];
            
            // Apply a force to the x particle.
            
-            RealVec dx = positions[p.xparticle]-pos;
+            Vec3 dx = positions[p.xparticle]-pos;
            double dx2 = dx.dot(dx);
-            RealVec f = torque.cross(dx)/dx2;
+            Vec3 f = torque.cross(dx)/dx2;
            forces[p.xparticle] += f;
            forces[particle] -= f;
            if (p.yparticle != -1) {
                // Apply a force to the y particle.  This is based on the component of the torque
                // that was not already applied to the x particle.
                
-                RealVec dy = positions[p.yparticle]-pos;
+                Vec3 dy = positions[p.yparticle]-pos;
                double dy2 = dy.dot(dy);
-                RealVec torque2 = dx*(torque.dot(dx)/dx2);
+                Vec3 torque2 = dx*(torque.dot(dx)/dx2);
                f = torque2.cross(dy)/dy2;
                forces[p.yparticle] += f;
                forces[particle] -= f;
@@ -326,27 +314,27 @@ void CpuGayBerneForce::applyTorques(const vector<RealVec>& positions, vector<Rea
    }
 }

-RealOpenMM CpuGayBerneForce::computeOneInteraction(int particle1, int particle2, RealOpenMM sigma, RealOpenMM epsilon, const RealVec* positions,
-        float* forces, vector<RealVec>& torques, const RealVec* boxVectors) {
+double CpuGayBerneForce::computeOneInteraction(int particle1, int particle2, double sigma, double epsilon, const Vec3* positions,
+        float* forces, vector<Vec3>& torques, const Vec3* boxVectors) {
    // Compute the displacement and check against the cutoff.

-    RealOpenMM deltaR[ReferenceForce::LastDeltaRIndex];
+    double deltaR[ReferenceForce::LastDeltaRIndex];
    if (nonbondedMethod == GayBerneForce::CutoffPeriodic)
        ReferenceForce::getDeltaRPeriodic(positions[particle2], positions[particle1], boxVectors, deltaR);
    else
        ReferenceForce::getDeltaR(positions[particle2], positions[particle1], deltaR);
-    RealOpenMM r = deltaR[ReferenceForce::RIndex];
+    double r = deltaR[ReferenceForce::RIndex];
    if (nonbondedMethod != GayBerneForce::NoCutoff && r >= cutoffDistance)
        return 0;
-    RealOpenMM rInv = 1/r;
-    RealVec dr(deltaR[ReferenceForce::XIndex], deltaR[ReferenceForce::YIndex], deltaR[ReferenceForce::ZIndex]);
-    RealVec drUnit = dr*rInv;
+    double rInv = 1/r;
+    Vec3 dr(deltaR[ReferenceForce::XIndex], deltaR[ReferenceForce::YIndex], deltaR[ReferenceForce::ZIndex]);
+    Vec3 drUnit = dr*rInv;
    
    // Compute the switching function.

-    RealOpenMM switchValue = 1, switchDeriv = 0;
+    double switchValue = 1, switchDeriv = 0;
    if (useSwitchingFunction && r > switchingDistance) {
-        RealOpenMM t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
+        double t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
        switchValue = 1+t*t*t*(-10+t*(15-t*6));
        switchDeriv = t*t*(-30+t*(60-t*30))/(cutoffDistance-switchingDistance);
    }
@@ -354,11 +342,11 @@ RealOpenMM CpuGayBerneForce::computeOneInteraction(int particle1, int particle2,
    // Interactions between two point particles can be computed more easily.
    
    if (particles[particle1].isPointParticle && particles[particle2].isPointParticle) {
-        RealOpenMM sig = sigma*rInv;
-        RealOpenMM sig2 = sig*sig;
-        RealOpenMM sig6 = sig2*sig2*sig2;
-        RealOpenMM energy = 4*epsilon*(sig6-1)*sig6;
-        RealVec force = drUnit*(switchValue*4*epsilon*(12*sig6 - 6)*sig6*rInv - energy*switchDeriv);
+        double sig = sigma*rInv;
+        double sig2 = sig*sig;
+        double sig6 = sig2*sig2*sig2;
+        double energy = 4*epsilon*(sig6-1)*sig6;
+        Vec3 force = drUnit*(switchValue*4*epsilon*(12*sig6 - 6)*sig6*rInv - energy*switchDeriv);
        forces[4*particle1] += force[0];
        forces[4*particle1+1] += force[1];
        forces[4*particle1+2] += force[2];
@@ -374,31 +362,31 @@ RealOpenMM CpuGayBerneForce::computeOneInteraction(int particle1, int particle2,
    Matrix G12 = G[particle1]+G[particle2];
    Matrix B12inv = B12.inverse();
    Matrix G12inv = G12.inverse();
-    RealOpenMM detG12 = G12.determinant();
+    double detG12 = G12.determinant();

    // Estimate the distance between the ellipsoids and compute the first terms needed for the energy.

-    RealOpenMM sigma12 = 1/SQRT(0.5*drUnit.dot(G12inv*drUnit));
-    RealOpenMM h12 = r - sigma12;
-    RealOpenMM rho = sigma/(h12+sigma);
-    RealOpenMM rho2 = rho*rho;
-    RealOpenMM rho6 = rho2*rho2*rho2;
-    RealOpenMM u = 4*epsilon*(rho6*rho6-rho6);
-    RealOpenMM eta = SQRT(2*s[particle1]*s[particle2]/detG12);
-    RealOpenMM chi = 2*drUnit.dot(B12inv*drUnit);
+    double sigma12 = 1/sqrt(0.5*drUnit.dot(G12inv*drUnit));
+    double h12 = r - sigma12;
+    double rho = sigma/(h12+sigma);
+    double rho2 = rho*rho;
+    double rho6 = rho2*rho2*rho2;
+    double u = 4*epsilon*(rho6*rho6-rho6);
+    double eta = sqrt(2*s[particle1]*s[particle2]/detG12);
+    double chi = 2*drUnit.dot(B12inv*drUnit);
    chi *= chi;
-    RealOpenMM energy = u*eta*chi;
+    double energy = u*eta*chi;
    
    // Compute the terms needed for the force.

-    RealVec kappa = G12inv*dr;
-    RealVec iota = B12inv*dr;
-    RealOpenMM rInv2 = rInv*rInv;
-    RealOpenMM dUSLJdr = 24*epsilon*(2*rho6-1)*rho6*rho/sigma;
-    RealOpenMM temp = 0.5*sigma12*sigma12*sigma12*rInv2;
-    RealVec dudr = (drUnit + (kappa-drUnit*kappa.dot(drUnit))*temp)*dUSLJdr;
-    RealVec dchidr = (iota-drUnit*iota.dot(drUnit))*(-8*rInv2*SQRT(chi));
-    RealVec force = (dchidr*u + dudr*chi)*(eta*switchValue) - drUnit*(energy*switchDeriv);
+    Vec3 kappa = G12inv*dr;
+    Vec3 iota = B12inv*dr;
+    double rInv2 = rInv*rInv;
+    double dUSLJdr = 24*epsilon*(2*rho6-1)*rho6*rho/sigma;
+    double temp = 0.5*sigma12*sigma12*sigma12*rInv2;
+    Vec3 dudr = (drUnit + (kappa-drUnit*kappa.dot(drUnit))*temp)*dUSLJdr;
+    Vec3 dchidr = (iota-drUnit*iota.dot(drUnit))*(-8*rInv2*sqrt(chi));
+    Vec3 force = (dchidr*u + dudr*chi)*(eta*switchValue) - drUnit*(energy*switchDeriv);
    forces[4*particle1] += force[0];
    forces[4*particle1+1] += force[1];
    forces[4*particle1+2] += force[2];
@@ -413,13 +401,13 @@ RealOpenMM CpuGayBerneForce::computeOneInteraction(int particle1, int particle2,
        ParticleInfo& p = particles[particle];
        if (p.isPointParticle)
            continue;
-        RealVec dudq = (kappa*G[particle]).cross(kappa*(temp*dUSLJdr));
-        RealVec dchidq = (iota*B[particle]).cross(iota)*(-4*rInv2);
-        RealOpenMM (&g12)[3][3] = G12.v;
-        RealOpenMM (&a)[3][3] = A[particle].v;
-        RealVec scale = RealVec(p.rx*p.rx, p.ry*p.ry, p.rz*p.rz)*(-0.5*eta/detG12);
+        Vec3 dudq = (kappa*G[particle]).cross(kappa*(temp*dUSLJdr));
+        Vec3 dchidq = (iota*B[particle]).cross(iota)*(-4*rInv2);
+        double (&g12)[3][3] = G12.v;
+        double (&a)[3][3] = A[particle].v;
+        Vec3 scale = Vec3(p.rx*p.rx, p.ry*p.ry, p.rz*p.rz)*(-0.5*eta/detG12);
        Matrix D;
-        RealOpenMM (&d)[3][3] = D.v;
+        double (&d)[3][3] = D.v;
        d[0][0] = scale[0]*(2*a[0][0]*(g12[1][1]*g12[2][2] - g12[1][2]*g12[2][1]) +
                              a[0][2]*(g12[1][2]*g12[0][1] + g12[1][0]*g12[2][1] - g12[1][1]*(g12[0][2] + g12[2][0])) +
                              a[0][1]*(g12[0][2]*g12[2][1] + g12[2][0]*g12[1][2] - g12[2][2]*(g12[0][1] + g12[1][0])));
@@ -447,10 +435,10 @@ RealOpenMM CpuGayBerneForce::computeOneInteraction(int particle1, int particle2,
        d[2][2] = scale[2]*(  a[2][0]*(g12[0][1]*g12[1][2] + g12[2][1]*g12[1][0] - g12[1][1]*(g12[0][2] + g12[2][0])) +
                              a[2][1]*(g12[1][0]*g12[0][2] + g12[2][0]*g12[0][1] - g12[0][0]*(g12[1][2] + g12[2][1])) +
                            2*a[2][2]*(g12[1][1]*g12[0][0] - g12[1][0]*g12[0][1]));
-        RealVec detadq;
+        Vec3 detadq;
        for (int i = 0; i < 3; i++)
-            detadq += RealVec(a[i][0], a[i][1], a[i][2]).cross(RealVec(d[i][0], d[i][1], d[i][2]));
-        RealVec torque = (dchidq*(u*eta) + detadq*(u*chi) + dudq*(eta*chi))*switchValue;
+            detadq += Vec3(a[i][0], a[i][1], a[i][2]).cross(Vec3(d[i][0], d[i][1], d[i][2]));
+        Vec3 torque = (dchidq*(u*eta) + detadq*(u*chi) + dudq*(eta*chi))*switchValue;
        torques[particle] -= torque;
    }
    return switchValue*energy;

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -41,43 +41,44 @@
 #include "ReferenceTabulatedFunction.h"
 #include "openmm/Context.h"
 #include "openmm/OpenMMException.h"
+#include "openmm/Vec3.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/internal/CustomNonbondedForceImpl.h"
 #include "openmm/internal/NonbondedForceImpl.h"
 #include "openmm/internal/vectorize.h"
-#include "RealVec.h"
 #include "lepton/CompiledExpression.h"
 #include "lepton/CustomFunction.h"
 #include "lepton/Operation.h"
 #include "lepton/Parser.h"
+#include <iostream>
 #include "lepton/ParsedExpression.h"

 using namespace OpenMM;
 using namespace std;

-static vector<RealVec>& extractPositions(ContextImpl& context) {
+static vector<Vec3>& extractPositions(ContextImpl& context) {
    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
-    return *((vector<RealVec>*) data->positions);
+    return *((vector<Vec3>*) data->positions);
 }

-static vector<RealVec>& extractVelocities(ContextImpl& context) {
+static vector<Vec3>& extractVelocities(ContextImpl& context) {
    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
-    return *((vector<RealVec>*) data->velocities);
+    return *((vector<Vec3>*) data->velocities);
 }

-static vector<RealVec>& extractForces(ContextImpl& context) {
+static vector<Vec3>& extractForces(ContextImpl& context) {
    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
-    return *((vector<RealVec>*) data->forces);
+    return *((vector<Vec3>*) data->forces);
 }

-static RealVec& extractBoxSize(ContextImpl& context) {
+static Vec3& extractBoxSize(ContextImpl& context) {
    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
-    return *(RealVec*) data->periodicBoxSize;
+    return *(Vec3*) data->periodicBoxSize;
 }

-static RealVec* extractBoxVectors(ContextImpl& context) {
+static Vec3* extractBoxVectors(ContextImpl& context) {
    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
-    return (RealVec*) data->periodicBoxVectors;
+    return (Vec3*) data->periodicBoxVectors;
 }

 static ReferenceConstraints& extractConstraints(ContextImpl& context) {
@@ -106,14 +107,14 @@ static void validateVariables(const Lepton::ExpressionTreeNode& node, const set<
 * for a leapfrog integrator.
 */
 static double computeShiftedKineticEnergy(ContextImpl& context, vector<double>& masses, double timeShift) {
-    vector<RealVec>& posData = extractPositions(context);
-    vector<RealVec>& velData = extractVelocities(context);
-    vector<RealVec>& forceData = extractForces(context);
+    vector<Vec3>& posData = extractPositions(context);
+    vector<Vec3>& velData = extractVelocities(context);
+    vector<Vec3>& forceData = extractForces(context);
    int numParticles = context.getSystem().getNumParticles();
    
    // Compute the shifted velocities.
    
-    vector<RealVec> shiftedVel(numParticles);
+    vector<Vec3> shiftedVel(numParticles);
    for (int i = 0; i < numParticles; ++i) {
        if (masses[i] > 0)
            shiftedVel[i] = velData[i]+forceData[i]*(timeShift/masses[i]);
@@ -137,40 +138,32 @@ static double computeShiftedKineticEnergy(ContextImpl& context, vector<double>&
    return 0.5*energy;
 }

-class CpuCalcForcesAndEnergyKernel::SumForceTask : public ThreadPool::Task {
-public:
-    SumForceTask(int numParticles, vector<RealVec>& forceData, CpuPlatform::PlatformData& data) : numParticles(numParticles), forceData(forceData), data(data) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        // Sum the contributions to forces that have been calculated by different threads.
-        
-        int numThreads = threads.getNumThreads();
-        int start = threadIndex*numParticles/numThreads;
-        int end = (threadIndex+1)*numParticles/numThreads;
-        for (int i = start; i < end; i++) {
-            fvec4 f(0.0f);
-            for (int j = 0; j < numThreads; j++)
-                f += fvec4(&data.threadForce[j][4*i]);
-            forceData[i][0] += f[0];
-            forceData[i][1] += f[1];
-            forceData[i][2] += f[2];
-        }
-    }
-    int numParticles;
-    vector<RealVec>& forceData;
-    CpuPlatform::PlatformData& data;
-};
+CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
+        CalcForcesAndEnergyKernel(name, platform), data(data) {
+    // Create a Reference platform version of this kernel.
+    
+    ReferenceKernelFactory referenceFactory;
+    referenceKernel = Kernel(referenceFactory.createKernelImpl(name, platform, context));
+}

-class CpuCalcForcesAndEnergyKernel::InitForceTask : public ThreadPool::Task {
-public:
-    InitForceTask(int numParticles, ContextImpl& context, CpuPlatform::PlatformData& data) : numParticles(numParticles), positionsValid(true), context(context), data(data) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
+void CpuCalcForcesAndEnergyKernel::initialize(const System& system) {
+    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().initialize(system);
+    lastPositions.resize(system.getNumParticles(), Vec3(1e10, 1e10, 1e10));
+}
+
+void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().beginComputation(context, includeForce, includeEnergy, groups);
+    
+    // Convert positions to single precision and clear the forces.
+
+    int numParticles = context.getSystem().getNumParticles();
+    bool positionsValid = true;
+    data.threads.execute([&] (ThreadPool& threads, int threadIndex) {
        // Convert the positions to single precision and apply periodic boundary conditions

        AlignedArray<float>& posq = data.posq;
-        vector<RealVec>& posData = extractPositions(context);
-        RealVec* boxVectors = extractBoxVectors(context);
+        vector<Vec3>& posData = extractPositions(context);
+        Vec3* boxVectors = extractBoxVectors(context);
        double boxSize[3] = {boxVectors[0][0], boxVectors[1][1], boxVectors[2][2]};
        double invBoxSize[3] = {1/boxVectors[0][0], 1/boxVectors[1][1], 1/boxVectors[2][2]};
        bool triclinic = (boxVectors[0][1] != 0 || boxVectors[0][2] != 0 || boxVectors[1][0] != 0 || boxVectors[1][2] != 0 || boxVectors[2][0] != 0 || boxVectors[2][1] != 0);
@@ -181,7 +174,7 @@ public:
        if (data.isPeriodic) {
            if (triclinic) {
                for (int i = start; i < end; i++) {
-                    RealVec pos = posData[i];
+                    Vec3 pos = posData[i];
                    pos -= boxVectors[2]*floor(pos[2]*invBoxSize[2]);
                    pos -= boxVectors[1]*floor(pos[1]*invBoxSize[1]);
                    pos -= boxVectors[0]*floor(pos[0]*invBoxSize[0]);
@@ -193,7 +186,7 @@ public:
            else {
                for (int i = start; i < end; i++) {
                    for (int j = 0; j < 3; j++) {
-                        RealOpenMM x = posData[i][j];
+                        double x = posData[i][j];
                        double base = floor(x*invBoxSize[j])*boxSize[j];
                        posq[4*i+j] = (float) (x-base);
                    }
@@ -218,36 +211,9 @@ public:
        fvec4 zero(0.0f);
        for (int j = 0; j < numParticles; j++)
            zero.store(&data.threadForce[threadIndex][j*4]);
-    }
-    int numParticles;
-    bool positionsValid;
-    ContextImpl& context;
-    CpuPlatform::PlatformData& data;
-};
-
-CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
-        CalcForcesAndEnergyKernel(name, platform), data(data) {
-    // Create a Reference platform version of this kernel.
-    
-    ReferenceKernelFactory referenceFactory;
-    referenceKernel = Kernel(referenceFactory.createKernelImpl(name, platform, context));
-}
-
-void CpuCalcForcesAndEnergyKernel::initialize(const System& system) {
-    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().initialize(system);
-    lastPositions.resize(system.getNumParticles(), Vec3(1e10, 1e10, 1e10));
-}
-
-void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
-    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().beginComputation(context, includeForce, includeEnergy, groups);
-    
-    // Convert positions to single precision and clear the forces.
-
-    int numParticles = context.getSystem().getNumParticles();
-    InitForceTask task(numParticles, context, data);
-    data.threads.execute(task);
+    });
    data.threads.waitForThreads();
-    if (!task.positionsValid)
+    if (!positionsValid)
        throw OpenMMException("Particle coordinate is nan");

    // Determine whether we need to recompute the neighbor list.
@@ -259,9 +225,9 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
        double farCutoff2 = 0.5*padding*padding;
        int maxNumMoved = numParticles/10;
        vector<int> moved;
-        vector<RealVec>& posData = extractPositions(context);
+        vector<Vec3>& posData = extractPositions(context);
        for (int i = 0; i < numParticles; i++) {
-            RealVec delta = posData[i]-lastPositions[i];
+            Vec3 delta = posData[i]-lastPositions[i];
            double dist2 = delta.dot(delta);
            if (dist2 > closeCutoff2) {
                moved.push_back(i);
@@ -280,11 +246,11 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
            double paddedCutoff2 = data.paddedCutoff*data.paddedCutoff;
            for (int i = 1; i < numMoved && !needRecompute; i++)
                for (int j = 0; j < i; j++) {
-                    RealVec delta = posData[moved[i]]-posData[moved[j]];
+                    Vec3 delta = posData[moved[i]]-posData[moved[j]];
                    if (delta.dot(delta) < cutoff2) {
                        // These particles should interact.  See if they are in the neighbor list.
                        
-                        RealVec oldDelta = lastPositions[moved[i]]-lastPositions[moved[j]];
+                        Vec3 oldDelta = lastPositions[moved[i]]-lastPositions[moved[j]];
                        if (oldDelta.dot(oldDelta) > paddedCutoff2) {
                            needRecompute = true;
                            break;
@@ -302,8 +268,23 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
 double CpuCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
    // Sum the forces from all the threads.
    
-    SumForceTask task(context.getSystem().getNumParticles(), extractForces(context), data);
-    data.threads.execute(task);
+    data.threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        // Sum the contributions to forces that have been calculated by different threads.
+        
+        int numParticles = context.getSystem().getNumParticles();
+        int numThreads = threads.getNumThreads();
+        int start = threadIndex*numParticles/numThreads;
+        int end = (threadIndex+1)*numParticles/numThreads;
+        vector<Vec3>& forceData = extractForces(context);
+        for (int i = start; i < end; i++) {
+            fvec4 f(0.0f);
+            for (int j = 0; j < numThreads; j++)
+                f += fvec4(&data.threadForce[j][4*i]);
+            forceData[i][0] += f[0];
+            forceData[i][1] += f[1];
+            forceData[i][2] += f[2];
+        }
+    });
    data.threads.waitForThreads();
    return referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().finishComputation(context, includeForce, includeEnergy, groups, valid);
 }
@@ -324,9 +305,9 @@ void CpuCalcHarmonicAngleForceKernel::initialize(const System& system, const Har
    angleIndexArray = new int*[numAngles];
    for (int i = 0; i < numAngles; i++)
        angleIndexArray[i] = new int[3];
-    angleParamArray = new RealOpenMM*[numAngles];
+    angleParamArray = new double*[numAngles];
    for (int i = 0; i < numAngles; i++)
-        angleParamArray[i] = new RealOpenMM[2];
+        angleParamArray[i] = new double[2];
    for (int i = 0; i < numAngles; ++i) {
        int particle1, particle2, particle3;
        double angle, k;
@@ -334,17 +315,17 @@ void CpuCalcHarmonicAngleForceKernel::initialize(const System& system, const Har
        angleIndexArray[i][0] = particle1;
        angleIndexArray[i][1] = particle2;
        angleIndexArray[i][2] = particle3;
-        angleParamArray[i][0] = (RealOpenMM) angle;
-        angleParamArray[i][1] = (RealOpenMM) k;
+        angleParamArray[i][0] = angle;
+        angleParamArray[i][1] = k;
    }
    bondForce.initialize(system.getNumParticles(), numAngles, 3, angleIndexArray, data.threads);
    usePeriodic = force.usesPeriodicBoundaryConditions();
 }

 double CpuCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    vector<RealVec>& posData = extractPositions(context);
-    vector<RealVec>& forceData = extractForces(context);
-    RealOpenMM energy = 0;
+    vector<Vec3>& posData = extractPositions(context);
+    vector<Vec3>& forceData = extractForces(context);
+    double energy = 0;
    ReferenceAngleBondIxn angleBond;
    if (usePeriodic)
        angleBond.setPeriodic(extractBoxVectors(context));
@@ -364,8 +345,8 @@ void CpuCalcHarmonicAngleForceKernel::copyParametersToContext(ContextImpl& conte
        force.getAngleParameters(i, particle1, particle2, particle3, angle, k);
        if (particle1 != angleIndexArray[i][0] || particle2 != angleIndexArray[i][1] || particle3 != angleIndexArray[i][2])
            throw OpenMMException("updateParametersInContext: The set of particles in an angle has changed");
-        angleParamArray[i][0] = (RealOpenMM) angle;
-        angleParamArray[i][1] = (RealOpenMM) k;
+        angleParamArray[i][0] = angle;
+        angleParamArray[i][1] = k;
    }
 }

@@ -385,9 +366,9 @@ void CpuCalcPeriodicTorsionForceKernel::initialize(const System& system, const P
    torsionIndexArray = new int*[numTorsions];
    for (int i = 0; i < numTorsions; i++)
        torsionIndexArray[i] = new int[4];
-    torsionParamArray = new RealOpenMM*[numTorsions];
+    torsionParamArray = new double*[numTorsions];
    for (int i = 0; i < numTorsions; i++)
-        torsionParamArray[i] = new RealOpenMM[3];
+        torsionParamArray[i] = new double[3];
    for (int i = 0; i < numTorsions; ++i) {
        int particle1, particle2, particle3, particle4, periodicity;
        double phase, k;
@@ -396,18 +377,18 @@ void CpuCalcPeriodicTorsionForceKernel::initialize(const System& system, const P
        torsionIndexArray[i][1] = particle2;
        torsionIndexArray[i][2] = particle3;
        torsionIndexArray[i][3] = particle4;
-        torsionParamArray[i][0] = (RealOpenMM) k;
-        torsionParamArray[i][1] = (RealOpenMM) phase;
-        torsionParamArray[i][2] = (RealOpenMM) periodicity;
+        torsionParamArray[i][0] = k;
+        torsionParamArray[i][1] = phase;
+        torsionParamArray[i][2] = periodicity;
    }
    bondForce.initialize(system.getNumParticles(), numTorsions, 4, torsionIndexArray, data.threads);
    usePeriodic = force.usesPeriodicBoundaryConditions();
 }

 double CpuCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    vector<RealVec>& posData = extractPositions(context);
-    vector<RealVec>& forceData = extractForces(context);
-    RealOpenMM energy = 0;
+    vector<Vec3>& posData = extractPositions(context);
+    vector<Vec3>& forceData = extractForces(context);
+    double energy = 0;
    ReferenceProperDihedralBond periodicTorsionBond;
    if (usePeriodic)
        periodicTorsionBond.setPeriodic(extractBoxVectors(context));
@@ -427,9 +408,9 @@ void CpuCalcPeriodicTorsionForceKernel::copyParametersToContext(ContextImpl& con
        force.getTorsionParameters(i, particle1, particle2, particle3, particle4, periodicity, phase, k);
        if (particle1 != torsionIndexArray[i][0] || particle2 != torsionIndexArray[i][1] || particle3 != torsionIndexArray[i][2] || particle4 != torsionIndexArray[i][3])
            throw OpenMMException("updateParametersInContext: The set of particles in a torsion has changed");
-        torsionParamArray[i][0] = (RealOpenMM) k;
-        torsionParamArray[i][1] = (RealOpenMM) phase;
-        torsionParamArray[i][2] = (RealOpenMM) periodicity;
+        torsionParamArray[i][0] = k;
+        torsionParamArray[i][1] = phase;
+        torsionParamArray[i][2] = periodicity;
    }
 }

@@ -449,9 +430,9 @@ void CpuCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsi
    torsionIndexArray = new int*[numTorsions];
    for (int i = 0; i < numTorsions; i++)
        torsionIndexArray[i] = new int[4];
-    torsionParamArray = new RealOpenMM*[numTorsions];
+    torsionParamArray = new double*[numTorsions];
    for (int i = 0; i < numTorsions; i++)
-        torsionParamArray[i] = new RealOpenMM[6];
+        torsionParamArray[i] = new double[6];
    for (int i = 0; i < numTorsions; ++i) {
        int particle1, particle2, particle3, particle4;
        double c0, c1, c2, c3, c4, c5;
@@ -460,21 +441,21 @@ void CpuCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsi
        torsionIndexArray[i][1] = particle2;
        torsionIndexArray[i][2] = particle3;
        torsionIndexArray[i][3] = particle4;
-        torsionParamArray[i][0] = (RealOpenMM) c0;
-        torsionParamArray[i][1] = (RealOpenMM) c1;
-        torsionParamArray[i][2] = (RealOpenMM) c2;
-        torsionParamArray[i][3] = (RealOpenMM) c3;
-        torsionParamArray[i][4] = (RealOpenMM) c4;
-        torsionParamArray[i][5] = (RealOpenMM) c5;
+        torsionParamArray[i][0] = c0;
+        torsionParamArray[i][1] = c1;
+        torsionParamArray[i][2] = c2;
+        torsionParamArray[i][3] = c3;
+        torsionParamArray[i][4] = c4;
+        torsionParamArray[i][5] = c5;
    }
    bondForce.initialize(system.getNumParticles(), numTorsions, 4, torsionIndexArray, data.threads);
    usePeriodic = force.usesPeriodicBoundaryConditions();
 }

 double CpuCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    vector<RealVec>& posData = extractPositions(context);
-    vector<RealVec>& forceData = extractForces(context);
-    RealOpenMM energy = 0;
+    vector<Vec3>& posData = extractPositions(context);
+    vector<Vec3>& forceData = extractForces(context);
+    double energy = 0;
    ReferenceRbDihedralBond rbTorsionBond;
    if (usePeriodic)
        rbTorsionBond.setPeriodic(extractBoxVectors(context));
@@ -494,12 +475,12 @@ void CpuCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl& context,
        force.getTorsionParameters(i, particle1, particle2, particle3, particle4, c0, c1, c2, c3, c4, c5);
        if (particle1 != torsionIndexArray[i][0] || particle2 != torsionIndexArray[i][1] || particle3 != torsionIndexArray[i][2] || particle4 != torsionIndexArray[i][3])
            throw OpenMMException("updateParametersInContext: The set of particles in a torsion has changed");
-        torsionParamArray[i][0] = (RealOpenMM) c0;
-        torsionParamArray[i][1] = (RealOpenMM) c1;
-        torsionParamArray[i][2] = (RealOpenMM) c2;
-        torsionParamArray[i][3] = (RealOpenMM) c3;
-        torsionParamArray[i][4] = (RealOpenMM) c4;
-        torsionParamArray[i][5] = (RealOpenMM) c5;
+        torsionParamArray[i][0] = c0;
+        torsionParamArray[i][1] = c1;
+        torsionParamArray[i][2] = c2;
+        torsionParamArray[i][3] = c3;
+        torsionParamArray[i][4] = c4;
+        torsionParamArray[i][5] = c5;
    }
 }

@@ -528,7 +509,7 @@ CpuNonbondedForce* createCpuNonbondedForceVec4();
 CpuNonbondedForce* createCpuNonbondedForceVec8();

 CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
-        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), nonbonded(NULL) {
+        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), hasInitializedDispersionPme(false), nonbonded(NULL) {
    if (isVec8Supported())
        nonbonded = createCpuNonbondedForceVec8();
    else
@@ -575,12 +556,14 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    for (int i = 0; i < num14; i++)
        bonded14ParamArray[i] = new double[3];
    particleParams.resize(numParticles);
+    C6params.resize(numParticles);
    double sumSquaredCharges = 0.0;
    for (int i = 0; i < numParticles; ++i) {
        double charge, radius, depth;
        force.getParticleParameters(i, charge, radius, depth);
        data.posq[4*i+3] = (float) charge;
        particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
+        C6params[i] = 8.0*pow(particleParams[i].first, 3.0) * particleParams[i].second;
        sumSquaredCharges += charge*charge;
    }
    
@@ -592,9 +575,9 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
        force.getExceptionParameters(nb14s[i], particle1, particle2, charge, radius, depth);
        bonded14IndexArray[i][0] = particle1;
        bonded14IndexArray[i][1] = particle2;
-        bonded14ParamArray[i][0] = static_cast<RealOpenMM>(radius);
-        bonded14ParamArray[i][1] = static_cast<RealOpenMM>(4.0*depth);
-        bonded14ParamArray[i][2] = static_cast<RealOpenMM>(charge);
+        bonded14ParamArray[i][0] = radius;
+        bonded14ParamArray[i][1] = 4.0*depth;
+        bonded14ParamArray[i][2] = charge;
    }
    bondForce.initialize(system.getNumParticles(), num14, 2, bonded14IndexArray, data.threads);
    
@@ -616,19 +599,35 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    }
    else if (nonbondedMethod == PME) {
        double alpha;
-        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2]);
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
        ewaldAlpha = alpha;
    }
-    if (nonbondedMethod == Ewald || nonbondedMethod == PME)
+    else if (nonbondedMethod == LJPME) {
+        double alpha;
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
+        ewaldAlpha = alpha;
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, dispersionGridSize[0], dispersionGridSize[1], dispersionGridSize[2], true);
+        ewaldDispersionAlpha = alpha;
+        useSwitchingFunction = false;
+    }
+
+    if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) {
        ewaldSelfEnergy = -ONE_4PI_EPS0*ewaldAlpha*sumSquaredCharges/sqrt(M_PI);
-    else
+        if(nonbondedMethod == LJPME){
+            for (int atom = 0; atom < numParticles; atom++) {
+                // Dispersion self term
+                ewaldSelfEnergy += pow(ewaldDispersionAlpha, 6.0) * C6params[atom]*C6params[atom] / 12.0;
+            }
+        }
+    } else {
        ewaldSelfEnergy = 0.0;
+    }
    rfDielectric = force.getReactionFieldDielectric();
    if (force.getUseDispersionCorrection())
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
    else
        dispersionCoefficient = 0.0;
-    data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME);
+    data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME);
 }

 double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
@@ -646,18 +645,33 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
            }
        }
+        if (nonbondedMethod == LJPME) {
+            // If available, use the optimized PME implementation.
+
+            vector<string> kernelNames;
+            kernelNames.push_back("CalcPmeReciprocalForce");
+            useOptimizedPme = getPlatform().supportsKernels(kernelNames);
+            if (useOptimizedPme) {
+                optimizedPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
+                optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
+                optimizedDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), context);
+                optimizedDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSize[0], dispersionGridSize[1],
+                                                                                                  dispersionGridSize[2], numParticles, ewaldDispersionAlpha);
+            }
+        }
    }
    AlignedArray<float>& posq = data.posq;
-    vector<RealVec>& posData = extractPositions(context);
-    vector<RealVec>& forceData = extractForces(context);
-    RealVec* boxVectors = extractBoxVectors(context);
+    vector<Vec3>& posData = extractPositions(context);
+    vector<Vec3>& forceData = extractForces(context);
+    Vec3* boxVectors = extractBoxVectors(context);
    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    bool ewald  = (nonbondedMethod == Ewald);
    bool pme  = (nonbondedMethod == PME);
+    bool ljpme = (nonbondedMethod == LJPME);
    if (nonbondedMethod != NoCutoff)
        nonbonded->setUseCutoff(nonbondedCutoff, *data.neighborList, rfDielectric);
    if (data.isPeriodic) {
-        RealVec* boxVectors = extractBoxVectors(context);
+        Vec3* boxVectors = extractBoxVectors(context);
        double minAllowedSize = 1.999999*nonbondedCutoff;
        if (boxVectors[0][0] < minAllowedSize || boxVectors[1][1] < minAllowedSize || boxVectors[2][2] < minAllowedSize)
            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
@@ -669,9 +683,13 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
        nonbonded->setUsePME(ewaldAlpha, gridSize);
    if (useSwitchingFunction)
        nonbonded->setUseSwitchingFunction(switchingDistance);
+    if (ljpme){
+        nonbonded->setUsePME(ewaldAlpha, gridSize);
+        nonbonded->setUseLJPME(ewaldDispersionAlpha, dispersionGridSize);
+    }
    double nonbondedEnergy = 0;
    if (includeDirect)
-        nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
+        nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, C6params, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
    if (includeReciprocal) {
        if (useOptimizedPme) {
            PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
@@ -680,13 +698,13 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
        }
        else
-            nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
+            nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, C6params, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
    }
    energy += nonbondedEnergy;
    if (includeDirect) {
        ReferenceLJCoulomb14 nonbonded14;
        bondForce.calculateForce(posData, bonded14ParamArray, forceData, includeEnergy ? &energy : NULL, nonbonded14);
-        if (data.isPeriodic)
+        if (data.isPeriodic && nonbondedMethod != LJPME)
            energy += dispersionCoefficient/(boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2]);
    }
    return energy;
@@ -726,9 +744,9 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
        force.getExceptionParameters(nb14s[i], particle1, particle2, charge, radius, depth);
        bonded14IndexArray[i][0] = particle1;
        bonded14IndexArray[i][1] = particle2;
-        bonded14ParamArray[i][0] = static_cast<RealOpenMM>(radius);
-        bonded14ParamArray[i][1] = static_cast<RealOpenMM>(4.0*depth);
-        bonded14ParamArray[i][2] = static_cast<RealOpenMM>(charge);
+        bonded14ParamArray[i][0] = radius;
+        bonded14ParamArray[i][1] = 4.0*depth;
+        bonded14ParamArray[i][2] = charge;
    }
    
    // Recompute the coefficient for the dispersion correction.
@@ -739,7 +757,7 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 }

 void CpuCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
-    if (nonbondedMethod != PME)
+    if (nonbondedMethod != PME && nonbondedMethod != LJPME)
        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
    if (useOptimizedPme)
        optimizedPme.getAs<const CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
@@ -751,6 +769,19 @@ void CpuCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int&
    }
 }

+void CpuCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    if (nonbondedMethod != LJPME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
+    if (useOptimizedPme)
+        optimizedDispersionPme.getAs<const CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
+    else {
+        alpha = ewaldDispersionAlpha;
+        nx = dispersionGridSize[0];
+        ny = dispersionGridSize[1];
+        nz = dispersionGridSize[2];
+    }
+}
+
 CpuCalcCustomNonbondedForceKernel::CpuCalcCustomNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) :
            CalcCustomNonbondedForceKernel(name, platform), data(data), forceCopy(NULL), nonbonded(NULL) {
 }
@@ -864,9 +895,9 @@ void CpuCalcCustomNonbondedForceKernel::initialize(const System& system, const C
 }

 double CpuCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    vector<RealVec>& posData = extractPositions(context);
-    vector<RealVec>& forceData = extractForces(context);
-    RealVec* boxVectors = extractBoxVectors(context);
+    vector<Vec3>& posData = extractPositions(context);
+    vector<Vec3>& forceData = extractForces(context);
+    Vec3* boxVectors = extractBoxVectors(context);
    double energy = 0;
    bool periodic = (nonbondedMethod == CutoffPeriodic);
    if (nonbondedMethod != NoCutoff)
@@ -953,7 +984,7 @@ void CpuCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCFo

 double CpuCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    if (data.isPeriodic) {
-        RealVec& boxSize = extractBoxSize(context);
+        Vec3& boxSize = extractBoxSize(context);
        float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
        obc.setPeriodic(floatBoxSize);
    }
@@ -1024,14 +1055,14 @@ void CpuCalcCustomGBForceKernel::initialize(const System& system, const CustomGB
        vector<double> parameters;
        force.getParticleParameters(i, parameters);
        for (int j = 0; j < numPerParticleParameters; j++)
-            particleParamArray[i][j] = static_cast<RealOpenMM>(parameters[j]);
+            particleParamArray[i][j] = parameters[j];
    }
    for (int i = 0; i < numPerParticleParameters; i++)
        particleParameterNames.push_back(force.getPerParticleParameterName(i));
    for (int i = 0; i < force.getNumGlobalParameters(); i++)
        globalParameterNames.push_back(force.getGlobalParameterName(i));
    nonbondedMethod = CalcCustomGBForceKernel::NonbondedMethod(force.getNonbondedMethod());
-    nonbondedCutoff = (RealOpenMM) force.getCutoffDistance();
+    nonbondedCutoff = force.getCutoffDistance();
    if (nonbondedMethod != NoCutoff)
        data.requestNeighborList(nonbondedCutoff, 0.25*nonbondedCutoff, force.getNumExclusions() > 0, exclusions);

@@ -1133,9 +1164,9 @@ void CpuCalcCustomGBForceKernel::initialize(const System& system, const CustomGB
 }

 double CpuCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    vector<RealVec>& forceData = extractForces(context);
-    RealOpenMM energy = 0;
-    RealVec* boxVectors = extractBoxVectors(context);
+    vector<Vec3>& forceData = extractForces(context);
+    double energy = 0;
+    Vec3* boxVectors = extractBoxVectors(context);
    if (data.isPeriodic)
        ixn->setPeriodic(extractBoxSize(context));
    if (nonbondedMethod != NoCutoff) {
@@ -1165,7 +1196,7 @@ void CpuCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context, c
        vector<double> parameters;
        force.getParticleParameters(i, parameters);
        for (int j = 0; j < numParameters; j++)
-            particleParamArray[i][j] = static_cast<RealOpenMM>(parameters[j]);
+            particleParamArray[i][j] = static_cast<double>(parameters[j]);
    }
 }

@@ -1208,7 +1239,7 @@ double CpuCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
    for (int i = 0; i < (int) globalParameterNames.size(); i++)
        globalParameters[globalParameterNames[i]] = context.getParameter(globalParameterNames[i]);
    if (nonbondedMethod == CutoffPeriodic) {
-        RealVec* boxVectors = extractBoxVectors(context);
+        Vec3* boxVectors = extractBoxVectors(context);
        double minAllowedSize = 2*cutoffDistance;
        if (boxVectors[0][0] < minAllowedSize || boxVectors[1][1] < minAllowedSize || boxVectors[2][2] < minAllowedSize)
            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
@@ -1232,7 +1263,7 @@ void CpuCalcCustomManyParticleForceKernel::copyParametersToContext(ContextImpl&
        int type;
        force.getParticleParameters(i, parameters, type);
        for (int j = 0; j < numParameters; j++)
-            particleParamArray[i][j] = static_cast<RealOpenMM>(parameters[j]);
+            particleParamArray[i][j] = static_cast<double>(parameters[j]);
    }
 }

@@ -1269,7 +1300,7 @@ void CpuIntegrateLangevinStepKernel::initialize(const System& system, const Lang
    int numParticles = system.getNumParticles();
    masses.resize(numParticles);
    for (int i = 0; i < numParticles; ++i)
-        masses[i] = static_cast<RealOpenMM>(system.getParticleMass(i));
+        masses[i] = static_cast<double>(system.getParticleMass(i));
    data.random.initialize(integrator.getRandomNumberSeed(), data.threads.getNumThreads());
 }

@@ -1277,16 +1308,15 @@ void CpuIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langevi
    double temperature = integrator.getTemperature();
    double friction = integrator.getFriction();
    double stepSize = integrator.getStepSize();
-    vector<RealVec>& posData = extractPositions(context);
-    vector<RealVec>& velData = extractVelocities(context);
-    vector<RealVec>& forceData = extractForces(context);
+    vector<Vec3>& posData = extractPositions(context);
+    vector<Vec3>& velData = extractVelocities(context);
+    vector<Vec3>& forceData = extractForces(context);
    if (dynamics == 0 || temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
        // Recreate the computation objects with the new parameters.
        
        if (dynamics)
            delete dynamics;
-        RealOpenMM tau = (friction == 0.0 ? 0.0 : 1.0/friction);
-        dynamics = new CpuLangevinDynamics(context.getSystem().getNumParticles(), stepSize, tau, temperature, data.threads, data.random);
+        dynamics = new CpuLangevinDynamics(context.getSystem().getNumParticles(), stepSize, friction, temperature, data.threads, data.random);
        dynamics->setReferenceConstraintAlgorithm(&extractConstraints(context));
        prevTemp = temperature;
        prevFriction = friction;

--- a/platforms/cpu/src/CpuLangevinDynamics.cpp
+++ b/platforms/cpu/src/CpuLangevinDynamics.cpp

-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Authors: Peter Eastman
 * Contributors: 
 *
@@ -29,45 +29,15 @@
 using namespace OpenMM;
 using namespace std;

-class CpuLangevinDynamics::Update1Task : public ThreadPool::Task {
-public:
-    Update1Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate1(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-
-class CpuLangevinDynamics::Update2Task : public ThreadPool::Task {
-public:
-    Update2Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate2(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-
-class CpuLangevinDynamics::Update3Task : public ThreadPool::Task {
-public:
-    Update3Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate3(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-
-CpuLangevinDynamics::CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM tau, RealOpenMM temperature, ThreadPool& threads, CpuRandom& random) : 
-           ReferenceStochasticDynamics(numberOfAtoms, deltaT, tau, temperature), threads(threads), random(random) {
+CpuLangevinDynamics::CpuLangevinDynamics(int numberOfAtoms, double deltaT, double friction, double temperature, ThreadPool& threads, CpuRandom& random) : 
+           ReferenceStochasticDynamics(numberOfAtoms, deltaT, friction, temperature), threads(threads), random(random) {
 }

 CpuLangevinDynamics::~CpuLangevinDynamics() {
 }

-void CpuLangevinDynamics::updatePart1(int numberOfAtoms, vector<RealVec>& atomCoordinates, vector<RealVec>& velocities,
-                                      vector<RealVec>& forces, vector<RealOpenMM>& inverseMasses, vector<RealVec>& xPrime) {
+void CpuLangevinDynamics::updatePart1(int numberOfAtoms, vector<Vec3>& atomCoordinates, vector<Vec3>& velocities,
+                                      vector<Vec3>& forces, vector<double>& inverseMasses, vector<Vec3>& xPrime) {
    // Record the parameters for the threads.
    
    this->numberOfAtoms = numberOfAtoms;
@@ -79,13 +49,12 @@ void CpuLangevinDynamics::updatePart1(int numberOfAtoms, vector<RealVec>& atomCo
    
    // Signal the threads to start running and wait for them to finish.
    
-    Update1Task task(*this);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate1(threadIndex); });
    threads.waitForThreads();
 }

-void CpuLangevinDynamics::updatePart2(int numberOfAtoms, vector<RealVec>& atomCoordinates, vector<RealVec>& velocities,
-                                      vector<RealVec>& forces, vector<RealOpenMM>& inverseMasses, vector<RealVec>& xPrime) {
+void CpuLangevinDynamics::updatePart2(int numberOfAtoms, vector<Vec3>& atomCoordinates, vector<Vec3>& velocities,
+                                      vector<Vec3>& forces, vector<double>& inverseMasses, vector<Vec3>& xPrime) {
    // Record the parameters for the threads.
    
    this->numberOfAtoms = numberOfAtoms;
@@ -97,13 +66,12 @@ void CpuLangevinDynamics::updatePart2(int numberOfAtoms, vector<RealVec>& atomCo
    
    // Signal the threads to start running and wait for them to finish.
    
-    Update2Task task(*this);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate2(threadIndex); });
    threads.waitForThreads();
 }

-void CpuLangevinDynamics::updatePart3(int numberOfAtoms, vector<RealVec>& atomCoordinates, vector<RealVec>& velocities,
-                                       vector<RealOpenMM>& inverseMasses, vector<RealVec>& xPrime) {
+void CpuLangevinDynamics::updatePart3(int numberOfAtoms, vector<Vec3>& atomCoordinates, vector<Vec3>& velocities,
+                                       vector<double>& inverseMasses, vector<Vec3>& xPrime) {
    // Record the parameters for the threads.
    
    this->numberOfAtoms = numberOfAtoms;
@@ -114,44 +82,44 @@ void CpuLangevinDynamics::updatePart3(int numberOfAtoms, vector<RealVec>& atomCo
    
    // Signal the threads to start running and wait for them to finish.
    
-    Update3Task task(*this);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate3(threadIndex); });
    threads.waitForThreads();
 }

 void CpuLangevinDynamics::threadUpdate1(int threadIndex) {
-    const RealOpenMM tau = getTau();
-    const RealOpenMM vscale = EXP(-getDeltaT()/tau);
-    const RealOpenMM fscale = (1-vscale)*tau;
-    const RealOpenMM kT = BOLTZ*getTemperature();
-    const RealOpenMM noisescale = SQRT(2*kT/tau)*SQRT(0.5*(1-vscale*vscale)*tau);
+    double dt = getDeltaT();
+    double friction = getFriction();
+    const double vscale = exp(-dt*friction);
+    const double fscale = (friction == 0 ? dt : (1-vscale)/friction);
+    const double kT = BOLTZ*getTemperature();
+    const double noisescale = sqrt(kT*(1-vscale*vscale));
    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();

    for (int i = start; i < end; i++) {
        if (inverseMasses[i] != 0.0) {
-            RealOpenMM sqrtInvMass = SQRT(inverseMasses[i]);
-            RealVec noise(random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex));
+            double sqrtInvMass = sqrt(inverseMasses[i]);
+            Vec3 noise(random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex));
            velocities[i]  = velocities[i]*vscale + forces[i]*(fscale*inverseMasses[i]) + noise*(noisescale*sqrtInvMass);
        }
   }
 }

 void CpuLangevinDynamics::threadUpdate2(int threadIndex) {
-    const RealOpenMM dt = getDeltaT();
+    const double dt = getDeltaT();
    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();

    for (int i = start; i < end; i++) {
        if (inverseMasses[i] != 0.0) {
-            RealOpenMM sqrtInvMass = SQRT(inverseMasses[i]);
+            double sqrtInvMass = sqrt(inverseMasses[i]);
            xPrime[i] = atomCoordinates[i]+velocities[i]*dt;
        }
   }
 }

 void CpuLangevinDynamics::threadUpdate3(int threadIndex) {
-   const RealOpenMM invStepSize = 1.0/getDeltaT();
+   const double invStepSize = 1.0/getDeltaT();
    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();


--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -59,7 +59,7 @@ public:
 */
 class CpuNeighborList::Voxels {
 public:
-    Voxels(int blockSize, float vsy, float vsz, float miny, float maxy, float minz, float maxz, const RealVec* boxVectors, bool usePeriodic) :
+    Voxels(int blockSize, float vsy, float vsz, float miny, float maxy, float minz, float maxz, const Vec3* boxVectors, bool usePeriodic) :
            blockSize(blockSize), voxelSizeY(vsy), voxelSizeZ(vsz), miny(miny), maxy(maxy), minz(minz), maxz(maxz), usePeriodic(usePeriodic) {
        for (int i = 0; i < 3; i++)
            for (int j = 0; j < 3; j++)
@@ -409,21 +409,11 @@ private:
    vector<vector<vector<pair<float, int> > > > bins;
 };

-class CpuNeighborList::ThreadTask : public ThreadPool::Task {
-public:
-    ThreadTask(CpuNeighborList& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeNeighborList(threads, threadIndex);
-    }
-    CpuNeighborList& owner;
-};
-
 CpuNeighborList::CpuNeighborList(int blockSize) : blockSize(blockSize) {
 }

 void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
-            const RealVec* periodicBoxVectors, bool usePeriodic, float maxDistance, ThreadPool& threads) {
+            const Vec3* periodicBoxVectors, bool usePeriodic, float maxDistance, ThreadPool& threads) {
    int numBlocks = (numAtoms+blockSize-1)/blockSize;
    blockNeighbors.resize(numBlocks);
    blockExclusions.resize(numBlocks);
@@ -460,8 +450,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
    // Sort the atoms based on a Hilbert curve.
    
    atomBins.resize(numAtoms);
-    ThreadTask task(*this);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeNeighborList(threads, threadIndex); });
    threads.waitForThreads();
    sort(atomBins.begin(), atomBins.end());


--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp

-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -30,6 +30,7 @@
 #include "ReferencePME.h"
 #include "openmm/internal/gmx_atomic.h"
 #include <algorithm>
+#include <iostream>

 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
@@ -41,23 +42,14 @@ using namespace OpenMM;
 const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
 const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;

-class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
-public:
-    ComputeDirectTask(CpuNonbondedForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeDirect(threads, threadIndex);
-    }
-    CpuNonbondedForce& owner;
-};
-
 /**---------------------------------------------------------------------------------------

   CpuNonbondedForce constructor

   --------------------------------------------------------------------------------------- */

-CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false), cutoffDistance(0.0f), alphaEwald(0.0f) {
+CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), ljpme(false), tableIsValid(false), expTableIsValid(false),
+    cutoffDistance(0.0f), alphaDispersionEwald(0.0f), alphaEwald(0.0f) {
 }

 CpuNonbondedForce::~CpuNonbondedForce() {
@@ -78,10 +70,21 @@ void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neig
        tableIsValid = false;
    cutoff = true;
    cutoffDistance = distance;
+    inverseRcut6 = pow(cutoffDistance, -6);
    neighborList = &neighbors;
    krf = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0)/(2.0*solventDielectric+1.0);
    crf = (1.0/cutoffDistance)*(3.0*solventDielectric)/(2.0*solventDielectric+1.0);
-  }
+    if(alphaDispersionEwald != 0.0f){
+        // We set this here, in case setUseCutoff is called after the dispersion alpha is set.
+        double dalphaR = alphaDispersionEwald*cutoffDistance;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        inverseRcut6Expterm  = inverseRcut6*(1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
+    }
+
+}

 /**---------------------------------------------------------------------------------------

@@ -96,7 +99,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    switchingDistance = distance;
 }

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use periodic boundary conditions.  This requires that a cutoff has
     also been set, and the smallest side of the periodic box is at least twice the cutoff
@@ -106,7 +109,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {

     --------------------------------------------------------------------------------------- */

-  void CpuNonbondedForce::setPeriodic(RealVec* periodicBoxVectors) {
+void CpuNonbondedForce::setPeriodic(Vec3* periodicBoxVectors) {

    assert(cutoff);
    assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
@@ -124,11 +127,11 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    periodicBoxVec4[1] = fvec4(periodicBoxVectors[1][0], periodicBoxVectors[1][1], periodicBoxVectors[1][2], 0);
    periodicBoxVec4[2] = fvec4(periodicBoxVectors[2][0], periodicBoxVectors[2][1], periodicBoxVectors[2][2], 0);
    triclinic = (periodicBoxVectors[0][1] != 0.0 || periodicBoxVectors[0][2] != 0.0 ||
-                 periodicBoxVectors[1][0] != 0.0 || periodicBoxVectors[1][2] != 0.0 ||
-                 periodicBoxVectors[2][0] != 0.0 || periodicBoxVectors[2][1] != 0.0);
-  }
+            periodicBoxVectors[1][0] != 0.0 || periodicBoxVectors[1][2] != 0.0 ||
+            periodicBoxVectors[2][0] != 0.0 || periodicBoxVectors[2][1] != 0.0);
+}

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use Ewald summation.

@@ -139,18 +142,18 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {

     --------------------------------------------------------------------------------------- */

-  void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
-      if (alpha != alphaEwald)
-          tableIsValid = false;
-      alphaEwald = alpha;
-      numRx = kmaxx;
-      numRy = kmaxy;
-      numRz = kmaxz;
-      ewald = true;
-      tabulateEwaldScaleFactor();
-  }
+void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
+    if (alpha != alphaEwald)
+        tableIsValid = false;
+    alphaEwald = alpha;
+    numRx = kmaxx;
+    numRy = kmaxy;
+    numRz = kmaxz;
+    ewald = true;
+    tabulateEwaldScaleFactor();
+}

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

     Set the force to use Particle-Mesh Ewald (PME) summation.

@@ -159,19 +162,49 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {

     --------------------------------------------------------------------------------------- */

-  void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
-      if (alpha != alphaEwald)
-          tableIsValid = false;
-      alphaEwald = alpha;
-      meshDim[0] = meshSize[0];
-      meshDim[1] = meshSize[1];
-      meshDim[2] = meshSize[2];
-      pme = true;
-      tabulateEwaldScaleFactor();
-  }
-
-  
-  void CpuNonbondedForce::tabulateEwaldScaleFactor() {
+void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
+    if (alpha != alphaEwald)
+        tableIsValid = false;
+    alphaEwald = alpha;
+    meshDim[0] = meshSize[0];
+    meshDim[1] = meshSize[1];
+    meshDim[2] = meshSize[2];
+    pme = true;
+    tabulateEwaldScaleFactor();
+}
+
+
+/**---------------------------------------------------------------------------------------
+
+     Set the force to use Particle-Mesh Ewald (PME) summation for dispersion.
+
+     @param alpha  the Ewald separation parameter
+     @param gridSize the dimensions of the mesh
+
+     --------------------------------------------------------------------------------------- */
+
+void CpuNonbondedForce::setUseLJPME(float alpha, int meshSize[3]) {
+    if (alpha != alphaDispersionEwald)
+        expTableIsValid = false;
+    alphaDispersionEwald = alpha;
+    dispersionMeshDim[0] = meshSize[0];
+    dispersionMeshDim[1] = meshSize[1];
+    dispersionMeshDim[2] = meshSize[2];
+    ljpme = true;
+    tabulateExpTerms();
+    if(cutoffDistance != 0.0f){
+        // We set this here, in case setUseLJPME is called after the cutoff is set
+        double dalphaR = alphaDispersionEwald*cutoffDistance;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        inverseRcut6Expterm  = inverseRcut6*(1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
+    }
+}
+
+
+void CpuNonbondedForce::tabulateEwaldScaleFactor() {
    if (tableIsValid)
        return;
    tableIsValid = true;
@@ -187,10 +220,30 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
        ewaldScaleTable[i] = erfcTable[i] + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
    }
 }
-  
-void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates,
-                                             const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
-                                             vector<RealVec>& forces, double* totalEnergy) const {
+
+void CpuNonbondedForce::tabulateExpTerms() {
+    if (expTableIsValid)
+        return;
+    expTableIsValid = true;
+    exptermsDX = cutoffDistance/NUM_TABLE_POINTS;
+    exptermsDXInv = 1.0f/exptermsDX;
+    exptermsTable.resize(NUM_TABLE_POINTS+4);
+    dExptermsTable.resize(NUM_TABLE_POINTS+4);
+    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
+        double r = i*ewaldDX;
+        double dalphaR = alphaDispersionEwald*r;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        exptermsTable[i]  = (1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
+        dExptermsTable[i] = (1.0 - expterm * (1.0 + dar2 + 0.5*dar4 + dar6/6.0));
+    }
+}
+
+void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<Vec3>& atomCoordinates,
+                                               const vector<pair<float, float> >& atomParameters, const vector<float> &C6params, const vector<set<int> >& exclusions,
+                                               vector<Vec3>& forces, double* totalEnergy) const {
    typedef std::complex<float> d_complex;

    static const float epsilon     =  1.0;
@@ -203,14 +256,37 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
    if (pme) {
        pme_t pmedata;
        pme_init(&pmedata, alphaEwald, numberOfAtoms, meshDim, 5, 1);
-        vector<RealOpenMM> charges(numberOfAtoms);
+        vector<double> charges(numberOfAtoms);
        for (int i = 0; i < numberOfAtoms; i++)
            charges[i] = posq[4*i+3];
-        RealOpenMM recipEnergy = 0.0;
+        double recipEnergy = 0.0;
        pme_exec(pmedata, atomCoordinates, forces, charges, periodicBoxVectors, &recipEnergy);
        if (totalEnergy)
            *totalEnergy += recipEnergy;
        pme_destroy(pmedata);
+
+        if (ljpme) {
+            // Dispersion reciprocal space terms
+            pme_init(&pmedata,alphaDispersionEwald,numberOfAtoms,dispersionMeshDim,5,1);
+
+            std::vector<Vec3> dpmeforces;
+            for (int i = 0; i < numberOfAtoms; i++){
+                charges[i] = C6params[i];
+                dpmeforces.push_back(Vec3());
+            }
+            double recipDispersionEnergy = 0.0;
+            pme_exec_dpme(pmedata,atomCoordinates,dpmeforces,charges,periodicBoxVectors,&recipDispersionEnergy);
+            for (int i = 0; i < numberOfAtoms; i++){
+                forces[i][0] -= 2.0*dpmeforces[i][0];
+                forces[i][1] -= 2.0*dpmeforces[i][1];
+                forces[i][2] -= 2.0*dpmeforces[i][2];
+            }
+            if (totalEnergy)
+                *totalEnergy += recipDispersionEnergy;
+
+            pme_destroy(pmedata);
+        }
+
    }

    // Ewald method
@@ -224,7 +300,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c

        // setup K-vectors

-        #define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
+#define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
        vector<d_complex> eir(kmax*numberOfAtoms*3);
        vector<d_complex> tab_xy(numberOfAtoms);
        vector<d_complex> tab_qxyz(numberOfAtoms);
@@ -232,15 +308,15 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
        for (int i = 0; (i < numberOfAtoms); i++) {
            float* pos = posq+4*i;
            for (int m = 0; (m < 3); m++)
-              EIR(0, i, m) = d_complex(1,0);
+                EIR(0, i, m) = d_complex(1,0);

            for (int m=0; (m<3); m++)
-              EIR(1, i, m) = d_complex(cos(pos[m]*recipBoxSize[m]),
-                                       sin(pos[m]*recipBoxSize[m]));
+                EIR(1, i, m) = d_complex(cos(pos[m]*recipBoxSize[m]),
+                                         sin(pos[m]*recipBoxSize[m]));

            for (int j=2; (j<kmax); j++)
-              for (int m=0; (m<3); m++)
-                EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
+                for (int m=0; (m<3); m++)
+                    EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
        }

        // calculate reciprocal space energy and forces
@@ -254,11 +330,11 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
                float ky = ry * recipBoxSize[1];
                if (ry >= 0) {
                    for (int n = 0; n < numberOfAtoms; n++)
-                      tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
+                        tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
                }
                else {
                    for (int n = 0; n < numberOfAtoms; n++)
-                      tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
+                        tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
                }
                for (int rz = lowrz; rz < numRz; rz++) {
                    if (rz >= 0) {
@@ -300,14 +376,15 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
 }


-void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<Vec3>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
+                                           const vector<float>& C6params, const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    
    this->numberOfAtoms = numberOfAtoms;
    this->posq = posq;
    this->atomCoordinates = &atomCoordinates[0];
    this->atomParameters = &atomParameters[0];
+    this->C6params = &C6params[0];
    this->exclusions = &exclusions[0];
    this->threadForce = &threadForce;
    includeEnergy = (totalEnergy != NULL);
@@ -318,8 +395,7 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    
    // Signal the threads to start running and wait for them to finish.
    
-    ComputeDirectTask task(*this);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeDirect(threads, threadIndex); });
    threads.waitForThreads();
    
    // Signal the threads to subtract the exclusions.
@@ -350,9 +426,8 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    float* forces = &(*threadForce)[threadIndex][0];
    fvec4 boxSize(periodicBoxVectors[0][0], periodicBoxVectors[1][1], periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize(recipBoxSize[0], recipBoxSize[1], recipBoxSize[2], 0);
-    if (ewald || pme) {
+    if (ewald || pme || ljpme) {
        // Compute the interactions from the neighbor list.
-
        while (true) {
            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
            if (nextBlock >= neighborList->getNumBlocks())
@@ -370,7 +445,7 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
                break;
            int end = min(start+groupSize, numberOfAtoms);
            for (int i = start; i < end; i++) {
-               fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
+                fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
                float scaledChargeI = (float) (ONE_4PI_EPS0*posq[4*i+3]);
                for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
                    if (*iter > i) {
@@ -394,7 +469,18 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
                                threadEnergy[threadIndex] -= chargeProdOverR*erfAlphaR;
                        }
                        else if (includeEnergy)
-                           threadEnergy[threadIndex] -= alphaEwald*TWO_OVER_SQRT_PI*scaledChargeI*posq[4*j+3];
+                            threadEnergy[threadIndex] -= alphaEwald*TWO_OVER_SQRT_PI*scaledChargeI*posq[4*j+3];
+                        if (ljpme) {
+                            float C6ij = C6params[i]*C6params[j];
+                            float inverseR2 = 1.0f/r2;
+                            float emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                            if(includeEnergy)
+                                threadEnergy[threadIndex] += emult;
+                            float dEdR = -6.0f*C6ij*inverseR2*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                            fvec4 result = deltaR*dEdR;
+                            (fvec4(forces+4*i)-result).store(forces+4*i);
+                            (fvec4(forces+4*j)+result).store(forces+4*j);
+                        }
                    }
                }
            }
@@ -444,7 +530,7 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
    }
    float sig       = atomParameters[ii].first + atomParameters[jj].first;
    float sig2      = inverseR*sig;
-          sig2     *= sig2;
+    sig2     *= sig2;
    float sig6      = sig2*sig2*sig2;

    float eps       = atomParameters[ii].second*atomParameters[jj].second;
@@ -476,7 +562,7 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
    fvec4 result = deltaR*dEdR;
    (fvec4(forces+4*ii)+result).store(forces+4*ii);
    (fvec4(forces+4*jj)-result).store(forces+4*jj);
-  }
+}

 void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    deltaR = posJ-posI;
@@ -502,3 +588,18 @@ float CpuNonbondedForce::erfcApprox(float x) {
    return coeff1*erfcTable[index] + coeff2*erfcTable[index+1];
 }

+float CpuNonbondedForce::exptermsApprox(float x) {
+    float x1 = x*exptermsDXInv;
+    int index = min((int) floor(x1), NUM_TABLE_POINTS);
+    float coeff2 = x1-index;
+    float coeff1 = 1.0f-coeff2;
+    return coeff1*exptermsTable[index] + coeff2*exptermsTable[index+1];
+}
+
+float CpuNonbondedForce::dExptermsApprox(float x) {
+    float x1 = x*exptermsDXInv;
+    int index = min((int) floor(x1), NUM_TABLE_POINTS);
+    float coeff2 = x1-index;
+    float coeff1 = 1.0f-coeff2;
+    return coeff1*dExptermsTable[index] + coeff2*dExptermsTable[index+1];
+}
--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
@@ -25,6 +25,7 @@
 #include "SimTKOpenMMUtilities.h"
 #include "CpuNonbondedForceVec4.h"
 #include <algorithm>
+#include <iostream>

 using namespace std;
 using namespace OpenMM;
@@ -213,7 +214,6 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,

 void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
    // Determine whether we need to apply periodic boundary conditions.
-    
    PeriodicType periodicType;
    fvec4 blockCenter;
    if (!periodic) {
@@ -263,7 +263,6 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces
 template <int PERIODIC_TYPE>
 void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
-    
    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
    fvec4 blockAtomPosq[4];
    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
@@ -278,9 +277,10 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    fvec4 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]]);
    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
-    
+
    // Loop over neighbors for this block.
    
    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
@@ -318,7 +318,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
            fvec4 sig2 = inverseR*sig;
            sig2 *= sig2;
            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            fvec4 eps = blockAtomEpsilon*atomEpsilon;
+            fvec4 epsSig6 = eps*sig6;
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
@@ -328,6 +329,17 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
                dEdR = switchValue*dEdR - energy*switchDeriv*r;
                energy *= switchValue;
            }
+
+            if (ljpme) {
+                fvec4 C6ij = C6s*C6params[atom];
+                fvec4 inverseR2 = inverseR*inverseR;
+                fvec4 mysig2 = sig*sig;
+                fvec4 mysig6 = mysig2*mysig2*mysig2;
+                fvec4 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                fvec4 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
+                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                energy += emult + potentialShift;
+            }
        }
        else {
            energy = 0.0f;
@@ -362,7 +374,7 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
    }
    
    // Record the forces on the block atoms.
-    
+
    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
    transpose(f[0], f[1], f[2], f[3]);
    for (int j = 0; j < 4; j++)
@@ -420,3 +432,30 @@ fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(const fvec4& x) {
    transpose(t1, t2, t3, t4);
    return coeff1*t1 + coeff2*t2;
 }
+
+fvec4 CpuNonbondedForceVec4::exptermsApprox(const fvec4& r) {
+    fvec4 r1 = r*exptermsDXInv;
+    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = r1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&exptermsTable[index[0]]);
+    fvec4 t2(&exptermsTable[index[1]]);
+    fvec4 t3(&exptermsTable[index[2]]);
+    fvec4 t4(&exptermsTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
+
+fvec4 CpuNonbondedForceVec4::dExptermsApprox(const fvec4& r) {
+    fvec4 r1 = r*exptermsDXInv;
+    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = r1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&dExptermsTable[index[0]]);
+    fvec4 t2(&dExptermsTable[index[1]]);
+    fvec4 t3(&dExptermsTable[index[2]]);
+    fvec4 t4(&dExptermsTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
+
--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
@@ -27,6 +27,7 @@
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/hardware.h"
 #include <algorithm>
+#include <iostream>

 using namespace std;
 using namespace OpenMM;
@@ -80,8 +81,7 @@ CpuNonbondedForceVec8::CpuNonbondedForceVec8() {
 enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};

 void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // Determine whether we need to apply periodic boundary conditions.
-    
+    // Determine whether we need to apply periodic boundary conditions.    
    PeriodicType periodicType;
    fvec4 blockCenter;
    if (!periodic) {
@@ -308,6 +308,7 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
    blockAtomCharge *= ONE_4PI_EPS0;
    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
+    fvec8 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]], C6params[blockAtom[4]], C6params[blockAtom[5]], C6params[blockAtom[6]], C6params[blockAtom[7]]);
    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
@@ -348,7 +349,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
            fvec8 sig2 = inverseR*sig;
            sig2 *= sig2;
            fvec8 sig6 = sig2*sig2*sig2;
-            fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            fvec8 eps = blockAtomEpsilon*atomEpsilon;
+            fvec8 epsSig6 = eps*sig6;
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
@@ -358,6 +360,17 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
                dEdR = switchValue*dEdR - energy*switchDeriv*r;
                energy *= switchValue;
            }
+            if (ljpme) {
+                fvec8 C6ij = C6s*C6params[atom];
+                fvec8 inverseR2 = inverseR*inverseR;
+                fvec8 mysig2 = sig*sig;
+                fvec8 mysig6 = mysig2*mysig2*mysig2;
+                fvec8 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                fvec8 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
+                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                energy += emult + potentialShift;
+            }
+
        }
        else {
            energy = 0.0f;
@@ -464,4 +477,45 @@ fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(const fvec8& x) {
    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
    return coeff1*s1 + coeff2*s2;
 }
+
+fvec8 CpuNonbondedForceVec8::exptermsApprox(const fvec8& r) {
+    fvec8 r1 = r*exptermsDXInv;
+    ivec8 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = r1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&exptermsTable[indexLower[0]]);
+    fvec4 t2(&exptermsTable[indexLower[1]]);
+    fvec4 t3(&exptermsTable[indexLower[2]]);
+    fvec4 t4(&exptermsTable[indexLower[3]]);
+    fvec4 t5(&exptermsTable[indexUpper[0]]);
+    fvec4 t6(&exptermsTable[indexUpper[1]]);
+    fvec4 t7(&exptermsTable[indexUpper[2]]);
+    fvec4 t8(&exptermsTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
+
+fvec8 CpuNonbondedForceVec8::dExptermsApprox(const fvec8& r) {
+    fvec8 r1 = r*exptermsDXInv;
+    ivec8 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = r1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&dExptermsTable[indexLower[0]]);
+    fvec4 t2(&dExptermsTable[indexLower[1]]);
+    fvec4 t3(&dExptermsTable[indexLower[2]]);
+    fvec4 t4(&dExptermsTable[indexLower[3]]);
+    fvec4 t5(&dExptermsTable[indexUpper[0]]);
+    fvec4 t6(&dExptermsTable[indexUpper[1]]);
+    fvec4 t7(&dExptermsTable[indexUpper[2]]);
+    fvec4 t8(&dExptermsTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
+
 #endif
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -127,6 +127,8 @@ void CpuPlatform::contextDestroyed(ContextImpl& context) const {
    PlatformData* data = contextData[&context];
    delete data;
    contextData.erase(&context);
+    ReferencePlatform::PlatformData* refPlatformData = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+    delete refPlatformData;
 }

 CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {

--- a/platforms/cpu/src/CpuSETTLE.cpp
+++ b/platforms/cpu/src/CpuSETTLE.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,56 +35,10 @@
 using namespace OpenMM;
 using namespace std;

-class CpuSETTLE::ApplyToPositionsTask : public ThreadPool::Task {
-public:
-    ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
-            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
-            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
-        gmx_atomic_set(&atomicCounter, 0);
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        while (true) {
-            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
-            if (index >= threadSettle.size())
-                break;
-            threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
-        }
-    }
-    vector<OpenMM::RealVec>& atomCoordinates;
-    vector<OpenMM::RealVec>& atomCoordinatesP;
-    vector<RealOpenMM>& inverseMasses;
-    RealOpenMM tolerance;
-    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
-    gmx_atomic_t atomicCounter;
-};
-
-class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
-public:
-    ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
-            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
-            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
-        gmx_atomic_set(&atomicCounter, 0);
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        while (true) {
-            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
-            if (index >= threadSettle.size())
-                break;
-            threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
-        }
-    }
-    vector<OpenMM::RealVec>& atomCoordinates;
-    vector<OpenMM::RealVec>& velocities;
-    vector<RealOpenMM>& inverseMasses;
-    RealOpenMM tolerance;
-    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
-    gmx_atomic_t atomicCounter;
-};
-
 CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
    int numBlocks = 10*threads.getNumThreads();
    int numClusters = settle.getNumClusters();
-    vector<RealOpenMM> mass(system.getNumParticles());
+    vector<double> mass(system.getNumParticles());
    for (int i = 0; i < system.getNumParticles(); i++)
        mass[i] = system.getParticleMass(i);
    for (int i = 0; i < numBlocks; i++) {
@@ -93,7 +47,7 @@ CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settl
        if (start != end) {
            int numThreadClusters = end-start;
            vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters);
-            vector<RealOpenMM> distance1(numThreadClusters), distance2(numThreadClusters);
+            vector<double> distance1(numThreadClusters), distance2(numThreadClusters);
            for (int j = 0; j < numThreadClusters; j++)
                settle.getClusterParameters(start+j, atom1[j], atom2[j], atom3[j], distance1[j], distance2[j]);
            threadSettle.push_back(new ReferenceSETTLEAlgorithm(atom1, atom2, atom3, distance1, distance2, mass));
@@ -106,14 +60,30 @@ CpuSETTLE::~CpuSETTLE() {
        delete threadSettle[i];
 }

-void CpuSETTLE::apply(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
-    ApplyToPositionsTask task(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance, threadSettle);
-    threads.execute(task);
+void CpuSETTLE::apply(vector<OpenMM::Vec3>& atomCoordinates, vector<OpenMM::Vec3>& atomCoordinatesP, vector<double>& inverseMasses, double tolerance) {
+    gmx_atomic_t atomicCounter;
+    gmx_atomic_set(&atomicCounter, 0);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        while (true) {
+            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            if (index >= threadSettle.size())
+                break;
+            threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
+        }
+    });
    threads.waitForThreads();
 }

-void CpuSETTLE::applyToVelocities(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
-    ApplyToVelocitiesTask task(atomCoordinates, velocities, inverseMasses, tolerance, threadSettle);
-    threads.execute(task);
+void CpuSETTLE::applyToVelocities(vector<OpenMM::Vec3>& atomCoordinates, vector<OpenMM::Vec3>& velocities, vector<double>& inverseMasses, double tolerance) {
+    gmx_atomic_t atomicCounter;
+    gmx_atomic_set(&atomicCounter, 0);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        while (true) {
+            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            if (index >= threadSettle.size())
+                break;
+            threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
+        }
+    });
    threads.waitForThreads();
 }
--- a/platforms/cpu/staticTarget/CMakeLists.txt
+++ b/platforms/cpu/staticTarget/CMakeLists.txt
@@ -16,7 +16,6 @@ ENDFOREACH(file)
 ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

 TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}_static ${PTHREADS_LIB_STATIC})
-#-DPTW32_STATIC_LIB only works for the windows pthreads.
-SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_STATIC_LIBRARY -DPTW32_STATIC_LIB")
+SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_STATIC_LIBRARY")

 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
--- a/platforms/cpu/tests/TestCpuDispersionPME.cpp
+++ b/platforms/cpu/tests/TestCpuDispersionPME.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2017 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CpuTests.h"
+#include "TestDispersionPME.h"
+
+void runPlatformTests() {
+}
+
--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -51,16 +51,16 @@ using namespace std;
 void testNeighborList(bool periodic, bool triclinic) {
    const int numParticles = 500;
    const float cutoff = 2.0f;
-    RealVec boxVectors[3];
+    Vec3 boxVectors[3];
    if (triclinic) {
-        boxVectors[0] = RealVec(10, 0, 0);
-        boxVectors[1] = RealVec(4, 9, 0);
-        boxVectors[2] = RealVec(-3, -3.5, 11);
+        boxVectors[0] = Vec3(10, 0, 0);
+        boxVectors[1] = Vec3(4, 9, 0);
+        boxVectors[2] = Vec3(-3, -3.5, 11);
    }
    else {
-        boxVectors[0] = RealVec(10, 0, 0);
-        boxVectors[1] = RealVec(0, 9, 0);
-        boxVectors[2] = RealVec(0, 0, 11);
+        boxVectors[0] = Vec3(10, 0, 0);
+        boxVectors[1] = Vec3(0, 9, 0);
+        boxVectors[2] = Vec3(0, 0, 11);
    }
    const float boxSize[3] = {(float) boxVectors[0][0], (float) boxVectors[1][1], (float) boxVectors[2][2]};
    const int blockSize = 8;

--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -494,6 +494,10 @@ public:
    CudaNonbondedUtilities& getNonbondedUtilities() {
        return *nonbonded;
    }
+    /**
+     * Set the particle charges.  These are packed into the fourth element of the posq array.
+     */
+    void setCharges(const std::vector<double>& charges);
    /**
     * Get the thread used by this context for executing parallel computations.
     */
@@ -577,6 +581,12 @@ public:
     * and order to be revalidated.
     */
    void invalidateMolecules();
+    /**
+     * Mark that the current molecule definitions from one particular force (and hence the atom order)
+     * may be invalid.  This should be called whenever force field parameters change.  It will cause the
+     * definitions and order to be revalidated.
+     */
+    bool invalidateMolecules(CudaForceInfo* force);
 private:
    /**
     * Compute a sorted list of device indices in decreasing order of desirability
@@ -626,6 +636,7 @@ private:
    CUfunction clearFourBuffersKernel;
    CUfunction clearFiveBuffersKernel;
    CUfunction clearSixBuffersKernel;
+    CUfunction setChargesKernel;
    std::vector<CudaForceInfo*> forces;
    std::vector<Molecule> molecules;
    std::vector<MoleculeGroup> moleculeGroups;
@@ -638,6 +649,7 @@ private:
    CudaArray* energyBuffer;
    CudaArray* energyParamDerivBuffer;
    CudaArray* atomIndexDevice;
+    CudaArray* chargeBuffer;
    std::vector<std::string> energyParamDerivNames;
    std::map<std::string, double> energyParamDerivWorkspace;
    std::vector<int> atomIndex;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -198,7 +198,6 @@ public:
     */
    void loadCheckpoint(ContextImpl& context, std::istream& stream);
 private:
-    class GetPositionsTask;
    CudaContext& cu;
 };

@@ -292,9 +291,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force);
 private:
+    class ForceInfo;
    int numBonds;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaArray* params;
 };
@@ -332,9 +333,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomBondForce& force);
 private:
+    class ForceInfo;
    int numBonds;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaParameterSet* params;
    CudaArray* globals;
@@ -375,9 +378,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force);
 private:
+    class ForceInfo;
    int numAngles;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaArray* params;
 };
@@ -415,9 +420,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomAngleForce& force);
 private:
+    class ForceInfo;
    int numAngles;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaParameterSet* params;
    CudaArray* globals;
@@ -458,9 +465,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaArray* params;
 };
@@ -498,9 +507,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const RBTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaArray* params1;
    CudaArray* params2;
@@ -539,9 +550,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    std::vector<int2> mapPositionsVec;
    CudaArray* coefficients;
@@ -582,9 +595,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force);
 private:
+    class ForceInfo;
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaParameterSet* params;
    CudaArray* globals;
@@ -599,7 +614,8 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeDispersionBsplineModuliX(NULL), pmeDispersionBsplineModuliY(NULL),
+            pmeDispersionBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -636,6 +652,15 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class SortTrait : public CudaSort::SortTrait {
        int getDataSize() const {return 8;}
@@ -647,12 +672,14 @@ private:
        const char* getMaxValue() const {return "make_int2(2147483647, 2147483647)";}
        const char* getSortKey() const {return "value.y";}
    };
+    class ForceInfo;
    class PmeIO;
    class PmePreComputation;
    class PmePostComputation;
    class SyncStreamPreComputation;
    class SyncStreamPostComputation;
    CudaContext& cu;
+    ForceInfo* info;
    bool hasInitializedFFT;
    CudaArray* sigmaEpsilon;
    CudaArray* exceptionParams;
@@ -662,6 +689,9 @@ private:
    CudaArray* pmeBsplineModuliX;
    CudaArray* pmeBsplineModuliY;
    CudaArray* pmeBsplineModuliZ;
+    CudaArray* pmeDispersionBsplineModuliX;
+    CudaArray* pmeDispersionBsplineModuliY;
+    CudaArray* pmeDispersionBsplineModuliZ;
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
    CudaArray* pmeEnergyBuffer;
@@ -673,20 +703,29 @@ private:
    CudaFFT3D* fft;
    cufftHandle fftForward;
    cufftHandle fftBackward;
+    CudaFFT3D* dispersionFft;
+    cufftHandle dispersionFftForward;
+    cufftHandle dispersionFftBackward;
    CUfunction ewaldSumsKernel;
    CUfunction ewaldForcesKernel;
    CUfunction pmeGridIndexKernel;
+    CUfunction pmeDispersionGridIndexKernel;
    CUfunction pmeSpreadChargeKernel;
+    CUfunction pmeDispersionSpreadChargeKernel;
    CUfunction pmeFinishSpreadChargeKernel;
+    CUfunction pmeDispersionFinishSpreadChargeKernel;
    CUfunction pmeEvalEnergyKernel;
+    CUfunction pmeEvalDispersionEnergyKernel;
    CUfunction pmeConvolutionKernel;
+    CUfunction pmeDispersionConvolutionKernel;
    CUfunction pmeInterpolateForceKernel;
-    std::map<std::string, std::string> pmeDefines;
+    CUfunction pmeInterpolateDispersionForceKernel;
    std::vector<std::pair<int, int> > exceptionAtoms;
-    double ewaldSelfEnergy, dispersionCoefficient, alpha;
+    double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
    int interpolateForceThreads;
    int gridSizeX, gridSizeY, gridSizeZ;
-    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT;
+    int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
+    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT, doLJPME;
    NonbondedMethod nonbondedMethod;
    static const int PmeOrder = 5;
 };
@@ -724,8 +763,10 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
 private:
+    class ForceInfo;
    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource, const std::vector<std::string>& tableTypes);
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* params;
    CudaArray* globals;
    CudaArray* interactionGroupData;
@@ -775,10 +816,12 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
 private:
+    class ForceInfo;
    double prefactor, surfaceAreaFactor, cutoff;
    bool hasCreatedKernels;
    int maxTiles;
    CudaContext& cu;
+    ForceInfo* info;
    CudaArray* params;
    CudaArray* bornSum;
    CudaArray* bornRadii;
@@ -825,10 +868,12 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
 private:
+    class ForceInfo;
    double cutoff;
    bool hasInitializedKernels, needParameterGradient, needEnergyParamDerivs;
    int maxTiles, numComputedValues;
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* params;
    CudaParameterSet* computedValues;
    CudaParameterSet* energyDerivs;
@@ -882,9 +927,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomExternalForce& force);
 private:
+    class ForceInfo;
    int numParticles;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    const System& system;
    CudaParameterSet* params;
    CudaArray* globals;
@@ -926,9 +973,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
 private:
+    class ForceInfo;
    int numDonors, numAcceptors;
    bool hasInitializedKernel;
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* donorParams;
    CudaParameterSet* acceptorParams;
    CudaArray* globals;
@@ -978,9 +1027,11 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomCentroidBondForce& force);

 private:
+    class ForceInfo;
    int numGroups, numBonds;
    bool needEnergyParamDerivs;
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* params;
    CudaArray* globals;
    CudaArray* groupParticles;
@@ -1031,8 +1082,10 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force);

 private:
+    class ForceInfo;
    int numBonds;
    CudaContext& cu;
+    ForceInfo* info;
    CudaParameterSet* params;
    CudaArray* globals;
    std::vector<std::string> globalParamNames;
@@ -1077,7 +1130,9 @@ public:
    void copyParametersToContext(ContextImpl& context, const CustomManyParticleForce& force);

 private:
+    class ForceInfo;
    CudaContext& cu;
+    ForceInfo* info;
    bool hasInitializedKernel;
    NonbondedMethod nonbondedMethod;
    int maxNeighborPairs, forceWorkgroupSize, findNeighborsWorkgroupSize;
@@ -1139,9 +1194,11 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const GayBerneForce& force);
 private:
+    class ForceInfo;
    class ReorderListener;
    void sortAtoms();
    CudaContext& cu;
+    ForceInfo* info;
    bool hasInitializedKernels;
    int numRealParticles, numExceptions, maxNeighborBlocks;
    GayBerneForce::NonbondedMethod nonbondedMethod;
@@ -1432,7 +1489,7 @@ private:
    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
    Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
    void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
-    void recordGlobalValue(double value, GlobalTarget target);
+    void recordGlobalValue(double value, GlobalTarget target, CustomIntegrator& integrator);
    void recordChangedParameters(ContextImpl& context);
    bool evaluateCondition(int step);
    CudaContext& cu;

--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -71,15 +71,16 @@ public:
    /**
     * Add a nonbonded interaction to be evaluated by the default interaction kernel.
     *
-     * @param usesCutoff     specifies whether a cutoff should be applied to this interaction
-     * @param usesPeriodic   specifies whether periodic boundary conditions should be applied to this interaction
-     * @param usesExclusions specifies whether this interaction uses exclusions.  If this is true, it must have identical exclusions to every other interaction.
-     * @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false)
-     * @param exclusionList  for each atom, specifies the list of other atoms whose interactions should be excluded
-     * @param kernel         the code to evaluate the interaction
-     * @param forceGroup     the force group in which the interaction should be calculated
+     * @param usesCutoff       specifies whether a cutoff should be applied to this interaction
+     * @param usesPeriodic     specifies whether periodic boundary conditions should be applied to this interaction
+     * @param usesExclusions   specifies whether this interaction uses exclusions.  If this is true, it must have identical exclusions to every other interaction.
+     * @param cutoffDistance   the cutoff distance for this interaction (ignored if usesCutoff is false)
+     * @param exclusionList    for each atom, specifies the list of other atoms whose interactions should be excluded
+     * @param kernel           the code to evaluate the interaction
+     * @param forceGroup       the force group in which the interaction should be calculated
+     * @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
     */
-    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
+    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList=false);
    /**
     * Add a per-atom parameter that the default interaction kernel may depend on.
     */
@@ -189,6 +190,12 @@ public:
    CudaArray& getInteractingAtoms() {
        return *interactingAtoms;
    }
+    /**
+     * Get the array containing single pairs in the neighbor list.
+     */
+    CudaArray& getSinglePairs() {
+        return *singlePairs;
+    }
    /**
     * Get the array containing exclusion flags.
     */
@@ -270,6 +277,8 @@ private:
    CudaArray* interactingTiles;
    CudaArray* interactingAtoms;
    CudaArray* interactionCount;
+    CudaArray* singlePairs;
+    CudaArray* singlePairCount;
    CudaArray* blockCenter;
    CudaArray* blockBoundingBox;
    CudaArray* sortedBlocks;
@@ -288,8 +297,8 @@ private:
    std::map<int, double> groupCutoff;
    std::map<int, std::string> groupKernelSource;
    double lastCutoff;
-    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList;
-    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
+    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
+    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
 };

 /**

--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
@@ -83,7 +83,7 @@ private:
    std::vector<Kernel> kernels;
    std::vector<long long> completionTimes;
    std::vector<double> contextNonbondedFractions;
-    int* tileCounts;
+    int2* interactionCounts;
    CudaArray* contextForces;
    void* pinnedPositionBuffer;
    long long* pinnedForceBuffer;
@@ -439,6 +439,15 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class Task;
    CudaPlatform::PlatformData& data;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -52,6 +52,7 @@
 #include <set>
 #include <sstream>
 #include <typeinfo>
+#include <sys/stat.h>
 #include <cudaProfiler.h>
 #ifndef WIN32
  #include <unistd.h>
@@ -107,7 +108,8 @@ static int executeInWindows(const string &command) {
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false),
-        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL),
+        integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    // Determine what compiler to use.
    
    this->compiler = "\""+compiler+"\"";
@@ -127,9 +129,12 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    string testCompilerCommand = this->compiler+" --version > /dev/null 2> /dev/null";
    int res = std::system(testCompilerCommand.c_str());
 #endif
-    isNvccAvailable = (res == 0);
+    struct stat info;
+    isNvccAvailable = (res == 0 && stat(tempDir.c_str(), &info) == 0);
+    int cudaDriverVersion;
+    cuDriverGetVersion(&cudaDriverVersion);
    static bool hasShownNvccWarning = false;
-    if (hasCompilerKernel && !isNvccAvailable && !hasShownNvccWarning) {
+    if (hasCompilerKernel && !isNvccAvailable && !hasShownNvccWarning && cudaDriverVersion < 8000) {
        hasShownNvccWarning = true;
        printf("Could not find nvcc.  Using runtime compiler, which may produce slower performance.  ");
 #ifdef WIN32
@@ -205,14 +210,15 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking

    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
-#if __CUDA_API_VERSION < 7000
+    int numThreadBlocksPerComputeUnit = (major >= 6 ? 4 : 6);
+    if (cudaDriverVersion < 7000) {
        // This is a workaround to support GTX 980 with CUDA 6.5.  It reports
        // its compute capability as 5.2, but the compiler doesn't support
        // anything beyond 5.0.
        if (major == 5)
            minor = 0;
-#endif
-#if __CUDA_API_VERSION < 8000
+    }
+    if (cudaDriverVersion < 8000) {
        // This is a workaround to support Pascal with CUDA 7.5.  It reports
        // its compute capability as 6.x, but the compiler doesn't support
        // anything beyond 5.3.
@@ -220,7 +226,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
            major = 5;
            minor = 3;
        }
-#endif
+    }
    gpuArchitecture = intToString(major)+intToString(minor);
    computeCapability = major+0.1*minor;

@@ -241,7 +247,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
-    int numThreadBlocksPerComputeUnit = 6;
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
    if (useDoublePrecision) {
        posq = CudaArray::create<double4>(*this, paddedNumAtoms, "posq");
@@ -287,6 +292,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
    clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
    clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
+    setChargesKernel = getKernel(utilities, "setCharges");

    // Set defines based on the requested precision.

@@ -403,6 +409,8 @@ CudaContext::~CudaContext() {
        delete energyParamDerivBuffer;
    if (atomIndexDevice != NULL)
        delete atomIndexDevice;
+    if (chargeBuffer != NULL)
+        delete chargeBuffer;
    if (integration != NULL)
        delete integration;
    if (expression != NULL)
@@ -856,6 +864,25 @@ void CudaContext::clearAutoclearBuffers() {
    }
 }

+void CudaContext::setCharges(const vector<double>& charges) {
+    if (chargeBuffer == NULL)
+        chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
+    if (getUseDoublePrecision()) {
+        double* c = (double*) getPinnedBuffer();
+        for (int i = 0; i < charges.size(); i++)
+            c[i] = charges[i];
+        chargeBuffer->upload(c);
+    }
+    else {
+        float* c = (float*) getPinnedBuffer();
+        for (int i = 0; i < charges.size(); i++)
+            c[i] = (float) charges[i];
+        chargeBuffer->upload(c);
+    }
+    void* args[] = {&chargeBuffer->getDevicePointer(), &posq->getDevicePointer(), &atomIndexDevice->getDevicePointer(), &numAtoms};
+    executeKernel(setChargesKernel, args, numAtoms);
+}
+
 /**
 * This class ensures that atom reordering doesn't break virtual sites.
 */
@@ -1054,9 +1081,19 @@ void CudaContext::findMoleculeGroups() {
 }

 void CudaContext::invalidateMolecules() {
+    for (int i = 0; i < forces.size(); i++)
+        if (invalidateMolecules(forces[i]))
+            return;
+}
+
+bool CudaContext::invalidateMolecules(CudaForceInfo* force) {
    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
-        return;
+        return false;
    bool valid = true;
+    int forceIndex = -1;
+    for (int i = 0; i < forces.size(); i++)
+        if (forces[i] == force)
+            forceIndex = i;
    for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
        MoleculeGroup& mol = moleculeGroups[group];
        vector<int>& instances = mol.instances;
@@ -1071,22 +1108,21 @@ void CudaContext::invalidateMolecules() {
            Molecule& m2 = molecules[instances[j]];
            int offset2 = offsets[j];
            for (int i = 0; i < (int) atoms.size() && valid; i++) {
-                for (int k = 0; k < (int) forces.size(); k++)
-                    if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
-                        valid = false;
+                if (!force->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
+                    valid = false;
            }

            // See if the force groups are identical.

-            for (int i = 0; i < (int) forces.size() && valid; i++) {
-                for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
-                    if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
+            if (valid && forceIndex > -1) {
+                for (int k = 0; k < (int) m1.groups[forceIndex].size() && valid; k++)
+                    if (!force->areGroupsIdentical(m1.groups[forceIndex][k], m2.groups[forceIndex][k]))
                        valid = false;
            }
        }
    }
    if (valid)
-        return;
+        return false;

    // The list of which molecules are identical is no longer valid.  We need to restore the
    // atoms to their original order, rebuild the list of identical molecules, and sort them
@@ -1154,6 +1190,7 @@ void CudaContext::invalidateMolecules() {
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        reorderListeners[i]->execute();
    reorderAtoms();
+    return true;
 }

 void CudaContext::reorderAtoms() {