Merge remote-tracking branch 'upstream/master'

047934e2 · Rafal P. Wiewiora · ce3a5dc0 · d12c9bd1 · 047934e2 · 047934e2
Commit 047934e2 authored Mar 01, 2017 by Rafal P. Wiewiora
20 changed files
--- a/platforms/cpu/src/CpuCustomManyParticleForce.cpp
+++ b/platforms/cpu/src/CpuCustomManyParticleForce.cpp
-/* Portions copyright (c) 2009-2014 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2017 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -37,16 +37,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuCustomManyParticleForce::ComputeForceTask : public ThreadPool::Task {
-public:
-    ComputeForceTask(CpuCustomManyParticleForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuCustomManyParticleForce& owner;
-};
 CpuCustomManyParticleForce::CpuCustomManyParticleForce(const CustomManyParticleForce& force, ThreadPool& threads) :
            threads(threads), useCutoff(false), usePeriodic(false), neighborList(NULL) {
    numParticles = force.getNumParticles();
@@ -98,7 +88,7 @@ CpuCustomManyParticleForce::~CpuCustomManyParticleForce() {
        delete threadData[i];
 }
-void CpuCustomManyParticleForce::calculateIxn(AlignedArray<float>& posq, RealOpenMM** particleParameters,
+void CpuCustomManyParticleForce::calculateIxn(AlignedArray<float>& posq, double** particleParameters,
                                                  const map<string, double>& globalParameters, vector<AlignedArray<float> >& threadForce,
                                                  bool includeForces, bool includeEnergy, double& energy) {
    // Record the parameters for the threads.
@@ -141,8 +131,7 @@ void CpuCustomManyParticleForce::calculateIxn(AlignedArray<float>& posq, RealOpe
    // Signal the threads to start running and wait for them to finish.
-    ComputeForceTask task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
    // Combine the energies from all the threads.
@@ -191,14 +180,14 @@ void CpuCustomManyParticleForce::threadComputeForce(ThreadPool& threads, int thr
    }
 }
-void CpuCustomManyParticleForce::setUseCutoff(RealOpenMM distance) {
+void CpuCustomManyParticleForce::setUseCutoff(double distance) {
    useCutoff = true;
    cutoffDistance = distance;
    if (neighborList == NULL)
        neighborList = new CpuNeighborList(4);
 }
-void CpuCustomManyParticleForce::setPeriodic(RealVec* periodicBoxVectors) {
+void CpuCustomManyParticleForce::setPeriodic(Vec3* periodicBoxVectors) {
    assert(useCutoff);
    assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
    assert(periodicBoxVectors[1][1] >= 2.0*cutoffDistance);
@@ -220,7 +209,7 @@ void CpuCustomManyParticleForce::setPeriodic(RealVec* periodicBoxVectors) {
 }
 void CpuCustomManyParticleForce::loopOverInteractions(vector<int>& availableParticles, vector<int>& particleSet, int loopIndex, int startIndex,
-                                                          RealOpenMM** particleParameters, float* forces, ThreadData& data, const fvec4& boxSize, const fvec4& invBoxSize) {
+                                                          double** particleParameters, float* forces, ThreadData& data, const fvec4& boxSize, const fvec4& invBoxSize) {
    int numParticles = availableParticles.size();
    double cutoff2 = cutoffDistance*cutoffDistance;
    int checkRange = (centralParticleMode ? 1 : loopIndex);
@@ -254,7 +243,7 @@ void CpuCustomManyParticleForce::loopOverInteractions(vector<int>& availablePart
    }
 }
-void CpuCustomManyParticleForce::calculateOneIxn(vector<int>& particleSet, RealOpenMM** particleParameters, float* forces, ThreadData& data, const fvec4& boxSize, const fvec4& invBoxSize) {
+void CpuCustomManyParticleForce::calculateOneIxn(vector<int>& particleSet, double** particleParameters, float* forces, ThreadData& data, const fvec4& boxSize, const fvec4& invBoxSize) {
    // Select the ordering to use for the particles.
    vector<int>& permutedParticles = data.permutedParticles;

--- a/platforms/cpu/src/CpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForce.cpp
-/* Portions copyright (c) 2009-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2017 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -33,16 +33,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuCustomNonbondedForce::ComputeForceTask : public ThreadPool::Task {
-public:
-    ComputeForceTask(CpuCustomNonbondedForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuCustomNonbondedForce& owner;
-};
 CpuCustomNonbondedForce::ThreadData::ThreadData(const Lepton::CompiledExpression& energyExpression, const Lepton::CompiledExpression& forceExpression,
            const vector<string>& parameterNames, const std::vector<Lepton::CompiledExpression> energyParamDerivExpressions) :
            energyExpression(energyExpression), forceExpression(forceExpression), energyParamDerivExpressions(energyParamDerivExpressions) {
@@ -70,7 +60,7 @@ CpuCustomNonbondedForce::ThreadData::ThreadData(const Lepton::CompiledExpression
 CpuCustomNonbondedForce::CpuCustomNonbondedForce(const Lepton::CompiledExpression& energyExpression,
            const Lepton::CompiledExpression& forceExpression, const vector<string>& parameterNames, const vector<set<int> >& exclusions,
            const std::vector<Lepton::CompiledExpression> energyParamDerivExpressions, ThreadPool& threads) :
-            cutoff(false), useSwitch(false), periodic(false), paramNames(parameterNames), exclusions(exclusions), threads(threads) {
+            cutoff(false), useSwitch(false), periodic(false), useInteractionGroups(false), paramNames(parameterNames), exclusions(exclusions), threads(threads) {
    for (int i = 0; i < threads.getNumThreads(); i++)
        threadData.push_back(new ThreadData(energyExpression, forceExpression, parameterNames, energyParamDerivExpressions));
 }
@@ -80,13 +70,14 @@ CpuCustomNonbondedForce::~CpuCustomNonbondedForce() {
        delete threadData[i];
 }
-void CpuCustomNonbondedForce::setUseCutoff(RealOpenMM distance, const CpuNeighborList& neighbors) {
+void CpuCustomNonbondedForce::setUseCutoff(double distance, const CpuNeighborList& neighbors) {
    cutoff = true;
    cutoffDistance = distance;
    neighborList = &neighbors;
  }
 void CpuCustomNonbondedForce::setInteractionGroups(const vector<pair<set<int>, set<int> > >& groups) {
+    useInteractionGroups = true;
    for (int group = 0; group < (int) groups.size(); group++) {
        const set<int>& set1 = groups[group].first;
        const set<int>& set2 = groups[group].second;
@@ -102,12 +93,12 @@ void CpuCustomNonbondedForce::setInteractionGroups(const vector<pair<set<int>, s
    }
 }
-void CpuCustomNonbondedForce::setUseSwitchingFunction(RealOpenMM distance) {
+void CpuCustomNonbondedForce::setUseSwitchingFunction(double distance) {
    useSwitch = true;
    switchingDistance = distance;
 }
-void CpuCustomNonbondedForce::setPeriodic(RealVec* periodicBoxVectors) {
+void CpuCustomNonbondedForce::setPeriodic(Vec3* periodicBoxVectors) {
    assert(cutoff);
    assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
    assert(periodicBoxVectors[1][1] >= 2.0*cutoffDistance);
@@ -129,8 +120,8 @@ void CpuCustomNonbondedForce::setPeriodic(RealVec* periodicBoxVectors) {
 }
-void CpuCustomNonbondedForce::calculatePairIxn(int numberOfAtoms, float* posq, vector<RealVec>& atomCoordinates, RealOpenMM** atomParameters,
+void CpuCustomNonbondedForce::calculatePairIxn(int numberOfAtoms, float* posq, vector<Vec3>& atomCoordinates, double** atomParameters,
-                                             RealOpenMM* fixedParameters, const map<string, double>& globalParameters,
+                                               double* fixedParameters, const map<string, double>& globalParameters,
                                               vector<AlignedArray<float> >& threadForce, bool includeForce, bool includeEnergy, double& totalEnergy, double* energyParamDerivs) {
    // Record the parameters for the threads.
@@ -149,8 +140,7 @@ void CpuCustomNonbondedForce::calculatePairIxn(int numberOfAtoms, float* posq, v
    // Signal the threads to start running and wait for them to finish.
-    ComputeForceTask task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
    // Combine the energies from all the threads.
@@ -183,7 +173,7 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
        data.energyParamDerivs[i] = 0.0;
    fvec4 boxSize(periodicBoxVectors[0][0], periodicBoxVectors[1][1], periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize(recipBoxSize[0], recipBoxSize[1], recipBoxSize[2], 0);
-    if (groupInteractions.size() > 0) {
+    if (useInteractionGroups) {
        // The user has specified interaction groups, so compute only the requested interactions.
        while (true) {

--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
-/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -37,16 +36,6 @@ const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
 const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
 const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;
-class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
-public:
-    ComputeTask(CpuGBSAOBCForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex);
-    }
-    CpuGBSAOBCForce& owner;
-};
 CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
    logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
    logDXInv = 1.0f/logDX;
@@ -89,6 +78,10 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
    particleParams = params;
    bornRadii.resize(params.size()+3);
    obcChain.resize(params.size()+3);
+    for (int i = bornRadii.size()-3; i < bornRadii.size(); i++) {
+        bornRadii[i] = 0;
+        obcChain[i] = 0;
+    }
 }
 void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
@@ -107,9 +100,8 @@ void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<Align
    // Signal the threads to start running and wait for them to finish.
-    ComputeTask task(*this);
    gmx_atomic_set(&counter, 0);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
    threads.waitForThreads(); // Compute Born radii
    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();

--- a/platforms/cpu/src/CpuGayBerneForce.cpp
+++ b/platforms/cpu/src/CpuGayBerneForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2016 Stanford University and the Authors.           *
+ * Portions copyright (c) 2016-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -44,17 +44,6 @@
 using namespace OpenMM;
 using namespace std;
-class CpuGayBerneForce::ComputeTask : public ThreadPool::Task {
-public:
-    ComputeTask(CpuGayBerneForce& owner, CpuNeighborList* neighborList) : owner(owner), neighborList(neighborList) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeForce(threads, threadIndex, neighborList);
-    }
-    CpuGayBerneForce& owner;
-    CpuNeighborList* neighborList;
-};
 CpuGayBerneForce::CpuGayBerneForce(const GayBerneForce& force) {
    // Record the force parameters.
@@ -111,7 +100,7 @@ const vector<set<int> >& CpuGayBerneForce::getExclusions() const {
    return particleExclusions;
 }
-RealOpenMM CpuGayBerneForce::calculateForce(const vector<RealVec>& positions, std::vector<RealVec>& forces, std::vector<AlignedArray<float> >& threadForce, RealVec* boxVectors, CpuPlatform::PlatformData& data) {
+double CpuGayBerneForce::calculateForce(const vector<Vec3>& positions, std::vector<Vec3>& forces, std::vector<AlignedArray<float> >& threadForce, Vec3* boxVectors, CpuPlatform::PlatformData& data) {
    if (nonbondedMethod == GayBerneForce::CutoffPeriodic) {
        double minAllowedSize = 1.999999*cutoffDistance;
        if (boxVectors[0][0] < minAllowedSize || boxVectors[1][1] < minAllowedSize || boxVectors[2][2] < minAllowedSize)
@@ -137,8 +126,7 @@ RealOpenMM CpuGayBerneForce::calculateForce(const vector<RealVec>& positions, st
    // Signal the threads to compute the pairwise interactions.
-    ComputeTask task(*this, data.neighborList);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex, data.neighborList); });
-    threads.execute(task);
    threads.waitForThreads();
    // Signal the threads to compute exceptions.
@@ -164,10 +152,10 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
    int numThreads = threads.getNumThreads();
    threadEnergy[threadIndex] = 0;
    float* forces = &(*threadForce)[threadIndex][0];
-    vector<RealVec>& torques = threadTorque[threadIndex];
+    vector<Vec3>& torques = threadTorque[threadIndex];
    torques.resize(numParticles);
    for (int i = 0; i < numParticles; i++)
-        torques[i] = RealVec();
+        torques[i] = Vec3();
    double energy = 0.0;
    // Compute this thread's subset of interactions.
@@ -184,8 +172,8 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
                    continue;
                if (particleExclusions[i].find(j) != particleExclusions[i].end())
                    continue; // This interaction will be handled by an exception.
-                RealOpenMM sigma = particles[i].sigmaOver2+particles[j].sigmaOver2;
+                double sigma = particles[i].sigmaOver2+particles[j].sigmaOver2;
-                RealOpenMM epsilon = particles[i].sqrtEpsilon*particles[j].sqrtEpsilon;
+                double epsilon = particles[i].sqrtEpsilon*particles[j].sqrtEpsilon;
                energy += computeOneInteraction(i, j, sigma, epsilon, positions, forces, torques, boxVectors);
            }
        }
@@ -208,8 +196,8 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
                        int second = blockAtom[k];
                        if (particles[second].sqrtEpsilon == 0.0f)
                            continue;
-                        RealOpenMM sigma = particles[first].sigmaOver2+particles[second].sigmaOver2;
+                        double sigma = particles[first].sigmaOver2+particles[second].sigmaOver2;
-                        RealOpenMM epsilon = particles[first].sqrtEpsilon*particles[second].sqrtEpsilon;
+                        double epsilon = particles[first].sqrtEpsilon*particles[second].sqrtEpsilon;
                        energy += computeOneInteraction(first, second, sigma, epsilon, positions, forces, torques, boxVectors);
                    }
                }
@@ -235,39 +223,39 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
    threadEnergy[threadIndex] = energy;
 }
-void CpuGayBerneForce::computeEllipsoidFrames(const vector<RealVec>& positions) {
+void CpuGayBerneForce::computeEllipsoidFrames(const vector<Vec3>& positions) {
    int numParticles = particles.size();
    for (int particle = 0; particle < numParticles; particle++) {
        ParticleInfo& p = particles[particle];
        // Compute the local coordinate system of the ellipsoid;
-        RealVec xdir, ydir, zdir;
+        Vec3 xdir, ydir, zdir;
        if (p.xparticle == -1) {
-            xdir = RealVec(1, 0, 0);
+            xdir = Vec3(1, 0, 0);
-            ydir = RealVec(0, 1, 0);
+            ydir = Vec3(0, 1, 0);
        }
        else {
            xdir = positions[particle]-positions[p.xparticle];
-            xdir /= SQRT(xdir.dot(xdir));
+            xdir /= sqrt(xdir.dot(xdir));
            if (p.yparticle == -1) {
                if (xdir[1] > -0.5 && xdir[1] < 0.5)
-                    ydir = RealVec(0, 1, 0);
+                    ydir = Vec3(0, 1, 0);
                else
-                    ydir = RealVec(1, 0, 0);
+                    ydir = Vec3(1, 0, 0);
            }
            else
                ydir = positions[particle]-positions[p.yparticle];
            ydir -= xdir*(xdir.dot(ydir));
-            ydir /= SQRT(ydir.dot(ydir));
+            ydir /= sqrt(ydir.dot(ydir));
        }
        zdir = xdir.cross(ydir);
        // Compute matrices we will need later.
-        RealOpenMM (&a)[3][3] = A[particle].v;
+        double (&a)[3][3] = A[particle].v;
-        RealOpenMM (&b)[3][3] = B[particle].v;
+        double (&b)[3][3] = B[particle].v;
-        RealOpenMM (&g)[3][3] = G[particle].v;
+        double (&g)[3][3] = G[particle].v;
        a[0][0] = xdir[0];
        a[0][1] = xdir[1];
        a[0][2] = xdir[2];
@@ -277,8 +265,8 @@ void CpuGayBerneForce::computeEllipsoidFrames(const vector<RealVec>& positions)
        a[2][0] = zdir[0];
        a[2][1] = zdir[1];
        a[2][2] = zdir[2];
-        RealVec r2(p.rx*p.rx, p.ry*p.ry, p.rz*p.rz);
+        Vec3 r2(p.rx*p.rx, p.ry*p.ry, p.rz*p.rz);
-        RealVec e2(1/sqrt(p.ex), 1/sqrt(p.ey), 1/sqrt(p.ez));
+        Vec3 e2(1/sqrt(p.ex), 1/sqrt(p.ey), 1/sqrt(p.ez));
        for (int i = 0; i < 3; i++)
            for (int j = 0; j < 3; j++) {
                b[i][j] = 0;
@@ -291,33 +279,33 @@ void CpuGayBerneForce::computeEllipsoidFrames(const vector<RealVec>& positions)
    }
 }
-void CpuGayBerneForce::applyTorques(const vector<RealVec>& positions, vector<RealVec>& forces) {
+void CpuGayBerneForce::applyTorques(const vector<Vec3>& positions, vector<Vec3>& forces) {
    int numParticles = particles.size();
    int numThreads = threadTorque.size();
    for (int particle = 0; particle < numParticles; particle++) {
        ParticleInfo& p = particles[particle];
-        RealVec pos = positions[particle];
+        Vec3 pos = positions[particle];
        if (p.xparticle != -1) {
            // Add up the torques from the individual threads.
-            RealVec torque;
+            Vec3 torque;
            for (int i = 0; i < numThreads; i++)
                torque += threadTorque[i][particle];
            // Apply a force to the x particle.
-            RealVec dx = positions[p.xparticle]-pos;
+            Vec3 dx = positions[p.xparticle]-pos;
            double dx2 = dx.dot(dx);
-            RealVec f = torque.cross(dx)/dx2;
+            Vec3 f = torque.cross(dx)/dx2;
            forces[p.xparticle] += f;
            forces[particle] -= f;
            if (p.yparticle != -1) {
                // Apply a force to the y particle.  This is based on the component of the torque
                // that was not already applied to the x particle.
-                RealVec dy = positions[p.yparticle]-pos;
+                Vec3 dy = positions[p.yparticle]-pos;
                double dy2 = dy.dot(dy);
-                RealVec torque2 = dx*(torque.dot(dx)/dx2);
+                Vec3 torque2 = dx*(torque.dot(dx)/dx2);
                f = torque2.cross(dy)/dy2;
                forces[p.yparticle] += f;
                forces[particle] -= f;
@@ -326,27 +314,27 @@ void CpuGayBerneForce::applyTorques(const vector<RealVec>& positions, vector<Rea
    }
 }
-RealOpenMM CpuGayBerneForce::computeOneInteraction(int particle1, int particle2, RealOpenMM sigma, RealOpenMM epsilon, const RealVec* positions,
+double CpuGayBerneForce::computeOneInteraction(int particle1, int particle2, double sigma, double epsilon, const Vec3* positions,
-        float* forces, vector<RealVec>& torques, const RealVec* boxVectors) {
+        float* forces, vector<Vec3>& torques, const Vec3* boxVectors) {
    // Compute the displacement and check against the cutoff.
-    RealOpenMM deltaR[ReferenceForce::LastDeltaRIndex];
+    double deltaR[ReferenceForce::LastDeltaRIndex];
    if (nonbondedMethod == GayBerneForce::CutoffPeriodic)
        ReferenceForce::getDeltaRPeriodic(positions[particle2], positions[particle1], boxVectors, deltaR);
    else
        ReferenceForce::getDeltaR(positions[particle2], positions[particle1], deltaR);
-    RealOpenMM r = deltaR[ReferenceForce::RIndex];
+    double r = deltaR[ReferenceForce::RIndex];
    if (nonbondedMethod != GayBerneForce::NoCutoff && r >= cutoffDistance)
        return 0;
-    RealOpenMM rInv = 1/r;
+    double rInv = 1/r;
-    RealVec dr(deltaR[ReferenceForce::XIndex], deltaR[ReferenceForce::YIndex], deltaR[ReferenceForce::ZIndex]);
+    Vec3 dr(deltaR[ReferenceForce::XIndex], deltaR[ReferenceForce::YIndex], deltaR[ReferenceForce::ZIndex]);
-    RealVec drUnit = dr*rInv;
+    Vec3 drUnit = dr*rInv;
    // Compute the switching function.
-    RealOpenMM switchValue = 1, switchDeriv = 0;
+    double switchValue = 1, switchDeriv = 0;
    if (useSwitchingFunction && r > switchingDistance) {
-        RealOpenMM t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
+        double t = (r-switchingDistance)/(cutoffDistance-switchingDistance);
        switchValue = 1+t*t*t*(-10+t*(15-t*6));
        switchDeriv = t*t*(-30+t*(60-t*30))/(cutoffDistance-switchingDistance);
    }
@@ -354,11 +342,11 @@ RealOpenMM CpuGayBerneForce::computeOneInteraction(int particle1, int particle2,
    // Interactions between two point particles can be computed more easily.
    if (particles[particle1].isPointParticle && particles[particle2].isPointParticle) {
-        RealOpenMM sig = sigma*rInv;
+        double sig = sigma*rInv;
-        RealOpenMM sig2 = sig*sig;
+        double sig2 = sig*sig;
-        RealOpenMM sig6 = sig2*sig2*sig2;
+        double sig6 = sig2*sig2*sig2;
-        RealOpenMM energy = 4*epsilon*(sig6-1)*sig6;
+        double energy = 4*epsilon*(sig6-1)*sig6;
-        RealVec force = drUnit*(switchValue*4*epsilon*(12*sig6 - 6)*sig6*rInv - energy*switchDeriv);
+        Vec3 force = drUnit*(switchValue*4*epsilon*(12*sig6 - 6)*sig6*rInv - energy*switchDeriv);
        forces[4*particle1] += force[0];
        forces[4*particle1+1] += force[1];
        forces[4*particle1+2] += force[2];
@@ -374,31 +362,31 @@ RealOpenMM CpuGayBerneForce::computeOneInteraction(int particle1, int particle2,
    Matrix G12 = G[particle1]+G[particle2];
    Matrix B12inv = B12.inverse();
    Matrix G12inv = G12.inverse();
-    RealOpenMM detG12 = G12.determinant();
+    double detG12 = G12.determinant();
    // Estimate the distance between the ellipsoids and compute the first terms needed for the energy.
-    RealOpenMM sigma12 = 1/SQRT(0.5*drUnit.dot(G12inv*drUnit));
+    double sigma12 = 1/sqrt(0.5*drUnit.dot(G12inv*drUnit));
-    RealOpenMM h12 = r - sigma12;
+    double h12 = r - sigma12;
-    RealOpenMM rho = sigma/(h12+sigma);
+    double rho = sigma/(h12+sigma);
-    RealOpenMM rho2 = rho*rho;
+    double rho2 = rho*rho;
-    RealOpenMM rho6 = rho2*rho2*rho2;
+    double rho6 = rho2*rho2*rho2;
-    RealOpenMM u = 4*epsilon*(rho6*rho6-rho6);
+    double u = 4*epsilon*(rho6*rho6-rho6);
-    RealOpenMM eta = SQRT(2*s[particle1]*s[particle2]/detG12);
+    double eta = sqrt(2*s[particle1]*s[particle2]/detG12);
-    RealOpenMM chi = 2*drUnit.dot(B12inv*drUnit);
+    double chi = 2*drUnit.dot(B12inv*drUnit);
    chi *= chi;
-    RealOpenMM energy = u*eta*chi;
+    double energy = u*eta*chi;
    // Compute the terms needed for the force.
-    RealVec kappa = G12inv*dr;
+    Vec3 kappa = G12inv*dr;
-    RealVec iota = B12inv*dr;
+    Vec3 iota = B12inv*dr;
-    RealOpenMM rInv2 = rInv*rInv;
+    double rInv2 = rInv*rInv;
-    RealOpenMM dUSLJdr = 24*epsilon*(2*rho6-1)*rho6*rho/sigma;
+    double dUSLJdr = 24*epsilon*(2*rho6-1)*rho6*rho/sigma;
-    RealOpenMM temp = 0.5*sigma12*sigma12*sigma12*rInv2;
+    double temp = 0.5*sigma12*sigma12*sigma12*rInv2;
-    RealVec dudr = (drUnit + (kappa-drUnit*kappa.dot(drUnit))*temp)*dUSLJdr;
+    Vec3 dudr = (drUnit + (kappa-drUnit*kappa.dot(drUnit))*temp)*dUSLJdr;
-    RealVec dchidr = (iota-drUnit*iota.dot(drUnit))*(-8*rInv2*SQRT(chi));
+    Vec3 dchidr = (iota-drUnit*iota.dot(drUnit))*(-8*rInv2*sqrt(chi));
-    RealVec force = (dchidr*u + dudr*chi)*(eta*switchValue) - drUnit*(energy*switchDeriv);
+    Vec3 force = (dchidr*u + dudr*chi)*(eta*switchValue) - drUnit*(energy*switchDeriv);
    forces[4*particle1] += force[0];
    forces[4*particle1+1] += force[1];
    forces[4*particle1+2] += force[2];
@@ -413,13 +401,13 @@ RealOpenMM CpuGayBerneForce::computeOneInteraction(int particle1, int particle2,
        ParticleInfo& p = particles[particle];
        if (p.isPointParticle)
            continue;
-        RealVec dudq = (kappa*G[particle]).cross(kappa*(temp*dUSLJdr));
+        Vec3 dudq = (kappa*G[particle]).cross(kappa*(temp*dUSLJdr));
-        RealVec dchidq = (iota*B[particle]).cross(iota)*(-4*rInv2);
+        Vec3 dchidq = (iota*B[particle]).cross(iota)*(-4*rInv2);
-        RealOpenMM (&g12)[3][3] = G12.v;
+        double (&g12)[3][3] = G12.v;
-        RealOpenMM (&a)[3][3] = A[particle].v;
+        double (&a)[3][3] = A[particle].v;
-        RealVec scale = RealVec(p.rx*p.rx, p.ry*p.ry, p.rz*p.rz)*(-0.5*eta/detG12);
+        Vec3 scale = Vec3(p.rx*p.rx, p.ry*p.ry, p.rz*p.rz)*(-0.5*eta/detG12);
        Matrix D;
-        RealOpenMM (&d)[3][3] = D.v;
+        double (&d)[3][3] = D.v;
        d[0][0] = scale[0]*(2*a[0][0]*(g12[1][1]*g12[2][2] - g12[1][2]*g12[2][1]) +
                              a[0][2]*(g12[1][2]*g12[0][1] + g12[1][0]*g12[2][1] - g12[1][1]*(g12[0][2] + g12[2][0])) +
                              a[0][1]*(g12[0][2]*g12[2][1] + g12[2][0]*g12[1][2] - g12[2][2]*(g12[0][1] + g12[1][0])));
@@ -447,10 +435,10 @@ RealOpenMM CpuGayBerneForce::computeOneInteraction(int particle1, int particle2,
        d[2][2] = scale[2]*(  a[2][0]*(g12[0][1]*g12[1][2] + g12[2][1]*g12[1][0] - g12[1][1]*(g12[0][2] + g12[2][0])) +
                              a[2][1]*(g12[1][0]*g12[0][2] + g12[2][0]*g12[0][1] - g12[0][0]*(g12[1][2] + g12[2][1])) +
                            2*a[2][2]*(g12[1][1]*g12[0][0] - g12[1][0]*g12[0][1]));
-        RealVec detadq;
+        Vec3 detadq;
        for (int i = 0; i < 3; i++)
-            detadq += RealVec(a[i][0], a[i][1], a[i][2]).cross(RealVec(d[i][0], d[i][1], d[i][2]));
+            detadq += Vec3(a[i][0], a[i][1], a[i][2]).cross(Vec3(d[i][0], d[i][1], d[i][2]));
-        RealVec torque = (dchidq*(u*eta) + detadq*(u*chi) + dudq*(eta*chi))*switchValue;
+        Vec3 torque = (dchidq*(u*eta) + detadq*(u*chi) + dudq*(eta*chi))*switchValue;
        torques[particle] -= torque;
    }
    return switchValue*energy;

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
--- a/platforms/cpu/src/CpuLangevinDynamics.cpp
+++ b/platforms/cpu/src/CpuLangevinDynamics.cpp
-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Authors: Peter Eastman
 * Contributors: 
 *
@@ -29,45 +29,15 @@
 using namespace OpenMM;
 using namespace std;
-class CpuLangevinDynamics::Update1Task : public ThreadPool::Task {
+CpuLangevinDynamics::CpuLangevinDynamics(int numberOfAtoms, double deltaT, double friction, double temperature, ThreadPool& threads, CpuRandom& random) : 
-public:
+           ReferenceStochasticDynamics(numberOfAtoms, deltaT, friction, temperature), threads(threads), random(random) {
-    Update1Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate1(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-class CpuLangevinDynamics::Update2Task : public ThreadPool::Task {
-public:
-    Update2Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate2(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-class CpuLangevinDynamics::Update3Task : public ThreadPool::Task {
-public:
-    Update3Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate3(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-CpuLangevinDynamics::CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM tau, RealOpenMM temperature, ThreadPool& threads, CpuRandom& random) : 
-           ReferenceStochasticDynamics(numberOfAtoms, deltaT, tau, temperature), threads(threads), random(random) {
 }
 CpuLangevinDynamics::~CpuLangevinDynamics() {
 }
-void CpuLangevinDynamics::updatePart1(int numberOfAtoms, vector<RealVec>& atomCoordinates, vector<RealVec>& velocities,
+void CpuLangevinDynamics::updatePart1(int numberOfAtoms, vector<Vec3>& atomCoordinates, vector<Vec3>& velocities,
-                                      vector<RealVec>& forces, vector<RealOpenMM>& inverseMasses, vector<RealVec>& xPrime) {
+                                      vector<Vec3>& forces, vector<double>& inverseMasses, vector<Vec3>& xPrime) {
    // Record the parameters for the threads.
    this->numberOfAtoms = numberOfAtoms;
@@ -79,13 +49,12 @@ void CpuLangevinDynamics::updatePart1(int numberOfAtoms, vector<RealVec>& atomCo
    // Signal the threads to start running and wait for them to finish.
-    Update1Task task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate1(threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
 }
-void CpuLangevinDynamics::updatePart2(int numberOfAtoms, vector<RealVec>& atomCoordinates, vector<RealVec>& velocities,
+void CpuLangevinDynamics::updatePart2(int numberOfAtoms, vector<Vec3>& atomCoordinates, vector<Vec3>& velocities,
-                                      vector<RealVec>& forces, vector<RealOpenMM>& inverseMasses, vector<RealVec>& xPrime) {
+                                      vector<Vec3>& forces, vector<double>& inverseMasses, vector<Vec3>& xPrime) {
    // Record the parameters for the threads.
    this->numberOfAtoms = numberOfAtoms;
@@ -97,13 +66,12 @@ void CpuLangevinDynamics::updatePart2(int numberOfAtoms, vector<RealVec>& atomCo
    // Signal the threads to start running and wait for them to finish.
-    Update2Task task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate2(threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
 }
-void CpuLangevinDynamics::updatePart3(int numberOfAtoms, vector<RealVec>& atomCoordinates, vector<RealVec>& velocities,
+void CpuLangevinDynamics::updatePart3(int numberOfAtoms, vector<Vec3>& atomCoordinates, vector<Vec3>& velocities,
-                                       vector<RealOpenMM>& inverseMasses, vector<RealVec>& xPrime) {
+                                       vector<double>& inverseMasses, vector<Vec3>& xPrime) {
    // Record the parameters for the threads.
    this->numberOfAtoms = numberOfAtoms;
@@ -114,44 +82,44 @@ void CpuLangevinDynamics::updatePart3(int numberOfAtoms, vector<RealVec>& atomCo
    // Signal the threads to start running and wait for them to finish.
-    Update3Task task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate3(threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
 }
 void CpuLangevinDynamics::threadUpdate1(int threadIndex) {
-    const RealOpenMM tau = getTau();
+    double dt = getDeltaT();
-    const RealOpenMM vscale = EXP(-getDeltaT()/tau);
+    double friction = getFriction();
-    const RealOpenMM fscale = (1-vscale)*tau;
+    const double vscale = exp(-dt*friction);
-    const RealOpenMM kT = BOLTZ*getTemperature();
+    const double fscale = (friction == 0 ? dt : (1-vscale)/friction);
-    const RealOpenMM noisescale = SQRT(2*kT/tau)*SQRT(0.5*(1-vscale*vscale)*tau);
+    const double kT = BOLTZ*getTemperature();
+    const double noisescale = sqrt(kT*(1-vscale*vscale));
    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();
    for (int i = start; i < end; i++) {
        if (inverseMasses[i] != 0.0) {
-            RealOpenMM sqrtInvMass = SQRT(inverseMasses[i]);
+            double sqrtInvMass = sqrt(inverseMasses[i]);
-            RealVec noise(random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex));
+            Vec3 noise(random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex));
            velocities[i]  = velocities[i]*vscale + forces[i]*(fscale*inverseMasses[i]) + noise*(noisescale*sqrtInvMass);
        }
   }
 }
 void CpuLangevinDynamics::threadUpdate2(int threadIndex) {
-    const RealOpenMM dt = getDeltaT();
+    const double dt = getDeltaT();
    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();
    for (int i = start; i < end; i++) {
        if (inverseMasses[i] != 0.0) {
-            RealOpenMM sqrtInvMass = SQRT(inverseMasses[i]);
+            double sqrtInvMass = sqrt(inverseMasses[i]);
            xPrime[i] = atomCoordinates[i]+velocities[i]*dt;
        }
   }
 }
 void CpuLangevinDynamics::threadUpdate3(int threadIndex) {
-   const RealOpenMM invStepSize = 1.0/getDeltaT();
+   const double invStepSize = 1.0/getDeltaT();
    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -59,7 +59,7 @@ public:
 */
 class CpuNeighborList::Voxels {
 public:
-    Voxels(int blockSize, float vsy, float vsz, float miny, float maxy, float minz, float maxz, const RealVec* boxVectors, bool usePeriodic) :
+    Voxels(int blockSize, float vsy, float vsz, float miny, float maxy, float minz, float maxz, const Vec3* boxVectors, bool usePeriodic) :
            blockSize(blockSize), voxelSizeY(vsy), voxelSizeZ(vsz), miny(miny), maxy(maxy), minz(minz), maxz(maxz), usePeriodic(usePeriodic) {
        for (int i = 0; i < 3; i++)
            for (int j = 0; j < 3; j++)
@@ -409,21 +409,11 @@ private:
    vector<vector<vector<pair<float, int> > > > bins;
 };
-class CpuNeighborList::ThreadTask : public ThreadPool::Task {
-public:
-    ThreadTask(CpuNeighborList& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeNeighborList(threads, threadIndex);
-    }
-    CpuNeighborList& owner;
-};
 CpuNeighborList::CpuNeighborList(int blockSize) : blockSize(blockSize) {
 }
 void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
-            const RealVec* periodicBoxVectors, bool usePeriodic, float maxDistance, ThreadPool& threads) {
+            const Vec3* periodicBoxVectors, bool usePeriodic, float maxDistance, ThreadPool& threads) {
    int numBlocks = (numAtoms+blockSize-1)/blockSize;
    blockNeighbors.resize(numBlocks);
    blockExclusions.resize(numBlocks);
@@ -460,8 +450,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
    // Sort the atoms based on a Hilbert curve.
    atomBins.resize(numAtoms);
-    ThreadTask task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeNeighborList(threads, threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
    sort(atomBins.begin(), atomBins.end());

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -30,6 +30,7 @@
 #include "ReferencePME.h"
 #include "openmm/internal/gmx_atomic.h"
 #include <algorithm>
+#include <iostream>
 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
@@ -41,23 +42,14 @@ using namespace OpenMM;
 const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
 const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;
-class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
-public:
-    ComputeDirectTask(CpuNonbondedForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeDirect(threads, threadIndex);
-    }
-    CpuNonbondedForce& owner;
-};
 /**---------------------------------------------------------------------------------------
   CpuNonbondedForce constructor
   --------------------------------------------------------------------------------------- */
-CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false), cutoffDistance(0.0f), alphaEwald(0.0f) {
+CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), ljpme(false), tableIsValid(false), expTableIsValid(false),
+    cutoffDistance(0.0f), alphaDispersionEwald(0.0f), alphaEwald(0.0f) {
 }
 CpuNonbondedForce::~CpuNonbondedForce() {
@@ -78,11 +70,22 @@ void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neig
        tableIsValid = false;
    cutoff = true;
    cutoffDistance = distance;
+    inverseRcut6 = pow(cutoffDistance, -6);
    neighborList = &neighbors;
    krf = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0)/(2.0*solventDielectric+1.0);
    crf = (1.0/cutoffDistance)*(3.0*solventDielectric)/(2.0*solventDielectric+1.0);
+    if(alphaDispersionEwald != 0.0f){
+        // We set this here, in case setUseCutoff is called after the dispersion alpha is set.
+        double dalphaR = alphaDispersionEwald*cutoffDistance;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        inverseRcut6Expterm  = inverseRcut6*(1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
    }
+}
 /**---------------------------------------------------------------------------------------
   Set the force to use a switching function on the Lennard-Jones interaction.
@@ -96,7 +99,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    switchingDistance = distance;
 }
-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------
     Set the force to use periodic boundary conditions.  This requires that a cutoff has
     also been set, and the smallest side of the periodic box is at least twice the cutoff
@@ -106,7 +109,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */
-  void CpuNonbondedForce::setPeriodic(RealVec* periodicBoxVectors) {
+void CpuNonbondedForce::setPeriodic(Vec3* periodicBoxVectors) {
    assert(cutoff);
    assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
@@ -126,9 +129,9 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    triclinic = (periodicBoxVectors[0][1] != 0.0 || periodicBoxVectors[0][2] != 0.0 ||
            periodicBoxVectors[1][0] != 0.0 || periodicBoxVectors[1][2] != 0.0 ||
            periodicBoxVectors[2][0] != 0.0 || periodicBoxVectors[2][1] != 0.0);
-  }
+}
-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------
     Set the force to use Ewald summation.
@@ -139,7 +142,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */
-  void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
+void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
    if (alpha != alphaEwald)
        tableIsValid = false;
    alphaEwald = alpha;
@@ -148,9 +151,9 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    numRz = kmaxz;
    ewald = true;
    tabulateEwaldScaleFactor();
-  }
+}
-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------
     Set the force to use Particle-Mesh Ewald (PME) summation.
@@ -159,7 +162,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */
-  void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
+void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
    if (alpha != alphaEwald)
        tableIsValid = false;
    alphaEwald = alpha;
@@ -168,10 +171,40 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    meshDim[2] = meshSize[2];
    pme = true;
    tabulateEwaldScaleFactor();
+}
+/**---------------------------------------------------------------------------------------
+     Set the force to use Particle-Mesh Ewald (PME) summation for dispersion.
+     @param alpha  the Ewald separation parameter
+     @param gridSize the dimensions of the mesh
+     --------------------------------------------------------------------------------------- */
+void CpuNonbondedForce::setUseLJPME(float alpha, int meshSize[3]) {
+    if (alpha != alphaDispersionEwald)
+        expTableIsValid = false;
+    alphaDispersionEwald = alpha;
+    dispersionMeshDim[0] = meshSize[0];
+    dispersionMeshDim[1] = meshSize[1];
+    dispersionMeshDim[2] = meshSize[2];
+    ljpme = true;
+    tabulateExpTerms();
+    if(cutoffDistance != 0.0f){
+        // We set this here, in case setUseLJPME is called after the cutoff is set
+        double dalphaR = alphaDispersionEwald*cutoffDistance;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        inverseRcut6Expterm  = inverseRcut6*(1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
    }
+}
-  void CpuNonbondedForce::tabulateEwaldScaleFactor() {
+void CpuNonbondedForce::tabulateEwaldScaleFactor() {
    if (tableIsValid)
        return;
    tableIsValid = true;
@@ -188,9 +221,29 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
    }
 }
-void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates,
+void CpuNonbondedForce::tabulateExpTerms() {
-                                             const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
+    if (expTableIsValid)
-                                             vector<RealVec>& forces, double* totalEnergy) const {
+        return;
+    expTableIsValid = true;
+    exptermsDX = cutoffDistance/NUM_TABLE_POINTS;
+    exptermsDXInv = 1.0f/exptermsDX;
+    exptermsTable.resize(NUM_TABLE_POINTS+4);
+    dExptermsTable.resize(NUM_TABLE_POINTS+4);
+    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
+        double r = i*ewaldDX;
+        double dalphaR = alphaDispersionEwald*r;
+        double dar2 = dalphaR * dalphaR;
+        double dar4 = dar2*dar2;
+        double dar6 = dar4*dar2;
+        double expterm = EXP(-dar2);
+        exptermsTable[i]  = (1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
+        dExptermsTable[i] = (1.0 - expterm * (1.0 + dar2 + 0.5*dar4 + dar6/6.0));
+    }
+}
+void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<Vec3>& atomCoordinates,
+                                               const vector<pair<float, float> >& atomParameters, const vector<float> &C6params, const vector<set<int> >& exclusions,
+                                               vector<Vec3>& forces, double* totalEnergy) const {
    typedef std::complex<float> d_complex;
    static const float epsilon     =  1.0;
@@ -203,14 +256,37 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
    if (pme) {
        pme_t pmedata;
        pme_init(&pmedata, alphaEwald, numberOfAtoms, meshDim, 5, 1);
-        vector<RealOpenMM> charges(numberOfAtoms);
+        vector<double> charges(numberOfAtoms);
        for (int i = 0; i < numberOfAtoms; i++)
            charges[i] = posq[4*i+3];
-        RealOpenMM recipEnergy = 0.0;
+        double recipEnergy = 0.0;
        pme_exec(pmedata, atomCoordinates, forces, charges, periodicBoxVectors, &recipEnergy);
        if (totalEnergy)
            *totalEnergy += recipEnergy;
        pme_destroy(pmedata);
+        if (ljpme) {
+            // Dispersion reciprocal space terms
+            pme_init(&pmedata,alphaDispersionEwald,numberOfAtoms,dispersionMeshDim,5,1);
+            std::vector<Vec3> dpmeforces;
+            for (int i = 0; i < numberOfAtoms; i++){
+                charges[i] = C6params[i];
+                dpmeforces.push_back(Vec3());
+            }
+            double recipDispersionEnergy = 0.0;
+            pme_exec_dpme(pmedata,atomCoordinates,dpmeforces,charges,periodicBoxVectors,&recipDispersionEnergy);
+            for (int i = 0; i < numberOfAtoms; i++){
+                forces[i][0] -= 2.0*dpmeforces[i][0];
+                forces[i][1] -= 2.0*dpmeforces[i][1];
+                forces[i][2] -= 2.0*dpmeforces[i][2];
+            }
+            if (totalEnergy)
+                *totalEnergy += recipDispersionEnergy;
+            pme_destroy(pmedata);
+        }
    }
    // Ewald method
@@ -224,7 +300,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
        // setup K-vectors
-        #define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
+#define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
        vector<d_complex> eir(kmax*numberOfAtoms*3);
        vector<d_complex> tab_xy(numberOfAtoms);
        vector<d_complex> tab_qxyz(numberOfAtoms);
@@ -300,14 +376,15 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
 }
-void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
+void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<Vec3>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+                                           const vector<float>& C6params, const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    this->numberOfAtoms = numberOfAtoms;
    this->posq = posq;
    this->atomCoordinates = &atomCoordinates[0];
    this->atomParameters = &atomParameters[0];
+    this->C6params = &C6params[0];
    this->exclusions = &exclusions[0];
    this->threadForce = &threadForce;
    includeEnergy = (totalEnergy != NULL);
@@ -318,8 +395,7 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    // Signal the threads to start running and wait for them to finish.
-    ComputeDirectTask task(*this);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeDirect(threads, threadIndex); });
-    threads.execute(task);
    threads.waitForThreads();
    // Signal the threads to subtract the exclusions.
@@ -350,9 +426,8 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    float* forces = &(*threadForce)[threadIndex][0];
    fvec4 boxSize(periodicBoxVectors[0][0], periodicBoxVectors[1][1], periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize(recipBoxSize[0], recipBoxSize[1], recipBoxSize[2], 0);
-    if (ewald || pme) {
+    if (ewald || pme || ljpme) {
        // Compute the interactions from the neighbor list.
        while (true) {
            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
            if (nextBlock >= neighborList->getNumBlocks())
@@ -395,6 +470,17 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
                        }
                        else if (includeEnergy)
                            threadEnergy[threadIndex] -= alphaEwald*TWO_OVER_SQRT_PI*scaledChargeI*posq[4*j+3];
+                        if (ljpme) {
+                            float C6ij = C6params[i]*C6params[j];
+                            float inverseR2 = 1.0f/r2;
+                            float emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                            if(includeEnergy)
+                                threadEnergy[threadIndex] += emult;
+                            float dEdR = -6.0f*C6ij*inverseR2*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                            fvec4 result = deltaR*dEdR;
+                            (fvec4(forces+4*i)-result).store(forces+4*i);
+                            (fvec4(forces+4*j)+result).store(forces+4*j);
+                        }
                    }
                }
            }
@@ -476,7 +562,7 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
    fvec4 result = deltaR*dEdR;
    (fvec4(forces+4*ii)+result).store(forces+4*ii);
    (fvec4(forces+4*jj)-result).store(forces+4*jj);
-  }
+}
 void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    deltaR = posJ-posI;
@@ -502,3 +588,18 @@ float CpuNonbondedForce::erfcApprox(float x) {
    return coeff1*erfcTable[index] + coeff2*erfcTable[index+1];
 }
+float CpuNonbondedForce::exptermsApprox(float x) {
+    float x1 = x*exptermsDXInv;
+    int index = min((int) floor(x1), NUM_TABLE_POINTS);
+    float coeff2 = x1-index;
+    float coeff1 = 1.0f-coeff2;
+    return coeff1*exptermsTable[index] + coeff2*exptermsTable[index+1];
+}
+float CpuNonbondedForce::dExptermsApprox(float x) {
+    float x1 = x*exptermsDXInv;
+    int index = min((int) floor(x1), NUM_TABLE_POINTS);
+    float coeff2 = x1-index;
+    float coeff1 = 1.0f-coeff2;
+    return coeff1*dExptermsTable[index] + coeff2*dExptermsTable[index+1];
+}
--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
@@ -25,6 +25,7 @@
 #include "SimTKOpenMMUtilities.h"
 #include "CpuNonbondedForceVec4.h"
 #include <algorithm>
+#include <iostream>
 using namespace std;
 using namespace OpenMM;
@@ -213,7 +214,6 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
 void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
    // Determine whether we need to apply periodic boundary conditions.
    PeriodicType periodicType;
    fvec4 blockCenter;
    if (!periodic) {
@@ -263,7 +263,6 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces
 template <int PERIODIC_TYPE>
 void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
    fvec4 blockAtomPosq[4];
    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
@@ -278,6 +277,7 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    fvec4 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]]);
    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
@@ -318,7 +318,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
            fvec4 sig2 = inverseR*sig;
            sig2 *= sig2;
            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            fvec4 eps = blockAtomEpsilon*atomEpsilon;
+            fvec4 epsSig6 = eps*sig6;
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
@@ -328,6 +329,17 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
                dEdR = switchValue*dEdR - energy*switchDeriv*r;
                energy *= switchValue;
            }
+            if (ljpme) {
+                fvec4 C6ij = C6s*C6params[atom];
+                fvec4 inverseR2 = inverseR*inverseR;
+                fvec4 mysig2 = sig*sig;
+                fvec4 mysig6 = mysig2*mysig2*mysig2;
+                fvec4 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                fvec4 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
+                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                energy += emult + potentialShift;
+            }
        }
        else {
            energy = 0.0f;
@@ -420,3 +432,30 @@ fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(const fvec4& x) {
    transpose(t1, t2, t3, t4);
    return coeff1*t1 + coeff2*t2;
 }
+fvec4 CpuNonbondedForceVec4::exptermsApprox(const fvec4& r) {
+    fvec4 r1 = r*exptermsDXInv;
+    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = r1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&exptermsTable[index[0]]);
+    fvec4 t2(&exptermsTable[index[1]]);
+    fvec4 t3(&exptermsTable[index[2]]);
+    fvec4 t4(&exptermsTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
+fvec4 CpuNonbondedForceVec4::dExptermsApprox(const fvec4& r) {
+    fvec4 r1 = r*exptermsDXInv;
+    ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = r1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&dExptermsTable[index[0]]);
+    fvec4 t2(&dExptermsTable[index[1]]);
+    fvec4 t3(&dExptermsTable[index[2]]);
+    fvec4 t4(&dExptermsTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
@@ -27,6 +27,7 @@
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/hardware.h"
 #include <algorithm>
+#include <iostream>
 using namespace std;
 using namespace OpenMM;
@@ -81,7 +82,6 @@ enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, Periodic
 void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
    // Determine whether we need to apply periodic boundary conditions.    
    PeriodicType periodicType;
    fvec4 blockCenter;
    if (!periodic) {
@@ -308,6 +308,7 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
    blockAtomCharge *= ONE_4PI_EPS0;
    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
+    fvec8 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]], C6params[blockAtom[4]], C6params[blockAtom[5]], C6params[blockAtom[6]], C6params[blockAtom[7]]);
    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
@@ -348,7 +349,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
            fvec8 sig2 = inverseR*sig;
            sig2 *= sig2;
            fvec8 sig6 = sig2*sig2*sig2;
-            fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            fvec8 eps = blockAtomEpsilon*atomEpsilon;
+            fvec8 epsSig6 = eps*sig6;
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
@@ -358,6 +360,17 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
                dEdR = switchValue*dEdR - energy*switchDeriv*r;
                energy *= switchValue;
            }
+            if (ljpme) {
+                fvec8 C6ij = C6s*C6params[atom];
+                fvec8 inverseR2 = inverseR*inverseR;
+                fvec8 mysig2 = sig*sig;
+                fvec8 mysig6 = mysig2*mysig2*mysig2;
+                fvec8 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
+                fvec8 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
+                dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
+                energy += emult + potentialShift;
+            }
        }
        else {
            energy = 0.0f;
@@ -464,4 +477,45 @@ fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(const fvec8& x) {
    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
    return coeff1*s1 + coeff2*s2;
 }
+fvec8 CpuNonbondedForceVec8::exptermsApprox(const fvec8& r) {
+    fvec8 r1 = r*exptermsDXInv;
+    ivec8 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = r1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&exptermsTable[indexLower[0]]);
+    fvec4 t2(&exptermsTable[indexLower[1]]);
+    fvec4 t3(&exptermsTable[indexLower[2]]);
+    fvec4 t4(&exptermsTable[indexLower[3]]);
+    fvec4 t5(&exptermsTable[indexUpper[0]]);
+    fvec4 t6(&exptermsTable[indexUpper[1]]);
+    fvec4 t7(&exptermsTable[indexUpper[2]]);
+    fvec4 t8(&exptermsTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
+fvec8 CpuNonbondedForceVec8::dExptermsApprox(const fvec8& r) {
+    fvec8 r1 = r*exptermsDXInv;
+    ivec8 index = min(floor(r1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = r1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&dExptermsTable[indexLower[0]]);
+    fvec4 t2(&dExptermsTable[indexLower[1]]);
+    fvec4 t3(&dExptermsTable[indexLower[2]]);
+    fvec4 t4(&dExptermsTable[indexLower[3]]);
+    fvec4 t5(&dExptermsTable[indexUpper[0]]);
+    fvec4 t6(&dExptermsTable[indexUpper[1]]);
+    fvec4 t7(&dExptermsTable[indexUpper[2]]);
+    fvec4 t8(&dExptermsTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
 #endif
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -127,6 +127,8 @@ void CpuPlatform::contextDestroyed(ContextImpl& context) const {
    PlatformData* data = contextData[&context];
    delete data;
    contextData.erase(&context);
+    ReferencePlatform::PlatformData* refPlatformData = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+    delete refPlatformData;
 }
 CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {

--- a/platforms/cpu/src/CpuSETTLE.cpp
+++ b/platforms/cpu/src/CpuSETTLE.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,56 +35,10 @@
 using namespace OpenMM;
 using namespace std;
-class CpuSETTLE::ApplyToPositionsTask : public ThreadPool::Task {
-public:
-    ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
-            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
-            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
-        gmx_atomic_set(&atomicCounter, 0);
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        while (true) {
-            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
-            if (index >= threadSettle.size())
-                break;
-            threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
-        }
-    }
-    vector<OpenMM::RealVec>& atomCoordinates;
-    vector<OpenMM::RealVec>& atomCoordinatesP;
-    vector<RealOpenMM>& inverseMasses;
-    RealOpenMM tolerance;
-    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
-    gmx_atomic_t atomicCounter;
-};
-class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
-public:
-    ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
-            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
-            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
-        gmx_atomic_set(&atomicCounter, 0);
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        while (true) {
-            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
-            if (index >= threadSettle.size())
-                break;
-            threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
-        }
-    }
-    vector<OpenMM::RealVec>& atomCoordinates;
-    vector<OpenMM::RealVec>& velocities;
-    vector<RealOpenMM>& inverseMasses;
-    RealOpenMM tolerance;
-    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
-    gmx_atomic_t atomicCounter;
-};
 CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
    int numBlocks = 10*threads.getNumThreads();
    int numClusters = settle.getNumClusters();
-    vector<RealOpenMM> mass(system.getNumParticles());
+    vector<double> mass(system.getNumParticles());
    for (int i = 0; i < system.getNumParticles(); i++)
        mass[i] = system.getParticleMass(i);
    for (int i = 0; i < numBlocks; i++) {
@@ -93,7 +47,7 @@ CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settl
        if (start != end) {
            int numThreadClusters = end-start;
            vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters);
-            vector<RealOpenMM> distance1(numThreadClusters), distance2(numThreadClusters);
+            vector<double> distance1(numThreadClusters), distance2(numThreadClusters);
            for (int j = 0; j < numThreadClusters; j++)
                settle.getClusterParameters(start+j, atom1[j], atom2[j], atom3[j], distance1[j], distance2[j]);
            threadSettle.push_back(new ReferenceSETTLEAlgorithm(atom1, atom2, atom3, distance1, distance2, mass));
@@ -106,14 +60,30 @@ CpuSETTLE::~CpuSETTLE() {
        delete threadSettle[i];
 }
-void CpuSETTLE::apply(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
+void CpuSETTLE::apply(vector<OpenMM::Vec3>& atomCoordinates, vector<OpenMM::Vec3>& atomCoordinatesP, vector<double>& inverseMasses, double tolerance) {
-    ApplyToPositionsTask task(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance, threadSettle);
+    gmx_atomic_t atomicCounter;
-    threads.execute(task);
+    gmx_atomic_set(&atomicCounter, 0);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        while (true) {
+            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            if (index >= threadSettle.size())
+                break;
+            threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
+        }
+    });
    threads.waitForThreads();
 }
-void CpuSETTLE::applyToVelocities(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
+void CpuSETTLE::applyToVelocities(vector<OpenMM::Vec3>& atomCoordinates, vector<OpenMM::Vec3>& velocities, vector<double>& inverseMasses, double tolerance) {
-    ApplyToVelocitiesTask task(atomCoordinates, velocities, inverseMasses, tolerance, threadSettle);
+    gmx_atomic_t atomicCounter;
-    threads.execute(task);
+    gmx_atomic_set(&atomicCounter, 0);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        while (true) {
+            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            if (index >= threadSettle.size())
+                break;
+            threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
+        }
+    });
    threads.waitForThreads();
 }
--- a/platforms/cpu/staticTarget/CMakeLists.txt
+++ b/platforms/cpu/staticTarget/CMakeLists.txt
@@ -16,7 +16,6 @@ ENDFOREACH(file)
 ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
 TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}_static ${PTHREADS_LIB_STATIC})
-#-DPTW32_STATIC_LIB only works for the windows pthreads.
+SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_STATIC_LIBRARY")
-SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_STATIC_LIBRARY -DPTW32_STATIC_LIB")
 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
--- a/platforms/cpu/tests/TestCpuDispersionPME.cpp
+++ b/platforms/cpu/tests/TestCpuDispersionPME.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2017 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "CpuTests.h"
+#include "TestDispersionPME.h"
+void runPlatformTests() {
+}
--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -51,16 +51,16 @@ using namespace std;
 void testNeighborList(bool periodic, bool triclinic) {
    const int numParticles = 500;
    const float cutoff = 2.0f;
-    RealVec boxVectors[3];
+    Vec3 boxVectors[3];
    if (triclinic) {
-        boxVectors[0] = RealVec(10, 0, 0);
+        boxVectors[0] = Vec3(10, 0, 0);
-        boxVectors[1] = RealVec(4, 9, 0);
+        boxVectors[1] = Vec3(4, 9, 0);
-        boxVectors[2] = RealVec(-3, -3.5, 11);
+        boxVectors[2] = Vec3(-3, -3.5, 11);
    }
    else {
-        boxVectors[0] = RealVec(10, 0, 0);
+        boxVectors[0] = Vec3(10, 0, 0);
-        boxVectors[1] = RealVec(0, 9, 0);
+        boxVectors[1] = Vec3(0, 9, 0);
-        boxVectors[2] = RealVec(0, 0, 11);
+        boxVectors[2] = Vec3(0, 0, 11);
    }
    const float boxSize[3] = {(float) boxVectors[0][0], (float) boxVectors[1][1], (float) boxVectors[2][2]};
    const int blockSize = 8;

--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp