Merge github.com:SimTk/openmm

19d2885a · Lee-Ping · 99ef4344 · 57a6768e · 19d2885a · 19d2885a
Commit 19d2885a authored Jan 23, 2014 by Lee-Ping
20 changed files
--- a/platforms/cpu/src/CpuBondForce.cpp
+++ b/platforms/cpu/src/CpuBondForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2014 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CpuBondForce.h"
+#include "openmm/OpenMMException.h"
+
+using namespace OpenMM;
+using namespace std;
+
+class CpuBondForce::ComputeForceTask : public ThreadPool::Task {
+public:
+    ComputeForceTask(CpuBondForce& owner, vector<RealVec>& atomCoordinates, RealOpenMM** parameters, vector<RealVec>& forces, 
+        vector<RealOpenMM>& threadEnergy, RealOpenMM* totalEnergy, ReferenceBondIxn& referenceBondIxn) : owner(owner), atomCoordinates(atomCoordinates),
+        parameters(parameters), forces(forces), threadEnergy(threadEnergy), totalEnergy(totalEnergy), referenceBondIxn(referenceBondIxn) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        RealOpenMM* energy = (totalEnergy == NULL ? NULL : &threadEnergy[threadIndex]);
+        owner.threadComputeForce(threads, threadIndex, atomCoordinates, parameters, forces, energy, referenceBondIxn);
+    }
+    CpuBondForce& owner;
+    vector<RealVec>& atomCoordinates;
+    RealOpenMM** parameters;
+    vector<RealVec>& forces;
+    vector<RealOpenMM>& threadEnergy;
+    RealOpenMM* totalEnergy;
+    ReferenceBondIxn& referenceBondIxn;
+};
+
+CpuBondForce::CpuBondForce() {
+}
+
+void CpuBondForce::initialize(int numAtoms, int numBonds, int numAtomsPerBond, int** bondAtoms, ThreadPool& threads) {
+    this->numBonds = numBonds;
+    this->numAtomsPerBond = numAtomsPerBond;
+    this->bondAtoms = bondAtoms;
+    this->threads = &threads;
+    int numThreads = threads.getNumThreads();
+    int targetBondsPerThread = numBonds/numThreads;
+    
+    // Record the bonds that include each atom.
+    
+    vector<set<int> > atomBonds(numAtoms);
+    for (int bond = 0; bond < numBonds; bond++) {
+        for (int i = 0; i < numAtomsPerBond; i++)
+            atomBonds[bondAtoms[bond][i]].insert(bond);
+    }
+    
+    // Divide bonds into groups.
+    
+    vector<int> atomThread(numAtoms, -1);
+    vector<int> bondThread(numBonds, -1);
+    threadBonds.resize(numThreads);
+    int numProcessed = 0;
+    int thread = 0;
+    list<int> candidateBonds;
+    while (thread < numThreads) {
+        // Find the next unassigned bond.
+        
+        while (numProcessed < numBonds && bondThread[numProcessed] != -1)
+            numProcessed++;
+        if (numProcessed == numBonds)
+            break; // We've gone through the whole list of bonds.
+        
+        // See if this bond can be assigned to this thread.
+        
+        if (!canAssignBond(numProcessed, thread, atomThread)) {
+            numProcessed++;
+            continue;
+        }
+        
+        // Assign this bond to the thread.
+        
+        assignBond(numProcessed++, thread, atomThread, bondThread, atomBonds, candidateBonds);
+        
+        // Assign additional bonds that have been identified as involving atoms assigned to this thread.
+        
+        while (!candidateBonds.empty() && threadBonds[thread].size() < targetBondsPerThread) {
+            int bond = *candidateBonds.begin();
+            if (bondThread[bond] == -1 && canAssignBond(bond, thread, atomThread))
+                assignBond(bond, thread, atomThread, bondThread, atomBonds, candidateBonds);
+            candidateBonds.pop_front();
+        }
+        
+        // If we have assigned enough bonds to this thread, move on to the next one.
+        
+        if (threadBonds[thread].size() >= targetBondsPerThread) {
+            candidateBonds.clear();
+            thread++;
+        }
+    }
+    
+    // Look through the remaining bonds and see whether any of them can be assigned.
+    
+    candidateBonds.clear();
+    for (int bond = 0; bond < numBonds; bond++) {
+        if (bondThread[bond] == -1) {
+            // See whether this bond can be assigned to a thread.
+            
+            bool canAssign = true;
+            int assignment = -1;
+            for (int i = 0; i < numAtomsPerBond; i++) {
+                int thread = atomThread[bondAtoms[bond][i]];
+                if (thread == -1 || thread == assignment)
+                    continue;
+                if (assignment == -1)
+                    assignment = thread;
+                else {
+                    canAssign = false;
+                    break;
+                }
+            }
+            if (canAssign) {
+                // Assign this bond to a thread.
+                
+                if (assignment == -1)
+                    assignment = numThreads-1;
+                assignBond(bond, assignment, atomThread, bondThread, atomBonds, candidateBonds);
+            }
+            else {
+                // Add it to the list of "extra" bonds.
+                
+                extraBonds.push_back(bond);
+            }
+        }
+    }
+}
+
+bool CpuBondForce::canAssignBond(int bond, int thread, vector<int>& atomThread) {
+    for (int i = 0; i < numAtomsPerBond; i++) {
+        int atom = bondAtoms[bond][i];
+        if (atomThread[atom] != -1 && atomThread[atom] != thread)
+            return false;
+    }
+    return true;
+}
+
+void CpuBondForce::assignBond(int bond, int thread, vector<int>& atomThread, vector<int>& bondThread, vector<set<int> >& atomBonds, list<int>& candidateBonds) {
+    // Assign the bond to a thread.
+    
+    bondThread[bond] = thread;
+    threadBonds[thread].push_back(bond);
+    
+    // Mark every atom in this bond as also belonging to the thread, and add all of their
+    // bonds to the list of candidates.
+    
+    for (int i = 0; i < numAtomsPerBond; i++) {
+        int& atom = atomThread[bondAtoms[bond][i]];
+        if (atom == thread)
+            continue;
+        if (atom != -1)
+            throw OpenMMException("CpuBondForce: Internal error: atoms assigned to threads incorrectly");
+        atom = thread;
+        for (set<int>::const_iterator iter = atomBonds[atom].begin(); iter != atomBonds[atom].end(); ++iter)
+            candidateBonds.push_back(*iter);
+    }
+}
+
+void CpuBondForce::calculateForce(vector<RealVec>& atomCoordinates, RealOpenMM** parameters, vector<RealVec>& forces, 
+        RealOpenMM* totalEnergy, ReferenceBondIxn& referenceBondIxn) {
+    // Have the worker threads compute their forces.
+    
+    vector<RealOpenMM> threadEnergy(threads->getNumThreads(), 0);
+    ComputeForceTask task(*this, atomCoordinates, parameters, forces, threadEnergy, totalEnergy, referenceBondIxn);
+    threads->execute(task);
+    threads->waitForThreads();
+    
+    // Compute any "extra" bonds.
+    
+    for (int i = 0; i < extraBonds.size(); i++) {
+        int bond = extraBonds[i];
+        referenceBondIxn.calculateBondIxn(bondAtoms[bond], atomCoordinates, parameters[bond], forces, totalEnergy);
+    }
+
+    // Compute the total energy.
+    
+    if (totalEnergy != NULL)
+        for (int i = 0; i < threads->getNumThreads(); i++)
+            *totalEnergy += threadEnergy[i];
+}
+
+void CpuBondForce::threadComputeForce(ThreadPool& threads, int threadIndex, vector<RealVec>& atomCoordinates, RealOpenMM** parameters, vector<RealVec>& forces, 
+            RealOpenMM* totalEnergy, ReferenceBondIxn& referenceBondIxn) {
+    vector<int>& bonds = threadBonds[threadIndex];
+    int numBonds = bonds.size();
+    for (int i = 0; i < numBonds; i++) {
+        int bond = bonds[i];
+        referenceBondIxn.calculateBondIxn(bondAtoms[bond], atomCoordinates, parameters[bond], forces, totalEnergy);
+    }
+}
\ No newline at end of file
--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "CpuGBSAOBCForce.h"
+#include "SimTKOpenMMRealType.h"
+#include "openmm/internal/vectorize.h"
+#include "gmx_atomic.h"
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+
+using namespace std;
+using namespace OpenMM;
+
+const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
+const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
+const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;
+
+class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
+public:
+    ComputeTask(CpuGBSAOBCForce& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadComputeForce(threads, threadIndex);
+    }
+    CpuGBSAOBCForce& owner;
+};
+
+CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
+    logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
+    logDXInv = 1.0f/logDX;
+    logTable.resize(NUM_TABLE_POINTS+4);
+    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
+        double x = TABLE_MIN+i*logDX;
+        logTable[i] = log(x);
+    }
+}
+
+void CpuGBSAOBCForce::setUseCutoff(float distance) {
+    cutoff = true;
+    cutoffDistance = distance;
+}
+
+void CpuGBSAOBCForce::setPeriodic(float* periodicBoxSize) {
+    periodic = true;
+    this->periodicBoxSize[0] = periodicBoxSize[0];
+    this->periodicBoxSize[1] = periodicBoxSize[1];
+    this->periodicBoxSize[2] = periodicBoxSize[2];
+}
+
+void CpuGBSAOBCForce::setSoluteDielectric(float dielectric) {
+    soluteDielectric = dielectric;
+}
+
+void CpuGBSAOBCForce::setSolventDielectric(float dielectric) {
+    solventDielectric = dielectric;
+}
+
+const std::vector<std::pair<float, float> >& CpuGBSAOBCForce::getParticleParameters() const {
+    return particleParams;
+}
+
+void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, float> >& params) {
+    particleParams = params;
+    bornRadii.resize(params.size()+3);
+    obcChain.resize(params.size()+3);
+}
+
+void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+    // Record the parameters for the threads.
+    
+    this->posq = &posq[0];
+    this->threadForce = &threadForce;
+    includeEnergy = (totalEnergy != NULL);
+    int numThreads = threads.getNumThreads();
+    threadEnergy.resize(numThreads);
+    threadBornForces.resize(numThreads);
+    for (int i = 0; i < numThreads; i++)
+        threadBornForces[i].resize(particleParams.size()+3);
+    gmx_atomic_t counter;
+    this->atomicCounter = &counter;
+    
+    // Signal the threads to start running and wait for them to finish.
+    
+    ComputeTask task(*this);
+    gmx_atomic_set(&counter, 0);
+    threads.execute(task);
+    threads.waitForThreads(); // Compute Born radii
+    gmx_atomic_set(&counter, 0);
+    threads.resumeThreads();
+    threads.waitForThreads(); // Compute surface area term
+    gmx_atomic_set(&counter, 0);
+    threads.resumeThreads();
+    threads.waitForThreads(); // First loop
+    gmx_atomic_set(&counter, 0);
+    threads.resumeThreads();
+    threads.waitForThreads(); // Second loop
+    
+    // Combine the energies from all the threads.
+    
+    if (totalEnergy != NULL) {
+        double energy = 0;
+        for (int i = 0; i < numThreads; i++)
+            energy += threadEnergy[i];
+        *totalEnergy += (float) energy;
+    }
+}
+
+void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
+    int numParticles = particleParams.size();
+    int numThreads = threads.getNumThreads();
+    const float dielectricOffset = 0.009;
+    const float alphaObc = 1.0f;
+    const float betaObc = 0.8f;
+    const float gammaObc = 4.85f;
+    fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
+    fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
+    int start = (threadIndex*numParticles)/numThreads;
+    int end = ((threadIndex+1)*numParticles)/numThreads;
+
+    // Calculate Born radii
+
+    while (true) {
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
+        int numInBlock = min(4, numParticles-blockStart);
+        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
+        float atomRadius[4], atomx[4], atomy[4], atomz[4];
+        int blockMask[4] = {0, 0, 0, 0};
+        for (int i = 0; i < numInBlock; i++) {
+            int atomIndex = blockStart+i;
+            atomRadius[i] = particleParams[atomIndex].first;
+            atomx[i] = posq[4*atomIndex];
+            atomy[i] = posq[4*atomIndex+1];
+            atomz[i] = posq[4*atomIndex+2];
+            blockMask[i] = 0xFFFFFFFF;
+        }
+        fvec4 offsetRadiusI(atomRadius);
+        fvec4 radiusIInverse = 1.0f/offsetRadiusI;
+        fvec4 x(atomx);
+        fvec4 y(atomy);
+        fvec4 z(atomz);
+        ivec4 mask(blockMask);
+        float sum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        for (int atomJ = 0; atomJ < numParticles; atomJ++) {
+            fvec4 posJ(posq+4*atomJ);
+            fvec4 dx, dy, dz, r2;
+            getDeltaR(posJ, x, y, z, dx, dy, dz, r2, periodic, boxSize, invBoxSize);
+            ivec4 include = mask & (blockAtomIndex != ivec4(atomJ));
+            if (cutoff)
+                include = include & (r2 < cutoffDistance*cutoffDistance);
+            if (!any(include))
+                continue;
+            fvec4 r = sqrt(r2);
+            float scaledRadiusJ = particleParams[atomJ].second;
+            float scaledRadiusJ2 = scaledRadiusJ*scaledRadiusJ;
+            fvec4 rScaledRadiusJ = r + scaledRadiusJ;
+            include = include & (offsetRadiusI < rScaledRadiusJ);
+            fvec4 l_ij = 1.0f/max(offsetRadiusI, abs(r-scaledRadiusJ));
+            fvec4 u_ij = 1.0f/rScaledRadiusJ;
+            fvec4 l_ij2 = l_ij*l_ij;
+            fvec4 u_ij2 = u_ij*u_ij;
+            fvec4 rInverse = 1.0f/r;
+            fvec4 r2Inverse = rInverse*rInverse;
+            fvec4 logRatio = fastLog(u_ij/l_ij);
+            fvec4 term = l_ij - u_ij + 0.25f*r*(u_ij2 - l_ij2) + (0.5f*rInverse*logRatio) + (0.25f*scaledRadiusJ*scaledRadiusJ*rInverse)*(l_ij2 - u_ij2);
+            for (int j = 0; j < 4; j++) {
+                if (include[j]) {
+                    sum[j] += term[j];
+                    if (offsetRadiusI[j] < scaledRadiusJ-r[j])
+                        sum[j] += 2.0f*(radiusIInverse[j]-l_ij[j]);
+                }
+            }
+        }
+        for (int i = 0; i < numInBlock; i++) {
+            int atomIndex = blockStart+i;
+            sum[i] *= 0.5f*atomRadius[i];
+            float sum2 = sum[i]*sum[i];
+            float sum3 = sum[i]*sum2;
+            float tanhSum = tanh(alphaObc*sum[i] - betaObc*sum2 + gammaObc*sum3);
+            float radiusI = atomRadius[i] + dielectricOffset;
+            bornRadii[atomIndex] = 1.0f/(1.0f/atomRadius[i] - tanhSum/radiusI);
+            obcChain[atomIndex] = atomRadius[i]*(alphaObc - 2.0f*betaObc*sum[i] + 3.0f*gammaObc*sum2);
+            obcChain[atomIndex] = (1.0f - tanhSum*tanhSum)*obcChain[atomIndex]/radiusI;
+        }
+    }
+    threads.syncThreads();
+
+    // Calculate ACE surface area term.
+
+    const float probeRadius = 0.14f;
+    const float surfaceAreaFactor = 28.3919551;
+    double energy = 0.0;
+    vector<float>& bornForces = threadBornForces[threadIndex];
+    for (int i = 0; i < numParticles; i++)
+        bornForces[i] = 0.0f;
+    while (true) {
+        int atomI = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+        if (atomI >= numParticles)
+            break;
+        if (bornRadii[atomI] > 0) {
+            float radiusI = particleParams[atomI].first + dielectricOffset;
+            float r = radiusI + probeRadius;
+            float ratio6 = powf(radiusI/bornRadii[atomI], 6.0f);
+            float saTerm = surfaceAreaFactor*r*r*ratio6;
+            energy += saTerm;
+            bornForces[atomI] = -6.0f*saTerm/bornRadii[atomI]; 
+        }
+        else
+            bornForces[atomI] = 0.0f;
+    }
+    threads.syncThreads();
+ 
+    // First loop of Born energy computation.
+
+    float* forces = &(*threadForce)[threadIndex][0];
+    float preFactor;
+    if (soluteDielectric != 0.0f && solventDielectric != 0.0f)
+        preFactor = ONE_4PI_EPS0*((1.0f/solventDielectric) - (1.0f/soluteDielectric));
+    else
+        preFactor = 0.0f;
+    while (true) {
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
+        int numInBlock = min(4, numParticles-blockStart);
+        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
+        float atomCharge[4], atomx[4], atomy[4], atomz[4];
+        int blockMask[4] = {0, 0, 0, 0};
+        fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f), blockAtomBornForce(0.0f);
+        for (int i = 0; i < numInBlock; i++) {
+            int atomIndex = blockStart+i;
+            atomx[i] = posq[4*atomIndex];
+            atomy[i] = posq[4*atomIndex+1];
+            atomz[i] = posq[4*atomIndex+2];
+            atomCharge[i] = preFactor*posq[4*atomIndex+3];
+            blockMask[i] = 0xFFFFFFFF;
+        }
+        fvec4 radii(&bornRadii[blockStart]);
+        fvec4 x(atomx);
+        fvec4 y(atomy);
+        fvec4 z(atomz);
+        fvec4 partialChargeI(atomCharge);
+        ivec4 mask(blockMask);
+        for (int atomJ = blockStart; atomJ < numParticles; atomJ++) {
+            fvec4 posJ(posq+4*atomJ);
+            fvec4 dx, dy, dz, r2;
+            getDeltaR(posJ, x, y, z, dx, dy, dz, r2, periodic, boxSize, invBoxSize);
+            ivec4 include = mask & (blockAtomIndex <= ivec4(atomJ));
+            if (cutoff)
+                include = include & (r2 < cutoffDistance*cutoffDistance);
+            if (!any(include))
+                continue;
+            fvec4 r = sqrt(r2);
+            fvec4 alpha2_ij = radii*bornRadii[atomJ];
+            fvec4 D_ij = r2/(4.0f*alpha2_ij);
+            fvec4 expTerm(expf(-D_ij[0]), expf(-D_ij[1]), expf(-D_ij[2]), expf(-D_ij[3]));
+            fvec4 denominator2 = r2 + alpha2_ij*expTerm; 
+            fvec4 denominator = sqrt(denominator2);
+            fvec4 Gpol = (partialChargeI*posJ[3])/denominator; 
+            fvec4 dGpol_dr = -Gpol*(1.0f - 0.25f*expTerm)/denominator2;  
+            fvec4 dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f + D_ij)/denominator2;
+            dGpol_dr = blend(0.0f, dGpol_dr, include);
+            dGpol_dalpha2_ij = blend(0.0f, dGpol_dalpha2_ij, include);
+            fvec4 fx = dx*dGpol_dr;
+            fvec4 fy = dy*dGpol_dr;
+            fvec4 fz = dz*dGpol_dr;
+            blockAtomForceX -= fx;
+            blockAtomForceY -= fy;
+            blockAtomForceZ -= fz;
+            blockAtomBornForce += dGpol_dalpha2_ij*bornRadii[atomJ];
+            float* atomForce = forces+4*atomJ;
+            fvec4 one(1.0f);
+            atomForce[0] += dot4(fx, one);
+            atomForce[1] += dot4(fy, one);
+            atomForce[2] += dot4(fz, one);
+            ivec4 atomJMask = include & (blockAtomIndex != ivec4(atomJ));
+            fvec4 termEnergy = blend(0.0f, Gpol, include);
+            if (cutoff)
+                termEnergy -= blend(0.0f, partialChargeI*posJ[3]/cutoffDistance, atomJMask);
+            termEnergy *= blend(0.5f, 1.0f, atomJMask);
+            energy += dot4(termEnergy, one);
+            bornForces[atomJ] += dot4(blend(0.0f, dGpol_dalpha2_ij, atomJMask), radii);
+        }
+        fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
+        transpose(f[0], f[1], f[2], f[3]);
+        for (int i = 0; i < numInBlock; i++) {
+            int atomIndex = blockStart+i;
+            (fvec4(forces+4*atomIndex)+f[i]).store(forces+4*atomIndex);
+            bornForces[atomIndex] += blockAtomBornForce[i];
+        }
+    }
+    threads.syncThreads();
+
+    // Second loop of Born energy computation.
+
+    while (true) {
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
+        fvec4 bornForce(0.0f);
+        for (int i = 0; i < numThreads; i++)
+            bornForce += fvec4(&threadBornForces[i][blockStart]);
+        fvec4 radii(&bornRadii[blockStart]);
+        bornForce *= radii*radii*fvec4(&obcChain[blockStart]);
+        int numInBlock = min(4, numParticles-blockStart);
+        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
+        float atomRadius[4], atomx[4], atomy[4], atomz[4];
+        int blockMask[4] = {0, 0, 0, 0};
+        fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+        for (int i = 0; i < numInBlock; i++) {
+            int atomIndex = blockStart+i;
+            atomRadius[i] = particleParams[atomIndex].first;
+            atomx[i] = posq[4*atomIndex];
+            atomy[i] = posq[4*atomIndex+1];
+            atomz[i] = posq[4*atomIndex+2];
+            blockMask[i] = 0xFFFFFFFF;
+        }
+        fvec4 offsetRadiusI(atomRadius);
+        fvec4 x(atomx);
+        fvec4 y(atomy);
+        fvec4 z(atomz);
+        ivec4 mask(blockMask);
+        for (int atomJ = 0; atomJ < numParticles; atomJ++) {
+            fvec4 posJ(posq+4*atomJ);
+            fvec4 dx, dy, dz, r2;
+            getDeltaR(posJ, x, y, z, dx, dy, dz, r2, periodic, boxSize, invBoxSize);
+            ivec4 include = mask & (blockAtomIndex != ivec4(atomJ));
+            if (cutoff)
+                include = include & (r2 < cutoffDistance*cutoffDistance);
+            if (!any(include))
+                continue;
+            fvec4 r = sqrt(r2);
+            float scaledRadiusJ = particleParams[atomJ].second;
+            float scaledRadiusJ2 = scaledRadiusJ*scaledRadiusJ;
+            fvec4 rScaledRadiusJ = r + scaledRadiusJ;
+            include = include & (offsetRadiusI < rScaledRadiusJ);
+            fvec4 l_ij = 1.0f/max(offsetRadiusI, abs(r-scaledRadiusJ));
+            fvec4 u_ij = 1.0f/rScaledRadiusJ;
+            fvec4 l_ij2 = l_ij*l_ij;
+            fvec4 u_ij2 = u_ij*u_ij;
+            fvec4 rInverse = 1.0f/r;
+            fvec4 r2Inverse = rInverse*rInverse;
+            fvec4 logRatio = fastLog(u_ij/l_ij);
+            fvec4 t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse;
+            fvec4 de = bornForce*t3*rInverse;
+            de = blend(0.0f, de, include);
+            fvec4 fx = dx*de;
+            fvec4 fy = dy*de;
+            fvec4 fz = dz*de;
+            blockAtomForceX += fx;
+            blockAtomForceY += fy;
+            blockAtomForceZ += fz;
+            float* atomForce = forces+4*atomJ;
+            fvec4 one(1.0f);
+            atomForce[0] -= dot4(fx, one);
+            atomForce[1] -= dot4(fy, one);
+            atomForce[2] -= dot4(fz, one);
+        }
+        fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
+        transpose(f[0], f[1], f[2], f[3]);
+        for (int i = 0; i < numInBlock; i++) {
+            int atomIndex = blockStart+i;
+            (fvec4(forces+4*atomIndex)+f[i]).store(forces+4*atomIndex);
+        }
+    }
+    threadEnergy[threadIndex] = energy;
+}
+
+void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (periodic) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
+
+fvec4 CpuGBSAOBCForce::fastLog(const fvec4& x) {
+    // Evaluate log(x) using a lookup table for speed.
+
+    if (any((x < TABLE_MIN) | (x >= TABLE_MAX)))
+        return fvec4(logf(x[0]), logf(x[1]), logf(x[2]), logf(x[3]));
+    fvec4 x1 = (x-TABLE_MIN)*logDXInv;
+    ivec4 index = floor(x1);
+    fvec4 coeff2 = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&logTable[index[0]]);
+    fvec4 t2(&logTable[index[1]]);
+    fvec4 t3(&logTable[index[2]]);
+    fvec4 t4(&logTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
--- a/platforms/cpu/src/CpuKernelFactory.cpp
+++ b/platforms/cpu/src/CpuKernelFactory.cpp
@@ -38,8 +38,18 @@
 using namespace OpenMM;

 KernelImpl* CpuKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
-    ReferencePlatform::PlatformData& data = *static_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+    CpuPlatform::PlatformData& data = CpuPlatform::getPlatformData(context);
+    if (name == CalcForcesAndEnergyKernel::Name())
+        return new CpuCalcForcesAndEnergyKernel(name, platform, data, context);
+    if (name == CalcPeriodicTorsionForceKernel::Name())
+        return new CpuCalcPeriodicTorsionForceKernel(name, platform, data);
+    if (name == CalcRBTorsionForceKernel::Name())
+        return new CpuCalcRBTorsionForceKernel(name, platform, data);
    if (name == CalcNonbondedForceKernel::Name())
-        return new CpuCalcNonbondedForceKernel(name, platform);
+        return new CpuCalcNonbondedForceKernel(name, platform, data);
+    if (name == CalcGBSAOBCForceKernel::Name())
+        return new CpuCalcGBSAOBCForceKernel(name, platform, data);
+    if (name == IntegrateLangevinStepKernel::Name())
+        return new CpuIntegrateLangevinStepKernel(name, platform, data);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '") + name + "'").c_str());
 }
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -31,11 +31,17 @@

 #include "CpuKernels.h"
 #include "ReferenceBondForce.h"
+#include "ReferenceConstraints.h"
+#include "ReferenceKernelFactory.h"
+#include "ReferenceKernels.h"
 #include "ReferenceLJCoulomb14.h"
+#include "ReferenceProperDihedralBond.h"
+#include "ReferenceRbDihedralBond.h"
 #include "openmm/Context.h"
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/internal/NonbondedForceImpl.h"
+#include "openmm/internal/vectorize.h"
 #include "RealVec.h"

 using namespace OpenMM;
@@ -61,6 +67,270 @@ static RealVec& extractBoxSize(ContextImpl& context) {
    return *(RealVec*) data->periodicBoxSize;
 }

+static ReferenceConstraints& extractConstraints(ContextImpl& context) {
+    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+    return *(ReferenceConstraints*) data->constraints;
+}
+
+/**
+ * Compute the kinetic energy of the system, possibly shifting the velocities in time to account
+ * for a leapfrog integrator.
+ */
+static double computeShiftedKineticEnergy(ContextImpl& context, vector<double>& masses, double timeShift) {
+    vector<RealVec>& posData = extractPositions(context);
+    vector<RealVec>& velData = extractVelocities(context);
+    vector<RealVec>& forceData = extractForces(context);
+    int numParticles = context.getSystem().getNumParticles();
+    
+    // Compute the shifted velocities.
+    
+    vector<RealVec> shiftedVel(numParticles);
+    for (int i = 0; i < numParticles; ++i) {
+        if (masses[i] > 0)
+            shiftedVel[i] = velData[i]+forceData[i]*(timeShift/masses[i]);
+        else
+            shiftedVel[i] = velData[i];
+    }
+    
+    // Apply constraints to them.
+    
+    vector<double> inverseMasses(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        inverseMasses[i] = (masses[i] == 0 ? 0 : 1/masses[i]);
+    extractConstraints(context).applyToVelocities(posData, shiftedVel, inverseMasses, 1e-4);
+    
+    // Compute the kinetic energy.
+    
+    double energy = 0.0;
+    for (int i = 0; i < numParticles; ++i)
+        if (masses[i] > 0)
+            energy += masses[i]*(shiftedVel[i].dot(shiftedVel[i]));
+    return 0.5*energy;
+}
+
+class CpuCalcForcesAndEnergyKernel::SumForceTask : public ThreadPool::Task {
+public:
+    SumForceTask(int numParticles, vector<RealVec>& forceData, CpuPlatform::PlatformData& data) : numParticles(numParticles), forceData(forceData), data(data) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        // Sum the contributions to forces that have been calculated by different threads.
+        
+        int numThreads = threads.getNumThreads();
+        int start = threadIndex*numParticles/numThreads;
+        int end = (threadIndex+1)*numParticles/numThreads;
+        for (int i = start; i < end; i++) {
+            fvec4 f(0.0f);
+            for (int j = 0; j < numThreads; j++)
+                f += fvec4(&data.threadForce[j][4*i]);
+            forceData[i][0] += f[0];
+            forceData[i][1] += f[1];
+            forceData[i][2] += f[2];
+        }
+    }
+    int numParticles;
+    vector<RealVec>& forceData;
+    CpuPlatform::PlatformData& data;
+};
+
+class CpuCalcForcesAndEnergyKernel::InitForceTask : public ThreadPool::Task {
+public:
+    InitForceTask(int numParticles, ContextImpl& context, CpuPlatform::PlatformData& data) : numParticles(numParticles), context(context), data(data) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        // Convert the positions to single precision and apply periodic boundary conditions
+
+        AlignedArray<float>& posq = data.posq;
+        vector<RealVec>& posData = extractPositions(context);
+        RealVec boxSize = extractBoxSize(context);
+        double invBoxSize[3] = {1/boxSize[0], 1/boxSize[1], 1/boxSize[2]};
+        int numParticles = context.getSystem().getNumParticles();
+        int numThreads = threads.getNumThreads();
+        int start = threadIndex*numParticles/numThreads;
+        int end = (threadIndex+1)*numParticles/numThreads;
+        if (data.isPeriodic)
+            for (int i = start; i < end; i++)
+                for (int j = 0; j < 3; j++) {
+                    RealOpenMM x = posData[i][j];
+                    double base = floor(x*invBoxSize[j])*boxSize[j];
+                    posq[4*i+j] = (float) (x-base);
+                }
+        else
+            for (int i = start; i < end; i++) {
+                posq[4*i] = (float) posData[i][0];
+                posq[4*i+1] = (float) posData[i][1];
+                posq[4*i+2] = (float) posData[i][2];
+            }
+
+        // Clear the forces.
+
+        fvec4 zero(0.0f);
+        for (int j = 0; j < numParticles; j++)
+            zero.store(&data.threadForce[threadIndex][j*4]);
+    }
+    int numParticles;
+    ContextImpl& context;
+    CpuPlatform::PlatformData& data;
+};
+
+CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
+        CalcForcesAndEnergyKernel(name, platform), data(data) {
+    // Create a Reference platform version of this kernel.
+    
+    ReferenceKernelFactory referenceFactory;
+    referenceKernel = Kernel(referenceFactory.createKernelImpl(name, platform, context));
+}
+
+void CpuCalcForcesAndEnergyKernel::initialize(const System& system) {
+    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().initialize(system);
+}
+
+void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().beginComputation(context, includeForce, includeEnergy, groups);
+    
+    // Convert positions to single precision and clear the forces.
+    
+    InitForceTask task(context.getSystem().getNumParticles(), context, data);
+    data.threads.execute(task);
+    data.threads.waitForThreads();
+}
+
+double CpuCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+    // Sum the forces from all the threads.
+    
+    SumForceTask task(context.getSystem().getNumParticles(), extractForces(context), data);
+    data.threads.execute(task);
+    data.threads.waitForThreads();
+    return referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().finishComputation(context, includeForce, includeEnergy, groups);
+}
+
+CpuCalcPeriodicTorsionForceKernel::~CpuCalcPeriodicTorsionForceKernel() {
+    if (torsionIndexArray != NULL) {
+        for (int i = 0; i < numTorsions; i++) {
+            delete[] torsionIndexArray[i];
+            delete[] torsionParamArray[i];
+        }
+        delete[] torsionIndexArray;
+        delete[] torsionParamArray;
+    }
+}
+
+void CpuCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
+    numTorsions = force.getNumTorsions();
+    torsionIndexArray = new int*[numTorsions];
+    for (int i = 0; i < numTorsions; i++)
+        torsionIndexArray[i] = new int[4];
+    torsionParamArray = new RealOpenMM*[numTorsions];
+    for (int i = 0; i < numTorsions; i++)
+        torsionParamArray[i] = new RealOpenMM[3];
+    for (int i = 0; i < numTorsions; ++i) {
+        int particle1, particle2, particle3, particle4, periodicity;
+        double phase, k;
+        force.getTorsionParameters(i, particle1, particle2, particle3, particle4, periodicity, phase, k);
+        torsionIndexArray[i][0] = particle1;
+        torsionIndexArray[i][1] = particle2;
+        torsionIndexArray[i][2] = particle3;
+        torsionIndexArray[i][3] = particle4;
+        torsionParamArray[i][0] = (RealOpenMM) k;
+        torsionParamArray[i][1] = (RealOpenMM) phase;
+        torsionParamArray[i][2] = (RealOpenMM) periodicity;
+    }
+    bondForce.initialize(system.getNumParticles(), numTorsions, 4, torsionIndexArray, data.threads);
+}
+
+double CpuCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    vector<RealVec>& posData = extractPositions(context);
+    vector<RealVec>& forceData = extractForces(context);
+    RealOpenMM energy = 0;
+    ReferenceProperDihedralBond periodicTorsionBond;
+    bondForce.calculateForce(posData, torsionParamArray, forceData, includeEnergy ? &energy : NULL, periodicTorsionBond);
+    return energy;
+}
+
+void CpuCalcPeriodicTorsionForceKernel::copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force) {
+    if (numTorsions != force.getNumTorsions())
+        throw OpenMMException("updateParametersInContext: The number of torsions has changed");
+
+    // Record the values.
+
+    for (int i = 0; i < numTorsions; ++i) {
+        int particle1, particle2, particle3, particle4, periodicity;
+        double phase, k;
+        force.getTorsionParameters(i, particle1, particle2, particle3, particle4, periodicity, phase, k);
+        if (particle1 != torsionIndexArray[i][0] || particle2 != torsionIndexArray[i][1] || particle3 != torsionIndexArray[i][2] || particle4 != torsionIndexArray[i][3])
+            throw OpenMMException("updateParametersInContext: The set of particles in a torsion has changed");
+        torsionParamArray[i][0] = (RealOpenMM) k;
+        torsionParamArray[i][1] = (RealOpenMM) phase;
+        torsionParamArray[i][2] = (RealOpenMM) periodicity;
+    }
+}
+
+CpuCalcRBTorsionForceKernel::~CpuCalcRBTorsionForceKernel() {
+    if (torsionIndexArray != NULL) {
+        for (int i = 0; i < numTorsions; i++) {
+            delete[] torsionIndexArray[i];
+            delete[] torsionParamArray[i];
+        }
+        delete[] torsionIndexArray;
+        delete[] torsionParamArray;
+    }
+}
+
+void CpuCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
+    numTorsions = force.getNumTorsions();
+    torsionIndexArray = new int*[numTorsions];
+    for (int i = 0; i < numTorsions; i++)
+        torsionIndexArray[i] = new int[4];
+    torsionParamArray = new RealOpenMM*[numTorsions];
+    for (int i = 0; i < numTorsions; i++)
+        torsionParamArray[i] = new RealOpenMM[6];
+    for (int i = 0; i < numTorsions; ++i) {
+        int particle1, particle2, particle3, particle4;
+        double c0, c1, c2, c3, c4, c5;
+        force.getTorsionParameters(i, particle1, particle2, particle3, particle4, c0, c1, c2, c3, c4, c5);
+        torsionIndexArray[i][0] = particle1;
+        torsionIndexArray[i][1] = particle2;
+        torsionIndexArray[i][2] = particle3;
+        torsionIndexArray[i][3] = particle4;
+        torsionParamArray[i][0] = (RealOpenMM) c0;
+        torsionParamArray[i][1] = (RealOpenMM) c1;
+        torsionParamArray[i][2] = (RealOpenMM) c2;
+        torsionParamArray[i][3] = (RealOpenMM) c3;
+        torsionParamArray[i][4] = (RealOpenMM) c4;
+        torsionParamArray[i][5] = (RealOpenMM) c5;
+    }
+    bondForce.initialize(system.getNumParticles(), numTorsions, 4, torsionIndexArray, data.threads);
+}
+
+double CpuCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    vector<RealVec>& posData = extractPositions(context);
+    vector<RealVec>& forceData = extractForces(context);
+    RealOpenMM energy = 0;
+    ReferenceRbDihedralBond rbTorsionBond;
+    bondForce.calculateForce(posData, torsionParamArray, forceData, includeEnergy ? &energy : NULL, rbTorsionBond);
+    return energy;
+}
+
+void CpuCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl& context, const RBTorsionForce& force) {
+    if (numTorsions != force.getNumTorsions())
+        throw OpenMMException("updateParametersInContext: The number of torsions has changed");
+
+    // Record the values.
+
+    for (int i = 0; i < numTorsions; ++i) {
+        int particle1, particle2, particle3, particle4;
+        double c0, c1, c2, c3, c4, c5;
+        force.getTorsionParameters(i, particle1, particle2, particle3, particle4, c0, c1, c2, c3, c4, c5);
+        if (particle1 != torsionIndexArray[i][0] || particle2 != torsionIndexArray[i][1] || particle3 != torsionIndexArray[i][2] || particle4 != torsionIndexArray[i][3])
+            throw OpenMMException("updateParametersInContext: The set of particles in a torsion has changed");
+        torsionParamArray[i][0] = (RealOpenMM) c0;
+        torsionParamArray[i][1] = (RealOpenMM) c1;
+        torsionParamArray[i][2] = (RealOpenMM) c2;
+        torsionParamArray[i][3] = (RealOpenMM) c3;
+        torsionParamArray[i][4] = (RealOpenMM) c4;
+        torsionParamArray[i][5] = (RealOpenMM) c5;
+    }
+}
+
 class CpuCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
 public:
    PmeIO(float* posq, float* force, int numParticles) : posq(posq), force(force), numParticles(numParticles) {
@@ -81,15 +351,35 @@ private:
    int numParticles;
 };

+bool isVec8Supported();
+CpuNonbondedForce* createCpuNonbondedForceVec4();
+CpuNonbondedForce* createCpuNonbondedForceVec8();
+
+CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
+        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), neighborList(NULL), nonbonded(NULL) {
+    if (isVec8Supported()) {
+        neighborList = new CpuNeighborList(8);
+        nonbonded = createCpuNonbondedForceVec8();
+    }
+    else {
+        neighborList = new CpuNeighborList(4);
+        nonbonded = createCpuNonbondedForceVec4();
+    }
+}
+
 CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
    if (bonded14ParamArray != NULL) {
        for (int i = 0; i < num14; i++) {
            delete[] bonded14IndexArray[i];
            delete[] bonded14ParamArray[i];
        }
-        delete bonded14IndexArray;
-        delete bonded14ParamArray;
+        delete[] bonded14IndexArray;
+        delete[] bonded14ParamArray;
    }
+    if (nonbonded != NULL)
+        delete nonbonded;
+    if (neighborList != NULL)
+        delete neighborList;
 }

 void CpuCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
@@ -97,8 +387,6 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    // Identify which exceptions are 1-4 interactions.

    numParticles = force.getNumParticles();
-    posq.resize(4*numParticles, 0);
-    forces.resize(4*numParticles, 0);
    exclusions.resize(numParticles);
    vector<int> nb14s;
    for (int i = 0; i < force.getNumExceptions(); i++) {
@@ -125,7 +413,7 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    for (int i = 0; i < numParticles; ++i) {
        double charge, radius, depth;
        force.getParticleParameters(i, charge, radius, depth);
-        posq[4*i+3] = (float) charge;
+        data.posq[4*i+3] = (float) charge;
        particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
        sumSquaredCharges += charge*charge;
    }
@@ -173,6 +461,7 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    else
        dispersionCoefficient = 0.0;
    lastPositions.resize(numParticles, Vec3(1e10, 1e10, 1e10));
+    data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME);
 }

 double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
@@ -192,32 +481,14 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            }
        }
    }
+    AlignedArray<float>& posq = data.posq;
    vector<RealVec>& posData = extractPositions(context);
    vector<RealVec>& forceData = extractForces(context);
    RealVec boxSize = extractBoxSize(context);
    float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
-    double energy = ewaldSelfEnergy;
-    bool periodic = (nonbondedMethod == CutoffPeriodic);
+    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    bool ewald  = (nonbondedMethod == Ewald);
    bool pme  = (nonbondedMethod == PME);
-    
-    // Convert the positions to single precision.
-    
-    if (periodic)
-        for (int i = 0; i < numParticles; i++)
-            for (int j = 0; j < 3; j++) {
-                RealOpenMM x = posData[i][j];
-                double base = floor(x/boxSize[j]+0.5)*boxSize[j];
-                posq[4*i+j] = (float) (x-base);
-            }
-    else
-        for (int i = 0; i < numParticles; i++) {
-            posq[4*i] = (float) posData[i][0];
-            posq[4*i+1] = (float) posData[i][1];
-            posq[4*i+2] = (float) posData[i][2];
-        }
-    for (int i = 0; i < 4*numParticles; i++)
-        forces[i] = 0.0f;
    if (nonbondedMethod != NoCutoff) {
        // Determine whether we need to recompute the neighbor list.
        
@@ -244,6 +515,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo

            int numMoved = moved.size();
            double cutoff2 = nonbondedCutoff*nonbondedCutoff;
+            double paddedCutoff2 = (nonbondedCutoff+padding)*(nonbondedCutoff+padding);
            for (int i = 1; i < numMoved && !needRecompute; i++)
                for (int j = 0; j < i; j++) {
                    RealVec delta = posData[moved[i]]-posData[moved[j]];
@@ -251,7 +523,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                        // These particles should interact.  See if they are in the neighbor list.
                        
                        RealVec oldDelta = lastPositions[moved[i]]-lastPositions[moved[j]];
-                        if (oldDelta.dot(oldDelta) > cutoff2) {
+                        if (oldDelta.dot(oldDelta) > paddedCutoff2) {
                            needRecompute = true;
                            break;
                        }
@@ -259,47 +531,42 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                }
        }
        if (needRecompute) {
-            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding);
+            neighborList->computeNeighborList(numParticles, posq, exclusions, floatBoxSize, data.isPeriodic, nonbondedCutoff+padding, data.threads);
            lastPositions = posData;
        }
-        nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
+        nonbonded->setUseCutoff(nonbondedCutoff, *neighborList, rfDielectric);
    }
-    if (periodic || ewald || pme) {
+    if (data.isPeriodic) {
        double minAllowedSize = 1.999999*nonbondedCutoff;
        if (boxSize[0] < minAllowedSize || boxSize[1] < minAllowedSize || boxSize[2] < minAllowedSize)
            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
-        nonbonded.setPeriodic(floatBoxSize);
+        nonbonded->setPeriodic(floatBoxSize);
    }
    if (ewald)
-        nonbonded.setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
+        nonbonded->setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
    if (pme)
-        nonbonded.setUsePME(ewaldAlpha, gridSize);
+        nonbonded->setUsePME(ewaldAlpha, gridSize);
    if (useSwitchingFunction)
-        nonbonded.setUseSwitchingFunction(switchingDistance);
-    float nonbondedEnergy = 0;
+        nonbonded->setUseSwitchingFunction(switchingDistance);
+    double nonbondedEnergy = 0;
    if (includeDirect)
-        nonbonded.calculateDirectIxn(numParticles, &posq[0], particleParams, exclusions, &forces[0], includeEnergy ? &nonbondedEnergy : NULL);
+        nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
    if (includeReciprocal) {
        if (useOptimizedPme) {
-            PmeIO io(&posq[0], &forces[0], numParticles);
+            PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
            Vec3 periodicBoxSize(boxSize[0], boxSize[1], boxSize[2]);
            optimizedPme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, periodicBoxSize, includeEnergy);
-            optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
+            nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
        }
        else
-            nonbonded.calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
+            nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
    }
    energy += nonbondedEnergy;
-    for (int i = 0; i < numParticles; i++) {
-        forceData[i][0] += forces[4*i];
-        forceData[i][1] += forces[4*i+1];
-        forceData[i][2] += forces[4*i+2];
-    }
    if (includeDirect) {
        ReferenceBondForce refBondForce;
        ReferenceLJCoulomb14 nonbonded14;
        refBondForce.calculateForce(num14, bonded14IndexArray, posData, bonded14ParamArray, forceData, includeEnergy ? &energy : NULL, nonbonded14);
-        if (periodic || ewald || pme)
+        if (data.isPeriodic)
            energy += dispersionCoefficient/(boxSize[0]*boxSize[1]*boxSize[2]);
    }
    return energy;
@@ -325,7 +592,7 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
    for (int i = 0; i < numParticles; ++i) {
        double charge, radius, depth;
        force.getParticleParameters(i, charge, radius, depth);
-        posq[4*i+3] = (float) charge;
+        data.posq[4*i+3] = (float) charge;
        particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
        sumSquaredCharges += charge*charge;
    }
@@ -350,3 +617,94 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
    if (force.getUseDispersionCorrection() && (method == NonbondedForce::CutoffPeriodic || method == NonbondedForce::Ewald || method == NonbondedForce::PME))
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
 }
+
+CpuCalcGBSAOBCForceKernel::~CpuCalcGBSAOBCForceKernel() {
+}
+
+void CpuCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
+    int numParticles = system.getNumParticles();
+    particleParams.resize(numParticles);
+    for (int i = 0; i < numParticles; ++i) {
+        double charge, radius, scalingFactor;
+        force.getParticleParameters(i, charge, radius, scalingFactor);
+        data.posq[4*i+3] = (float) charge;
+        radius -= 0.009;
+        particleParams[i] = make_pair((float) radius, (float) (scalingFactor*radius));
+    }
+    obc.setParticleParameters(particleParams);
+    obc.setSolventDielectric((float) force.getSolventDielectric());
+    obc.setSoluteDielectric((float) force.getSoluteDielectric());
+    if (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff)
+        obc.setUseCutoff((float) force.getCutoffDistance());
+    data.isPeriodic = (force.getNonbondedMethod() == GBSAOBCForce::CutoffPeriodic);
+}
+
+double CpuCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    if (data.isPeriodic) {
+        RealVec& boxSize = extractBoxSize(context);
+        float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
+        obc.setPeriodic(floatBoxSize);
+    }
+    double energy = 0.0;
+    obc.computeForce(data.posq, data.threadForce, includeEnergy ? &energy : NULL, data.threads);
+    return energy;
+}
+
+void CpuCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force) {
+    int numParticles = force.getNumParticles();
+    if (numParticles != obc.getParticleParameters().size())
+        throw OpenMMException("updateParametersInContext: The number of particles has changed");
+
+    // Record the values.
+
+    for (int i = 0; i < numParticles; ++i) {
+        double charge, radius, scalingFactor;
+        force.getParticleParameters(i, charge, radius, scalingFactor);
+        data.posq[4*i+3] = (float) charge;
+        radius -= 0.009;
+        particleParams[i] = make_pair((float) radius, (float) (scalingFactor*radius));
+    }
+    obc.setParticleParameters(particleParams);
+}
+
+CpuIntegrateLangevinStepKernel::~CpuIntegrateLangevinStepKernel() {
+    if (dynamics)
+        delete dynamics;
+}
+
+void CpuIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
+    int numParticles = system.getNumParticles();
+    masses.resize(numParticles);
+    for (int i = 0; i < numParticles; ++i)
+        masses[i] = static_cast<RealOpenMM>(system.getParticleMass(i));
+    data.random.initialize(integrator.getRandomNumberSeed(), data.threads.getNumThreads());
+}
+
+void CpuIntegrateLangevinStepKernel::execute(ContextImpl& context, const LangevinIntegrator& integrator) {
+    double temperature = integrator.getTemperature();
+    double friction = integrator.getFriction();
+    double stepSize = integrator.getStepSize();
+    vector<RealVec>& posData = extractPositions(context);
+    vector<RealVec>& velData = extractVelocities(context);
+    vector<RealVec>& forceData = extractForces(context);
+    if (dynamics == 0 || temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
+        // Recreate the computation objects with the new parameters.
+        
+        if (dynamics)
+            delete dynamics;
+        RealOpenMM tau = (friction == 0.0 ? 0.0 : 1.0/friction);
+        dynamics = new CpuLangevinDynamics(context.getSystem().getNumParticles(), stepSize, tau, temperature, data.threads, data.random);
+        dynamics->setReferenceConstraintAlgorithm(&extractConstraints(context));
+        prevTemp = temperature;
+        prevFriction = friction;
+        prevStepSize = stepSize;
+    }
+    dynamics->update(context.getSystem(), posData, velData, forceData, masses, integrator.getConstraintTolerance());
+    ReferencePlatform::PlatformData* refData = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+    refData->time += stepSize;
+    refData->stepCount++;
+}
+
+double CpuIntegrateLangevinStepKernel::computeKineticEnergy(ContextImpl& context, const LangevinIntegrator& integrator) {
+    return computeShiftedKineticEnergy(context, masses, 0.5*integrator.getStepSize());
+}
--- a/platforms/cpu/src/CpuLangevinDynamics.cpp
+++ b/platforms/cpu/src/CpuLangevinDynamics.cpp
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Authors: Peter Eastman
+ * Contributors: 
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "SimTKOpenMMCommon.h"
+#include "SimTKOpenMMLog.h"
+#include "SimTKOpenMMUtilities.h"
+#include "CpuLangevinDynamics.h"
+
+using namespace OpenMM;
+using namespace std;
+
+class CpuLangevinDynamics::Update1Task : public ThreadPool::Task {
+public:
+    Update1Task(CpuLangevinDynamics& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadUpdate1(threadIndex);
+    }
+    CpuLangevinDynamics& owner;
+};
+
+class CpuLangevinDynamics::Update2Task : public ThreadPool::Task {
+public:
+    Update2Task(CpuLangevinDynamics& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadUpdate2(threadIndex);
+    }
+    CpuLangevinDynamics& owner;
+};
+
+CpuLangevinDynamics::CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM tau, RealOpenMM temperature, ThreadPool& threads, CpuRandom& random) : 
+           ReferenceStochasticDynamics(numberOfAtoms, deltaT, tau, temperature), threads(threads), random(random) {
+}
+
+CpuLangevinDynamics::~CpuLangevinDynamics() {
+}
+
+void CpuLangevinDynamics::updatePart1(int numberOfAtoms, vector<RealVec>& atomCoordinates, vector<RealVec>& velocities,
+                                      vector<RealVec>& forces, vector<RealOpenMM>& inverseMasses, vector<RealVec>& xPrime) {
+    // Record the parameters for the threads.
+    
+    this->numberOfAtoms = numberOfAtoms;
+    this->atomCoordinates = &atomCoordinates[0];
+    this->velocities = &velocities[0];
+    this->forces = &forces[0];
+    this->inverseMasses = &inverseMasses[0];
+    this->xPrime = &xPrime[0];
+    
+    // Signal the threads to start running and wait for them to finish.
+    
+    Update1Task task(*this);
+    threads.execute(task);
+    threads.waitForThreads();
+}
+
+void CpuLangevinDynamics::updatePart2(int numberOfAtoms, vector<RealVec>& atomCoordinates, vector<RealVec>& velocities,
+                                      vector<RealVec>& forces, vector<RealOpenMM>& inverseMasses, vector<RealVec>& xPrime) {
+    // Record the parameters for the threads.
+    
+    this->numberOfAtoms = numberOfAtoms;
+    this->atomCoordinates = &atomCoordinates[0];
+    this->velocities = &velocities[0];
+    this->forces = &forces[0];
+    this->inverseMasses = &inverseMasses[0];
+    this->xPrime = &xPrime[0];
+    
+    // Signal the threads to start running and wait for them to finish.
+    
+    Update2Task task(*this);
+    threads.execute(task);
+    threads.waitForThreads();
+}
+
+void CpuLangevinDynamics::threadUpdate1(int threadIndex) {
+    const RealOpenMM tau = getTau();
+    const RealOpenMM vscale = EXP(-getDeltaT()/tau);
+    const RealOpenMM fscale = (1-vscale)*tau;
+    const RealOpenMM kT = BOLTZ*getTemperature();
+    const RealOpenMM noisescale = SQRT(2*kT/tau)*SQRT(0.5*(1-vscale*vscale)*tau);
+    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
+    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();
+
+    for (int i = start; i < end; i++) {
+        if (inverseMasses[i] != 0.0) {
+            RealOpenMM sqrtInvMass = SQRT(inverseMasses[i]);
+            RealVec noise(random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex));
+            velocities[i]  = velocities[i]*vscale + forces[i]*(fscale*inverseMasses[i]) + noise*(noisescale*sqrtInvMass);
+        }
+   }
+}
+
+void CpuLangevinDynamics::threadUpdate2(int threadIndex) {
+    const RealOpenMM dt = getDeltaT();
+    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
+    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();
+
+    for (int i = start; i < end; i++) {
+        if (inverseMasses[i] != 0.0) {
+            RealOpenMM sqrtInvMass = SQRT(inverseMasses[i]);
+            xPrime[i] = atomCoordinates[i]+velocities[i]*dt;
+        }
+   }
+}
--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
 #include "CpuNeighborList.h"
 #include "openmm/internal/hardware.h"
 #include "openmm/internal/vectorize.h"
@@ -12,27 +43,25 @@ using namespace std;

 namespace OpenMM {

-const int CpuNeighborList::BlockSize = 4;
-
 class VoxelIndex 
 {
 public:
+    VoxelIndex() : x(0), y(0) {
+    }
    VoxelIndex(int x, int y) : x(x), y(y) {
    }
    int x;
    int y;
 };

-typedef pair<const float*, int> VoxelItem;
-
 /**
 * This data structure organizes the particles spatially.  It divides them into bins along the x and y axes,
 * then sorts each bin along the z axis so ranges can be identified quickly with a binary search.
 */
 class CpuNeighborList::Voxels {
 public:
-    Voxels(float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) :
-            voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) {
+    Voxels(int blockSize, float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) :
+            blockSize(blockSize), voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) {
        if (usePeriodic) {
            nx = (int) floorf(periodicBoxSize[0]/voxelSizeX+0.5f);
            ny = (int) floorf(periodicBoxSize[1]/voxelSizeY+0.5f);
@@ -60,7 +89,7 @@ public:
     */
    void insert(const int& atom, const float* location) {
        VoxelIndex voxelIndex = getVoxelIndex(location);
-        bins[voxelIndex.x][voxelIndex.y].push_back(make_pair(location[2], VoxelItem(location, atom)));
+        bins[voxelIndex.x][voxelIndex.y].push_back(make_pair(location[2], atom));
    }
    
    /**
@@ -76,7 +105,7 @@ public:
     * Find the index of the first particle in voxel (x,y) whose z coordinate in >= the specified value.
     */
    int findLowerBound(int x, int y, double z) const {
-        const vector<pair<float, VoxelItem> >& bin = bins[x][y];
+        const vector<pair<float, int> >& bin = bins[x][y];
        int lower = 0;
        int upper = bin.size();
        while (lower < upper) {
@@ -93,7 +122,7 @@ public:
     * Find the index of the first particle in voxel (x,y) whose z coordinate in greater than the specified value.
     */
    int findUpperBound(int x, int y, double z) const {
-        const vector<pair<float, VoxelItem> >& bin = bins[x][y];
+        const vector<pair<float, int> >& bin = bins[x][y];
        int lower = 0;
        int upper = bin.size();
        while (lower < upper) {
@@ -125,7 +154,7 @@ public:
        return VoxelIndex(x, y);
    }

-    void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int> blockAtoms, const float* atomLocations) const {
+    void getNeighbors(vector<int>& neighbors, int blockIndex, const fvec4& blockCenter, const fvec4& blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const float* atomLocations, const vector<VoxelIndex>& atomVoxelIndex) const {
        neighbors.resize(0);
        exclusions.resize(0);
        fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
@@ -135,8 +164,12 @@ public:
        float refineCutoff = maxDistance-max(max(blockWidth[0], blockWidth[1]), blockWidth[2]);
        float refineCutoffSquared = refineCutoff*refineCutoff;

-        int dIndexX = min(nx/2, int((maxDistance+blockWidth[0])/voxelSizeX)+1); // How may voxels away do we have to look?
-        int dIndexY = min(ny/2, int((maxDistance+blockWidth[1])/voxelSizeY)+1);
+        int dIndexX = int((maxDistance+blockWidth[0])/voxelSizeX)+1; // How may voxels away do we have to look?
+        int dIndexY = int((maxDistance+blockWidth[1])/voxelSizeY)+1;
+        if (usePeriodic) {
+            dIndexX = min(nx/2, dIndexX);
+            dIndexY = min(ny/2, dIndexY);
+        }
        float centerPos[4];
        blockCenter.store(centerPos);
        VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
@@ -148,63 +181,87 @@ public:
        if (usePeriodic) {
            endx = min(endx, centerVoxelIndex.x-dIndexX+nx-1);
            endy = min(endy, centerVoxelIndex.y-dIndexY+ny-1);
-            numRanges = 2;
        }
        else {
            startx = max(startx, 0);
            starty = max(starty, 0);
            endx = min(endx, nx-1);
            endy = min(endy, ny-1);
-            numRanges = 1;
        }
-        int lastSortedIndex = BlockSize*(blockIndex+1);
+        int lastSortedIndex = blockSize*(blockIndex+1);
        VoxelIndex voxelIndex(0, 0);
        for (int x = startx; x <= endx; ++x) {
            voxelIndex.x = x;
            if (usePeriodic)
                voxelIndex.x = (x < 0 ? x+nx : (x >= nx ? x-nx : x));
-            float dx = max(0.0f, voxelSizeX*max(0, abs(centerVoxelIndex.x-x)-1)-blockWidth[0]);
            for (int y = starty; y <= endy; ++y) {
                voxelIndex.y = y;
                if (usePeriodic)
                    voxelIndex.y = (y < 0 ? y+ny : (y >= ny ? y-ny : y));
-                float dy = max(0.0f, voxelSizeY*max(0, abs(centerVoxelIndex.y-y)-1)-blockWidth[1]);
                
                // Identify the range of atoms within this bin we need to search.  When using periodic boundary
                // conditions, there may be two separate ranges.
                
-                float dz = maxDistance+blockWidth[2];
-                dz = sqrtf(max(0.0f, dz*dz-dx*dx-dy*dy));
+                float minz = centerPos[2];
+                float maxz = centerPos[2];
+                fvec4 offset(voxelSizeX*x+(usePeriodic ? 0.0f : minx), voxelSizeY*y+(usePeriodic ? 0.0f : miny), 0, 0);
+                for (int k = 0; k < (int) blockAtoms.size(); k++) {
+                    const float* atomPos = &atomLocations[4*blockAtoms[k]];
+                    fvec4 posVec(atomPos);
+                    fvec4 delta1 = offset-posVec;
+                    fvec4 delta2 = delta1+fvec4(voxelSizeX, voxelSizeY, 0, 0);
+                    if (usePeriodic) {
+                        delta1 -= round(delta1*invBoxSize)*boxSize;
+                        delta2 -= round(delta2*invBoxSize)*boxSize;
+                    }
+                    fvec4 delta = min(abs(delta1), abs(delta2));
+                    float dx = (x == atomVoxelIndex[k].x ? 0.0f : delta[0]);
+                    float dy = (y == atomVoxelIndex[k].y ? 0.0f : delta[1]);
+                    float dist2 = maxDistanceSquared-dx*dx-dy*dy;
+                    if (dist2 > 0) {
+                        float dist = sqrtf(dist2);
+                        minz = min(minz, atomPos[2]-dist);
+                        maxz = max(maxz, atomPos[2]+dist);
+                    }
+                }
+                if (minz == maxz)
+                    continue;
+                bool needPeriodic = (centerPos[0]-blockWidth[0] < maxDistance || centerPos[0]+blockWidth[0] > periodicBoxSize[0]-maxDistance ||
+                                     centerPos[1]-blockWidth[1] < maxDistance || centerPos[1]+blockWidth[1] > periodicBoxSize[1]-maxDistance ||
+                                     minz < 0.0f || maxz > periodicBoxSize[2]);
                int rangeStart[2];
                int rangeEnd[2];
-                rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz);
-                if (usePeriodic) {
-                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
+                rangeStart[0] = findLowerBound(voxelIndex.x, voxelIndex.y, minz);
+                if (needPeriodic) {
+                    numRanges = 2;
+                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, maxz);
                    if (rangeStart[0] > 0) {
                        rangeStart[1] = 0;
-                        rangeEnd[1] = min(findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz-periodicBoxSize[2]), rangeStart[0]);
+                        rangeEnd[1] = min(findUpperBound(voxelIndex.x, voxelIndex.y, maxz-periodicBoxSize[2]), rangeStart[0]);
                    }
                    else {
-                        rangeStart[1] = max(findLowerBound(voxelIndex.x, voxelIndex.y, centerPos[2]-dz+periodicBoxSize[2]), rangeEnd[0]);
+                        rangeStart[1] = max(findLowerBound(voxelIndex.x, voxelIndex.y, minz+periodicBoxSize[2]), rangeEnd[0]);
                        rangeEnd[1] = bins[voxelIndex.x][voxelIndex.y].size();
                    }
                }
-                else
-                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, centerPos[2]+dz);
+                else {
+                    numRanges = 1;
+                    rangeEnd[0] = findUpperBound(voxelIndex.x, voxelIndex.y, maxz);
+                }
                
                // Loop over atoms and check to see if they are neighbors of this block.
                
                for (int range = 0; range < numRanges; range++) {
                    for (int item = rangeStart[range]; item < rangeEnd[range]; item++) {
-                        const int sortedIndex = bins[voxelIndex.x][voxelIndex.y][item].second.second;
+                        const int sortedIndex = bins[voxelIndex.x][voxelIndex.y][item].second;

                        // Avoid duplicate entries.
                        if (sortedIndex >= lastSortedIndex)
                            continue;
                        
-                        fvec4 atomPos(bins[voxelIndex.x][voxelIndex.y][item].second.first);
+                        fvec4 atomPos(atomLocations+4*sortedAtoms[sortedIndex]);
                        fvec4 delta = atomPos-centerPos;
-                        if (usePeriodic) {
+                        if (needPeriodic) {
                            fvec4 base = round(delta*invBoxSize)*boxSize;
                            delta = delta-base;
                        }
@@ -221,7 +278,7 @@ public:
                            for (int k = 0; k < (int) blockAtoms.size(); k++) {
                                fvec4 pos1(&atomLocations[4*blockAtoms[k]]);
                                delta = atomPos-pos1;
-                                if (usePeriodic) {
+                                if (needPeriodic) {
                                    fvec4 base = round(delta*invBoxSize)*boxSize;
                                    delta = delta-base;
                                }
@@ -238,10 +295,12 @@ public:
                        // Add this atom to the list of neighbors.
                        
                        neighbors.push_back(sortedAtoms[sortedIndex]);
-                        if (sortedIndex < BlockSize*blockIndex)
+                        if (sortedIndex < blockSize*blockIndex)
                            exclusions.push_back(0);
-                        else
-                            exclusions.push_back(0xF & (0xF<<(sortedIndex-BlockSize*blockIndex)));
+                        else {
+                            int mask = (1<<blockSize)-1;
+                            exclusions.push_back(mask & (mask<<(sortedIndex-blockSize*blockIndex)));
+                        }
                    }
                }
            }
@@ -249,63 +308,31 @@ public:
    }

 private:
+    int blockSize;
    float voxelSizeX, voxelSizeY;
    float minx, maxx, miny, maxy;
    int nx, ny;
    const float* periodicBoxSize;
    const bool usePeriodic;
-    vector<vector<vector<pair<float, VoxelItem> > > > bins;
+    vector<vector<vector<pair<float, int> > > > bins;
 };

-class CpuNeighborList::ThreadData {
+class CpuNeighborList::ThreadTask : public ThreadPool::Task {
 public:
-    ThreadData(int index, CpuNeighborList& owner) : index(index), owner(owner) {
+    ThreadTask(CpuNeighborList& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadComputeNeighborList(threads, threadIndex);
    }
-    int index;
    CpuNeighborList& owner;
 };

-static void* threadBody(void* args) {
-    CpuNeighborList::ThreadData& data = *reinterpret_cast<CpuNeighborList::ThreadData*>(args);
-    data.owner.runThread(data.index);
-    delete &data;
-    return 0;
+CpuNeighborList::CpuNeighborList(int blockSize) : blockSize(blockSize) {
 }

-CpuNeighborList::CpuNeighborList() {
-    isDeleted = false;
-    numThreads = getNumProcessors();
-    pthread_cond_init(&startCondition, NULL);
-    pthread_cond_init(&endCondition, NULL);
-    pthread_mutex_init(&lock, NULL);
-    thread.resize(numThreads);
-    pthread_mutex_lock(&lock);
-    waitCount = 0;
-    for (int i = 0; i < numThreads; i++) {
-        ThreadData* data = new ThreadData(i, *this);
-        threadData.push_back(data);
-        pthread_create(&thread[i], NULL, threadBody, data);
-    }
-    while (waitCount < numThreads)
-        pthread_cond_wait(&endCondition, &lock);
-    pthread_mutex_unlock(&lock);
-}
-
-CpuNeighborList::~CpuNeighborList() {
-    isDeleted = true;
-    pthread_mutex_lock(&lock);
-    pthread_cond_broadcast(&startCondition);
-    pthread_mutex_unlock(&lock);
-    for (int i = 0; i < (int) thread.size(); i++)
-        pthread_join(thread[i], NULL);
-    pthread_mutex_destroy(&lock);
-    pthread_cond_destroy(&startCondition);
-    pthread_cond_destroy(&endCondition);
-}
-
-void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
-            const float* periodicBoxSize, bool usePeriodic, float maxDistance) {
-    int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
+void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
+            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
+    int numBlocks = (numAtoms+blockSize-1)/blockSize;
    blockNeighbors.resize(numBlocks);
    blockExclusions.resize(numBlocks);
    sortedAtoms.resize(numAtoms);
@@ -338,8 +365,9 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
    // Sort the atoms based on a Hilbert curve.
    
    atomBins.resize(numAtoms);
-    pthread_mutex_lock(&lock);
-    advanceThreads();
+    ThreadTask task(*this);
+    threads.execute(task);
+    threads.waitForThreads();
    sort(atomBins.begin(), atomBins.end());

    // Build the voxel hash.
@@ -348,10 +376,10 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
    if (!usePeriodic)
        edgeSizeX = edgeSizeY = maxDistance; // TODO - adjust this as needed
    else {
-        edgeSizeX = 0.5f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
-        edgeSizeY = 0.5f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
+        edgeSizeX = 0.6f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
+        edgeSizeY = 0.6f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
    }
-    Voxels voxels(edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
+    Voxels voxels(blockSize, edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
    for (int i = 0; i < numAtoms; i++) {
        int atomIndex = atomBins[i].second;
        sortedAtoms[i] = atomIndex;
@@ -362,14 +390,14 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
    
    // Signal the threads to start running and wait for them to finish.
    
-    advanceThreads();
-    pthread_mutex_unlock(&lock);
+    threads.resumeThreads();
+    threads.waitForThreads();
    
    // Add padding atoms to fill up the last block.
    
-    int numPadding = numBlocks*BlockSize-numAtoms;
+    int numPadding = numBlocks*blockSize-numAtoms;
    if (numPadding > 0) {
-        char mask = (0xF0 >> numPadding) & 0xF;
+        char mask = ((0xFFFF-(1<<blockSize)+1) >> numPadding);
        for (int i = 0; i < numPadding; i++)
            sortedAtoms.push_back(0);
        vector<char>& exc = blockExclusions[blockExclusions.size()-1];
@@ -379,7 +407,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& ato
 }

 int CpuNeighborList::getNumBlocks() const {
-    return sortedAtoms.size()/BlockSize;
+    return sortedAtoms.size()/blockSize;
 }

 const std::vector<int>& CpuNeighborList::getSortedAtoms() const {
@@ -395,82 +423,59 @@ const std::vector<char>& CpuNeighborList::getBlockExclusions(int blockIndex) con
    
 }

-void CpuNeighborList::runThread(int index) {
-    while (true) {
-        // Wait for the signal to start running.
-        
-        threadWait();
-        if (isDeleted)
-            break;
-        
-        // Compute the positions of atoms along the Hilbert curve.
+void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadIndex) {
+    // Compute the positions of atoms along the Hilbert curve.

-        float binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0f;
-        float invBinWidth = 1.0f/binWidth;
-        bitmask_t coords[3];
-        for (int i = index; i < numAtoms; i += numThreads) {
-            const float* pos = &atomLocations[4*i];
-            coords[0] = (bitmask_t) ((pos[0]-minx)*invBinWidth);
-            coords[1] = (bitmask_t) ((pos[1]-miny)*invBinWidth);
-            coords[2] = (bitmask_t) ((pos[2]-minz)*invBinWidth);
-            int bin = (int) hilbert_c2i(3, 8, coords);
-            atomBins[i] = pair<int, int>(bin, i);
-        }
-        threadWait();
-        
-        // Compute this thread's subset of neighbors.
-        
-        int numBlocks = blockNeighbors.size();
-        vector<int> blockAtoms;
-        for (int i = index; i < numBlocks; i += numThreads) {
-            {
-            int firstIndex = BlockSize*i;
-            int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
-            blockAtoms.resize(atomsInBlock);
-            for (int j = 0; j < atomsInBlock; j++)
-                blockAtoms[j] = sortedAtoms[firstIndex+j];
-            }
+    float binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0f;
+    float invBinWidth = 1.0f/binWidth;
+    bitmask_t coords[3];
+    int numThreads = threads.getNumThreads();
+    for (int i = threadIndex; i < numAtoms; i += numThreads) {
+        const float* pos = &atomLocations[4*i];
+        coords[0] = (bitmask_t) ((pos[0]-minx)*invBinWidth);
+        coords[1] = (bitmask_t) ((pos[1]-miny)*invBinWidth);
+        coords[2] = (bitmask_t) ((pos[2]-minz)*invBinWidth);
+        int bin = (int) hilbert_c2i(3, 8, coords);
+        atomBins[i] = pair<int, int>(bin, i);
+    }
+    threads.syncThreads();

-                        
-            int firstIndex = BlockSize*i;
-            fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
-            fvec4 maxPos = minPos;
-            int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
-            for (int j = 1; j < atomsInBlock; j++) {
-                fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
-                minPos = min(minPos, pos);
-                maxPos = max(maxPos, pos);
-            }
-            voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations);
-            
-            // Record the exclusions for this block.
-            
-            for (int j = 0; j < atomsInBlock; j++) {
-                const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
-                char mask = 1<<j;
-                for (int k = 0; k < (int) blockNeighbors[i].size(); k++) {
-                    int atomIndex = blockNeighbors[i][k];
-                    if (atomExclusions.find(atomIndex) != atomExclusions.end())
-                        blockExclusions[i][k] |= mask;
-                }
-            }
+    // Compute this thread's subset of neighbors.
+
+    int numBlocks = blockNeighbors.size();
+    vector<int> blockAtoms;
+    vector<VoxelIndex> atomVoxelIndex;
+    for (int i = threadIndex; i < numBlocks; i += numThreads) {
+        // Find the atoms in this block and compute their bounding box.
+        
+        int firstIndex = blockSize*i;
+        int atomsInBlock = min(blockSize, numAtoms-firstIndex);
+        blockAtoms.resize(atomsInBlock);
+        atomVoxelIndex.resize(atomsInBlock);
+        for (int j = 0; j < atomsInBlock; j++) {
+            blockAtoms[j] = sortedAtoms[firstIndex+j];
+            atomVoxelIndex[j] = voxels->getVoxelIndex(&atomLocations[4*blockAtoms[j]]);
        }
-    }
-}
+        fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
+        fvec4 maxPos = minPos;
+        for (int j = 1; j < atomsInBlock; j++) {
+            fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
+            minPos = min(minPos, pos);
+            maxPos = max(maxPos, pos);
+        }
+        voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations, atomVoxelIndex);

-void CpuNeighborList::threadWait() {
-    pthread_mutex_lock(&lock);
-    waitCount++;
-    pthread_cond_signal(&endCondition);
-    pthread_cond_wait(&startCondition, &lock);
-    pthread_mutex_unlock(&lock);
-}
+        // Record the exclusions for this block.

-void CpuNeighborList::advanceThreads() {
-    waitCount = 0;
-    pthread_cond_broadcast(&startCondition);
-    while (waitCount < numThreads) {
-        pthread_cond_wait(&endCondition, &lock);
+        for (int j = 0; j < atomsInBlock; j++) {
+            const set<int>& atomExclusions = (*exclusions)[sortedAtoms[firstIndex+j]];
+            char mask = 1<<j;
+            for (int k = 0; k < (int) blockNeighbors[i].size(); k++) {
+                int atomIndex = blockNeighbors[i][k];
+                if (atomExclusions.find(atomIndex) != atomExclusions.end())
+                    blockExclusions[i][k] |= mask;
+            }
+        }
    }
 }


--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -22,7 +22,6 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include <string.h>
 #include <complex>

 #include "SimTKOpenMMCommon.h"
@@ -30,9 +29,8 @@
 #include "CpuNonbondedForce.h"
 #include "ReferenceForce.h"
 #include "ReferencePME.h"
-#include "openmm/internal/hardware.h"
-#include "openmm/internal/SplineFitter.h"
-#include "openmm/internal/vectorize.h"
+#include "gmx_atomic.h"
+#include <algorithm>

 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
@@ -42,80 +40,43 @@ using namespace std;
 using namespace OpenMM;

 const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
-const int CpuNonbondedForce::NUM_TABLE_POINTS = 1025;
+const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;

-class CpuNonbondedForce::ThreadData {
+class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
 public:
-    ThreadData(int index, CpuNonbondedForce& owner) : index(index), owner(owner) {
+    ComputeDirectTask(CpuNonbondedForce& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadComputeDirect(threads, threadIndex);
    }
-    int index;
    CpuNonbondedForce& owner;
-    vector<float> threadForce;
-    double threadEnergy;
 };

-static void* threadBody(void* args) {
-    CpuNonbondedForce::ThreadData& data = *reinterpret_cast<CpuNonbondedForce::ThreadData*>(args);
-    data.owner.runThread(data.index, data.threadForce, data.threadEnergy);
-    delete &data;
-    return 0;
-}
-
 /**---------------------------------------------------------------------------------------

   CpuNonbondedForce constructor

   --------------------------------------------------------------------------------------- */

-CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false) {
-    isDeleted = false;
-    numThreads = getNumProcessors();
-    pthread_cond_init(&startCondition, NULL);
-    pthread_cond_init(&endCondition, NULL);
-    pthread_mutex_init(&lock, NULL);
-    thread.resize(numThreads);
-    pthread_mutex_lock(&lock);
-    waitCount = 0;
-    for (int i = 0; i < numThreads; i++) {
-        ThreadData* data = new ThreadData(i, *this);
-        threadData.push_back(data);
-        pthread_create(&thread[i], NULL, threadBody, data);
-    }
-    while (waitCount < numThreads)
-        pthread_cond_wait(&endCondition, &lock);
-    pthread_mutex_unlock(&lock);
+CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false), cutoffDistance(0.0f), alphaEwald(0.0f) {
 }

-/**---------------------------------------------------------------------------------------
-
-   CpuNonbondedForce destructor
-
-   --------------------------------------------------------------------------------------- */
-
-CpuNonbondedForce::~CpuNonbondedForce(){
-    isDeleted = true;
-    pthread_mutex_lock(&lock);
-    pthread_cond_broadcast(&startCondition);
-    pthread_mutex_unlock(&lock);
-    for (int i = 0; i < (int) thread.size(); i++)
-        pthread_join(thread[i], NULL);
-    pthread_mutex_destroy(&lock);
-    pthread_cond_destroy(&startCondition);
-    pthread_cond_destroy(&endCondition);
+CpuNonbondedForce::~CpuNonbondedForce() {
 }

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

-     Set the force to use a cutoff.
+   Set the force to use a cutoff.

-     @param distance            the cutoff distance
-     @param neighbors           the neighbor list to use
-     @param solventDielectric   the dielectric constant of the bulk solvent
+   @param distance            the cutoff distance
+   @param neighbors           the neighbor list to use
+   @param solventDielectric   the dielectric constant of the bulk solvent

     --------------------------------------------------------------------------------------- */

-  void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
-
+void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
+    if (distance != cutoffDistance)
+        tableIsValid = false;
    cutoff = true;
    cutoffDistance = distance;
    neighborList = &neighbors;
@@ -170,6 +131,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */

  void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
+      if (alpha != alphaEwald)
+          tableIsValid = false;
      alphaEwald = alpha;
      numRx = kmaxx;
      numRy = kmaxy;
@@ -188,6 +151,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */

  void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
+      if (alpha != alphaEwald)
+          tableIsValid = false;
      alphaEwald = alpha;
      meshDim[0] = meshSize[0];
      meshDim[1] = meshSize[1];
@@ -198,35 +163,27 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {

  
 void CpuNonbondedForce::tabulateEwaldScaleFactor() {
-    ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2);
+    if (tableIsValid)
+        return;
+    tableIsValid = true;
+    ewaldDX = cutoffDistance/NUM_TABLE_POINTS;
    ewaldDXInv = 1.0f/ewaldDX;
-    vector<double> x(NUM_TABLE_POINTS+1);
-    vector<double> y(NUM_TABLE_POINTS+1);
-    vector<double> deriv;
-    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
-        double r = i*cutoffDistance/(NUM_TABLE_POINTS-2);
+    ewaldScaleTable.resize(NUM_TABLE_POINTS+4);
+    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
+        double r = i*ewaldDX;
        double alphaR = alphaEwald*r;
-        x[i] = r;
-        y[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
-    }
-    SplineFitter::createNaturalSpline(x, y, deriv);
-    ewaldScaleTable.resize(4*NUM_TABLE_POINTS);
-    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
-        ewaldScaleTable[4*i] = (float) y[i];
-        ewaldScaleTable[4*i+1] = (float) y[i+1];
-        ewaldScaleTable[4*i+2] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
-        ewaldScaleTable[4*i+3] = (float) (deriv[i+1]*ewaldDX*ewaldDX/6);
+        ewaldScaleTable[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
    }
 }
  
-void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, vector<RealVec>& atomCoordinates,
+void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates,
                                             const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
-                                             vector<RealVec>& forces, float* totalEnergy) const {
+                                             vector<RealVec>& forces, double* totalEnergy) const {
    typedef std::complex<float> d_complex;

    static const float epsilon     =  1.0;

-    int kmax                            = (ewald ? std::max(numRx, std::max(numRy,numRz)) : 0);
+    int kmax                       = (ewald ? max(numRx, max(numRy,numRz)) : 0);
    float factorEwald              = -1 / (4*alphaEwald*alphaEwald);
    float TWO_PI                   = 2.0 * PI_M;
    float recipCoeff               = (float)(ONE_4PI_EPS0*4*PI_M/(periodicBoxSize[0] * periodicBoxSize[1] * periodicBoxSize[2]) /epsilon);
@@ -333,111 +290,107 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, v
 }


-void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, float* forces, float* totalEnergy) {
+void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
+                const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    
    this->numberOfAtoms = numberOfAtoms;
    this->posq = posq;
+    this->atomCoordinates = &atomCoordinates[0];
    this->atomParameters = &atomParameters[0];
    this->exclusions = &exclusions[0];
+    this->threadForce = &threadForce;
    includeEnergy = (totalEnergy != NULL);
+    threadEnergy.resize(threads.getNumThreads());
+    gmx_atomic_t counter;
+    gmx_atomic_set(&counter, 0);
+    this->atomicCounter = &counter;
    
    // Signal the threads to start running and wait for them to finish.
    
-    pthread_mutex_lock(&lock);
-    waitCount = 0;
-    pthread_cond_broadcast(&startCondition);
-    while (waitCount < numThreads)
-        pthread_cond_wait(&endCondition, &lock);
-    pthread_mutex_unlock(&lock);
+    ComputeDirectTask task(*this);
+    threads.execute(task);
+    threads.waitForThreads();
    
-    // Combine the results from all the threads.
+    // Combine the energies from all the threads.
    
-    double directEnergy = 0;
-    for (int i = 0; i < numThreads; i++)
-        directEnergy += threadData[i]->threadEnergy;
-    for (int i = 0; i < numberOfAtoms; i++) {
-        fvec4 f(forces+4*i);
-        for (int j = 0; j < numThreads; j++)
-            f += fvec4(&threadData[j]->threadForce[4*i]);
-        f.store(forces+4*i);
+    if (totalEnergy != NULL) {
+        double directEnergy = 0;
+        int numThreads = threads.getNumThreads();
+        for (int i = 0; i < numThreads; i++)
+            directEnergy += threadEnergy[i];
+        *totalEnergy += directEnergy;
    }
-
-    if (totalEnergy != NULL)
-        *totalEnergy += (float) directEnergy;
 }

+void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex) {
+    // Compute this thread's subset of interactions.
+
+    int numThreads = threads.getNumThreads();
+    threadEnergy[threadIndex] = 0;
+    double* energyPtr = (includeEnergy ? &threadEnergy[threadIndex] : NULL);
+    float* forces = &(*threadForce)[threadIndex][0];
+    fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
+    fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
+    if (ewald || pme) {
+        // Compute the interactions from the neighbor list.
+
+        while (true) {
+            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (nextBlock >= neighborList->getNumBlocks())
+                break;
+            calculateBlockEwaldIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
+        }
+
+        // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.

-void CpuNonbondedForce::runThread(int index, vector<float>& threadForce, double& threadEnergy) {
-    while (true) {
-        // Wait for the signal to start running.
-        
-        pthread_mutex_lock(&lock);
-        waitCount++;
-        pthread_cond_signal(&endCondition);
-        pthread_cond_wait(&startCondition, &lock);
-        pthread_mutex_unlock(&lock);
-        if (isDeleted)
-            break;
-        
-        // Compute this thread's subset of interactions.
-        
-        threadEnergy = 0;
-        double* energyPtr = (includeEnergy ? &threadEnergy : NULL);
-        threadForce.resize(4*numberOfAtoms, 0.0f);
-        for (int i = 0; i < 4*numberOfAtoms; i++)
-            threadForce[i] = 0.0f;
        fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
        fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
-        if (ewald || pme) {
-            // Compute the interactions from the neighbor list.
-
-            for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
-                calculateBlockEwaldIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
-            
-            // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
-
-            fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
-            fvec4 invBoxSize((1/periodicBoxSize[0]), (1/periodicBoxSize[1]), (1/periodicBoxSize[2]), 0);
-            for (int i = index; i < numberOfAtoms; i += numThreads)
-                for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
-                    if (*iter > i) {
-                        int j = *iter;
-                        fvec4 deltaR;
-                        fvec4 posI(posq+4*i);
-                        fvec4 posJ(posq+4*j);
-                        float r2;
-                        getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
-                        float r = sqrtf(r2);
-                        float inverseR = 1/r;
-                        float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
-                        float alphaR = alphaEwald*r;
-                        float erfcAlphaR = erfcApprox(alphaR)[0];
-                        float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
-                        dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
-                        fvec4 result = deltaR*dEdR;
-                        (fvec4(&threadForce[4*i])-result).store(&threadForce[4*i]);
-                        (fvec4(&threadForce[4*j])+result).store(&threadForce[4*j]);
-                        if (includeEnergy)
-                            threadEnergy -= chargeProd*inverseR*(1.0f-erfcAlphaR);
-                    }
+        for (int i = threadIndex; i < numberOfAtoms; i += numThreads) {
+            fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
+            for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
+                if (*iter > i) {
+                    int j = *iter;
+                    fvec4 deltaR;
+                    fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f);
+                    float r2;
+                    getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
+                    float r = sqrtf(r2);
+                    float inverseR = 1/r;
+                    float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
+                    float alphaR = alphaEwald*r;
+                    float erfcAlphaR = erfcApprox(alphaR);
+                    float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
+                    dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
+                    fvec4 result = deltaR*dEdR;
+                    (fvec4(forces+4*i)-result).store(forces+4*i);
+                    (fvec4(forces+4*j)+result).store(forces+4*j);
+                    if (includeEnergy)
+                        threadEnergy[threadIndex] -= chargeProd*inverseR*(1.0f-erfcAlphaR);
                }
+            }
        }
-        else if (cutoff) {
-            // Compute the interactions from the neighbor list.
-
-            for (int i = index; i < neighborList->getNumBlocks(); i += numThreads)
-                calculateBlockIxn(i, &threadForce[0], energyPtr, boxSize, invBoxSize);
+    }
+    else if (cutoff) {
+        // Compute the interactions from the neighbor list.
+
+        while (true) {
+            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (nextBlock >= neighborList->getNumBlocks())
+                break;
+            calculateBlockIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
        }
-        else {
-            // Loop over all atom pairs
-
-            for (int i = index; i < numberOfAtoms; i += numThreads){
-                for (int j = i+1; j < numberOfAtoms; j++)
-                    if (exclusions[j].find(i) == exclusions[j].end())
-                        calculateOneIxn(i, j, &threadForce[0], energyPtr, boxSize, invBoxSize);
-            }
+    }
+    else {
+        // Loop over all atom pairs
+
+        while (true) {
+            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (i >= numberOfAtoms)
+                break;
+            for (int j = i+1; j < numberOfAtoms; j++)
+                if (exclusions[j].find(i) == exclusions[j].end())
+                    calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
        }
    }
 }
@@ -496,198 +449,6 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
    (fvec4(forces+4*jj)-result).store(forces+4*jj);
  }

-void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // Load the positions and parameters of the atoms in the block.
-    
-    int blockAtom[4];
-    fvec4 blockAtomPosq[4];
-    fvec4 blockAtomForce[4];
-    for (int i = 0; i < 4; i++) {
-        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
-        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
-        blockAtomForce[i] = fvec4(0.0f);
-    }
-    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
-    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
-    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    
-    // Loop over neighbors for this block.
-    
-    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    float blockAtomR2[4];
-    bool include[4];
-    fvec4 blockAtomDelta[4];
-    for (int i = 0; i < (int) neighbors.size(); i++) {
-        // Load the next neighbor.
-        
-        int atom = neighbors[i];
-        fvec4 atomPosq(posq+4*atom);
-        
-        // Compute the distances to the block atoms.
-        
-        bool any = false;
-        for (int j = 0; j < 4; j++) {
-            getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
-            include[j] = (((exclusions[i]>>j)&1) == 0 && (!cutoff || blockAtomR2[j] < cutoffDistance*cutoffDistance));
-            any |= include[j];
-        }
-        if (!any)
-            continue; // No interactions to compute.
-        
-        // Compute the interactions.
-        
-        fvec4 r2(blockAtomR2);
-        fvec4 r = sqrt(r2);
-        fvec4 inverseR = fvec4(1.0f)/r;
-        fvec4 switchValue(1.0f), switchDeriv(0.0f);
-        if (useSwitch) {
-            fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
-            switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-            switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
-        }
-        fvec4 sig = blockAtomSigma+atomParameters[atom].first;
-        fvec4 sig2 = inverseR*sig;
-        sig2 *= sig2;
-        fvec4 sig6 = sig2*sig2*sig2;
-        fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
-        fvec4 dEdR = switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
-        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
-        if (cutoff)
-            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
-        else
-            dEdR += chargeProd*inverseR;
-        dEdR *= inverseR*inverseR;
-        fvec4 energy = eps*(sig6-1.0f)*sig6;
-        if (useSwitch) {
-            dEdR -= energy*switchDeriv*inverseR;
-            energy *= switchValue;
-        }
-
-        // Accumulate energies.
-
-        if (totalEnergy) {
-            if (cutoff)
-                energy += chargeProd*(inverseR+krf*r2-crf);
-            else
-                energy += chargeProd*inverseR;
-            for (int j = 0; j < 4; j++)
-                if (include[j])
-                    *totalEnergy += energy[j];
-        }
-
-        // Accumulate forces.
-
-        fvec4 atomForce(forces+4*atom);
-        for (int j = 0; j < 4; j++) {
-            if (include[j]) {
-                fvec4 result = blockAtomDelta[j]*dEdR[j];
-                blockAtomForce[j] += result;
-                atomForce -= result;
-            }
-        }
-        atomForce.store(forces+4*atom);
-    }
-    
-    // Record the forces on the block atoms.
-    
-    for (int j = 0; j < 4; j++)
-        (fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
-  }
-
-void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // Load the positions and parameters of the atoms in the block.
-    
-    int blockAtom[4];
-    fvec4 blockAtomPosq[4];
-    fvec4 blockAtomForce[4];
-    for (int i = 0; i < 4; i++) {
-        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
-        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
-        blockAtomForce[i] = fvec4(0.0f);
-    }
-    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
-    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
-    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    
-    // Loop over neighbors for this block.
-    
-    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    float blockAtomR2[4];
-    bool include[4];
-    fvec4 blockAtomDelta[4];
-    for (int i = 0; i < (int) neighbors.size(); i++) {
-        // Load the next neighbor.
-        
-        int atom = neighbors[i];
-        fvec4 atomPosq(posq+4*atom);
-        
-        // Compute the distances to the block atoms.
-        
-        bool any = false;
-        for (int j = 0; j < 4; j++) {
-            getDeltaR(atomPosq, blockAtomPosq[j], blockAtomDelta[j], blockAtomR2[j], periodic, boxSize, invBoxSize);
-            include[j] = (((exclusions[i]>>j)&1) == 0 && blockAtomR2[j] < cutoffDistance*cutoffDistance);
-            any |= include[j];
-        }
-        if (!any)
-            continue; // No interactions to compute.
-        
-        // Compute the interactions.
-        
-        fvec4 r2(blockAtomR2);
-        fvec4 r = sqrt(r2);
-        fvec4 inverseR = fvec4(1.0f)/r;
-        fvec4 switchValue(1.0f), switchDeriv(0.0f);
-        if (useSwitch) {
-            fvec4 t = (r>switchingDistance) & ((r-switchingDistance)/(cutoffDistance-switchingDistance));
-            switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-            switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))/(cutoffDistance-switchingDistance);
-        }
-        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
-        fvec4 dEdR = chargeProd*inverseR*ewaldScaleFunction(r);
-        fvec4 sig = blockAtomSigma+atomParameters[atom].first;
-        fvec4 sig2 = inverseR*sig;
-        sig2 *= sig2;
-        fvec4 sig6 = sig2*sig2*sig2;
-        fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
-        dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
-        dEdR *= inverseR*inverseR;
-        fvec4 energy = eps*(sig6-1.0f)*sig6;
-        if (useSwitch) {
-            dEdR -= energy*switchDeriv*inverseR;
-            energy *= switchValue;
-        }
-
-        // Accumulate energies.
-
-        if (totalEnergy) {
-            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
-            for (int j = 0; j < 4; j++)
-                if (include[j])
-                    *totalEnergy += energy[j];
-        }
-
-        // Accumulate forces.
-
-        fvec4 atomForce(forces+4*atom);
-        for (int j = 0; j < 4; j++) {
-            if (include[j]) {
-                fvec4 result = blockAtomDelta[j]*dEdR[j];
-                blockAtomForce[j] += result;
-                atomForce -= result;
-            }
-        }
-        atomForce.store(forces+4*atom);
-    }
-    
-    // Record the forces on the block atoms.
-    
-    for (int j = 0; j < 4; j++)
-        (fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
-}
-
 void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    deltaR = posJ-posI;
    if (periodic) {
@@ -697,33 +458,15 @@ void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& d
    r2 = dot3(deltaR, deltaR);
 }

-fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
+float CpuNonbondedForce::erfcApprox(float x) {
    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
    // error of 3e-7.

-    fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
+    float t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
    t *= t;
    t *= t;
    t *= t;
    return 1.0f/(t*t);
 }

-fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
-    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
-
-    float y[4];
-    fvec4 x1 = x*ewaldDXInv;
-    ivec4 index = floor(x1);
-    fvec4 coeff[4];
-    coeff[1] = x1-index;
-    coeff[0] = 1.0f-coeff[1];
-    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
-    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
-    _MM_TRANSPOSE4_PS(coeff[0], coeff[1], coeff[2], coeff[3]);
-    for (int i = 0; i < 4; i++) {
-        if (index[i] < NUM_TABLE_POINTS)
-            y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]]));
-    }
-    return fvec4(y);
-}
--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "SimTKOpenMMCommon.h"
+#include "SimTKOpenMMUtilities.h"
+#include "CpuNonbondedForceVec4.h"
+
+using namespace std;
+using namespace OpenMM;
+
+/**
+ * Factory method to create a CpuNonbondedForceVec4.
+ */
+CpuNonbondedForce* createCpuNonbondedForceVec4() {
+    return new CpuNonbondedForceVec4();
+}
+
+/**---------------------------------------------------------------------------------------
+
+   CpuNonbondedForceVec4 constructor
+
+   --------------------------------------------------------------------------------------- */
+
+CpuNonbondedForceVec4::CpuNonbondedForceVec4() {
+}
+
+void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    
+    int blockAtom[4];
+    fvec4 blockAtomPosq[4];
+    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    for (int i = 0; i < 4; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+    }
+    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
+    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
+    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
+    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
+    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
+    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
+            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    
+    // Loop over neighbors for this block.
+    
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        
+        int atom = neighbors[i];
+        
+        // Compute the distances to the block atoms.
+        
+        fvec4 dx, dy, dz, r2;
+        getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        ivec4 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
+            continue; // No interactions to compute.
+        
+        // Compute the interactions.
+        
+        fvec4 r = sqrt(r2);
+        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec4 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec4 sig6 = sig2*sig2*sig2;
+            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
+        if (cutoff)
+            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
+        else
+            dEdR += chargeProd*inverseR;
+        dEdR *= inverseR*inverseR;
+
+        // Accumulate energies.
+
+        fvec4 one(1.0f);
+        if (totalEnergy) {
+            if (cutoff)
+                energy += chargeProd*(inverseR+krf*r2-crf);
+            else
+                energy += chargeProd*inverseR;
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot4(energy, one);
+        }
+
+        // Accumulate forces.
+
+        dEdR = blend(0.0f, dEdR, include);
+        fvec4 fx = dx*dEdR;
+        fvec4 fy = dy*dEdR;
+        fvec4 fz = dz*dEdR;
+        blockAtomForceX += fx;
+        blockAtomForceY += fy;
+        blockAtomForceZ += fz;
+        float* atomForce = forces+4*atom;
+        atomForce[0] -= dot4(fx, one);
+        atomForce[1] -= dot4(fy, one);
+        atomForce[2] -= dot4(fz, one);
+    }
+    
+    // Record the forces on the block atoms.
+
+    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
+    transpose(f[0], f[1], f[2], f[3]);
+    for (int j = 0; j < 4; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+  }
+
+void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    
+    int blockAtom[4];
+    fvec4 blockAtomPosq[4];
+    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    for (int i = 0; i < 4; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+    }
+    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
+    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
+    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
+    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
+    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
+    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
+            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    
+    // Loop over neighbors for this block.
+    
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        
+        int atom = neighbors[i];
+        
+        // Compute the distances to the block atoms.
+        
+        fvec4 dx, dy, dz, r2;
+        getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        ivec4 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
+            continue; // No interactions to compute.
+        
+        // Compute the interactions.
+        
+        fvec4 r = sqrt(r2);
+        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec4 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec4 sig6 = sig2*sig2*sig2;
+            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
+        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
+        dEdR *= inverseR*inverseR;        
+
+        // Accumulate energies.
+
+        fvec4 one(1.0f);
+        if (totalEnergy) {
+            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot4(energy, one);
+        }
+
+        // Accumulate forces.
+
+        dEdR = blend(0.0f, dEdR, include);
+        fvec4 fx = dx*dEdR;
+        fvec4 fy = dy*dEdR;
+        fvec4 fz = dz*dEdR;
+        blockAtomForceX += fx;
+        blockAtomForceY += fy;
+        blockAtomForceZ += fz;
+        float* atomForce = forces+4*atom;
+        atomForce[0] -= dot4(fx, one);
+        atomForce[1] -= dot4(fy, one);
+        atomForce[2] -= dot4(fz, one);
+    }
+    
+    // Record the forces on the block atoms.
+    
+    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
+    transpose(f[0], f[1], f[2], f[3]);
+    for (int j = 0; j < 4; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+}
+
+void CpuNonbondedForceVec4::getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (periodic) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
+
+fvec4 CpuNonbondedForceVec4::erfcApprox(const fvec4& x) {
+    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
+    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
+    // error of 3e-7.
+
+    fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
+    t *= t;
+    t *= t;
+    t *= t;
+    return 1.0f/(t*t);
+}
+
+fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(const fvec4& x) {
+    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+
+    fvec4 x1 = x*ewaldDXInv;
+    ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&ewaldScaleTable[index[0]]);
+    fvec4 t2(&ewaldScaleTable[index[1]]);
+    fvec4 t3(&ewaldScaleTable[index[2]]);
+    fvec4 t4(&ewaldScaleTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "SimTKOpenMMCommon.h"
+#include "SimTKOpenMMUtilities.h"
+#include "CpuNonbondedForceVec8.h"
+#include "openmm/OpenMMException.h"
+#include "openmm/internal/hardware.h"
+
+using namespace std;
+using namespace OpenMM;
+
+#ifndef __AVX__
+bool isVec8Supported() {
+    return false;
+}
+
+CpuNonbondedForce* createCpuNonbondedForceVec8() {
+    throw OpenMMException("Internal error: OpenMM was compiled without AVX support");
+}
+#else
+/**
+ * Check whether 8 component vectors are supported with the current CPU.
+ */
+bool isVec8Supported() {
+    // Make sure the CPU supports AVX.
+    
+    int cpuInfo[4];
+    cpuid(cpuInfo, 0);
+    if (cpuInfo[0] >= 1) {
+        cpuid(cpuInfo, 1);
+        return ((cpuInfo[2] & ((int) 1 << 28)) != 0);
+    }
+    return false;
+}
+
+/**
+ * Factory method to create a CpuNonbondedForceVec8.
+ */
+CpuNonbondedForce* createCpuNonbondedForceVec8() {
+    return new CpuNonbondedForceVec8();
+}
+
+/**---------------------------------------------------------------------------------------
+
+   CpuNonbondedForceVec8 constructor
+
+   --------------------------------------------------------------------------------------- */
+
+CpuNonbondedForceVec8::CpuNonbondedForceVec8() {
+}
+
+void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    
+    int blockAtom[8];
+    fvec4 blockAtomPosq[8];
+    fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
+    for (int i = 0; i < 8; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+    }
+    transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
+    blockAtomCharge *= ONE_4PI_EPS0;
+    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
+    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
+    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
+            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    
+    // Loop over neighbors for this block.
+    
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        
+        int atom = neighbors[i];
+        
+        // Compute the distances to the block atoms.
+        
+        fvec8 dx, dy, dz, r2;
+        getDeltaR(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        ivec8 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec8(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1, excl&16 ? 0 : -1, excl&32 ? 0 : -1, excl&64 ? 0 : -1, excl&128 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
+            continue; // No interactions to compute.
+        
+        // Compute the interactions.
+        
+        fvec8 r = sqrt(r2);
+        fvec8 inverseR = fvec8(1.0f)/r;
+        fvec8 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec8 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec8 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec8 sig6 = sig2*sig2*sig2;
+            fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
+        if (cutoff)
+            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
+        else
+            dEdR += chargeProd*inverseR;
+        dEdR *= inverseR*inverseR;
+
+        // Accumulate energies.
+
+        fvec8 one(1.0f);
+        if (totalEnergy) {
+            if (cutoff)
+                energy += chargeProd*(inverseR+krf*r2-crf);
+            else
+                energy += chargeProd*inverseR;
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot8(energy, one);
+        }
+
+        // Accumulate forces.
+
+        dEdR = blend(0.0f, dEdR, include);
+        fvec8 fx = dx*dEdR;
+        fvec8 fy = dy*dEdR;
+        fvec8 fz = dz*dEdR;
+        blockAtomForceX += fx;
+        blockAtomForceY += fy;
+        blockAtomForceZ += fz;
+        float* atomForce = forces+4*atom;
+        atomForce[0] -= dot8(fx, one);
+        atomForce[1] -= dot8(fy, one);
+        atomForce[2] -= dot8(fz, one);
+    }
+    
+    // Record the forces on the block atoms.
+
+    fvec4 f[8];
+    transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);
+    for (int j = 0; j < 8; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+  }
+
+void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    
+    int blockAtom[8];
+    fvec4 blockAtomPosq[8];
+    fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
+    for (int i = 0; i < 8; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+    }
+    transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
+    blockAtomCharge *= ONE_4PI_EPS0;
+    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
+    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
+    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
+            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    
+    // Loop over neighbors for this block.
+    
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        
+        int atom = neighbors[i];
+        
+        // Compute the distances to the block atoms.
+        
+        fvec8 dx, dy, dz, r2;
+        getDeltaR(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        ivec8 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec8(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1, excl&16 ? 0 : -1, excl&32 ? 0 : -1, excl&64 ? 0 : -1, excl&128 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
+            continue; // No interactions to compute.
+        
+        // Compute the interactions.
+        
+        fvec8 r = sqrt(r2);
+        fvec8 inverseR = fvec8(1.0f)/r;
+        fvec8 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec8 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec8 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec8 sig6 = sig2*sig2*sig2;
+            fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
+        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
+        dEdR *= inverseR*inverseR;        
+
+        // Accumulate energies.
+
+        fvec8 one(1.0f);
+        if (totalEnergy) {
+            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot8(energy, one);
+        }
+
+        // Accumulate forces.
+
+        dEdR = blend(0.0f, dEdR, include);
+        fvec8 fx = dx*dEdR;
+        fvec8 fy = dy*dEdR;
+        fvec8 fz = dz*dEdR;
+        blockAtomForceX += fx;
+        blockAtomForceY += fy;
+        blockAtomForceZ += fz;
+        float* atomForce = forces+4*atom;
+        atomForce[0] -= dot8(fx, one);
+        atomForce[1] -= dot8(fy, one);
+        atomForce[2] -= dot8(fz, one);
+    }
+    
+    // Record the forces on the block atoms.
+    
+    fvec4 f[8];
+    transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);
+    for (int j = 0; j < 8; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+}
+
+void CpuNonbondedForceVec8::getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (periodic) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
+
+fvec8 CpuNonbondedForceVec8::erfcApprox(const fvec8& x) {
+    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
+    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
+    // error of 3e-7.
+
+    fvec8 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
+    t *= t;
+    t *= t;
+    t *= t;
+    return 1.0f/(t*t);
+}
+
+fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(const fvec8& x) {
+    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+
+    fvec8 x1 = x*ewaldDXInv;
+    ivec8 index = min(floor(x1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = x1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&ewaldScaleTable[indexLower[0]]);
+    fvec4 t2(&ewaldScaleTable[indexLower[1]]);
+    fvec4 t3(&ewaldScaleTable[indexLower[2]]);
+    fvec4 t4(&ewaldScaleTable[indexLower[3]]);
+    fvec4 t5(&ewaldScaleTable[indexUpper[0]]);
+    fvec4 t6(&ewaldScaleTable[indexUpper[1]]);
+    fvec4 t7(&ewaldScaleTable[indexUpper[2]]);
+    fvec4 t8(&ewaldScaleTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
+#endif
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -32,9 +32,12 @@
 #include "CpuPlatform.h"
 #include "CpuKernelFactory.h"
 #include "CpuKernels.h"
+#include "CpuSETTLE.h"
+#include "ReferenceConstraints.h"
 #include "openmm/internal/hardware.h"

 using namespace OpenMM;
+using namespace std;

 extern "C" OPENMM_EXPORT_CPU void registerPlatforms() {
    // Only register this platform if the CPU supports SSE 4.1.
@@ -43,9 +46,16 @@ extern "C" OPENMM_EXPORT_CPU void registerPlatforms() {
        Platform::registerPlatform(new CpuPlatform());
 }

+map<ContextImpl*, CpuPlatform::PlatformData*> CpuPlatform::contextData;
+
 CpuPlatform::CpuPlatform() {
    CpuKernelFactory* factory = new CpuKernelFactory();
+    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
+    registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
+    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
+    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
 }

 double CpuPlatform::getSpeed() const {
@@ -67,3 +77,33 @@ bool CpuPlatform::isProcessorSupported() {
    }
    return false;
 }
+
+void CpuPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
+    ReferencePlatform::contextCreated(context, properties);
+    PlatformData* data = new PlatformData(context.getSystem().getNumParticles());
+    contextData[&context] = data;
+    ReferenceConstraints& constraints = *(ReferenceConstraints*) reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData())->constraints;
+    if (constraints.settle != NULL) {
+        CpuSETTLE* parallelSettle = new CpuSETTLE(context.getSystem(), *(ReferenceSETTLEAlgorithm*) constraints.settle, data->threads);
+        delete constraints.settle;
+        constraints.settle = parallelSettle;
+    }
+}
+
+void CpuPlatform::contextDestroyed(ContextImpl& context) const {
+    PlatformData* data = contextData[&context];
+    delete data;
+    contextData.erase(&context);
+}
+
+CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
+    return *contextData[&context];
+}
+
+CpuPlatform::PlatformData::PlatformData(int numParticles) : posq(4*numParticles) {
+    int numThreads = threads.getNumThreads();
+    threadForce.resize(numThreads);
+    for (int i = 0; i < numThreads; i++)
+        threadForce[i].resize(4*numParticles);
+    isPeriodic = false;
+}
--- a/platforms/cpu/src/CpuRandom.cpp
+++ b/platforms/cpu/src/CpuRandom.cpp
+/* Portions copyright (c) 2013 Stanford University and Simbios.
+ * Authors: Peter Eastman
+ * Contributors: 
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "CpuRandom.h"
+#include "openmm/OpenMMException.h"
+#include <cmath>
+
+using namespace std;
+using namespace OpenMM;
+
+CpuRandom::CpuRandom() : hasInitialized(false) {
+}
+
+CpuRandom::~CpuRandom() {
+    for (int i = 0; i < threadRandom.size(); i++)
+        delete threadRandom[i];
+}
+
+void CpuRandom::initialize(int seed, int numThreads) {
+    if (hasInitialized) {
+        if (seed == randomSeed)
+            return; // Already initialized with the same seed.
+        throw OpenMMException("Random number generator initialized twice with different seeds");
+    }
+    randomSeed = seed;
+    hasInitialized = true;
+    threadRandom.resize(numThreads);
+    nextGaussian.resize(numThreads);
+    nextGaussianIsValid.resize(numThreads, false);
+
+    // Use a quick and dirty RNG to pick seeds for the real random number generator.
+
+    unsigned int r = (unsigned int) seed;
+    for (int i = 0; i < numThreads; i++) {
+        r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+        threadRandom[i] = new OpenMM_SFMT::SFMT();
+        init_gen_rand(r, *threadRandom[i]);
+    }
+}
+
+float CpuRandom::getGaussianRandom(int threadIndex) {
+    if (nextGaussianIsValid[threadIndex]) {
+        nextGaussianIsValid[threadIndex] = false;
+        return nextGaussian[threadIndex];
+    }
+    
+    // Use the polar form of the Box-Muller transformation to generate two Gaussian random numbers.
+    
+    float x, y, r2;
+    do {
+        x = 2.0f*(float) genrand_real2(*threadRandom[threadIndex])-1.0f;
+        y = 2.0f*(float) genrand_real2(*threadRandom[threadIndex])-1.0f;
+        r2 = x*x + y*y;
+    } while (r2 >= 1.0f || r2 == 0.0f);
+    float multiplier = sqrtf((-2.0f*logf(r2))/r2);
+    nextGaussian[threadIndex] = y*multiplier;
+    nextGaussianIsValid[threadIndex] = true;
+    return x*multiplier;
+}
+
+float CpuRandom::getUniformRandom(int threadIndex) {
+    return genrand_real2(*threadRandom[threadIndex]);
+}
--- a/platforms/cpu/src/CpuSETTLE.cpp
+++ b/platforms/cpu/src/CpuSETTLE.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CpuSETTLE.h"
+
+using namespace OpenMM;
+using namespace std;
+
+class CpuSETTLE::ApplyToPositionsTask : public ThreadPool::Task {
+public:
+    ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
+            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
+            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        threadSettle[threadIndex]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
+    }
+    vector<OpenMM::RealVec>& atomCoordinates;
+    vector<OpenMM::RealVec>& atomCoordinatesP;
+    vector<RealOpenMM>& inverseMasses;
+    RealOpenMM tolerance;
+    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
+};
+
+class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
+public:
+    ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
+            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
+            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        threadSettle[threadIndex]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
+    }
+    vector<OpenMM::RealVec>& atomCoordinates;
+    vector<OpenMM::RealVec>& velocities;
+    vector<RealOpenMM>& inverseMasses;
+    RealOpenMM tolerance;
+    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
+};
+
+CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
+    int numThreads = threads.getNumThreads();
+    int numClusters = settle.getNumClusters();
+    vector<RealOpenMM> mass(system.getNumParticles());
+    for (int i = 0; i < system.getNumParticles(); i++)
+        mass[i] = system.getParticleMass(i);
+    for (int i = 0; i < numThreads; i++) {
+        int start = i*numClusters/numThreads;
+        int end = (i+1)*numClusters/numThreads;
+        if (start != end) {
+            int numThreadClusters = end-start;
+            vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters);
+            vector<RealOpenMM> distance1(numThreadClusters), distance2(numThreadClusters);
+            for (int j = 0; j < numThreadClusters; j++)
+                settle.getClusterParameters(start+j, atom1[j], atom2[j], atom3[j], distance1[j], distance2[j]);
+            threadSettle.push_back(new ReferenceSETTLEAlgorithm(atom1, atom2, atom3, distance1, distance2, mass));
+        }
+    }
+}
+
+CpuSETTLE::~CpuSETTLE() {
+    for (int i = 0; i < (int) threadSettle.size(); i++)
+        delete threadSettle[i];
+}
+
+void CpuSETTLE::apply(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
+    ApplyToPositionsTask task(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance, threadSettle);
+    threads.execute(task);
+    threads.waitForThreads();
+}
+
+void CpuSETTLE::applyToVelocities(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
+    ApplyToVelocitiesTask task(atomCoordinates, velocities, inverseMasses, tolerance, threadSettle);
+    threads.execute(task);
+    threads.waitForThreads();
+}
--- a/platforms/cpu/src/gmx_atomic.h
+++ b/platforms/cpu/src/gmx_atomic.h
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- 
+*
+* Copyright (c) 2004-2008, Erik Lindahl <lindahl@cbr.su.se>
+*
+*  Unfortunately, some of the constructs in this file are _very_ sensitive
+*  to compiler optimizations and architecture changes. If you find any such
+*  errors, please send a message to lindahl@cbr.su.se to help us fix the
+*  upstream version too.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*
+* And Hey:
+* Gnomes, ROck Monsters And Chili Sauce
+*/
+#ifndef _GMX_ATOMIC_H_
+#define _GMX_ATOMIC_H_
+
+/*! \file gmx_atomic.h
+ *
+ *  @brief Atomic operations for fast SMP synchronization
+ *
+ *  This file defines atomic integer operations and spinlocks for 
+ *  fast synchronization in performance-critical regions of gromacs.
+ *
+ *  In general, the best option is to use functions without explicit 
+ *  locking, e.g. gmx_atomic_fetch_add() or gmx_atomic_cmpxchg().
+ *
+ *  Not all architecture support atomic operations though inline assembly,
+ *  and even if they do it might not be implemented here. In that case
+ *  we use a fallback mutex implementation, so you can always count on
+ *  the function interfaces working in Gromacs.
+ *
+ *  Don't use spinlocks in non-performance-critical regions like file I/O.
+ *  Since they always spin busy they would waste CPU cycles instead of 
+ *  properly yielding to a computation thread while waiting for the disk.
+ *
+ *  Finally, note that all our spinlock operations are defined to return
+ *  0 if initialization or locking completes successfully.
+ *  This is the opposite of some other implementations, but the same standard
+ *  as used for pthread mutexes. So, if e.g. are trying to lock a spinlock,
+ *  you will have gotten the lock if the return value is 0.
+ * 
+ *  gmx_spinlock_islocked(x) obviously still returns 1 if the lock is locked,
+ *  and 0 if it is available, though...
+ */
+
+
+
+#include <stdio.h>
+
+#include <pthread.h>
+
+#ifdef __cplusplus
+extern "C" 
+{  
+#endif
+#if 0
+} /* Avoids screwing up auto-indentation */
+#endif
+
+
+
+
+#if ( ( (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__)) && \
+        (defined(i386) || defined(__x86_64__)) )                                      \
+      || defined (DOXYGEN) )
+
+/* This code is executed for x86 and x86-64, with these compilers:
+ * GNU
+ * Intel 
+ * Pathscale
+ * All these support GCC-style inline assembly. 
+ * We also use this section for the documentation.
+ */
+
+/*! \brief Memory barrier operation
+ *
+ *  Modern CPUs rely heavily on out-of-order execution, and one common feature
+ *  is that load/stores might be reordered. Also, when using inline assembly
+ *  the compiler might already have loaded the variable we are changing into
+ *  a register, so any update to memory won't be visible.
+ *
+ *  This command creates a memory barrier, i.e. all memory results before
+ *  it in the code should be visible to all memory operations after it - the
+ *  CPU cannot propagate load/stores across it.
+ */
+#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
+
+/* Only gcc and Intel support this check, otherwise set it to true (skip doc) */
+#if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN)
+#define __builtin_constant_p(i) (1)
+#endif
+
+
+/*! \brief Gromacs atomic operations datatype
+ *
+ *  Portable synchronization primitives like mutexes are effective for
+ *  many purposes, but usually not very high performance.
+ *  One of the problem is that you have the overhead of a function call,
+ *  and another is that Mutexes often have extra overhead to make the
+ *  scheduling fair. Finally, if performance is important we don't want
+ *  to suspend the thread if we cannot lock a mutex, but spin-lock at 100%
+ *  CPU usage until the resources is available (e.g. increment a counter).
+ *
+ *  These things can often be implemented with inline-assembly or other
+ *  system-dependent functions, and we provide such functionality for the
+ *  most common platforms. For portability we also have a fallback 
+ *  implementation using a mutex for locking.
+ *
+ *  Performance-wise, the fastest solution is always to avoid locking 
+ *  completely (obvious, but remember it!). If you cannot do that, the
+ *  next best thing is to use atomic operations that e.g. increment a
+ *  counter without explicit locking. Spinlocks are useful to lock an
+ *  entire region, but leads to more overhead and can be difficult to
+ *  debug - it is up to you to make sure that only the thread owning the
+ *  lock unlocks it!
+ *
+ *  You should normally NOT use atomic operations for things like 
+ *  I/O threads. These should yield to other threads while waiting for 
+ *  the disk instead of spinning at 100% CPU usage.
+ *
+ *  It is imperative that you use the provided routines for reading
+ *  and writing, since some implementations require memory barriers before
+ *  the CPU or memory sees an updated result. The structure contents is
+ *  only visible here so it can be inlined for performance - it might
+ *  change without further notice.
+ *
+ *  \note No initialization is required for atomic variables.
+ *
+ *  Currently, we have (real) atomic operations for:
+ *
+ *  - x86 or x86_64, using GNU compilers
+ *  - x86 or x86_64, using Intel compilers 
+ *  - x86 or x86_64, using Pathscale compilers
+ *  - Itanium, using GNU compilers 
+ *  - Itanium, using Intel compilers
+ *  - Itanium, using HP compilers
+ *  - PowerPC, using GNU compilers 
+ *  - PowerPC, using IBM AIX compilers 
+ *  - PowerPC, using IBM compilers >=7.0 under Linux or Mac OS X.
+ */
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+
+/*! \brief Gromacs spinlock
+ *
+ *  Spinlocks provide a faster synchronization than mutexes,
+ *  although they consume CPU-cycles while waiting. They are implemented
+ *  with atomic operations and inline assembly whenever possible, and
+ *  otherwise we use a fallback implementation where a spinlock is identical
+ *  to a mutex (this is one of the reasons why you have to initialize them).
+ *
+ *  There are no guarantees whatsoever about fair scheduling or
+ *  debugging if you make a mistake and unlock a variable somebody
+ *  else has locked - performance is the primary goal of spinlocks.
+ *
+ */
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+
+
+
+/*! \brief Spinlock static initializer
+ *
+ *  This is used for static spinlock initialization, and has the same
+ *  properties as GMX_THREAD_MUTEX_INITIALIZER has for mutexes.
+ *  This is only for inlining in the gmx_thread.h header file. Whether
+ *  it is 0, 1, or something else when unlocked depends on the platform.
+ *  Don't assume anything about it. It might even be a mutex when using the
+ * fallback implementation!
+ */
+#define GMX_SPINLOCK_INITIALIZER   { 1 }
+
+
+
+/*! \brief Return value of an atomic integer 
+ *
+ *  Also implements proper memory barriers when necessary.
+ *  The actual implementation is system-dependent.
+ *
+ *  \param  a   Atomic variable to read
+ *  \return     Integer value of the atomic variable
+ */
+#define gmx_atomic_read(a)  ((a)->value) 
+
+ 
+/*! \brief Write value to an atomic integer 
+ *
+ *  Also implements proper memory barriers when necessary.
+ *  The actual implementation is system-dependent.
+ *
+ *  \param  a   Atomic variable
+ *  \param  i   Integer to set the atomic variable to.
+ */
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+
+ 
+/*! \brief Add integer to atomic variable
+ *
+ *  Also implements proper memory barriers when necessary.
+ *  The actual implementation is system-dependent.
+ *
+ *  \param a   atomic datatype to modify
+ *  \param i   integer to increment with. Use i<0 to subtract atomically.
+ *
+ *  \return The new value (after summation).
+ */
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *     a, 
+                      volatile int       i)
+{
+    int __i;
+    
+    __i = i;
+    __asm__ __volatile__("lock ; xaddl %0, %1;"
+                         :"=r"(i) :"m"(a->value), "0"(i));
+    return i + __i;
+}  
+  
+
+/*! \brief Add to variable, return the old value.
+ *
+ *  This operation is quite useful for synchronization counters.
+ *  By performing a fetchadd with N, a thread can e.g. reserve a chunk 
+ *  with the next N iterations, and the return value is the index
+ *  of the first element to treat.
+ *
+ *  Also implements proper memory barriers when necessary.
+ *  The actual implementation is system-dependent.
+ *
+ *  \param a   atomic datatype to modify
+ *  \param i   integer to increment with. Use i<0 to subtract atomically.
+ *
+ *  \return    The value of the atomic variable before addition.
+ */
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     volatile int       i)
+{
+    int __i;
+
+    __i = i;
+    __asm__ __volatile__("lock ; xaddl %0, %1;"
+                         :"=r"(i) :"m"(a->value), "0"(i));
+    return i;
+}
+
+
+/*! \brief Atomic compare-exchange operation
+ *
+ *   The \a old value is compared with the memory value in the atomic datatype.
+ *   If the are identical, the atomic type is updated to the new value, 
+ *   and otherwise left unchanged. 
+ *  
+ *   This is a very useful synchronization primitive: You can start by reading
+ *   a value (without locking anything), perform some calculations, and then
+ *   atomically try to update it in memory unless it has changed. If it has
+ *   changed you will get an error return code - reread the new value
+ *   an repeat the calculations in that case.
+ *
+ *   \param a        Atomic datatype ('memory' value)
+ *   \param oldval   Integer value read from the atomic type at an earlier point
+ *   \param newval   New value to write to the atomic type if it currently is
+ *                   identical to the old value.
+ *
+ *   \return The value of the atomic memory variable in memory when this 
+ *           instruction was executed. This, if the operation succeeded the
+ *           return value was identical to the \a old parameter, and if not
+ *           it returns the updated value in memory so you can repeat your
+ *           operations on it. 
+ *
+ *   \note   The exchange occured if the return value is identical to \a old.
+ */
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *    a, 
+                   int               oldval,
+                   int               newval)
+{
+    volatile unsigned long prev;
+    
+    __asm__ __volatile__("lock ; cmpxchgl %1,%2"
+                         : "=a"(prev)
+                         : "q"(newval), "m"(a->value), "0"(oldval)
+                         : "memory");
+    
+    return prev;
+}
+
+
+/*! \brief Initialize spinlock
+ *
+ *  In theory you can call this from multiple threads, but remember
+ *  that we don't check for errors. If the first thread proceeded to
+ *  lock the spinlock after initialization, the second will happily
+ *  overwrite the contents and unlock it without warning you.
+ *
+ *  \param x      Gromacs spinlock pointer.
+ */
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *   x)
+{
+    x->lock = 1;
+}
+
+
+
+/*! \brief Acquire spinlock
+ *
+ *  This routine blocks until the spinlock is available, and
+ *  the locks it again before returning.
+ *
+ *  \param x     Gromacs spinlock pointer
+ */
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *  x)
+{
+	__asm__ __volatile__("\n1:\t" 
+						 "lock ; decb %0\n\t" 
+						 "jns 3f\n" 
+						 "2:\t" 
+						 "rep;nop\n\t" 
+						 "cmpb $0,%0\n\t" 
+						 "jle 2b\n\t" 
+						 "jmp 1b\n" 
+						 "3:\n\t" 
+						 :"=m" (x->lock) : : "memory"); 
+}
+
+
+/*! \brief Attempt to acquire spinlock
+ *
+ * This routine acquires the spinlock if possible, but if 
+ * already locked it return an error code immediately.
+ *
+ *  \param x     Gromacs spinlock pointer
+ *
+ * \return 0 if the mutex was available so we could lock it,
+ *         otherwise a non-zero integer (1) if the lock is busy.
+ */
+static inline int
+gmx_spinlock_trylock(gmx_spinlock_t *  x)
+{
+	char old_value;
+	
+    __asm__ __volatile__("xchgb %b0,%1"
+                         :"=q" (old_value), "=m" (x->lock)
+						 :"0" (0) : "memory");
+    return (old_value <= 0);
+}
+
+
+/*! \brief Release spinlock
+ *
+ *  \param x     Gromacs spinlock pointer
+ *
+ *  Unlocks the spinlock, regardless if which thread locked it.
+ */
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *  x)
+{
+	char old_value = 1;
+	
+	__asm__ __volatile__(
+                         "xchgb %b0, %1" 
+                         :"=q" (old_value), "=m" (x->lock) 
+                         :"0" (old_value) : "memory"
+                         );
+}
+ 
+
+/*! \brief Check if spinlock is locked
+ *
+ *  This routine returns immediately with the lock status.
+ *
+ *  \param x  Gromacs spinlock pointer
+ *
+ *  \return 1 if the spinlock is locked, 0 otherwise.
+ */
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *  x)
+{
+    return (*(volatile signed char *)(&(x)->lock) <= 0);
+}
+
+
+/*! \brief Wait for a spinlock to become available
+ *
+ *  This routine blocks until the spinlock is unlocked, 
+ *  but in contrast to gmx_spinlock_lock() it returns without 
+ *  trying to lock the spinlock.
+ *
+ *  \param x  Gromacs spinlock pointer
+ */
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    do 
+    {
+        gmx_atomic_memory_barrier(); 
+    } 
+    while(gmx_spinlock_islocked(x));
+}
+
+
+#elif ( defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__)))
+/* PowerPC using proper GCC inline assembly. 
+ * Recent versions of xlC (>=7.0) _partially_ support this, but since it is
+ * not 100% compatible we provide a separate implementation for xlC in
+ * the next section.
+ */
+
+/* Compiler-dependent stuff: GCC memory barrier */
+#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
+
+
+
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+
+
+#define gmx_atomic_read(a)   ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+
+
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *    a, 
+                      int               i)
+{
+    int t;
+    
+	__asm__ __volatile__("1:     lwarx   %0,0,%2\n"
+                         "\tadd     %0,%1,%0\n"
+                         "\tstwcx.  %0,0,%2 \n"
+                         "\tbne-    1b"
+                         "\tisync\n"
+                         : "=&r" (t)
+						 : "r" (i), "r" (&a->value)
+						 : "cc" , "memory");
+    return t;
+}
+
+
+
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     int                i)
+{
+    int t;
+    
+    __asm__ __volatile__("\teieio\n"
+                         "1:     lwarx   %0,0,%2\n"                         
+                         "\tadd     %0,%1,%0\n"
+                         "\tstwcx.  %0,0,%2 \n"
+                         "\tbne-    1b\n"
+                         "\tisync\n"
+                         : "=&r" (t)
+                         : "r" (i), "r" (&a->value)
+                         : "cc", "memory");
+    
+    return (t - i);    
+}
+
+
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *       a,
+                   int                  oldval,
+                   int                  newval)
+{
+    int prev;
+    
+    __asm__ __volatile__ ("1:    lwarx   %0,0,%2 \n"
+                          "\tcmpw    0,%0,%3 \n"
+                          "\tbne     2f \n"
+                          "\tstwcx.  %4,0,%2 \n"
+                          "bne-    1b\n"
+                          "\tsync\n"
+                          "2:\n"
+                          : "=&r" (prev), "=m" (a->value)
+                          : "r" (&a->value), "r" (oldval), "r" (newval), "m" (a->value)
+                          : "cc", "memory");
+    
+    return prev;
+}
+
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *x)
+{
+    x->lock = 0;
+}
+
+
+
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *  x)
+{
+    unsigned int tmp;
+    
+    __asm__ __volatile__("\tb      1f\n"
+                         "2:      lwzx    %0,0,%1\n"
+                         "\tcmpwi   0,%0,0\n"
+                         "\tbne+    2b\n"
+                         "1:      lwarx   %0,0,%1\n"
+                         "\tcmpwi   0,%0,0\n"
+                         "\tbne-    2b\n"
+                         "\tstwcx.  %2,0,%1\n"
+                         "bne-    2b\n"
+                         "\tisync\n"
+                         : "=&r"(tmp)
+                         : "r"(&x->lock), "r"(1)
+                         : "cr0", "memory");
+}
+
+
+static inline int
+gmx_spinlock_trylock(gmx_spinlock_t *  x)
+{
+    unsigned int old, t;
+    unsigned int mask = 1;
+    volatile unsigned int *p = &x->lock;
+    
+    __asm__ __volatile__("\teieio\n"
+                         "1:      lwarx   %0,0,%4 \n"
+                         "\tor      %1,%0,%3 \n"
+                         "\tstwcx.  %1,0,%4 \n"
+                         "\tbne     1b\n"
+                         "\tsync\n"
+                         : "=&r" (old), "=&r" (t), "=m" (*p)
+                         : "r" (mask), "r" (p), "m" (*p)
+                         : "cc", "memory");
+    
+    return ((old & mask) != 0);    
+}
+
+
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *  x)
+{
+    __asm__ __volatile__("\teieio\n": : :"memory");
+    x->lock = 0;
+}
+
+
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return ( x->lock != 0);
+}
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *x)
+{
+    do 
+    {
+        gmx_atomic_memory_barrier(); 
+    }
+    while(gmx_spinlock_islocked(x));
+}
+
+
+
+#elif ( (defined(__IBM_GCC_ASM) || defined(__IBM_STDCPP_ASM))  && \
+        (defined(__powerpc__) || defined(__ppc__)))
+/* PowerPC using xlC inline assembly. 
+ * Recent versions of xlC (>=7.0) _partially_ support GCC inline assembly
+ * if you use the option -qasm=gcc but we have had to hack things a bit, in 
+ * particular when it comes to clobbered variables. Since this implementation
+ * _could_ be buggy, we have separated it from the known-to-be-working gcc
+ * one above.
+ */
+
+/* memory barrier - no idea how to create one with xlc! */
+#define gmx_atomic_memory_barrier()
+
+
+
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+
+
+#define gmx_atomic_read(a)   ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+
+
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *    a, 
+                      int               i)
+{
+    int t;
+    
+	__asm__ __volatile__("1:     lwarx   %0,0,%2 \n"
+                         "\t add     %0,%1,%0 \n"
+                         "\t stwcx.  %0,0,%2 \n"
+                         "\t bne-    1b \n"
+                         "\t isync \n"
+                         : "=&r" (t)
+						 : "r" (i), "r" (&a->value) );
+    return t;
+}
+
+
+
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     int                i)
+{
+    int t;
+    
+    __asm__ __volatile__("\t eieio\n"
+                         "1:     lwarx   %0,0,%2 \n"                         
+                         "\t add     %0,%1,%0 \n"
+                         "\t stwcx.  %0,0,%2 \n"
+                         "\t bne-    1b \n"
+                         "\t isync \n"
+                         : "=&r" (t)
+                         : "r" (i), "r" (&a->value));
+    
+    return (t - i);    
+}
+
+
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *       a,
+                   int                  oldval,
+                   int                  newval)
+{
+    int prev;
+    
+    __asm__ __volatile__ ("1:    lwarx   %0,0,%2 \n"
+                          "\t cmpw    0,%0,%3 \n"
+                          "\t bne     2f \n"
+                          "\t stwcx.  %4,0,%2 \n"
+                          "\t bne-    1b \n"
+                          "\t sync \n"
+                          "2: \n"
+                          : "=&r" (prev), "=m" (a->value)
+                          : "r" (&a->value), "r" (oldval), "r" (newval), "m" (a->value));
+    
+    return prev;
+}
+
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *x)
+{
+    x->lock = 0;
+}
+
+
+
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *  x)
+{
+    unsigned int tmp;
+    
+    __asm__ __volatile__("\t b      1f \n"
+                         "2:      lwzx    %0,0,%1 \n"
+                         "\t cmpwi   0,%0,0 \n"
+                         "\t bne+    2b \n"
+                         "1:      lwarx   %0,0,%1 \n"
+                         "\t cmpwi   0,%0,0 \n"
+                         "\t bne-    2b \n"
+                         "\t stwcx.  %2,0,%1 \n"
+                         "\t bne-    2b \n"
+                         "\t isync\n"
+                         : "=&r"(tmp)
+                         : "r"(&x->lock), "r"(1));
+}
+
+
+static inline int
+gmx_spinlock_trylock(gmx_spinlock_t *  x)
+{
+    unsigned int old, t;
+    unsigned int mask = 1;
+    volatile unsigned int *p = &x->lock;
+    
+    __asm__ __volatile__("\t eieio\n"
+                         "1:      lwarx   %0,0,%4 \n"
+                         "\t or      %1,%0,%3 \n"
+                         "\t stwcx.  %1,0,%4 \n"
+                         "\t bne     1b \n"
+                         "\t sync \n"
+                         : "=&r" (old), "=&r" (t), "=m" (*p)
+                         : "r" (mask), "r" (p), "m" (*p));
+    
+    return ((old & mask) != 0);    
+}
+
+
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *  x)
+{
+    __asm__ __volatile__("\t eieio \n");
+    x->lock = 0;
+}
+
+
+static inline void
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return ( x->lock != 0);
+}
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    
+    do 
+    {
+        gmx_atomic_memory_barrier();
+    }
+    while(gmx_spinlock_islocked(x));
+}
+
+
+
+
+#elif (defined(__ia64__) && (defined(__GNUC__) || defined(__INTEL_COMPILER)))
+/* ia64 with GCC or Intel compilers. Since we need to define everything through
+* cmpxchg and fetchadd on ia64, we merge the different compilers and only provide 
+* different implementations for that single function. 
+* Documentation? Check the gcc/x86 section.
+*/
+
+
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+
+
+#define gmx_atomic_read(a)   ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+
+
+
+/* Compiler thingies */
+#ifdef __INTEL_COMPILER
+void __memory_barrier(void);
+int _InterlockedCompareExchange(volatile int *dest, int xchg, int comp);
+unsigned __int64 __fetchadd4_rel(unsigned int *addend, const int increment);
+/* ia64 memory barrier */
+#  define gmx_atomic_memory_barrier() __memory_barrier()
+/* ia64 cmpxchg */
+#  define gmx_atomic_cmpxchg(a, oldval, newval) _InterlockedCompareExchange(&a->value,newval,oldval)
+/* ia64 fetchadd, but it only works with increments +/- 1,4,8,16 */
+#  define gmx_ia64_fetchadd(a, inc)  __fetchadd4_rel(a, inc)
+
+#elif defined __GNUC__  
+/* ia64 memory barrier */
+#  define gmx_atomic_memory_barrier() asm volatile ("":::"memory")
+/* ia64 cmpxchg */
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *   a,
+                   int              oldval,
+                   int              newval)
+{
+    volatile int res;
+    asm volatile ("mov ar.ccv=%0;;" :: "rO"(oldval));
+    asm volatile ("cmpxchg4.acq %0=[%1],%2,ar.ccv":                    
+                  "=r"(res) : "r"(&a->value), "r"(newval) : "memory"); 
+                          
+    return res;
+}
+
+
+/* fetchadd, but on ia64 it only works with increments +/- 1,4,8,16 */
+#define gmx_ia64_fetchadd(a, inc)                                             \
+({  unsigned long res;                                                        \
+    asm volatile ("fetchadd4.rel %0=[%1],%2"                                  \
+                  : "=r"(res) : "r"(a), "i" (inc) : "memory");                \
+                  res;                                                        \
+})
+
+
+#else /* Unknown compiler */
+#  error Unknown ia64 compiler (not GCC or ICC) - modify gmx_thread.h!
+#endif
+
+
+
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *       a, 
+                      volatile int         i)
+{
+    volatile int oldval,newval;    
+    volatile int __i = i;
+
+    /* Use fetchadd if, and only if, the increment value can be determined
+     * at compile time (otherwise this check is optimized away) and it is
+     * a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
+     */                         
+    if (__builtin_constant_p(i) &&
+        ( (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
+          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) ) )
+    {
+        oldval = gmx_ia64_fetchadd(a,__i);
+        newval = oldval + i;
+    }
+    else
+    {
+        /* Use compare-exchange addition that works with any value */
+        do
+        {
+            oldval = gmx_atomic_read(a);
+            newval = oldval + i;
+        }
+        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
+    }
+    return newval;
+}
+
+
+
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     volatile int       i)
+{
+    volatile int oldval,newval;    
+    volatile int __i = i;
+    
+    /* Use ia64 fetchadd if, and only if, the increment value can be determined
+     * at compile time (otherwise this check is optimized away) and it is
+     * a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
+     */                         
+    if (__builtin_constant_p(i) &&
+        ( (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
+          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) ) )
+    {
+        oldval = gmx_ia64_fetchadd(a,__i);
+        newval = oldval + i;
+    }
+    else
+    {
+        /* Use compare-exchange addition that works with any value */
+        do
+        {
+            oldval = gmx_atomic_read(a);
+            newval = oldval + i;
+        }
+        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
+    }
+    return oldval;
+}
+
+
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *x)
+{
+    x->lock = 0;
+}
+
+
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *   x)
+{
+    gmx_atomic_t *a = (gmx_atomic_t *) x;
+    unsigned long value;                                                 
+    value = gmx_atomic_cmpxchg(a, 0, 1);                             
+    if (value)                                                           
+    {                                                                    
+        do                                                               
+        {                                                                
+            while (a->value != 0)                                                 
+            {                                                            
+                gmx_atomic_memory_barrier();                             
+            }                                                            
+            value = gmx_atomic_cmpxchg(a, 0, 1);                       
+        }                                                                
+        while (value);                                                   
+    }                                                                    
+} 
+
+
+static inline int
+gmx_spinlock_trylock(gmx_spinlock_t *   x)
+{
+    return (gmx_atomic_cmpxchg((gmx_atomic_t *)x, 0, 1) != 0);
+}
+
+
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *   x)
+{
+    do
+    {
+        gmx_atomic_memory_barrier(); 
+        x->lock = 0;
+    } 
+    while (0);
+}
+
+
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return (x->lock != 0);
+}
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    
+    do 
+    {
+        gmx_atomic_memory_barrier();
+    }
+    while(gmx_spinlock_islocked(x));
+}
+
+
+#undef gmx_ia64_fetchadd
+
+
+
+#elif (defined(__hpux) || defined(__HP_cc)) && defined(__ia64)
+/* HP compiler on ia64 */
+#include <machine/sys/inline.h>
+
+#define gmx_atomic_memory_barrier() _Asm_mf()
+
+#define gmx_hpia64_fetchadd(a, i)                           \
+    _Asm_fetchadd((_Asm_fasz)_FASZ_W,(_Asm_sem)_SEM_REL,    \
+                  (UInt32*)a,(unsigned int) i,              \
+                  (_Asm_ldhint)LDHINT_NONE)
+ 
+
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *   a,
+                   int              oldval,
+                   int              newval)
+{
+    int ret;
+    
+    _Asm_mov_to_ar((_Asm_app_reg)_AREG_CCV,(Uint32)oldval,                  
+                   (_Asm_fence)(_UP_CALL_FENCE | _UP_SYS_FENCE |         
+                                _DOWN_CALL_FENCE | _DOWN_SYS_FENCE));
+                   
+    ret = _Asm_cmpxchg((_Asm_sz)SZ_W,(_Asm_sem)_SEM_ACQ,(Uint32*)a,    
+                       (Uint32)newval,(_Asm_ldhint)_LDHINT_NONE);
+                   
+    return ret;
+}
+
+
+
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+
+
+#define gmx_atomic_read(a)   ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+
+
+static inline void 
+gmx_atomic_add_return(gmx_atomic_t *       a, 
+                      int                  i)
+{
+    int old,new;    
+    int __i = i;
+    
+    /* On HP-UX we don't know any macro to determine whether the increment
+     * is known at compile time, but hopefully the call uses something simple
+     * like a constant, and then the optimizer should be able to do the job.
+     */                         
+    if (  (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
+          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) )
+    {
+        oldval = gmx_hpia64_fetchadd(a,__i);
+        newval = oldval + i;
+    }
+    else
+    {
+        /* Use compare-exchange addition that works with any value */
+        do
+        {
+            oldval = gmx_atomic_read(a);
+            newval = oldval + i;
+        }
+        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
+    }
+    return newval;
+}
+
+
+
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     int                i)
+{
+    int oldval,newval;    
+    int __i = i;
+    
+    /* On HP-UX we don't know any macro to determine whether the increment
+     * is known at compile time, but hopefully the call uses something simple
+     * like a constant, and then the optimizer should be able to do the job.
+     */                         
+    if (  (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
+          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) )
+    {
+        oldval = gmx_hpia64_fetchadd(a,__i);
+        newval = oldval + i;
+    }
+    else
+    {
+        /* Use compare-exchange addition that works with any value */
+        do
+        {
+            oldval = gmx_atomic_read(a);
+            newval = oldval + i;
+        }
+        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
+    }
+    return oldval;
+}
+
+
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *x)
+{
+    x->lock = 0;
+}
+
+
+
+
+
+static inline void
+gmx_spinlock_trylock(gmx_spinlock_t *x)
+{
+    int rc;
+
+    rc = _Asm_xchg((_Asm_sz)_SZ_W, (unsigned int *)x, 1        
+                    (_Asm_ldhit)_LDHINT_NONE);
+    
+    return ( (rc>0) ? 1 : 0);
+}
+
+
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *x)
+{
+    int      status = 1;
+    
+    do
+    {
+        if( *((unsigned int *)x->lock) == 0 ) 
+        {
+            status = gmx_spinlock_trylock(x);
+        }
+    } while( status != 0);
+}
+
+
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *   x)
+{
+    _Asm_fetchadd((_Asm_fasz)_SZ_W,(_Asm_sem)_SEM_REL,                  
+                  (unsigned int *)x,-1,(_Asm_ldhint)_LDHINT_NONE);
+}
+
+
+
+static inline void
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return ( x->lock != 0 );
+}
+
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    do
+    {
+        gmx_atomic_memory_barrier(); 
+    } 
+    while(gmx_spinlock_islocked(x));
+}
+
+
+#undef gmx_hpia64_fetchadd
+
+
+
+#elif (defined(_MSC_VER) && (_MSC_VER >= 1200))
+/* Microsoft Visual C on x86, define taken from FFTW who got it from Morten Nissov */
+
+#include <windows.h>
+
+#define gmx_atomic_memory_barrier()
+
+
+typedef struct gmx_atomic
+{
+	LONG volatile	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+typedef struct gmx_spinlock
+{
+    LONG volatile      lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+
+
+
+
+#define gmx_atomic_read(a)  ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+
+
+
+
+#define gmx_atomic_fetch_add(a, i)  \
+    InterlockedExchangeAdd((LONG volatile *)a, (LONG) i)
+
+#define gmx_atomic_add_return(a, i)  \
+    ( i + InterlockedExchangeAdd((LONG volatile *)a, (LONG) i) )
+
+#define gmx_atomic_cmpxchg(a, oldval, newval) \
+    InterlockedCompareExchange((LONG volatile *)a, (LONG) newval, (LONG) oldval)
+
+
+# define gmx_spinlock_lock(x)   \
+    while((InterlockedCompareExchange((LONG volatile *)&x, 1, 0))!=0)
+
+
+#define gmx_spinlock_trylock(x)   \
+    InterlockedCompareExchange((LONG volatile *)&x, 1, 0)
+
+
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *   x)
+{
+    x->lock = 0;
+}
+
+
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return (*(volatile signed char *)(&(x)->lock) != 0);
+}
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    while(gmx_spinlock_islocked(x))
+    {
+        Sleep(0);
+    }
+}
+
+
+
+#elif defined(__xlC__) && defined (_AIX)
+/* IBM xlC compiler on AIX */
+#include <sys/atomic_op.h>
+
+
+#define gmx_atomic_memory_barrier()
+
+
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+typedef struct gmx_spinlock
+{
+    volatile unsigned int      lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *    a,
+                   int               oldval,
+                   int               newval)
+{
+    int t;
+    
+    if(__check_lock((atomic_p)&a->value, oldval, newval))
+    {
+        /* Not successful - value had changed in memory. Reload value. */
+        t = a->value;
+    }
+    else
+    {
+        /* replacement suceeded */
+        t = oldval;
+    }
+    return t;        
+}
+
+
+static inline void 
+gmx_atomic_add_return(gmx_atomic_t *       a, 
+                      int                  i)
+{
+    int oldval,newval;    
+    
+    do
+    {
+        oldval = gmx_atomic_read(a);
+        newval = oldval + i;
+    }
+    while(__check_lock((atomic_p)&a->value, oldval, newval));
+
+    return newval;
+}
+
+
+
+static inline void 
+gmx_atomic_fetch_add(gmx_atomic_t *       a, 
+                     int                  i)
+{
+    int oldval,newval;    
+    
+    do
+    {
+        oldval = gmx_atomic_read(a);
+        newval = oldval + i;
+    }
+    while(__check_lock((atomic_p)&a->value, oldval, newval));
+    
+    return oldval;
+}
+
+
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *   x)
+{
+    __clear_lock((atomic_p)x,0);
+}
+
+
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *   x)
+{
+    do
+    {
+        ;
+    }
+    while(__check_lock((atomic_p)x, 0, 1));
+}
+
+
+static inline void
+gmx_spinlock_trylock(gmx_spinlock_t *   x)
+{
+    /* Return 0 if we got the lock */
+    return (__check_lock((atomic_p)x, 0, 1) != 0)
+}
+
+
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *   x)
+{
+    __clear_lock((atomic_p)x,0);
+}
+
+
+static inline void
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return (*((atomic_p)x) != 0);
+}
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *    x)
+{
+    while(gmx_spinlock_islocked(x)) { ; } 
+}
+
+
+#else
+/* No atomic operations, use mutex fallback. Documentation is in x86 section */
+
+
+#define gmx_atomic_memory_barrier()
+
+/* System mutex used for locking to guarantee atomicity */
+static pthread_mutex_t
+gmx_atomic_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+
+typedef struct gmx_atomic
+{
+	int	   value;
+}
+gmx_atomic_t;
+
+#define gmx_spinlock_t     pthread_mutex_t
+
+ 
+#  define GMX_SPINLOCK_INITIALIZER   PTHREAD_MUTEX_INITIALIZER
+
+/* Since mutexes guarantee memory barriers this works fine */
+#define gmx_atomic_read(a)   ((a)->value)
+
+
+static inline void
+gmx_atomic_set(gmx_atomic_t *   a, 
+               int              i)
+{
+    /* Mutexes here are necessary to guarantee memory visibility */
+    pthread_mutex_lock(&gmx_atomic_mutex);
+    a->value = i;
+    pthread_mutex_unlock(&gmx_atomic_mutex);
+}
+
+
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *   a, 
+                      int              i)
+{
+    int t;
+    pthread_mutex_lock(&gmx_atomic_mutex);
+    t = a->value + i;
+    a->value = t;
+    pthread_mutex_unlock(&gmx_atomic_mutex);
+    return t;
+}
+
+
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *   a,
+                     int              i)
+{
+    int old_value;
+    
+    pthread_mutex_lock(&gmx_atomic_mutex);
+    old_value  = a->value;
+    a->value   = old_value + i;
+    pthread_mutex_unlock(&gmx_atomic_mutex);
+    return old_value;
+}
+
+
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *           a, 
+                   int                      oldv,
+                   int                      newv)
+{
+    int t;
+    
+    pthread_mutex_lock(&gmx_atomic_mutex);
+    t = a->value;
+    if (t == oldv)
+    {
+        a->value = newv;
+    }
+    pthread_mutex_unlock(&gmx_atomic_mutex);
+    return t;
+}
+
+
+#define gmx_spinlock_init(lock)       pthread_mutex_init(lock)
+#define gmx_spinlock_lock(lock)       pthread_mutex_lock(lock)
+#define gmx_spinlock_trylock(lock)    pthread_mutex_trylock(lock)
+#define gmx_spinlock_unlock(lock)     pthread_mutex_unlock(lock)
+
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    int rc;
+    
+    if(gmx_spinlock_trylock(x) != 0)
+    {
+        /* It was locked */
+        return 1;
+    }
+    else
+    {
+        /* We just locked it */
+        gmx_spinlock_unlock(x);
+        return 0;
+    }
+}
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    int rc;
+    
+    gmx_spinlock_lock(x);
+    /* Got the lock now, so the waiting is over */
+    gmx_spinlock_unlock(x);
+}
+
+
+#endif
+
+
+
+
+/*! \brief Spinlock-based barrier type
+ *
+ *  This barrier has the same functionality as the standard
+ *  gmx_thread_barrier_t, but since it is based on spinlocks
+ *  it provides faster synchronization at the cost of busy-waiting.
+ *
+ *  Variables of this type should be initialized by calling
+ *  gmx_spinlock_barrier_init() to set the number of threads
+ *  that should be synchronized.
+ */
+typedef struct gmx_spinlock_barrier
+{
+	gmx_atomic_t            count;     /*!< Number of threads remaining     */
+	int                     threshold; /*!< Total number of threads         */
+	volatile int            cycle;     /*!< Current cycle (alternating 0/1) */
+}
+gmx_spinlock_barrier_t;
+ 
+
+
+
+/*! \brief Initialize spinlock-based barrier
+ *
+ *  \param barrier  Pointer to _spinlock_ barrier. Note that this is not
+ *                  the same datatype as the full, thread based, barrier.
+ *  \param count    Number of threads to synchronize. All threads
+ *                  will be released after \a count calls to 
+ *                  gmx_spinlock_barrier_wait().  
+ */
+static inline void 
+gmx_spinlock_barrier_init(gmx_spinlock_barrier_t *         barrier,
+                          int                              count)
+{
+	barrier->threshold = count;
+	barrier->cycle     = 0;
+	gmx_atomic_set(&(barrier->count),count);
+}
+
+
+
+
+/*! \brief Perform busy-waiting barrier synchronization
+*
+*  This routine blocks until it has been called N times,
+*  where N is the count value the barrier was initialized with.
+*  After N total calls all threads return. The barrier automatically
+*  cycles, and thus requires another N calls to unblock another time.
+*
+*  Note that spinlock-based barriers are completely different from
+*  standard ones (using mutexes and condition variables), only the 
+*  functionality and names are similar.
+*
+*  \param barrier  Pointer to previously create barrier.
+*
+*  \return The last thread returns -1, all the others 0.
+*/
+static inline int
+gmx_spinlock_barrier_wait(gmx_spinlock_barrier_t *   barrier)
+{
+  int    cycle;
+  int    status;
+  
+  /* We don't need to lock or use atomic ops here, since the cycle index 
+	* cannot change until after the last thread has performed the check
+	* further down. Further, they cannot reach this point in the next 
+	* barrier iteration until all of them have been released, and that 
+	* happens after the cycle value has been updated.
+	*
+	* No synchronization == fast synchronization.
+	*/
+  cycle = barrier->cycle;
+  
+  /* Decrement the count atomically and check if it is zero.
+	* This will only be true for the last thread calling us.
+	*/
+  if( gmx_atomic_add_return( &(barrier->count), -1 ) == 0)
+  { 
+	gmx_atomic_set(&(barrier->count), barrier->threshold);
+	barrier->cycle = !barrier->cycle;
+    
+	status = -1;
+  }
+  else
+  {
+	/* Wait until the last thread changes the cycle index.
+	* We are both using a memory barrier, and explicit
+	* volatile pointer cast to make sure the compiler
+	* doesn't try to be smart and cache the contents.
+	*/
+	do
+	{ 
+	  gmx_atomic_memory_barrier();
+	} 
+	while( *(volatile int *)(&(barrier->cycle)) == cycle);
+	
+	status = 0;
+  }
+  return status;
+}
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* _GMX_ATOMIC_H_ */
--- a/platforms/cpu/tests/CMakeLists.txt
+++ b/platforms/cpu/tests/CMakeLists.txt
@@ -20,6 +20,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)

 ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/platforms/cpu/tests/TestCpuGBSAOBCForce.cpp
+++ b/platforms/cpu/tests/TestCpuGBSAOBCForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2013 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CPU implementation of GBSAOBCForce.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CpuPlatform.h"
+#include "openmm/GBSAOBCForce.h"
+#include "openmm/System.h"
+#include "openmm/LangevinIntegrator.h"
+#include "openmm/NonbondedForce.h"
+#include "SimTKOpenMMRealType.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+void testSingleParticle() {
+    CpuPlatform platform;
+    System system;
+    system.addParticle(2.0);
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+    GBSAOBCForce* forceField = new GBSAOBCForce();
+    forceField->addParticle(0.5, 0.15, 1);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(1);
+    positions[0] = Vec3(0, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Energy);
+    double bornRadius = 0.15-0.009; // dielectric offset
+    double eps0 = EPSILON0;
+    double bornEnergy = (-0.5*0.5/(8*PI_M*eps0))*(1.0/forceField->getSoluteDielectric()-1.0/forceField->getSolventDielectric())/bornRadius;
+    double extendedRadius = 0.15+0.14; // probe radius
+    double nonpolarEnergy = CAL2JOULE*PI_M*0.0216*(10*extendedRadius)*(10*extendedRadius)*std::pow(0.15/bornRadius, 6.0); // Where did this formula come from?  Just copied it from CpuImplicitSolvent.cpp
+    ASSERT_EQUAL_TOL((bornEnergy+nonpolarEnergy), state.getPotentialEnergy(), 0.01);
+    
+    // Change the parameters and see if it is still correct.
+    
+    forceField->setParticleParameters(0, 0.4, 0.25, 1);
+    forceField->updateParametersInContext(context);
+    state = context.getState(State::Energy);
+    bornRadius = 0.25-0.009; // dielectric offset
+    bornEnergy = (-0.4*0.4/(8*PI_M*eps0))*(1.0/forceField->getSoluteDielectric()-1.0/forceField->getSolventDielectric())/bornRadius;
+    extendedRadius = bornRadius+0.14;
+    nonpolarEnergy = CAL2JOULE*PI_M*0.0216*(10*extendedRadius)*(10*extendedRadius)*std::pow(0.25/bornRadius, 6.0);
+    ASSERT_EQUAL_TOL((bornEnergy+nonpolarEnergy), state.getPotentialEnergy(), 0.01);
+}
+
+void testCutoffAndPeriodic() {
+    CpuPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+    GBSAOBCForce* gbsa = new GBSAOBCForce();
+    NonbondedForce* nonbonded = new NonbondedForce();
+    gbsa->addParticle(-1, 0.15, 1);
+    nonbonded->addParticle(-1, 1, 0);
+    gbsa->addParticle(1, 0.15, 1);
+    nonbonded->addParticle(1, 1, 0);
+    const double cutoffDistance = 3.0;
+    const double boxSize = 10.0;
+    nonbonded->setCutoffDistance(cutoffDistance);
+    gbsa->setCutoffDistance(cutoffDistance);
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    system.addForce(gbsa);
+    system.addForce(nonbonded);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(2, 0, 0);
+
+    // Calculate the forces for both cutoff and periodic with two different atom positions.
+
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
+    gbsa->setNonbondedMethod(GBSAOBCForce::CutoffNonPeriodic);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    State state1 = context.getState(State::Forces);
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    gbsa->setNonbondedMethod(GBSAOBCForce::CutoffPeriodic);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state2 = context.getState(State::Forces);
+    positions[1][0]+= boxSize;
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
+    gbsa->setNonbondedMethod(GBSAOBCForce::CutoffNonPeriodic);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state3 = context.getState(State::Forces);
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    gbsa->setNonbondedMethod(GBSAOBCForce::CutoffPeriodic);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state4 = context.getState(State::Forces);
+
+    // All forces should be identical, exception state3 which should be zero.
+
+    ASSERT_EQUAL_VEC(state1.getForces()[0], state2.getForces()[0], 0.01);
+    ASSERT_EQUAL_VEC(state1.getForces()[1], state2.getForces()[1], 0.01);
+    ASSERT_EQUAL_VEC(state1.getForces()[0], state4.getForces()[0], 0.01);
+    ASSERT_EQUAL_VEC(state1.getForces()[1], state4.getForces()[1], 0.01);
+    ASSERT_EQUAL_VEC(state3.getForces()[0], Vec3(0, 0, 0), 0.01);
+    ASSERT_EQUAL_VEC(state3.getForces()[1], Vec3(0, 0, 0), 0.01);
+}
+
+void testForce(int numParticles, NonbondedForce::NonbondedMethod method, GBSAOBCForce::NonbondedMethod method2) {
+    CpuPlatform platform;
+    ReferencePlatform reference;
+    System system;
+    GBSAOBCForce* gbsa = new GBSAOBCForce();
+    NonbondedForce* nonbonded = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(1.0);
+        double charge = i%2 == 0 ? -1 : 1;
+        gbsa->addParticle(charge, 0.15, 1);
+        nonbonded->addParticle(charge, 1, 0);
+    }
+    nonbonded->setNonbondedMethod(method);
+    gbsa->setNonbondedMethod(method2);
+    nonbonded->setCutoffDistance(3.0);
+    gbsa->setCutoffDistance(3.0);
+    int grid = (int) floor(0.5+pow(numParticles, 1.0/3.0));
+    if (method == NonbondedForce::CutoffPeriodic) {
+        double boxSize = (grid+1)*1.1;
+        system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    }
+    system.addForce(gbsa);
+    system.addForce(nonbonded);
+    LangevinIntegrator integrator1(0, 0.1, 0.01);
+    LangevinIntegrator integrator2(0, 0.1, 0.01);
+    Context context(system, integrator1, platform);
+    Context refContext(system, integrator2, reference);
+
+    // Set random (but uniformly distributed) positions for all the particles.
+
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    for (int i = 0; i < grid; i++)
+        for (int j = 0; j < grid; j++)
+            for (int k = 0; k < grid; k++)
+                positions[i*grid*grid+j*grid+k] = Vec3(i*1.1, j*1.1, k*1.1);
+    for (int i = 0; i < numParticles; ++i)
+        positions[i] = positions[i] + Vec3(0.5*genrand_real2(sfmt), 0.5*genrand_real2(sfmt), 0.5*genrand_real2(sfmt));
+    context.setPositions(positions);
+    refContext.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    State refState = refContext.getState(State::Forces | State::Energy);
+
+    // Make sure the CPU and Reference platforms agree.
+
+    double norm = 0.0;
+    double diff = 0.0;
+    for (int i = 0; i < numParticles; ++i) {
+        Vec3 f = state.getForces()[i];
+        norm += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
+        Vec3 delta = f-refState.getForces()[i];
+        diff += delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2];
+    }
+    norm = std::sqrt(norm);
+    diff = std::sqrt(diff);
+    ASSERT_EQUAL_TOL(0.0, diff, 0.001*norm);
+    ASSERT_EQUAL_TOL(state.getPotentialEnergy(), refState.getPotentialEnergy(), 1e-3);
+
+    // Take a small step in the direction of the energy gradient and see whether the potential energy changes by the expected amount.
+    // (This doesn't work with cutoffs, since the energy changes discontinuously at the cutoff distance.)
+
+    if (method == NonbondedForce::NoCutoff)
+    {
+        const double delta = 0.3;
+        double step = 0.5*delta/norm;
+        vector<Vec3> positions2(numParticles), positions3(numParticles);
+        for (int i = 0; i < numParticles; ++i) {
+            Vec3 p = positions[i];
+            Vec3 f = state.getForces()[i];
+            positions2[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+            positions3[i] = Vec3(p[0]+f[0]*step, p[1]+f[1]*step, p[2]+f[2]*step);
+        }
+        context.setPositions(positions2);
+        State state2 = context.getState(State::Energy);
+        context.setPositions(positions3);
+        State state3 = context.getState(State::Energy);
+        ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state3.getPotentialEnergy())/delta, 1e-2)
+    }
+}
+
+int main() {
+    try {
+        if (!CpuPlatform::isProcessorSupported()) {
+            cout << "CPU is not supported.  Exiting." << endl;
+            return 0;
+        }
+        testSingleParticle();
+        testCutoffAndPeriodic();
+        for (int i = 5; i < 11; i++) {
+            testForce(i*i*i, NonbondedForce::NoCutoff, GBSAOBCForce::NoCutoff);
+            testForce(i*i*i, NonbondedForce::CutoffNonPeriodic, GBSAOBCForce::CutoffNonPeriodic);
+            testForce(i*i*i, NonbondedForce::CutoffPeriodic, GBSAOBCForce::CutoffPeriodic);
+        }
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/cpu/tests/TestCpuLangevinIntegrator.cpp
+++ b/platforms/cpu/tests/TestCpuLangevinIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2013 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the reference implementation of LangevinIntegrator.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CpuPlatform.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/LangevinIntegrator.h"
+#include "SimTKOpenMMRealType.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+void testSingleBond() {
+    CpuPlatform platform;
+    System system;
+    system.addParticle(2.0);
+    system.addParticle(2.0);
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+    HarmonicBondForce* forceField = new HarmonicBondForce();
+    forceField->addBond(0, 1, 1.5, 1);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(-1, 0, 0);
+    positions[1] = Vec3(1, 0, 0);
+    context.setPositions(positions);
+    
+    // This is simply a damped harmonic oscillator, so compare it to the analytical solution.
+    
+    double freq = std::sqrt(1-0.05*0.05);
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions | State::Velocities);
+        double time = state.getTime();
+        double expectedDist = 1.5+0.5*std::exp(-0.05*time)*std::cos(freq*time);
+        ASSERT_EQUAL_VEC(Vec3(-0.5*expectedDist, 0, 0), state.getPositions()[0], 0.02);
+        ASSERT_EQUAL_VEC(Vec3(0.5*expectedDist, 0, 0), state.getPositions()[1], 0.02);
+        double expectedSpeed = -0.5*std::exp(-0.05*time)*(0.05*std::cos(freq*time)+freq*std::sin(freq*time));
+        ASSERT_EQUAL_VEC(Vec3(-0.5*expectedSpeed, 0, 0), state.getVelocities()[0], 0.02);
+        ASSERT_EQUAL_VEC(Vec3(0.5*expectedSpeed, 0, 0), state.getVelocities()[1], 0.02);
+        integrator.step(1);
+    }
+    
+    // Not set the friction to a tiny value and see if it conserves energy.
+    
+    integrator.setFriction(5e-5);
+    context.setPositions(positions);
+    State state = context.getState(State::Energy);
+    double initialEnergy = state.getKineticEnergy()+state.getPotentialEnergy();
+    for (int i = 0; i < 1000; ++i) {
+        state = context.getState(State::Energy);
+        double energy = state.getKineticEnergy()+state.getPotentialEnergy();
+        ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
+        integrator.step(1);
+    }
+}
+
+void testTemperature() {
+    const int numParticles = 8;
+    const double temp = 100.0;
+    CpuPlatform platform;
+    System system;
+    LangevinIntegrator integrator(temp, 2.0, 0.01);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(2.0);
+        forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
+    }
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; ++i)
+        positions[i] = Vec3((i%2 == 0 ? 2 : -2), (i%4 < 2 ? 2 : -2), (i < 4 ? 2 : -2));
+    context.setPositions(positions);
+    
+    // Let it equilibrate.
+    
+    integrator.step(10000);
+    
+    // Now run it for a while and see if the temperature is correct.
+    
+    double ke = 0.0;
+    for (int i = 0; i < 10000; ++i) {
+        State state = context.getState(State::Energy);
+        ke += state.getKineticEnergy();
+        integrator.step(1);
+    }
+    ke /= 10000;
+    double expected = 0.5*numParticles*3*BOLTZ*temp;
+    ASSERT_USUALLY_EQUAL_TOL(expected, ke, 6/std::sqrt(10000.0));
+}
+
+void testConstraints() {
+    const int numParticles = 8;
+    const double temp = 100.0;
+    CpuPlatform platform;
+    System system;
+    LangevinIntegrator integrator(temp, 2.0, 0.01);
+    integrator.setConstraintTolerance(1e-5);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(10.0);
+        forceField->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
+    }
+    for (int i = 0; i < numParticles-1; ++i)
+        system.addConstraint(i, i+1, 1.0);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    for (int i = 0; i < numParticles; ++i) {
+        positions[i] = Vec3(i/2, (i+1)/2, 0);
+        velocities[i] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+    }
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    
+    // Simulate it and see whether the constraints remain satisfied.
+    
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions);
+        for (int j = 0; j < numParticles-1; ++j) {
+            Vec3 p1 = state.getPositions()[j];
+            Vec3 p2 = state.getPositions()[j+1];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(1.0, dist, 2e-5);
+        }
+        integrator.step(1);
+    }
+}
+
+void testConstrainedMasslessParticles() {
+    CpuPlatform platform;
+    System system;
+    system.addParticle(0.0);
+    system.addParticle(1.0);
+    system.addConstraint(0, 1, 1.5);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(-1, 0, 0);
+    positions[1] = Vec3(1, 0, 0);
+    LangevinIntegrator integrator(300.0, 2.0, 0.01);
+    bool failed = false;
+    try {
+        // This should throw an exception.
+        
+        Context context(system, integrator, platform);
+    }
+    catch (exception& ex) {
+        failed = true;
+    }
+    ASSERT(failed);
+    
+    // Now make both particles massless, which should work.
+    
+    system.setParticleMass(1, 0.0);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setVelocitiesToTemperature(300.0);
+    integrator.step(1);
+    State state = context.getState(State::Velocities | State::Positions);
+    ASSERT_EQUAL(0.0, state.getVelocities()[0][0]);
+}
+
+void testRandomSeed() {
+    const int numParticles = 8;
+    const double temp = 100.0;
+    const double collisionFreq = 10.0;
+    CpuPlatform platform;
+    System system;
+    LangevinIntegrator integrator(temp, 2.0, 0.01);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(2.0);
+        forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
+    }
+    system.addForce(forceField);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    for (int i = 0; i < numParticles; ++i) {
+        positions[i] = Vec3((i%2 == 0 ? 2 : -2), (i%4 < 2 ? 2 : -2), (i < 4 ? 2 : -2));
+        velocities[i] = Vec3(0, 0, 0);
+    }
+
+    // Try twice with the same random seed.
+
+    integrator.setRandomNumberSeed(5);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    integrator.step(10);
+    State state1 = context.getState(State::Positions);
+    context.reinitialize();
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    integrator.step(10);
+    State state2 = context.getState(State::Positions);
+
+    // Try twice with a different random seed.
+
+    integrator.setRandomNumberSeed(10);
+    context.reinitialize();
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    integrator.step(10);
+    State state3 = context.getState(State::Positions);
+    context.reinitialize();
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    integrator.step(10);
+    State state4 = context.getState(State::Positions);
+
+    // Compare the results.
+
+    for (int i = 0; i < numParticles; i++) {
+        for (int j = 0; j < 3; j++) {
+            ASSERT_EQUAL_TOL(state1.getPositions()[i][j], state2.getPositions()[i][j], 1e-5);
+            ASSERT_EQUAL_TOL(state3.getPositions()[i][j], state4.getPositions()[i][j], 1e-5);
+            ASSERT(state1.getPositions()[i][j] != state3.getPositions()[i][j]);
+        }
+    }
+}
+
+int main() {
+    try {
+        if (!CpuPlatform::isProcessorSupported()) {
+            cout << "CPU is not supported.  Exiting." << endl;
+            return 0;
+        }
+        testSingleBond();
+        testTemperature();
+        testConstraints();
+        testConstrainedMasslessParticles();
+        testRandomSeed();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -34,6 +34,8 @@
 */

 #include "openmm/internal/AssertionUtilities.h"
+#include "openmm/internal/ThreadPool.h"
+#include "AlignedArray.h"
 #include "CpuNeighborList.h"
 #include "CpuPlatform.h"
 #include "sfmt/SFMT.h"
@@ -49,9 +51,10 @@ void testNeighborList(bool periodic) {
    const int numParticles = 500;
    const float cutoff = 2.0f;
    const float boxSize[3] = {20.0f, 15.0f, 22.0f};
+    const int blockSize = 8;
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
-    vector<float> positions(4*numParticles);
+    AlignedArray<float> positions(4*numParticles);
    for (int i = 0; i < 4*numParticles; i++)
        if (i%4 < 3)
            positions[i] = boxSize[i%4]*genrand_real2(sfmt);
@@ -63,15 +66,16 @@ void testNeighborList(bool periodic) {
            exclusions[i-j].insert(i);
        }
    }
-    CpuNeighborList neighborList;
-    neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff);
+    ThreadPool threads;
+    CpuNeighborList neighborList(blockSize);
+    neighborList.computeNeighborList(numParticles, positions, exclusions, boxSize, periodic, cutoff, threads);
    
    // Convert the neighbor list to a set for faster lookup.
    
    set<pair<int, int> > neighbors;
    for (int i = 0; i < (int) neighborList.getSortedAtoms().size(); i++) {
-        int blockIndex = i/CpuNeighborList::BlockSize;
-        int indexInBlock = i-blockIndex*CpuNeighborList::BlockSize;
+        int blockIndex = i/blockSize;
+        int indexInBlock = i-blockIndex*blockSize;
        char mask = 1<<indexInBlock;
        for (int j = 0; j < (int) neighborList.getBlockExclusions(blockIndex).size(); j++) {
            if ((neighborList.getBlockExclusions(blockIndex)[j] & mask) == 0) {

--- a/platforms/cpu/tests/TestCpuNonbondedForce.cpp
+++ b/platforms/cpu/tests/TestCpuNonbondedForce.cpp
@@ -397,44 +397,44 @@ void testLargeSystem() {
    system.addForce(bonds);
    VerletIntegrator integrator1(0.01);
    VerletIntegrator integrator2(0.01);
-    Context cuContext(system, integrator1, platform);
+    Context cpuContext(system, integrator1, platform);
    Context referenceContext(system, integrator2, reference);
-    cuContext.setPositions(positions);
-    cuContext.setVelocities(velocities);
+    cpuContext.setPositions(positions);
+    cpuContext.setVelocities(velocities);
    referenceContext.setPositions(positions);
    referenceContext.setVelocities(velocities);
-    State cuState = cuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
+    State cpuState = cpuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    State referenceState = referenceContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++) {
-        ASSERT_EQUAL_VEC(cuState.getPositions()[i], referenceState.getPositions()[i], tol);
-        ASSERT_EQUAL_VEC(cuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
-        ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getPositions()[i], referenceState.getPositions()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
    }
-    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
+    ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);

    // Now do the same thing with periodic boundary conditions.

    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
-    cuContext.reinitialize();
+    cpuContext.reinitialize();
    referenceContext.reinitialize();
-    cuContext.setPositions(positions);
-    cuContext.setVelocities(velocities);
+    cpuContext.setPositions(positions);
+    cpuContext.setVelocities(velocities);
    referenceContext.setPositions(positions);
    referenceContext.setVelocities(velocities);
-    cuState = cuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
+    cpuState = cpuContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    referenceState = referenceContext.getState(State::Positions | State::Velocities | State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++) {
-        double dx = cuState.getPositions()[i][0]-referenceState.getPositions()[i][0];
-        double dy = cuState.getPositions()[i][1]-referenceState.getPositions()[i][1];
-        double dz = cuState.getPositions()[i][2]-referenceState.getPositions()[i][2];
-        ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][0]-referenceState.getPositions()[i][0], boxSize), 0, tol);
-        ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][1]-referenceState.getPositions()[i][1], boxSize), 0, tol);
-        ASSERT_EQUAL_TOL(fmod(cuState.getPositions()[i][2]-referenceState.getPositions()[i][2], boxSize), 0, tol);
-        ASSERT_EQUAL_VEC(cuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
-        ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
+        double dx = cpuState.getPositions()[i][0]-referenceState.getPositions()[i][0];
+        double dy = cpuState.getPositions()[i][1]-referenceState.getPositions()[i][1];
+        double dz = cpuState.getPositions()[i][2]-referenceState.getPositions()[i][2];
+        ASSERT_EQUAL_TOL(fmod(cpuState.getPositions()[i][0]-referenceState.getPositions()[i][0], boxSize), 0, tol);
+        ASSERT_EQUAL_TOL(fmod(cpuState.getPositions()[i][1]-referenceState.getPositions()[i][1], boxSize), 0, tol);
+        ASSERT_EQUAL_TOL(fmod(cpuState.getPositions()[i][2]-referenceState.getPositions()[i][2], boxSize), 0, tol);
+        ASSERT_EQUAL_VEC(cpuState.getVelocities()[i], referenceState.getVelocities()[i], tol);
+        ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
    }
-    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
+    ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }

 void testDispersionCorrection() {
@@ -542,15 +542,15 @@ void testChangingParameters() {
    
    VerletIntegrator integrator1(0.01);
    VerletIntegrator integrator2(0.01);
-    Context cuContext(system, integrator1, platform);
+    Context cpuContext(system, integrator1, platform);
    Context referenceContext(system, integrator2, reference);
-    cuContext.setPositions(positions);
+    cpuContext.setPositions(positions);
    referenceContext.setPositions(positions);
-    State cuState = cuContext.getState(State::Forces | State::Energy);
+    State cpuState = cpuContext.getState(State::Forces | State::Energy);
    State referenceState = referenceContext.getState(State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++)
-        ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
-    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
+        ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
+    ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
    
    // Now modify parameters and see if they still agree.

@@ -559,13 +559,13 @@ void testChangingParameters() {
        nonbonded->getParticleParameters(i, charge, sigma, epsilon);
        nonbonded->setParticleParameters(i, 1.5*charge, 1.1*sigma, 1.7*epsilon);
    }
-    nonbonded->updateParametersInContext(cuContext);
+    nonbonded->updateParametersInContext(cpuContext);
    nonbonded->updateParametersInContext(referenceContext);
-    cuState = cuContext.getState(State::Forces | State::Energy);
+    cpuState = cpuContext.getState(State::Forces | State::Energy);
    referenceState = referenceContext.getState(State::Forces | State::Energy);
    for (int i = 0; i < numParticles; i++)
-        ASSERT_EQUAL_VEC(cuState.getForces()[i], referenceState.getForces()[i], tol);
-    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
+        ASSERT_EQUAL_VEC(cpuState.getForces()[i], referenceState.getForces()[i], tol);
+    ASSERT_EQUAL_TOL(cpuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }

 void testSwitchingFunction(NonbondedForce::NonbondedMethod method) {

--- a/platforms/cpu/tests/TestCpuPeriodicTorsionForce.cpp
+++ b/platforms/cpu/tests/TestCpuPeriodicTorsionForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2014 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CPU implementation of PeriodicTorsionForce.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CpuPlatform.h"
+#include "openmm/PeriodicTorsionForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "SimTKOpenMMRealType.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+CpuPlatform platform;
+
+const double TOL = 1e-5;
+
+void testPeriodicTorsions() {
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    PeriodicTorsionForce* forceField = new PeriodicTorsionForce();
+    forceField->addTorsion(0, 1, 2, 3, 2, PI_M/3, 1.1);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(4);
+    positions[0] = Vec3(0, 1, 0);
+    positions[1] = Vec3(0, 0, 0);
+    positions[2] = Vec3(1, 0, 0);
+    positions[3] = Vec3(1, 0, 2);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    {
+        const vector<Vec3>& forces = state.getForces();
+        double torque = -2*1.1*std::sin(2*PI_M/3);
+        ASSERT_EQUAL_VEC(Vec3(0, 0, torque), forces[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, 0.5*torque, 0), forces[3], TOL);
+        ASSERT_EQUAL_VEC(Vec3(forces[0][0]+forces[1][0]+forces[2][0]+forces[3][0], forces[0][1]+forces[1][1]+forces[2][1]+forces[3][1], forces[0][2]+forces[1][2]+forces[2][2]+forces[3][2]), Vec3(0, 0, 0), TOL);
+        ASSERT_EQUAL_TOL(1.1*(1+std::cos(2*PI_M/3)), state.getPotentialEnergy(), TOL);
+    }
+    
+    // Try changing the torsion parameters and make sure it's still correct.
+    
+    forceField->setTorsionParameters(0, 0, 1, 2, 3, 3, PI_M/3.2, 1.3);
+    forceField->updateParametersInContext(context);
+    state = context.getState(State::Forces | State::Energy);
+    {
+        const vector<Vec3>& forces = state.getForces();
+        double dtheta = (3*PI_M/2)-(PI_M/3.2);
+        double torque = -3*1.3*std::sin(dtheta);
+        ASSERT_EQUAL_VEC(Vec3(0, 0, torque), forces[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, 0.5*torque, 0), forces[3], TOL);
+        ASSERT_EQUAL_VEC(Vec3(forces[0][0]+forces[1][0]+forces[2][0]+forces[3][0], forces[0][1]+forces[1][1]+forces[2][1]+forces[3][1], forces[0][2]+forces[1][2]+forces[2][2]+forces[3][2]), Vec3(0, 0, 0), TOL);
+        ASSERT_EQUAL_TOL(1.3*(1+std::cos(dtheta)), state.getPotentialEnergy(), TOL);
+    }
+}
+
+void testParallelComputation() {
+    System system;
+    const int numParticles = 200;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    PeriodicTorsionForce* force = new PeriodicTorsionForce();
+    for (int i = 3; i < numParticles; i++)
+        force->addTorsion(i-3, i-2, i-1, i, 2, 1.1, i);
+    system.addForce(force);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(i, i%2, i%3);
+    VerletIntegrator integrator1(0.01);
+    ReferencePlatform reference;
+    Context context1(system, integrator1, reference);
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    VerletIntegrator integrator2(0.01);
+    Context context2(system, integrator2, platform);
+    context2.setPositions(positions);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        testPeriodicTorsions();
+        testParallelComputation();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/cpu/tests/TestCpuRBTorsionForce.cpp
+++ b/platforms/cpu/tests/TestCpuRBTorsionForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2014 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CUDA implementation of RBTorsionForce.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CpuPlatform.h"
+#include "openmm/RBTorsionForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "SimTKOpenMMRealType.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+CpuPlatform platform;
+
+const double TOL = 1e-5;
+
+void testRBTorsions() {
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    RBTorsionForce* forceField = new RBTorsionForce();
+    forceField->addTorsion(0, 1, 2, 3, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(4);
+    positions[0] = Vec3(0, 1, 0);
+    positions[1] = Vec3(0, 0, 0);
+    positions[2] = Vec3(1, 0, 0);
+    positions[3] = Vec3(1, 1, 1);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    {
+        const vector<Vec3>& forces = state.getForces();
+        double psi = 0.25*PI_M - PI_M;
+        double torque = 0.0;
+        for (int i = 1; i < 6; ++i) {
+            double c = 0.1*(i+1);
+            torque += -c*i*std::pow(std::cos(psi), i-1)*std::sin(psi);
+        }
+        ASSERT_EQUAL_VEC(Vec3(0, 0, torque), forces[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, 0.5*torque, -0.5*torque), forces[3], TOL);
+        ASSERT_EQUAL_VEC(Vec3(forces[0][0]+forces[1][0]+forces[2][0]+forces[3][0], forces[0][1]+forces[1][1]+forces[2][1]+forces[3][1], forces[0][2]+forces[1][2]+forces[2][2]+forces[3][2]), Vec3(0, 0, 0), TOL);
+        double energy = 0.0;
+        for (int i = 0; i < 6; ++i) {
+            double c = 0.1*(i+1);
+            energy += c*std::pow(std::cos(psi), i);
+        }
+        ASSERT_EQUAL_TOL(energy, state.getPotentialEnergy(), TOL);
+    }
+    
+    // Try changing the torsion parameters and make sure it's still correct.
+    
+    forceField->setTorsionParameters(0, 0, 1, 2, 3, 0.11, 0.22, 0.33, 0.44, 0.55, 0.66);
+    forceField->updateParametersInContext(context);
+    state = context.getState(State::Forces | State::Energy);
+    {
+        const vector<Vec3>& forces = state.getForces();
+        double psi = 0.25*PI_M - PI_M;
+        double torque = 0.0;
+        for (int i = 1; i < 6; ++i) {
+            double c = 0.11*(i+1);
+            torque += -c*i*std::pow(std::cos(psi), i-1)*std::sin(psi);
+        }
+        ASSERT_EQUAL_VEC(Vec3(0, 0, torque), forces[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, 0.5*torque, -0.5*torque), forces[3], TOL);
+        ASSERT_EQUAL_VEC(Vec3(forces[0][0]+forces[1][0]+forces[2][0]+forces[3][0], forces[0][1]+forces[1][1]+forces[2][1]+forces[3][1], forces[0][2]+forces[1][2]+forces[2][2]+forces[3][2]), Vec3(0, 0, 0), TOL);
+        double energy = 0.0;
+        for (int i = 0; i < 6; ++i) {
+            double c = 0.11*(i+1);
+            energy += c*std::pow(std::cos(psi), i);
+        }
+        ASSERT_EQUAL_TOL(energy, state.getPotentialEnergy(), TOL);
+    }
+}
+
+void testParallelComputation() {
+    System system;
+    const int numParticles = 200;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    RBTorsionForce* force = new RBTorsionForce();
+    for (int i = 3; i < numParticles; i++)
+        force->addTorsion(i-3, i-2, i-1, i, 2, 0.1*i, 0.5*i, i, 1, 1);
+    system.addForce(force);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(i, i%2, i%3);
+    VerletIntegrator integrator1(0.01);
+    ReferencePlatform reference;
+    Context context1(system, integrator1, reference);
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    VerletIntegrator integrator2(0.01);
+    Context context2(system, integrator2, platform);
+    context2.setPositions(positions);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        testRBTorsions();
+        testParallelComputation();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}